436 files changed, 224104 insertions, 0 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
new file mode 100644
index 00000000..b455ff23
--- /dev/null
+++ b/src/btree/bt_compact.c
@@ -0,0 +1,2652 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __bam_compact_dups __P((DBC *,
+     PAGE **, u_int32_t, int, DB_COMPACT *, int *));
+static int __bam_compact_isdone __P((DBC *, DBT *, PAGE *, int *));
+static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
+static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
+static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_merge __P((DBC *,
+     DBC *,  u_int32_t, DBT *, DB_COMPACT *,int *));
+static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
+static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
+static int __bam_merge_records __P((DBC *, DBC*,  u_int32_t, DB_COMPACT *));
+static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
+static int __bam_truncate_root_page __P((DBC *,
+     PAGE *, u_int32_t, DB_COMPACT *));
+
+#ifdef HAVE_FTRUNCATE
+static int __bam_savekey __P((DBC *, int, DBT *));
+#endif
+
+/*
+ * __bam_csearch -- isolate search code for bam_compact.
+ * This routine hides the differences between searching
+ * a BTREE and a RECNO from the rest of the code.
+ */
+#define	CS_READ	0	/* We are just reading. */
+#define	CS_PARENT	1	/* We want the parent too, write lock. */
+#define	CS_NEXT		2	/* Get the next page. */
+#define	CS_NEXT_WRITE	3	/* Get the next page and write lock. */
+#define	CS_DEL		4	/* Get a stack to delete a page. */
+#define	CS_START	5	/* Starting level for stack, write lock. */
+#define	CS_NEXT_BOTH	6	/* Get this page and the next, write lock. */
+#define	CS_GETRECNO     0x80	/* Extract record number from start. */
+
+static int
+__bam_csearch(dbc, start, sflag, level)
+	DBC *dbc;
+	DBT *start;
+	u_int32_t sflag;
+	int level;
+{
+	BTREE_CURSOR *cp;
+	int not_used, ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if (dbc->dbtype == DB_RECNO) {
+		/* If GETRECNO is not set the cp->recno is what we want. */
+		if (FLD_ISSET(sflag, CS_GETRECNO)) {
+			if (start == NULL || start->size == 0)
+				cp->recno = 1;
+			else if ((ret =
+			     __ram_getno(dbc, start, &cp->recno, 0)) != 0)
+				return (ret);
+			FLD_CLR(sflag, CS_GETRECNO);
+		}
+		switch (sflag) {
+		case CS_READ:
+			sflag = SR_READ;
+			break;
+		case CS_NEXT:
+			sflag = SR_PARENT | SR_READ;
+			break;
+		case CS_START:
+			level = LEAFLEVEL;
+			/* FALLTHROUGH */
+		case CS_DEL:
+		case CS_NEXT_WRITE:
+			sflag = SR_STACK;
+			break;
+		case CS_NEXT_BOTH:
+			sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+			break;
+		case CS_PARENT:
+			sflag = SR_PARENT | SR_WRITE;
+			break;
+		default:
+			return (__env_panic(dbc->env, EINVAL));
+		}
+		if ((ret = __bam_rsearch(dbc,
+		     &cp->recno, sflag, level, &not_used)) != 0)
+			return (ret);
+		/* Reset the cursor's recno to the beginning of the page. */
+		cp->recno -= cp->csp->indx;
+	} else {
+		FLD_CLR(sflag, CS_GETRECNO);
+		switch (sflag) {
+		case CS_READ:
+			sflag = SR_READ | SR_DUPFIRST;
+			break;
+		case CS_DEL:
+			sflag = SR_DEL;
+			break;
+		case CS_NEXT:
+			sflag = SR_NEXT;
+			break;
+		case CS_NEXT_WRITE:
+			sflag = SR_NEXT | SR_WRITE;
+			break;
+		case CS_NEXT_BOTH:
+			sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+			break;
+		case CS_START:
+			sflag = SR_START | SR_WRITE;
+			break;
+		case CS_PARENT:
+			sflag = SR_PARENT | SR_WRITE;
+			break;
+		default:
+			return (__env_panic(dbc->env, EINVAL));
+		}
+		if (start == NULL || start->size == 0)
+			FLD_SET(sflag, SR_MIN);
+
+		if ((ret = __bam_search(dbc,
+		     PGNO_INVALID, start, sflag, level, NULL, &not_used)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_compact_int -- internal compaction routine.
+ *	Called either with a cursor on the main database
+ * or a cursor initialized to the root of an off page duplicate
+ * tree.
+ * PUBLIC: int __bam_compact_int __P((DBC *,
+ * PUBLIC:      DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
+ */
+int
+__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
+	DBC *dbc;
+	DBT *start, *stop;
+	u_int32_t factor;
+	int *spanp;
+	DB_COMPACT *c_data;
+	int *donep;
+{
+	BTREE_CURSOR *cp, *ncp;
+	DB *dbp;
+	DBC *ndbc;
+	DB_LOCK metalock, next_lock, nnext_lock, prev_lock, saved_lock;
+	DB_MPOOLFILE *dbmp;
+	ENV *env;
+	EPG *epg;
+	PAGE *pg, *ppg, *npg;
+	db_pgno_t metapgno, npgno, nnext_pgno;
+	db_pgno_t pgno, prev_pgno, ppgno, saved_pgno;
+	db_recno_t next_recno;
+	u_int32_t nentry, sflag, pgs_free;
+	int check_dups, check_trunc, clear_root, do_commit, isdone;
+	int merged, next_p, pgs_done, ret, t_ret, tdone;
+
+#ifdef	DEBUG
+#define	CTRACE(dbc, location, t, start, f) do {				\
+		DBT __trace;						\
+		DB_SET_DBT(__trace, t, strlen(t));			\
+		DEBUG_LWRITE(						\
+		    dbc, (dbc)->txn, location, &__trace, start, f)	\
+	} while (0)
+#define	PTRACE(dbc, location, p, start, f) do {				\
+		char __buf[32];						\
+		(void)snprintf(__buf,					\
+		    sizeof(__buf), "pgno: %lu", (u_long)p);		\
+		CTRACE(dbc, location, __buf, start, f);			\
+	} while (0)
+#else
+#define	CTRACE(dbc, location, t, start, f)
+#define	PTRACE(dbc, location, p, start, f)
+#endif
+
+	ndbc = NULL;
+	pg = NULL;
+	npg = NULL;
+
+	isdone = 0;
+	tdone = 0;
+	pgs_done = 0;
+	do_commit = 0;
+	next_recno = 0;
+	next_p = 0;
+	clear_root = 0;
+	metapgno = PGNO_BASE_MD;
+	ppgno = PGNO_INVALID;
+	LOCK_INIT(next_lock);
+	LOCK_INIT(nnext_lock);
+	LOCK_INIT(saved_lock);
+	LOCK_INIT(metalock);
+	LOCK_INIT(prev_lock);
+	check_trunc = c_data->compact_truncate != PGNO_INVALID;
+	check_dups = (!F_ISSET(dbc, DBC_OPD) &&
+	     F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	dbmp = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	pgs_free = c_data->compact_pages_free;
+
+	/* Search down the tree for the starting point. */
+	if ((ret = __bam_csearch(dbc,
+	    start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) {
+		/* Its not an error to compact an empty db. */
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		isdone = 1;
+		goto err;
+	}
+
+	/*
+	 * Get the first leaf page. The loop below will change pg so
+	 * we clear the stack reference so we don't put a a page twice.
+	 */
+	pg = cp->csp->page;
+	cp->csp->page = NULL;
+	next_recno = cp->recno;
+next:	/*
+	 * This is the start of the main compaction loop.  There are 3
+	 * parts to the process:
+	 * 1) Walk the leaf pages of the tree looking for a page to
+	 *	process.  We do this with read locks.  Save the
+	 *	key from the page and release it.
+	 * 2) Set up a cursor stack which will write lock the page
+	 *	and enough of its ancestors to get the job done.
+	 *	This could go to the root if we might delete a subtree
+	 *	or we have record numbers to update.
+	 * 3) Loop fetching pages after the above page and move enough
+	 *	data to fill it.
+	 * We exit the loop if we are at the end of the leaf pages, are
+	 * about to lock a new subtree (we span) or on error.
+	 */
+
+	/* Walk the pages looking for something to fill up. */
+	while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+		c_data->compact_pages_examine++;
+		PTRACE(dbc, "Next", PGNO(pg), start, 0);
+
+		/* If we have fetched the next page, get the new key. */
+		if (next_p == 1 &&
+		    dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) {
+			if ((ret = __db_ret(dbc, pg, 0, start,
+			    &start->data, &start->ulen)) != 0)
+				goto err;
+		}
+		next_recno += NUM_ENT(pg);
+		if (P_FREESPACE(dbp, pg) > factor ||
+		     (check_trunc && PGNO(pg) > c_data->compact_truncate))
+			break;
+		if (stop != NULL && stop->size > 0) {
+			if ((ret = __bam_compact_isdone(dbc,
+			    stop, pg, &isdone)) != 0)
+				goto err;
+			if (isdone)
+				goto done;
+		}
+
+		/*
+		 * The page does not need more data or to be swapped,
+		 * check to see if we want to look at possible duplicate
+		 * trees or overflow records and the move on to the next page.
+		 */
+		cp->recno += NUM_ENT(pg);
+		next_p = 1;
+		tdone = pgs_done;
+		PTRACE(dbc, "Dups", PGNO(pg), start, 0);
+		if (check_dups && (ret = __bam_compact_dups(
+		     dbc, &pg, factor, 0, c_data, &pgs_done)) != 0)
+			goto err;
+		npgno = NEXT_PGNO(pg);
+		if ((ret = __memp_fput(dbmp,
+		     dbc->thread_info, pg, dbc->priority)) != 0)
+			goto err;
+		pg = NULL;
+		/*
+		 * If we don't do anything we don't need to hold
+		 * the lock on the previous page, so couple always.
+		 */
+		if ((ret = __db_lget(dbc,
+		    tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE,
+		    npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(dbmp, &npgno,
+		     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+			goto err;
+	}
+
+	/*
+	 * When we get here we have 3 cases:
+	 * 1) We've reached the end of the leaf linked list and are done.
+	 * 2) A page whose freespace exceeds our target and therefore needs
+	 *	to have data added to it.
+	 * 3) A page that doesn't have too much free space but needs to be
+	 *	checked for truncation.
+	 * In both cases 2 and 3, we need that page's first key or record
+	 * number.  We may already have it, if not get it here.
+	 */
+	if ((nentry = NUM_ENT(pg)) != 0) {
+		/* Get a copy of the first recno on the page. */
+		if (dbc->dbtype == DB_RECNO) {
+			if ((ret = __db_retcopy(dbp->env, start,
+			     &cp->recno, sizeof(cp->recno),
+			     &start->data, &start->ulen)) != 0)
+				goto err;
+		} else if (((next_p == 1 && npgno == PGNO_INVALID) ||
+		    start->size == 0) && (ret = __db_ret(dbc,
+		    pg, 0, start, &start->data, &start->ulen)) != 0)
+			goto err;
+
+		next_p = 0;
+		/*
+		 * If there is no next page we can stop unless there is
+		 * a possibility of moving this data to a lower numbered
+		 * page.
+		 */
+		if (npgno == PGNO_INVALID &&
+		    (!check_trunc || PGNO(pg) <= c_data->compact_truncate ||
+		    PGNO(pg) == BAM_ROOT_PGNO(dbc))) {
+			/* End of the tree, check its duplicates and exit. */
+			PTRACE(dbc, "GoDone", PGNO(pg), start, 0);
+			if (check_dups && (ret = __bam_compact_dups(dbc,
+			   &pg, factor, 0, c_data, &pgs_done)) != 0)
+				goto err;
+			c_data->compact_pages_examine++;
+			isdone = 1;
+			goto done;
+		}
+	}
+
+	/* Release the page so we don't deadlock getting its parent. */
+	if ((ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority)) != 0)
+		goto err;
+	if ((ret = __LPUT(dbc, cp->csp->lock)) != 0)
+		goto err;
+	BT_STK_CLR(cp);
+	pg = NULL;
+	saved_pgno = PGNO_INVALID;
+	prev_pgno = PGNO_INVALID;
+	nnext_pgno = PGNO_INVALID;
+
+	/*
+	 * We must lock the metadata page first because we cannot block
+	 * while holding interior nodes of the tree pinned.
+	 */
+
+	if (!LOCK_ISSET(metalock) && pgs_free == c_data->compact_pages_free &&
+	    (ret = __db_lget(dbc,
+	    LCK_ALWAYS, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+
+	/*
+	 * Setup the cursor stack. There are 3 cases:
+	 * 1) the page is empty and will be deleted: nentry == 0.
+	 * 2) the next page has the same parent: *spanp == 0.
+	 * 3) the next page has a different parent: *spanp == 1.
+	 *
+	 * We now need to search the tree again, getting a write lock
+	 * on the page we are going to merge or delete.  We do this by
+	 * searching down the tree and locking as much of the subtree
+	 * above the page as needed.  In the case of a delete we will
+	 * find the maximal subtree that can be deleted. In the case
+	 * of merge if the current page and the next page are siblings
+	 * with the same parent then we only need to lock the parent.
+	 * Otherwise *span will be set and we need to search to find the
+	 * lowest common ancestor.  Dbc will be set to contain the subtree
+	 * containing the page to be merged or deleted. Ndbc will contain
+	 * the minimal subtree containing that page and its next sibling.
+	 * In all cases for DB_RECNO we simplify things and get the whole
+	 * tree if we need more than a single parent.
+	 * The tree can collapse while we don't have it locked, so the
+	 * page we are looking for may be gone.  If so we are at
+	 * the right most end of the leaf pages and are done.
+	 */
+
+retry:	pg = NULL;
+	if (npg != NULL && (ret = __memp_fput(dbmp,
+	     dbc->thread_info, npg, dbc->priority)) != 0)
+		goto err;
+	npg = NULL;
+	if (ndbc != NULL) {
+		ncp = (BTREE_CURSOR *)ndbc->internal;
+		if (clear_root == 1) {
+			ncp->sp->page = NULL;
+			LOCK_INIT(ncp->sp->lock);
+		}
+		if ((ret = __bam_stkrel(ndbc, 0)) != 0)
+			goto err;
+	}
+	clear_root = 0;
+	/* Case 1 -- page is empty. */
+	if (nentry == 0) {
+		CTRACE(dbc, "Empty", "", start, 0);
+		if (next_p == 1)
+			sflag = CS_NEXT_WRITE;
+		else
+			sflag = CS_DEL;
+		if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0) {
+			isdone = 1;
+			if (ret == DB_NOTFOUND)
+				ret = 0;
+			goto err;
+		}
+
+		pg = cp->csp->page;
+		/* Check to see if the page is still empty. */
+		if (NUM_ENT(pg) != 0)
+			npgno = PGNO(pg);
+		else {
+			npgno = NEXT_PGNO(pg);
+			/* If this is now the root, we are very done. */
+			if (PGNO(pg) == BAM_ROOT_PGNO(dbc))
+				isdone = 1;
+			else {
+				if (npgno != PGNO_INVALID) {
+					TRY_LOCK(dbc, npgno, saved_pgno,
+					    next_lock, DB_LOCK_WRITE, retry);
+					if (ret != 0)
+						goto err;
+				}
+				if (PREV_PGNO(pg) != PGNO_INVALID) {
+					TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+					    prev_lock, DB_LOCK_WRITE, retry);
+					if (ret != 0)
+						goto err;
+				}
+				if ((ret =
+				    __bam_dpages(dbc, 0, BTD_RELINK)) != 0)
+					goto err;
+				c_data->compact_pages_free++;
+				if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+					goto err;
+				LOCK_INIT(prev_lock);
+				if ((ret = __TLPUT(dbc, next_lock)) != 0)
+					goto err;
+				LOCK_INIT(next_lock);
+				saved_pgno = PGNO_INVALID;
+				goto next_no_release;
+			}
+		}
+		goto next_page;
+	}
+
+	/* case 3 -- different parents. */
+	if (*spanp) {
+		CTRACE(dbc, "Span", "", start, 0);
+		/*
+		 * Search the tree looking for the page containing and
+		 * the next page after the current key.
+		 * The stack will be rooted at the page that spans
+		 * the current and next pages. The two subtrees
+		 * are returned below that.  For BTREE the current
+		 * page subtree will be first while for RECNO the
+		 * next page subtree will be first
+		 */
+		if (ndbc == NULL && (ret = __dbc_dup(dbc, &ndbc, 0)) != 0)
+			goto err;
+		DB_ASSERT(env, ndbc != NULL);
+		ncp = (BTREE_CURSOR *)ndbc->internal;
+
+		ncp->recno = cp->recno;
+		cp->recno = next_recno;
+
+		if ((ret = __bam_csearch(dbc, start, CS_NEXT_BOTH, 0)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				isdone = 1;
+				ret = 0;
+			}
+			goto err;
+		}
+
+		/*
+		 * Find the top of the stack for the second subtree.
+		 */
+		for (epg = cp->csp - 1; epg > cp->sp; epg--)
+			if (LEVEL(epg->page) == LEAFLEVEL)
+				break;
+		DB_ASSERT(env, epg != cp->sp);
+
+		/*
+		 * Copy the root. We will have two instances of the
+		 * same page, be careful not to free both.
+		 */
+		BT_STK_PUSH(env, ncp, cp->sp->page, cp->sp->indx,
+		     cp->sp->lock, cp->sp->lock_mode, ret);
+		if (ret != 0)
+			goto err;
+		clear_root = 1;
+
+		/* Copy the stack containing the next page. */
+		for (epg++; epg <= cp->csp; epg++) {
+			BT_STK_PUSH(env, ncp, epg->page, epg->indx,
+			     epg->lock, epg->lock_mode, ret);
+			if (ret != 0)
+				goto err;
+		}
+		/* adjust the stack pointer to remove these items. */
+		ncp->csp--;
+		cp->csp -= ncp->csp - ncp->sp;
+
+		/*
+		 * If this is RECNO then we want to swap the stacks.
+		 */
+		if (dbc->dbtype == DB_RECNO) {
+			ndbc->internal = (DBC_INTERNAL *)cp;
+			dbc->internal = (DBC_INTERNAL *)ncp;
+			cp = ncp;
+			ncp = (BTREE_CURSOR *)ndbc->internal;
+			cp->sp->indx--;
+		} else
+			ncp->sp->indx++;
+
+		DB_ASSERT(env,
+		    NEXT_PGNO(cp->csp->page) == PGNO(ncp->csp->page));
+		pg = cp->csp->page;
+
+		/*
+		 * The page may have emptied while we waited for the
+		 * lock or the record we are looking for may have
+		 * moved.
+		 * Reset npgno so we re-get this page when we go back
+		 * to the top.
+		 */
+		if (NUM_ENT(pg) == 0 ||
+		     (dbc->dbtype == DB_RECNO &&
+		     NEXT_PGNO(cp->csp->page) != PGNO(ncp->csp->page))) {
+			npgno = PGNO(pg);
+			*spanp = 0;
+			goto next_page;
+		}
+
+		if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
+			if (PREV_PGNO(pg) != PGNO_INVALID) {
+				TRY_LOCK2(dbc, ndbc, PREV_PGNO(pg), prev_pgno,
+				    prev_lock, DB_LOCK_WRITE, retry);
+				if (ret != 0)
+					goto err1;
+			}
+			pgs_done++;
+			/* Get a fresh low numbered page. */
+			if ((ret = __db_exchange_page(dbc,
+			    &cp->csp->page, ncp->csp->page,
+			    PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+				goto err1;
+			if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+				goto err1;
+			LOCK_INIT(prev_lock);
+			pg = cp->csp->page;
+		}
+		*spanp = 0;
+		PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0);
+		if (check_dups && (ret = __bam_compact_dups(ndbc,
+		     &ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
+			goto err1;
+
+		DB_ASSERT(env, ndbc != NULL);
+		/* Check to see if the tree collapsed. */
+		/*lint -e{794} */
+		if (PGNO(ncp->csp->page) == BAM_ROOT_PGNO(ndbc))
+			goto done;
+
+		pg = cp->csp->page;
+		npgno = NEXT_PGNO(pg);
+		PTRACE(dbc, "SDups", PGNO(pg), start, 0);
+		if (check_dups && (ret =
+		     __bam_compact_dups(dbc, &cp->csp->page,
+		     factor, 1, c_data, &pgs_done)) != 0)
+			goto err1;
+
+		/*
+		 * We may have dropped our locks, check again
+		 * to see if we still need to fill this page and
+		 * we are in a spanning situation.
+		 */
+
+		if (P_FREESPACE(dbp, pg) <= factor ||
+		     cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1)
+			goto next_page;
+
+		/*
+		 * Try to move things into a single parent.
+		 */
+		merged = 0;
+		for (epg = cp->sp; epg != cp->csp; epg++) {
+			PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
+			if ((ret = __bam_merge_internal(dbc,
+			    ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
+				break;
+			if (merged)
+				break;
+		}
+
+		if (ret != 0 && ret != DB_LOCK_NOTGRANTED)
+			goto err1;
+		/*
+		 * If we merged the parent, then we nolonger span.
+		 * Otherwise if we tried to merge the parent but would
+		 * block on one of the other leaf pages try again.
+		 * If we did not merge any records of the parent,
+		 * exit to commit any local transactions and try again.
+		 */
+		if (merged || (pgs_done > 0 && ret == DB_LOCK_NOTGRANTED)) {
+			if (merged)
+				pgs_done++;
+			else
+				goto done;
+			if (cp->csp->page == NULL)
+				goto deleted;
+			npgno = PGNO(pg);
+			next_recno = cp->recno;
+			goto next_page;
+		}
+		PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
+
+		/* if we remove the next page, then we need its next locked */
+		npgno = NEXT_PGNO(ncp->csp->page);
+		if (npgno != PGNO_INVALID) {
+			TRY_LOCK2(dbc, ndbc, npgno,
+			    nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+			if (ret != 0)
+				goto err1;
+		}
+		/*lint -e{794} */
+		if ((ret = __bam_merge(dbc,
+		     ndbc, factor, stop, c_data, &isdone)) != 0)
+			goto err1;
+		pgs_done++;
+		/*
+		 * __bam_merge could have freed our stack if it
+		 * deleted a page possibly collapsing the tree.
+		 */
+		if (cp->csp->page == NULL)
+			goto deleted;
+		cp->recno += NUM_ENT(pg);
+
+		if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+			goto err1;
+		LOCK_INIT(nnext_lock);
+		nnext_pgno = PGNO_INVALID;
+
+		/* If we did not bump to the next page something did not fit. */
+		if (npgno != NEXT_PGNO(pg)) {
+			npgno = NEXT_PGNO(pg);
+			goto next_page;
+		}
+	} else {
+		/* Case 2 -- same parents. */
+		CTRACE(dbc, "Sib", "", start, 0);
+		if ((ret =
+		    __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				isdone = 1;
+				ret = 0;
+			}
+			goto err;
+		}
+
+		pg = cp->csp->page;
+		DB_ASSERT(env, IS_DIRTY(pg));
+		DB_ASSERT(env,
+		    PGNO(pg) == BAM_ROOT_PGNO(dbc) ||
+		    IS_DIRTY(cp->csp[-1].page));
+
+		/* Check to see if we moved to a new parent. */
+		if (PGNO(pg) != BAM_ROOT_PGNO(dbc) &&
+		    ppgno != PGNO(cp->csp[-1].page) && pgs_done != 0) {
+			do_commit = 1;
+			goto next_page;
+		}
+
+		/* We now have a write lock, recheck the page. */
+		if ((nentry = NUM_ENT(pg)) == 0) {
+			npgno = PGNO(pg);
+			goto next_page;
+		}
+
+		/* Check duplicate trees, we have a write lock on the page. */
+		PTRACE(dbc, "SibDup", PGNO(pg), start, 0);
+		if (check_dups && (ret =
+		     __bam_compact_dups(dbc, &cp->csp->page,
+		     factor, 1, c_data, &pgs_done)) != 0)
+			goto err1;
+		pg = cp->csp->page;
+		npgno = NEXT_PGNO(pg);
+
+		/* Check to see if the tree collapsed. */
+		if (PGNO(pg) == BAM_ROOT_PGNO(dbc))
+			goto err1;
+		DB_ASSERT(env, cp->csp - cp->sp == 1);
+
+		/* After re-locking check to see if we still need to fill. */
+		if (P_FREESPACE(dbp, pg) <= factor) {
+			if (check_trunc &&
+			    PGNO(pg) > c_data->compact_truncate) {
+				if (PREV_PGNO(pg) != PGNO_INVALID) {
+					TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+					    prev_lock, DB_LOCK_WRITE, retry);
+					if (ret != 0)
+						goto err1;
+				}
+				if (npgno != PGNO_INVALID) {
+					TRY_LOCK(dbc, npgno, saved_pgno,
+					    next_lock, DB_LOCK_WRITE, retry);
+					if (ret != 0)
+						goto err1;
+				}
+				/* Get a fresh low numbered page. */
+				pgno = PGNO(pg);
+				if ((ret = __db_exchange_page(dbc,
+				    &cp->csp->page, NULL,
+				    PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+					goto err1;
+				if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+					goto err1;
+				LOCK_INIT(prev_lock);
+				prev_pgno = PGNO_INVALID;
+				if ((ret = __TLPUT(dbc, next_lock)) != 0)
+					goto err1;
+				LOCK_INIT(next_lock);
+				saved_pgno = PGNO_INVALID;
+				pg = cp->csp->page;
+				if (pgno != PGNO(pg)) {
+					pgs_done++;
+					pgno = PGNO(pg);
+				}
+			}
+			/*
+			 * If we are going to leave this parent commit
+			 * the current transaction before continuing.
+			 */
+			epg = &cp->csp[-1];
+			if ((ppgno != PGNO(epg->page) &&
+			    ppgno != PGNO_INVALID) ||
+			    epg->indx == NUM_ENT(epg->page) - 1)
+				do_commit = 1;
+			ppgno = PGNO(epg->page);
+			goto next_page;
+		}
+
+		/* If they have the same parent, just dup the cursor */
+		if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
+			goto err1;
+		if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
+			goto err1;
+		ncp = (BTREE_CURSOR *)ndbc->internal;
+
+		/*
+		 * ncp->recno needs to have the recno of the next page.
+		 * Bump it by the number of records on the current page.
+		 */
+		ncp->recno += NUM_ENT(pg);
+	}
+
+	pgno = PGNO(cp->csp->page);
+	ppgno = PGNO(cp->csp[-1].page);
+	/* Fetch pages until we fill this one. */
+	while (!isdone && npgno != PGNO_INVALID &&
+	     P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) {
+		/*
+		 * merging may have to free the parent page, if it does,
+		 * refetch it but do it descending the tree.
+		 */
+		epg = &cp->csp[-1];
+		if ((ppg = epg->page) == NULL) {
+			if ((ret = __memp_fput(dbmp, dbc->thread_info,
+			     cp->csp->page, dbc->priority)) != 0)
+				goto err1;
+			pg = cp->csp->page = NULL;
+			if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+				(ret = __db_lget(dbc, 0, ppgno,
+				DB_LOCK_WRITE, 0, &epg->lock)) != 0)
+					goto err1;
+			if ((ret = __memp_fget(dbmp, &ppgno, dbc->thread_info,
+			    dbc->txn, DB_MPOOL_DIRTY, &ppg)) != 0)
+				goto err1;
+			if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+				(ret = __db_lget(dbc, 0, pgno,
+				DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+					goto err1;
+			if ((ret = __memp_fget(dbmp, &pgno, dbc->thread_info,
+			    dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+				goto err1;
+			epg->page = ppg;
+			cp->csp->page = pg;
+		}
+
+		/*
+		 * If our current position is the last one on a parent
+		 * page, then we are about to merge across different
+		 * internal nodes.  Thus, we need to lock higher up
+		 * in the tree.  We will exit the routine and commit
+		 * what we have done so far.  Set spanp so we know
+		 * we are in this case when we come back.
+		 */
+		if (epg->indx == NUM_ENT(ppg) - 1) {
+			*spanp = 1;
+			do_commit = 1;
+			npgno = PGNO(pg);
+			next_recno = cp->recno;
+			epg->page = ppg;
+			goto next_page;
+		}
+
+		/* Lock and get the next page. */
+		TRY_LOCK(dbc, npgno,
+		    saved_pgno, saved_lock, DB_LOCK_WRITE, retry);
+		if (ret != 0)
+			goto err1;
+		if ((ret = __LPUT(dbc, ncp->lock)) != 0)
+			goto err1;
+		ncp->lock = saved_lock;
+		LOCK_INIT(saved_lock);
+		saved_pgno = PGNO_INVALID;
+
+		if ((ret = __memp_fget(dbmp, &npgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &npg)) != 0)
+			goto err1;
+
+		if (check_trunc &&
+		    PGNO(pg) > c_data->compact_truncate) {
+			if (PREV_PGNO(pg) != PGNO_INVALID) {
+				TRY_LOCK(dbc, PREV_PGNO(pg),
+				    prev_pgno, prev_lock, DB_LOCK_WRITE, retry);
+				if (ret != 0)
+					goto err1;
+			}
+			pgno = PGNO(pg);
+			/* Get a fresh low numbered page. */
+			if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+			    npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+				goto err1;
+			if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+				goto err1;
+			LOCK_INIT(prev_lock);
+			prev_pgno = PGNO_INVALID;
+			pg = cp->csp->page;
+			if (pgno != PGNO(pg)) {
+				pgs_done++;
+				pgno = PGNO(pg);
+			}
+		}
+		c_data->compact_pages_examine++;
+
+		PTRACE(dbc, "MDups", PGNO(npg), start, 0);
+		if (check_dups && (ret = __bam_compact_dups(ndbc,
+		     &npg, factor, 1, c_data, &pgs_done)) != 0)
+			goto err1;
+
+		npgno = NEXT_PGNO(npg);
+		if (npgno != PGNO_INVALID) {
+			TRY_LOCK(dbc, npgno,
+			    nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+			if (ret != 0)
+				goto err1;
+		}
+
+		/* copy the common parent to the stack. */
+		BT_STK_PUSH(env, ncp, ppg,
+		     epg->indx + 1, epg->lock, epg->lock_mode, ret);
+		if (ret != 0)
+			goto err1;
+
+		/* Put the page on the stack. */
+		BT_STK_ENTER(env, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret);
+
+		LOCK_INIT(ncp->lock);
+		npg = NULL;
+
+		/*
+		 * Merge the pages.  This will either free the next
+		 * page or just update its parent pointer.
+		 */
+		PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
+		if ((ret = __bam_merge(dbc,
+		     ndbc, factor, stop, c_data, &isdone)) != 0)
+			goto err1;
+
+		pgs_done++;
+
+		if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+			goto err1;
+		LOCK_INIT(nnext_lock);
+		nnext_pgno = PGNO_INVALID;
+
+		/*
+		 * __bam_merge could have freed our stack if it
+		 * deleted a page possibly collapsing the tree.
+		 */
+		if (cp->csp->page == NULL)
+			goto deleted;
+		/* If we did not bump to the next page something did not fit. */
+		if (npgno != NEXT_PGNO(pg))
+			break;
+	}
+
+	/* Bottom of the main loop.  Move to the next page. */
+	npgno = NEXT_PGNO(pg);
+	cp->recno += NUM_ENT(pg);
+	next_recno = cp->recno;
+
+next_page:
+	if (ndbc != NULL) {
+		ncp = (BTREE_CURSOR *)ndbc->internal;
+		if (ncp->sp->page == cp->sp->page) {
+			ncp->sp->page = NULL;
+			LOCK_INIT(ncp->sp->lock);
+		}
+		if ((ret = __bam_stkrel(ndbc,
+		     pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+			goto err;
+	}
+	/*
+	 * Unlatch the tree before trying to lock the next page.  We must
+	 * unlatch to avoid a latch deadlock but we want to hold the
+	 * lock on the parent node so this leaf cannot be unlinked.
+	 */
+	pg = NULL;
+	if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
+		goto err;
+	if (npgno != PGNO_INVALID &&
+	    (ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
+		goto err;
+	if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+		goto err;
+	if ((ret = __TLPUT(dbc, saved_lock)) != 0)
+		goto err;
+	if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+		goto err;
+
+next_no_release:
+	pg = NULL;
+
+	if (npgno == PGNO_INVALID || c_data->compact_pages  == 0)
+		isdone = 1;
+	if (!isdone) {
+		/*
+		 * If we are at the end of this parent commit the
+		 * transaction so we don't tie things up.
+		 */
+		if (do_commit && !F_ISSET(dbc, DBC_OPD) &&
+		   (atomic_read(&dbp->mpf->mfp->multiversion) != 0 ||
+		   pgs_done != 0)) {
+deleted:		if (ndbc != NULL &&
+			     ((ret = __bam_stkrel(ndbc, 0)) != 0 ||
+			     (ret = __dbc_close(ndbc)) != 0))
+				goto err;
+			goto out;
+		}
+
+		/* Reget the next page to look at. */
+		cp->recno = next_recno;
+		if ((ret = __memp_fget(dbmp, &npgno,
+		    dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+			goto err;
+		cp->csp->lock = next_lock;
+		LOCK_INIT(next_lock);
+		next_p = 1;
+		do_commit = 0;
+		/* If we did not do anything we can drop the metalock. */
+		if (pgs_done == 0 && (ret = __LPUT(dbc, metalock)) != 0)
+			goto err;
+		goto next;
+	}
+
+done:
+	if (0) {
+		/*
+		 * We come here if pg came from cp->csp->page and could
+		 * have already been fput.
+		 */
+err1:		pg = NULL;
+	}
+err:	/*
+	 * Don't release locks (STK_PGONLY)if we had an error, we could reveal
+	 * a bad tree to a dirty reader.  Wait till the abort to free the locks.
+	 */
+	sflag = STK_CLRDBC;
+	if (dbc->txn != NULL && ret != 0)
+		sflag |= STK_PGONLY;
+	if (ndbc != NULL) {
+		ncp = (BTREE_CURSOR *)ndbc->internal;
+		if (npg == ncp->csp->page)
+			npg = NULL;
+		if (ncp->sp->page == cp->sp->page) {
+			ncp->sp->page = NULL;
+			LOCK_INIT(ncp->sp->lock);
+		}
+		if ((t_ret = __bam_stkrel(ndbc, sflag)) != 0 && ret == 0)
+			ret = t_ret;
+		else if ((t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (pg == cp->csp->page)
+		pg = NULL;
+	if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (pg != NULL && (t_ret =
+	     __memp_fput(dbmp,
+		  dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
+		ret = t_ret;
+	if (npg != NULL && (t_ret =
+	     __memp_fput(dbmp,
+		  dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
+		ret = t_ret;
+
+out:	*donep = isdone;
+
+	/* For OPD trees return if we did anything in the span variable. */
+	if (F_ISSET(dbc, DBC_OPD))
+		*spanp = pgs_done;
+
+	return (ret);
+}
+
+/*
+ * __bam_merge -- do actual merging of leaf pages.
+ */
+static int
+__bam_merge(dbc, ndbc, factor, stop, c_data, donep)
+	DBC *dbc, *ndbc;
+	u_int32_t factor;
+	DBT *stop;
+	DB_COMPACT *c_data;
+	int *donep;
+{
+	BTREE_CURSOR *cp, *ncp;
+	DB *dbp;
+	PAGE *pg, *npg;
+	db_indx_t nent;
+	int ret;
+
+	DB_ASSERT(NULL, dbc != NULL);
+	DB_ASSERT(NULL, ndbc != NULL);
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ncp = (BTREE_CURSOR *)ndbc->internal;
+	pg = cp->csp->page;
+	npg = ncp->csp->page;
+
+	nent = NUM_ENT(npg);
+
+	/* If the page is empty just throw it away. */
+	if (nent == 0)
+		goto free_page;
+
+	/* Find if the stopping point is on this page. */
+	if (stop != NULL && stop->size != 0) {
+		if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
+			return (ret);
+		if (*donep)
+			return (0);
+	}
+
+	/*
+	 * If there is too much data then just move records one at a time.
+	 * Otherwise copy the data space over and fix up the index table.
+	 * If we are on the left most child we will effect our parent's
+	 * index entry so we call merge_records to figure out key sizes.
+	 */
+	if ((dbc->dbtype == DB_BTREE &&
+	    ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
+	    (int)(P_FREESPACE(dbp, pg) -
+	    ((dbp->pgsize - P_OVERHEAD(dbp)) -
+	    P_FREESPACE(dbp, npg))) < (int)factor)
+		ret = __bam_merge_records(dbc, ndbc, factor, c_data);
+	else
+		/*lint -e{794} */
+free_page:	ret = __bam_merge_pages(dbc, ndbc, c_data);
+
+	return (ret);
+}
+
+static int
+__bam_merge_records(dbc, ndbc, factor, c_data)
+	DBC *dbc, *ndbc;
+	u_int32_t factor;
+	DB_COMPACT *c_data;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk, *tmp_bk;
+	BTREE *t;
+	BTREE_CURSOR *cp, *ncp;
+	DB *dbp;
+	DBT a, b, data, hdr;
+	ENV *env;
+	EPG *epg;
+	PAGE *pg, *npg;
+	db_indx_t adj, indx, nent, *ninp, pind;
+	int32_t adjust;
+	u_int32_t freespace, len, nksize, pfree, size;
+	int first_dup, is_dup, next_dup, n_ok, ret;
+	size_t (*func) __P((DB *, const DBT *, const DBT *));
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	t = dbp->bt_internal;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ncp = (BTREE_CURSOR *)ndbc->internal;
+	pg = cp->csp->page;
+	memset(&hdr, 0, sizeof(hdr));
+	pind = NUM_ENT(pg);
+	n_ok = 0;
+	adjust = 0;
+	ret = 0;
+
+	/* See if we want to swap out this page. */
+	if (c_data->compact_truncate != PGNO_INVALID &&
+	     PGNO(ncp->csp->page) > c_data->compact_truncate) {
+		/* Get a fresh low numbered page. */
+		if ((ret = __db_exchange_page(ndbc,
+		   &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+			goto err;
+	}
+
+	npg = ncp->csp->page;
+	nent = NUM_ENT(npg);
+
+	DB_ASSERT(env, nent != 0);
+
+	ninp = P_INP(dbp, npg);
+
+	/*
+	 * pg is the page that is being filled, it is in the stack in cp.
+	 * npg is the next page, it is in the stack in ncp.
+	 */
+	freespace = P_FREESPACE(dbp, pg);
+
+	adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
+	/*
+	 * Loop through the records and find the stopping point.
+	 */
+	for (indx = 0; indx < nent; indx += adj)  {
+		bk = GET_BKEYDATA(dbp, npg, indx);
+
+		/* Size of the key. */
+		size = BITEM_PSIZE(bk);
+
+		/* Size of the data. */
+		if (TYPE(pg) == P_LBTREE)
+			size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1));
+		/*
+		 * If we are at a duplicate set, skip ahead to see and
+		 * get the total size for the group.
+		 */
+		n_ok = adj;
+		if (TYPE(pg) == P_LBTREE &&
+		     indx < nent - adj &&
+		     ninp[indx] == ninp[indx + adj]) {
+			do {
+				/* Size of index for key reference. */
+				size += sizeof(db_indx_t);
+				n_ok++;
+				/* Size of data item. */
+				size += BITEM_PSIZE(
+				    GET_BKEYDATA(dbp, npg, indx + n_ok));
+				n_ok++;
+			} while (indx + n_ok < nent &&
+			    ninp[indx] == ninp[indx + n_ok]);
+		}
+		/* if the next set will not fit on the page we are done. */
+		if (freespace < size)
+			break;
+
+		/*
+		 * Otherwise figure out if we are past the goal and if
+		 * adding this set will put us closer to the goal than
+		 * we are now.
+		 */
+		if ((freespace - size) < factor) {
+			if (freespace - factor > factor - (freespace - size))
+				indx += n_ok;
+			break;
+		}
+		freespace -= size;
+		indx += n_ok - adj;
+	}
+
+	/* If we have hit the first record then there is nothing we can move. */
+	if (indx == 0)
+		goto done;
+	if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
+		if (indx == nent)
+			return (__bam_merge_pages(dbc, ndbc, c_data));
+		goto no_check;
+	}
+	/*
+	 * We need to update npg's parent key.  Avoid creating a new key
+	 * that will be too big. Get what space will be available on the
+	 * parents. Then if there will not be room for this key, see if
+	 * prefix compression will make it work, if not backup till we
+	 * find something that will.  (Needless to say, this is a very
+	 * unlikely event.)  If we are deleting this page then we will
+	 * need to propagate the next key to our grand parents, so we
+	 * see if that will fit.
+	 */
+	pfree = dbp->pgsize;
+	for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--)
+		if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+			bi = GET_BINTERNAL(dbp, epg->page, epg->indx);
+			/* Add back in the key we will be deleting. */
+			freespace += BINTERNAL_PSIZE(bi->len);
+			if (freespace < pfree)
+				pfree = freespace;
+			if (epg->indx != 0)
+				break;
+		}
+
+	/*
+	 * If we are at the end, we will delete this page.  We need to
+	 * check the next parent key only if we are the leftmost page and
+	 * will therefore have to propagate the key up the tree.
+	 */
+	if (indx == nent) {
+		if (ncp->csp[-1].indx != 0 || ncp->csp[-1].entries == 1 ||
+		     BINTERNAL_PSIZE(GET_BINTERNAL(dbp,
+		     ncp->csp[-1].page, 1)->len) <= pfree)
+			return (__bam_merge_pages(dbc, ndbc, c_data));
+		indx -= adj;
+	}
+	bk = GET_BKEYDATA(dbp, npg, indx);
+	len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+	if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
+		if (F_ISSET(dbc, DBC_OPD)) {
+			if (dbp->dup_compare == __bam_defcmp)
+				func = __bam_defpfx;
+			else
+				func = NULL;
+		} else
+			func = t->bt_prefix;
+	} else
+		func = NULL;
+
+	/* Skip to the beginning of a duplicate set. */
+	while (indx != 0 && ninp[indx] == ninp[indx - adj])
+		indx -= adj;
+
+	while (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
+		if (B_TYPE(bk->type) != B_KEYDATA)
+			goto noprefix;
+		/*
+		 * Figure out if we can truncate this key.
+		 * Code borrowed from bt_split.c
+		 */
+		if (func == NULL)
+			goto noprefix;
+		tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj);
+		if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+			goto noprefix;
+		memset(&a, 0, sizeof(a));
+		a.size = tmp_bk->len;
+		a.data = tmp_bk->data;
+		memset(&b, 0, sizeof(b));
+		b.size = bk->len;
+		b.data = bk->data;
+		nksize = (u_int32_t)func(dbp, &a, &b);
+		if (BINTERNAL_PSIZE(nksize) < pfree)
+			break;
+noprefix:
+		/* Skip to the beginning of a duplicate set. */
+		do {
+			indx -= adj;
+		} while (indx != 0 &&  ninp[indx] == ninp[indx - adj]);
+
+		bk = GET_BKEYDATA(dbp, npg, indx);
+		len =
+		    (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+	}
+
+	/*
+	 * indx references the first record that will not move to the previous
+	 * page.  If it is 0 then we could not find a key that would fit in
+	 * the parent that would permit us to move any records.
+	 */
+	if (indx == 0)
+		goto done;
+	DB_ASSERT(env, indx <= nent);
+
+	/* Loop through the records and move them from npg to pg. */
+no_check: is_dup = first_dup = next_dup = 0;
+	pg = cp->csp->page;
+	npg = ncp->csp->page;
+	DB_ASSERT(env, IS_DIRTY(pg));
+	DB_ASSERT(env, IS_DIRTY(npg));
+	ninp = P_INP(dbp, npg);
+	do {
+		bk = GET_BKEYDATA(dbp, npg, 0);
+		/* Figure out if we are in a duplicate group or not. */
+		if ((NUM_ENT(npg) % 2) == 0) {
+			if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) {
+				if (!is_dup) {
+					first_dup = 1;
+					is_dup = 1;
+				} else
+					first_dup = 0;
+
+				next_dup = 1;
+			} else if (next_dup) {
+				is_dup = 1;
+				first_dup = 0;
+				next_dup = 0;
+			} else
+				is_dup = 0;
+		}
+
+		if (is_dup && !first_dup && (pind % 2) == 0) {
+			/* Duplicate key. */
+			if ((ret = __bam_adjindx(dbc,
+			     pg, pind, pind - P_INDX, 1)) != 0)
+				goto err;
+			if (!next_dup)
+				is_dup = 0;
+		} else switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			hdr.data = bk;
+			hdr.size = SSZA(BKEYDATA, data);
+			data.size = bk->len;
+			data.data = bk->data;
+			if ((ret = __db_pitem(dbc, pg, pind,
+			     BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0)
+				goto err;
+			break;
+		case B_OVERFLOW:
+		case B_DUPLICATE:
+			data.size = BOVERFLOW_SIZE;
+			data.data = bk;
+			if ((ret = __db_pitem(dbc, pg, pind,
+			     BOVERFLOW_SIZE, &data, NULL)) != 0)
+				goto err;
+			break;
+		default:
+			__db_errx(env, DB_STR_A("1022",
+			    "Unknown record format, page %lu, indx 0",
+			    "%lu"), (u_long)PGNO(pg));
+			ret = EINVAL;
+			goto err;
+		}
+		pind++;
+		if (next_dup && (NUM_ENT(npg) % 2) == 0) {
+			if ((ret = __bam_adjindx(ndbc,
+			     npg, 0, O_INDX, 0)) != 0)
+				goto err;
+		} else {
+			if ((ret = __db_ditem(ndbc,
+			     npg, 0, BITEM_SIZE(bk))) != 0)
+				goto err;
+		}
+		adjust++;
+	} while (--indx != 0);
+
+	DB_ASSERT(env, NUM_ENT(npg) != 0);
+
+	if (adjust != 0 &&
+	     (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) {
+		if (TYPE(pg) == P_LBTREE)
+			adjust /= P_INDX;
+		if ((ret = __bam_adjust(ndbc, -adjust)) != 0)
+			goto err;
+
+		if ((ret = __bam_adjust(dbc, adjust)) != 0)
+			goto err;
+	}
+
+	/* Update parent with new key. */
+	if (ndbc->dbtype == DB_BTREE &&
+	    (ret = __bam_pupdate(ndbc, pg)) != 0)
+		goto err;
+
+done:	if (cp->sp->page == ncp->sp->page) {
+		cp->sp->page = NULL;
+		LOCK_INIT(cp->sp->lock);
+	}
+	ret = __bam_stkrel(ndbc, STK_CLRDBC);
+
+err:	return (ret);
+}
+
+static int
+__bam_merge_pages(dbc, ndbc, c_data)
+	DBC *dbc, *ndbc;
+	DB_COMPACT *c_data;
+{
+	BTREE_CURSOR *cp, *ncp;
+	DB *dbp;
+	DBT data, hdr;
+	DB_LOCK root_lock;
+	DB_MPOOLFILE *dbmp;
+	PAGE *pg, *npg;
+	db_indx_t nent, *ninp, *pinp;
+	db_pgno_t pgno, ppgno;
+	u_int8_t *bp;
+	u_int32_t len;
+	int i, level, ret;
+
+	LOCK_INIT(root_lock);
+	COMPQUIET(ppgno, PGNO_INVALID);
+	dbp = dbc->dbp;
+	dbmp = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ncp = (BTREE_CURSOR *)ndbc->internal;
+	pg = cp->csp->page;
+	npg = ncp->csp->page;
+	memset(&hdr, 0, sizeof(hdr));
+	nent = NUM_ENT(npg);
+
+	/* If the page is empty just throw it away. */
+	if (nent == 0)
+		goto free_page;
+
+	pg = cp->csp->page;
+	npg = ncp->csp->page;
+	DB_ASSERT(dbp->env, IS_DIRTY(pg));
+	DB_ASSERT(dbp->env, IS_DIRTY(npg));
+	DB_ASSERT(dbp->env, nent == NUM_ENT(npg));
+
+	/* Bulk copy the data to the new page. */
+	len = dbp->pgsize - HOFFSET(npg);
+	if (DBC_LOGGING(dbc)) {
+		memset(&hdr, 0, sizeof(hdr));
+		hdr.data = npg;
+		hdr.size = LOFFSET(dbp, npg);
+		memset(&data, 0, sizeof(data));
+		data.data = (u_int8_t *)npg + HOFFSET(npg);
+		data.size = len;
+		if ((ret = __db_merge_log(dbp,
+		     dbc->txn, &LSN(pg), 0, PGNO(pg),
+		     &LSN(pg), PGNO(npg), &LSN(npg), &hdr, &data, 0)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(pg));
+	LSN(npg) = LSN(pg);
+	bp = (u_int8_t *)pg + HOFFSET(pg) - len;
+	memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len);
+
+	/* Copy index table offset by what was there already. */
+	pinp = P_INP(dbp, pg) + NUM_ENT(pg);
+	ninp = P_INP(dbp, npg);
+	for (i = 0; i < NUM_ENT(npg); i++)
+		*pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg));
+	HOFFSET(pg) -= len;
+	NUM_ENT(pg) += i;
+
+	NUM_ENT(npg) = 0;
+	HOFFSET(npg) += len;
+
+	if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) {
+		/*
+		 * There are two cases here regarding the stack.
+		 * Either we have two two level stacks but only ndbc
+		 * references the parent page or we have a multilevel
+		 * stack and only ndbc has an entry for the spanning
+		 * page.
+		 */
+		if (TYPE(pg) == P_LBTREE)
+			i /= P_INDX;
+		if ((ret = __bam_adjust(ndbc, -i)) != 0)
+			goto err;
+
+		if ((ret = __bam_adjust(dbc, i)) != 0)
+			goto err;
+	}
+
+free_page:
+	/*
+	 * __bam_dpages may decide to collapse the tree.
+	 * This can happen if we have the root and there
+	 * are exactly 2 pointers left in it.
+	 * If it can collapse the tree we must free the other
+	 * stack since it will nolonger be valid.  This
+	 * must be done before hand because we cannot
+	 * hold a page pinned if it might be truncated.
+	 */
+	if ((ret = __db_relink(dbc,
+	    ncp->csp->page, cp->csp->page, PGNO_INVALID)) != 0)
+		goto err;
+	/* Drop the duplicate reference to the sub tree root. */
+	cp->sp->page = NULL;
+	LOCK_INIT(cp->sp->lock);
+	if (PGNO(ncp->sp->page) == BAM_ROOT_PGNO(ndbc) &&
+	    NUM_ENT(ncp->sp->page) == 2) {
+		if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0)
+			goto err;
+		level = LEVEL(ncp->sp->page);
+		ppgno = PGNO(ncp->csp[-1].page);
+	} else
+		level = 0;
+	COMPACT_TRUNCATE(c_data);
+	if ((ret = __bam_dpages(ndbc,
+	    0, ndbc->dbtype == DB_RECNO ? 0 : BTD_UPDATE)) != 0)
+		goto err;
+	npg = NULL;
+	c_data->compact_pages_free++;
+	c_data->compact_pages--;
+	if (level != 0) {
+		pgno = PGNO_INVALID;
+		BAM_GET_ROOT(ndbc, pgno, npg, 0, DB_LOCK_READ, root_lock, ret);
+		if (ret != 0)
+			goto err;
+		DB_ASSERT(dbp->env, npg != NULL);
+		if (level == LEVEL(npg))
+			level = 0;
+		if ((ret = __memp_fput(dbmp,
+		     dbc->thread_info, npg, dbc->priority)) != 0)
+			goto err;
+		if ((ret = __LPUT(ndbc, root_lock)) != 0)
+			goto err;
+		npg = NULL;
+		if (level != 0) {
+			c_data->compact_levels++;
+			c_data->compact_pages_free++;
+			COMPACT_TRUNCATE(c_data);
+			if (c_data->compact_pages != 0)
+				c_data->compact_pages--;
+		}
+	}
+
+err:	return (ret);
+}
+
+/*
+ * __bam_merge_internal --
+ *	Merge internal nodes of the tree.
+ */
+static int
+__bam_merge_internal(dbc, ndbc, level, c_data, merged)
+	DBC *dbc, *ndbc;
+	int level;
+	DB_COMPACT *c_data;
+	int *merged;
+{
+	BINTERNAL bi, *bip, *fip;
+	BTREE_CURSOR *cp, *ncp;
+	DB *dbp;
+	DBT data, hdr;
+	DB_LOCK root_lock;
+	DB_MPOOLFILE *dbmp;
+	EPG *epg, *save_csp, *nsave_csp;
+	PAGE *pg, *npg;
+	RINTERNAL *rk;
+	db_indx_t first, indx, pind;
+	db_pgno_t pgno, ppgno;
+	int32_t nrecs, trecs;
+	u_int16_t size;
+	u_int32_t freespace, pfree;
+	int ret;
+
+	COMPQUIET(bip, NULL);
+	COMPQUIET(ppgno, PGNO_INVALID);
+	DB_ASSERT(NULL, dbc != NULL);
+	DB_ASSERT(NULL, ndbc != NULL);
+	LOCK_INIT(root_lock);
+
+	/*
+	 * ndbc will contain the the dominating parent of the subtree.
+	 * dbc will have the tree containing the left child.
+	 *
+	 * The stacks descend to the leaf level.
+	 * If this is a recno tree then both stacks will start at the root.
+	 */
+	dbp = dbc->dbp;
+	dbmp = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ncp = (BTREE_CURSOR *)ndbc->internal;
+	*merged = 0;
+	ret = 0;
+
+	/*
+	 * Set the stacks to the level requested.
+	 * Save the old value to restore when we exit.
+	 */
+	save_csp = cp->csp;
+	cp->csp = &cp->csp[-level + 1];
+	pg = cp->csp->page;
+	pind = NUM_ENT(pg);
+
+	nsave_csp = ncp->csp;
+	ncp->csp = &ncp->csp[-level + 1];
+	npg = ncp->csp->page;
+	indx = NUM_ENT(npg);
+
+	/*
+	 * The caller may have two stacks that include common ancestors, we
+	 * check here for convenience.
+	 */
+	if (npg == pg)
+		goto done;
+
+	if (TYPE(pg) == P_IBTREE) {
+		/*
+		 * Check for overflow keys on both pages while we have
+		 * them locked.
+		 */
+		 if ((ret =
+		      __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
+			goto err;
+		 if ((ret =
+		      __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
+			goto err;
+	}
+
+	/*
+	 * If we are about to move data off the left most page of an
+	 * internal node we will need to update its parents, make sure there
+	 * will be room for the new key on all the parents in the stack.
+	 * If not, move less data.
+	 */
+	fip = NULL;
+	if (TYPE(pg) == P_IBTREE) {
+		/* See where we run out of space. */
+		freespace = P_FREESPACE(dbp, pg);
+		/*
+		 * The leftmost key of an internal page is not accurate.
+		 * Go up the tree to find a non-leftmost parent.
+		 */
+		epg = ncp->csp;
+		while (--epg >= ncp->sp && epg->indx == 0)
+			continue;
+		fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+		epg = ncp->csp;
+
+		for (indx = 0;;) {
+			size = BINTERNAL_PSIZE(bip->len);
+			if (size > freespace)
+				break;
+			freespace -= size;
+			if (++indx >= NUM_ENT(npg))
+				break;
+			bip = GET_BINTERNAL(dbp, npg, indx);
+		}
+
+		/* See if we are deleting the page and we are not left most. */
+		if (indx == NUM_ENT(npg) && epg[-1].indx != 0)
+			goto fits;
+
+		pfree = dbp->pgsize;
+		for (epg--; epg >= ncp->sp; epg--)
+			if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+				bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+				/* Add back in the key we will be deleting. */
+				freespace += BINTERNAL_PSIZE(bip->len);
+				if (freespace < pfree)
+					pfree = freespace;
+				if (epg->indx != 0)
+					break;
+			}
+		epg = ncp->csp;
+
+		/* If we are at the end of the page we will delete it. */
+		if (indx == NUM_ENT(npg)) {
+			if (NUM_ENT(epg[-1].page) == 1)
+				goto fits;
+			bip =
+			     GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1);
+		} else
+			bip = GET_BINTERNAL(dbp, npg, indx);
+
+		/* Back up until we have a key that fits. */
+		while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) {
+			indx--;
+			bip = GET_BINTERNAL(dbp, npg, indx);
+		}
+		if (indx == 0)
+			goto done;
+	}
+
+fits:	memset(&bi, 0, sizeof(bi));
+	memset(&hdr, 0, sizeof(hdr));
+	memset(&data, 0, sizeof(data));
+	trecs = 0;
+
+	/*
+	 * Copy data between internal nodes till one is full
+	 * or the other is empty.
+	 */
+	first = 0;
+	nrecs = 0;
+	do {
+		if (dbc->dbtype == DB_BTREE) {
+			bip = GET_BINTERNAL(dbp, npg, 0);
+			size = fip == NULL ?
+			     BINTERNAL_SIZE(bip->len) :
+			     BINTERNAL_SIZE(fip->len);
+			if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+				break;
+
+			if (fip == NULL) {
+				data.size = bip->len;
+				data.data = bip->data;
+			} else {
+				data.size = fip->len;
+				data.data = fip->data;
+			}
+			bi.len = data.size;
+			B_TSET(bi.type, bip->type);
+			bi.pgno = bip->pgno;
+			bi.nrecs = bip->nrecs;
+			hdr.data = &bi;
+			hdr.size = SSZA(BINTERNAL, data);
+			if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))
+				nrecs = (int32_t)bip->nrecs;
+		} else {
+			rk = GET_RINTERNAL(dbp, npg, 0);
+			size = RINTERNAL_SIZE;
+			if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+				break;
+
+			hdr.data = rk;
+			hdr.size = size;
+			nrecs = (int32_t)rk->nrecs;
+		}
+		/*
+		 * Try to lock the subtree leaf records without waiting.
+		 * We must lock the subtree below the record we are merging
+		 * and the one after it since that is were a search will wind
+		 * up if it has already looked at our parent.  After the first
+		 * move we have the current subtree already locked.
+		 * If we merged any records then we will revisit this
+		 * node when we merge its leaves.  If not we will return
+		 * NOTGRANTED and our caller will do a retry.  We only
+		 * need to do this if we are in a transaction. If not then
+		 * we cannot abort and things will be hosed up on error
+		 * anyway.
+		 */
+		if (dbc->txn != NULL && (ret = __bam_lock_tree(ndbc,
+		    ncp->csp, nsave_csp, first,
+		    NUM_ENT(ncp->csp->page) == 1 ? 1 : 2)) != 0) {
+			if (ret != DB_LOCK_NOTGRANTED)
+				goto err;
+			break;
+		}
+		first = 1;
+		if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
+			goto err;
+		pind++;
+		if (fip != NULL) {
+			/* reset size to be for the record being deleted. */
+			size = BINTERNAL_SIZE(bip->len);
+			fip = NULL;
+		}
+		if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0)
+			goto err;
+		*merged = 1;
+		trecs += nrecs;
+	} while (--indx != 0);
+
+	if (!*merged)
+		goto done;
+
+	if (trecs != 0) {
+		cp->csp--;
+		ret = __bam_adjust(dbc, trecs);
+		if (ret != 0)
+			goto err;
+		cp->csp++;
+		ncp->csp--;
+		if ((ret = __bam_adjust(ndbc, -trecs)) != 0)
+			goto err;
+		ncp->csp++;
+	}
+
+	/*
+	 * Either we emptied the page or we need to update its
+	 * parent to reflect the first page we now point to.
+	 * First get rid of the bottom of the stack,
+	 * bam_dpages will clear the stack.  Maintain transactional
+	 * locks on the leaf pages to protect changes at this level.
+	 */
+	do {
+		if ((ret = __memp_fput(dbmp, dbc->thread_info,
+		    nsave_csp->page, dbc->priority)) != 0)
+			goto err;
+		nsave_csp->page = NULL;
+		if ((ret = __TLPUT(dbc, nsave_csp->lock)) != 0)
+			goto err;
+		LOCK_INIT(nsave_csp->lock);
+		nsave_csp--;
+	} while (nsave_csp != ncp->csp);
+
+	if (NUM_ENT(npg) == 0)  {
+		/*
+		 * __bam_dpages may decide to collapse the tree
+		 * so we need to free our other stack.  The tree
+		 * will change in height and our stack will nolonger
+		 * be valid.
+		 */
+		cp->csp = save_csp;
+		cp->sp->page = NULL;
+		LOCK_INIT(cp->sp->lock);
+		if (PGNO(ncp->sp->page) == BAM_ROOT_PGNO(ndbc) &&
+		    NUM_ENT(ncp->sp->page) == 2) {
+			if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0)
+				goto err;
+			level = LEVEL(ncp->sp->page);
+			ppgno = PGNO(ncp->csp[-1].page);
+		} else
+			level = 0;
+
+		COMPACT_TRUNCATE(c_data);
+		ret = __bam_dpages(ndbc,
+		     0, ndbc->dbtype == DB_RECNO ?
+		     BTD_RELINK : BTD_UPDATE | BTD_RELINK);
+		c_data->compact_pages_free++;
+		if (ret == 0 && level != 0) {
+			pgno = PGNO_INVALID;
+			BAM_GET_ROOT(ndbc,
+			    pgno, npg, 0, DB_LOCK_READ, root_lock, ret);
+			if (ret != 0)
+				goto err;
+			if (level == LEVEL(npg))
+				level = 0;
+			if ((ret = __LPUT(ndbc, root_lock)) != 0)
+				goto err;
+			if ((ret = __memp_fput(dbmp,
+			    dbc->thread_info, npg, dbc->priority)) != 0)
+				goto err;
+			npg = NULL;
+			if (level != 0) {
+				c_data->compact_levels++;
+				c_data->compact_pages_free++;
+				COMPACT_TRUNCATE(c_data);
+				if (c_data->compact_pages != 0)
+					c_data->compact_pages--;
+			}
+		}
+	} else {
+		ret = __bam_pupdate(ndbc, npg);
+
+		if (NUM_ENT(npg) != 0 &&
+		    c_data->compact_truncate != PGNO_INVALID &&
+		    PGNO(npg) > c_data->compact_truncate &&
+		    ncp->csp != ncp->sp) {
+			if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
+			    pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+				goto err;
+		}
+		if (c_data->compact_truncate != PGNO_INVALID &&
+		     PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
+			if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+			    ncp->csp->page,
+			    PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+				goto err;
+		}
+	}
+	cp->csp = save_csp;
+
+	return (ret);
+
+done:
+err:	cp->csp = save_csp;
+	ncp->csp = nsave_csp;
+
+	return (ret);
+}
+
+/*
+ * __bam_compact_dups -- try to compress off page dup trees.
+ * We may or may not have a write lock on this page.
+ */
+static int
+__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
+	DBC *dbc;
+	PAGE **ppg;
+	u_int32_t factor;
+	int have_lock;
+	DB_COMPACT *c_data;
+	int *donep;
+{
+	BOVERFLOW *bo;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *dbmp;
+	db_indx_t i;
+	db_pgno_t pgno;
+	int ret;
+
+	ret = 0;
+
+	DB_ASSERT(NULL, dbc != NULL);
+	dbp = dbc->dbp;
+	dbmp = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	for (i = 0; i <  NUM_ENT(*ppg); i++) {
+		bo = GET_BOVERFLOW(dbp, *ppg, i);
+		if (B_TYPE(bo->type) == B_KEYDATA)
+			continue;
+		c_data->compact_pages_examine++;
+		if (bo->pgno > c_data->compact_truncate) {
+			(*donep)++;
+			if (!have_lock) {
+				/*
+				 * The caller should have the page at
+				 * least read locked.  Drop the buffer
+				 * and get the write lock.
+				 */
+				pgno = PGNO(*ppg);
+				if ((ret = __memp_fput(dbmp, dbc->thread_info,
+				     *ppg, dbc->priority)) != 0)
+					goto err;
+				*ppg = NULL;
+				if ((ret = __db_lget(dbc, 0, pgno,
+				     DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+					goto err;
+				have_lock = 1;
+				if ((ret = __memp_fget(dbmp, &pgno,
+				    dbc->thread_info,
+				    dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+					goto err;
+			}
+			if ((ret = __bam_truncate_root_page(dbc,
+			     *ppg, i, c_data)) != 0)
+				goto err;
+			/* Just in case it should move.  Could it? */
+			bo = GET_BOVERFLOW(dbp, *ppg, i);
+		}
+
+		if (B_TYPE(bo->type) == B_OVERFLOW) {
+			if ((ret = __db_truncate_overflow(dbc,
+			    bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
+				goto err;
+			(*donep)++;
+			continue;
+		}
+		if ((ret = __bam_compact_opd(dbc, bo->pgno,
+		    have_lock ? NULL : ppg, factor, c_data, donep)) != 0)
+			goto err;
+	}
+
+err:
+	return (ret);
+}
+
+/*
+ * __bam_compact_opd -- compact an off page duplicate tree.
+ *
+ * PUBLIC: int __bam_compact_opd __P((DBC *,
+ * PUBLIC:      db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
+ */
+int
+__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	PAGE **ppg;
+	u_int32_t factor;
+	DB_COMPACT *c_data;
+	int *donep;
+{
+	BTREE_CURSOR *cp;
+	DBC *opd;
+	DBT start;
+	DB_MPOOLFILE *dbmp;
+	ENV *env;
+	PAGE *dpg;
+	int isdone, level, ret, span, t_ret;
+	db_pgno_t pgno;
+
+	LOCK_CHECK_OFF(dbc->thread_info);
+
+	opd = NULL;
+	env = dbc->dbp->env;
+	dbmp = dbc->dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Take a peek at the root.  If it's a leaf then
+	 * there is no tree here, avoid all the trouble.
+	 */
+	if ((ret = __memp_fget(dbmp, &root_pgno,
+	     dbc->thread_info, dbc->txn, 0, &dpg)) != 0)
+		goto err;
+
+	level = dpg->level;
+	if ((ret = __memp_fput(dbmp,
+	     dbc->thread_info, dpg, dbc->priority)) != 0)
+		goto err;
+	if (level == LEAFLEVEL)
+		goto done;
+	if ((ret = __dbc_newopd(dbc, root_pgno, NULL, &opd)) != 0)
+		goto err;
+	if (ppg != NULL) {
+		/*
+		 * The caller should have the page at
+		 * least read locked.  Drop the buffer
+		 * and get the write lock.
+		 */
+		pgno = PGNO(*ppg);
+		if ((ret = __memp_fput(dbmp, dbc->thread_info,
+		     *ppg, dbc->priority)) != 0)
+			goto err;
+		*ppg = NULL;
+		if ((ret = __db_lget(dbc, 0, pgno,
+		     DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(dbmp, &pgno,
+		    dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+			goto err;
+	}
+	memset(&start, 0, sizeof(start));
+	do {
+		span = 0;
+		if ((ret = __bam_compact_int(opd, &start,
+		     NULL, factor, &span, c_data, &isdone)) != 0)
+			break;
+		/* For OPD the number of pages dirtied is returned in span. */
+		*donep += span;
+	} while (!isdone);
+
+	if (start.data != NULL)
+		__os_free(env, start.data);
+
+err:	if (opd != NULL && (t_ret = __dbc_close(opd)) != 0 && ret == 0)
+		ret = t_ret;
+done:
+	LOCK_CHECK_ON(dbc->thread_info);
+
+	return (ret);
+}
+
+/*
+ * __bam_truncate_root_page -- swap a page which is
+ *    the root of an off page dup tree or the head of an overflow.
+ * The page is reference by the pg/indx passed in.
+ */
+static int
+__bam_truncate_root_page(dbc, pg, indx, c_data)
+	DBC *dbc;
+	PAGE *pg;
+	u_int32_t indx;
+	DB_COMPACT *c_data;
+{
+	BINTERNAL *bi;
+	BOVERFLOW *bo;
+	DB *dbp;
+	db_pgno_t *pgnop;
+	u_int32_t tlen;
+
+	COMPQUIET(c_data, NULL);
+	COMPQUIET(bo, NULL);
+	dbp = dbc->dbp;
+	if (TYPE(pg) == P_IBTREE) {
+		bi = GET_BINTERNAL(dbp, pg, indx);
+		if (B_TYPE(bi->type) == B_OVERFLOW) {
+			bo = (BOVERFLOW *)(bi->data);
+			pgnop = &bo->pgno;
+			tlen = bo->tlen;
+		} else {
+			/* Tlen is not used if this is not an overflow. */
+			tlen = 0;
+			pgnop = &bi->pgno;
+		}
+	} else {
+		bo = GET_BOVERFLOW(dbp, pg, indx);
+		pgnop = &bo->pgno;
+		tlen = bo->tlen;
+	}
+
+	DB_ASSERT(dbp->env, IS_DIRTY(pg));
+
+	return (__db_truncate_root(dbc, pg, indx, pgnop, tlen));
+}
+
+/*
+ * -- bam_truncate_internal_overflow -- find overflow keys
+ *	on internal pages and if they have high page
+ * numbers swap them with lower pages and truncate them.
+ * Note that if there are overflow keys in the internal
+ * nodes they will get copied adding pages to the database.
+ */
+static int
+__bam_truncate_internal_overflow(dbc, page, c_data)
+	DBC *dbc;
+	PAGE *page;
+	DB_COMPACT *c_data;
+{
+	BINTERNAL *bi;
+	BOVERFLOW *bo;
+	db_indx_t indx;
+	int ret;
+
+	COMPQUIET(bo, NULL);
+	ret = 0;
+	for (indx = 0; indx < NUM_ENT(page); indx++) {
+		bi = GET_BINTERNAL(dbc->dbp, page, indx);
+		if (B_TYPE(bi->type) != B_OVERFLOW)
+			continue;
+		bo = (BOVERFLOW *)(bi->data);
+		if (bo->pgno > c_data->compact_truncate && (ret =
+		     __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
+			break;
+		if ((ret = __db_truncate_overflow(
+		     dbc, bo->pgno, NULL, c_data)) != 0)
+			break;
+	}
+	return (ret);
+}
+
+/*
+ * __bam_compact_isdone ---
+ *
+ * Check to see if the stop key specified by the caller is on the
+ * current page, in which case we are done compacting.
+ */
+static int
+__bam_compact_isdone(dbc, stop, pg, isdone)
+	DBC *dbc;
+	DBT *stop;
+	PAGE *pg;
+	int *isdone;
+{
+	db_recno_t recno;
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	int cmp, ret;
+
+	*isdone = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	t = dbc->dbp->bt_internal;
+
+	if (dbc->dbtype == DB_RECNO) {
+		if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0)
+			return (ret);
+		*isdone = cp->recno > recno;
+	} else {
+		DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
+		if ((ret = __bam_cmp(dbc, stop, pg, 0,
+		    t->bt_compare, &cmp)) != 0)
+			return (ret);
+
+		*isdone = cmp <= 0;
+	}
+	return (0);
+}
+
+/*
+ * Lock the subtrees from the top of the stack.
+ *	The 0'th child may be in the stack and locked otherwise iterate
+ * through the records by calling __bam_lock_subtree.
+ */
+static int
+__bam_lock_tree(dbc, sp, csp, start, stop)
+	DBC *dbc;
+	EPG *sp, *csp;
+	u_int32_t start, stop;
+{
+	PAGE *cpage;
+	db_pgno_t pgno;
+	int ret;
+
+	if (dbc->dbtype == DB_RECNO)
+		pgno = GET_RINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+	else
+		pgno = GET_BINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+	cpage = (sp + 1)->page;
+	/*
+	 * First recurse down the left most sub tree if it is in the cursor
+	 * stack.  We already have these pages latched and locked if its a
+	 * leaf.
+	 */
+	if (start == 0 && sp + 1 != csp && pgno == PGNO(cpage) &&
+	    (ret = __bam_lock_tree(dbc, sp + 1, csp, 0, NUM_ENT(cpage))) != 0)
+		return (ret);
+
+	/*
+	 * Then recurse on the other records on the page if needed.
+	 * If the page is in the stack then its already locked or
+	 * was processed above.
+	 */
+	if (start == 0 && pgno == PGNO(cpage))
+		start = 1;
+
+	if (start == stop)
+		return (0);
+	return (__bam_lock_subtree(dbc, sp->page, start, stop));
+
+}
+
+/*
+ * Lock the subtree from the current node.
+ */
+static int
+__bam_lock_subtree(dbc, page, indx, stop)
+	DBC *dbc;
+	PAGE *page;
+	u_int32_t indx, stop;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	PAGE *cpage;
+	db_pgno_t pgno;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	for (; indx < stop; indx++) {
+		if (dbc->dbtype == DB_RECNO)
+			pgno = GET_RINTERNAL(dbc->dbp, page, indx)->pgno;
+		else
+			pgno = GET_BINTERNAL(dbc->dbp, page, indx)->pgno;
+		if (LEVEL(page) - 1 == LEAFLEVEL) {
+			if ((ret = __db_lget(dbc, 0, pgno,
+			     DB_LOCK_WRITE, DB_LOCK_NOWAIT, &lock)) != 0) {
+				if (ret == DB_LOCK_DEADLOCK)
+					return (DB_LOCK_NOTGRANTED);
+				return (ret);
+			}
+		} else {
+			if ((ret = __memp_fget(dbp->mpf, &pgno,
+			     dbc->thread_info, dbc->txn, 0, &cpage)) != 0)
+				return (ret);
+			ret = __bam_lock_subtree(dbc, cpage, 0, NUM_ENT(cpage));
+			if ((t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+			    cpage, dbc->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				return (ret);
+		}
+	}
+	return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __bam_savekey -- save the key from an internal page.
+ *  We need to save information so that we can
+ * fetch then next internal node of the tree.  This means
+ * we need the btree key on this current page, or the
+ * next record number.
+ */
+static int
+__bam_savekey(dbc, next, start)
+	DBC *dbc;
+	int next;
+	DBT *start;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_LOCK lock;
+	ENV *env;
+	PAGE *pg;
+	RINTERNAL *ri;
+	db_indx_t indx, top;
+	db_pgno_t pgno, saved_pgno;
+	int ret, t_ret;
+	u_int32_t len;
+	u_int8_t *data;
+	int level;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	pg = cp->csp->page;
+	ret = 0;
+
+	if (dbc->dbtype == DB_RECNO) {
+		if (next)
+			for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) {
+				ri = GET_RINTERNAL(dbp, pg, indx);
+				cp->recno += ri->nrecs;
+			}
+		return (__db_retcopy(env, start, &cp->recno,
+		     sizeof(cp->recno), &start->data, &start->ulen));
+
+	}
+
+	bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+	data = bi->data;
+	len = bi->len;
+	LOCK_INIT(lock);
+	saved_pgno = PGNO_INVALID;
+	/* If there is single record on the page it may have an empty key. */
+	while (len == 0) {
+		/*
+		 * We should not have an empty data page, since we just
+		 * compacted things, check anyway and punt.
+		 */
+		if (NUM_ENT(pg) == 0)
+			goto no_key;
+		pgno = bi->pgno;
+		level = LEVEL(pg);
+		if (pg != cp->csp->page &&
+		    (ret = __memp_fput(dbp->mpf,
+			 dbc->thread_info, pg, dbc->priority)) != 0) {
+			pg = NULL;
+			goto err;
+		}
+		pg = NULL;
+		if (level - 1 == LEAFLEVEL) {
+			TRY_LOCK(dbc, pgno, saved_pgno,
+			    lock, DB_LOCK_READ, retry);
+			if (ret != 0)
+				goto err;
+		}
+		if ((ret = __memp_fget(dbp->mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+			goto err;
+
+		/*
+		 * At the data level use the last key to try and avoid the
+		 * possibility that the user has a zero length key, if they
+		 * do, we punt.
+		 */
+		if (pg->level == LEAFLEVEL) {
+			bk = GET_BKEYDATA(dbp, pg, NUM_ENT(pg) - 2);
+			data = bk->data;
+			len = bk->len;
+			if (len == 0) {
+no_key:				__db_errx(env, DB_STR("1023",
+				    "Compact cannot handle zero length key"));
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+		} else {
+			bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+			data = bi->data;
+			len = bi->len;
+		}
+	}
+	if (B_TYPE(bi->type) == B_OVERFLOW) {
+		bo = (BOVERFLOW *)(data);
+		ret = __db_goff(dbc, start, bo->tlen, bo->pgno,
+		    &start->data, &start->ulen);
+	}
+	else
+		ret = __db_retcopy(env,
+		     start, data, len,  &start->data, &start->ulen);
+
+err:	if (pg != NULL && pg != cp->csp->page &&
+	    (t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+		 pg, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+
+retry:	return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * bam_truncate_ipages --
+ *	Find high numbered pages in the internal nodes of a tree and
+ *	swap them for lower numbered pages.
+ * PUBLIC:  int __bam_truncate_ipages __P((DB *,
+ * PUBLIC:    DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ */
+int
+__bam_truncate_ipages(dbp, ip, txn, c_data)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_COMPACT *c_data;
+{
+	BTMETA *meta;
+	BTREE *bt;
+	BTREE_CURSOR *cp;
+	DBC *dbc;
+	DBMETA *dbmeta;
+	DBT start;
+	DB_LOCK meta_lock, root_lock;
+	DB_TXN *txn_orig;
+	PAGE *pg, *root;
+	db_pgno_t pgno;
+	u_int32_t sflag;
+	int level, local_txn, ret, rlevel, t_ret;
+
+	COMPQUIET(pg, NULL);
+	dbc = NULL;
+	memset(&start, 0, sizeof(start));
+	LOCK_INIT(root_lock);
+	txn_orig = txn;
+
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		local_txn = 1;
+		txn = NULL;
+	} else
+		local_txn = 0;
+
+	level = LEAFLEVEL + 1;
+	sflag = CS_READ | CS_GETRECNO;
+	LOCK_INIT(meta_lock);
+	bt = dbp->bt_internal;
+	meta = NULL;
+	root = NULL;
+
+new_txn:
+	if (local_txn &&
+	    (ret = __txn_begin(dbp->env, ip, txn_orig, &txn, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * If the the root is a leaf we have nothing to do.
+	 * Searching an empty RECNO tree will return NOTFOUND below and loop.
+	 */
+	pgno = PGNO_INVALID;
+	BAM_GET_ROOT(dbc, pgno, root, 0, DB_LOCK_READ, root_lock, ret);
+	if (ret != 0)
+		goto err;
+
+	rlevel = LEVEL(root);
+	if ((ret = __memp_fput(dbp->mpf, ip, root, dbp->priority)) != 0)
+		goto err;
+	root = NULL;
+
+	if (rlevel == LEAFLEVEL)
+		goto again;
+
+	pgno = PGNO_INVALID;
+	do {
+		if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) {
+			/* No more at this level, go up one. */
+			if (ret == DB_NOTFOUND) {
+				level++;
+				if (start.data != NULL)
+					__os_free(dbp->env, start.data);
+				memset(&start, 0, sizeof(start));
+				sflag = CS_READ | CS_GETRECNO;
+				continue;
+			}
+			goto err;
+		}
+		c_data->compact_pages_examine++;
+
+		pg = cp->csp->page;
+		pgno = PGNO(pg);
+
+		sflag = CS_NEXT | CS_GETRECNO;
+		/* Grab info about the page and drop the stack. */
+		if (pgno != BAM_ROOT_PGNO(dbc) && (ret = __bam_savekey(dbc,
+		    pgno <= c_data->compact_truncate, &start)) != 0) {
+			if (ret == DB_LOCK_NOTGRANTED)
+				continue;
+			goto err;
+		}
+
+		/* We only got read locks so we can drop them. */
+		if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+			goto err;
+		if (pgno == BAM_ROOT_PGNO(dbc))
+			break;
+
+		if (pgno <= c_data->compact_truncate)
+			continue;
+
+		/* Get the meta page lock before latching interior nodes. */
+		if (!LOCK_ISSET(meta_lock) && (ret = __db_lget(dbc,
+		     0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+			goto err;
+
+		/* Reget the page with a write latch, and its parent too. */
+		if ((ret = __bam_csearch(dbc,
+		    &start, CS_PARENT | CS_GETRECNO, level)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+			}
+			goto err;
+		}
+		pgno = PGNO(cp->csp->page);
+
+		if (pgno > c_data->compact_truncate) {
+			if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+			    NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+				goto err;
+		}
+
+		/*
+		 * For RECNO we need to bump the saved key to the next
+		 * page since CS_NEXT will not do that.
+		 */
+		if (dbc->dbtype == DB_RECNO &&
+		    (ret = __bam_savekey(dbc, 1, &start)) != 0)
+			goto err;
+
+		pg = cp->csp->page;
+		if ((ret = __bam_stkrel(dbc,
+		     pgno != PGNO(pg) ? 0 : STK_NOLOCK)) != 0)
+			goto err;
+
+		/* We are locking subtrees, so drop the write locks asap. */
+		if (local_txn && pgno != PGNO(pg))
+			break;
+		/* We really break from the loop above on this condition. */
+	} while (pgno != BAM_ROOT_PGNO(dbc));
+
+	if ((ret = __LPUT(dbc, root_lock)) != 0)
+		goto err;
+	if ((ret = __dbc_close(dbc)) != 0)
+		goto err;
+	dbc = NULL;
+	if (local_txn) {
+		if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
+			goto err;
+		txn = NULL;
+		LOCK_INIT(meta_lock);
+	}
+	if (pgno != bt->bt_root)
+		goto new_txn;
+
+	/*
+	 * Attempt to move the subdatabase metadata and/or root pages.
+	 * Grab the metadata page and verify the revision, if its out
+	 * of date reopen and try again.
+	 */
+again:	if (F_ISSET(dbp, DB_AM_SUBDB) &&
+	    (bt->bt_root > c_data->compact_truncate ||
+	    bt->bt_meta > c_data->compact_truncate)) {
+		if (local_txn && txn == NULL &&
+		    (ret = __txn_begin(dbp->env, ip, txn_orig, &txn, 0)) != 0)
+			goto err;
+		if (dbc == NULL &&
+		    (ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+			goto err;
+		if ((ret = __db_lget(dbc,
+		     0, bt->bt_meta, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(dbp->mpf, &bt->bt_meta,
+		     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+		if (bt->revision != dbp->mpf->mfp->revision) {
+			if ((ret = __memp_fput(dbp->mpf,
+			    ip, meta, dbp->priority)) != 0)
+				goto err;
+			meta = NULL;
+			if (local_txn) {
+				if ((ret = __dbc_close(dbc)) != 0)
+					goto err;
+				dbc = NULL;
+				ret = __txn_abort(txn);
+				txn = NULL;
+				if (ret != 0)
+					goto err;
+			} else {
+				if ((ret = __LPUT(dbc, meta_lock)) != 0)
+					goto err;
+			}
+			if ((ret = __db_reopen(dbc)) != 0)
+				goto err;
+			goto again;
+		}
+		if (PGNO(meta) > c_data->compact_truncate) {
+			dbmeta = (DBMETA *)meta;
+			ret = __db_move_metadata(dbc, &dbmeta, c_data);
+			meta = (BTMETA *)dbmeta;
+			if (ret != 0)
+				goto err;
+		}
+		if (bt->bt_root > c_data->compact_truncate) {
+			if ((ret = __db_lget(dbc, 0,
+			   bt->bt_root, DB_LOCK_WRITE, 0, &root_lock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(dbp->mpf,
+			     &bt->bt_root, dbc->thread_info,
+			     dbc->txn, DB_MPOOL_DIRTY, &root)) != 0)
+				goto err;
+			c_data->compact_pages_examine++;
+			/*
+			 * Bump the revision first since any reader will be
+			 * blocked on the latch on the old page.  That latch
+			 * will get dropped when we free the page and the
+			 * reader will do a __db_reopen and wait till the meta
+			 * page latch is released.
+			 */
+			++dbp->mpf->mfp->revision;
+			if ((ret = __db_exchange_page(dbc,
+			    &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+				goto err;
+			if (PGNO(root) == bt->bt_root)
+				goto err;
+			if (DBC_LOGGING(dbc)) {
+				if ((ret =
+				    __bam_root_log(dbp, txn, &LSN(meta), 0,
+				    PGNO(meta), PGNO(root), &LSN(meta))) != 0)
+					goto err;
+			} else
+				LSN_NOT_LOGGED(LSN(meta));
+			bt->bt_root = meta->root = PGNO(root);
+			bt->revision = dbp->mpf->mfp->revision;
+			if ((ret = __memp_fput(dbp->mpf,
+			    ip, root, dbp->priority)) != 0)
+				goto err;
+			root = NULL;
+			if (txn == NULL && (ret = __LPUT(dbc, root_lock)) != 0)
+				goto err;
+
+		}
+		if ((ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority)) != 0)
+			goto err;
+		meta = NULL;
+		if ((ret = __dbc_close(dbc)) != 0)
+			goto err;
+		dbc = NULL;
+		if (local_txn) {
+			ret = __txn_commit(txn, DB_TXN_NOSYNC);
+			txn = NULL;
+			LOCK_INIT(meta_lock);
+			LOCK_INIT(root_lock);
+		}
+	}
+
+err:	if (txn != NULL && ret != 0)
+		sflag = STK_PGONLY;
+	else
+		sflag = 0;
+	if (txn == NULL) {
+		if (dbc != NULL &&
+		    (t_ret = __LPUT(dbc, meta_lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (dbc != NULL &&
+		    (t_ret = __LPUT(dbc, root_lock)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (meta != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    ip, meta, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (root != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    ip, root, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbc != NULL && (t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (local_txn &&
+	    txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
+		ret = t_ret;
+	if (start.data != NULL)
+		__os_free(dbp->env, start.data);
+	return (ret);
+}
+
+#endif
diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c
new file mode 100644
index 00000000..5c009071
--- /dev/null
+++ b/src/btree/bt_compare.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_cmp --
+ *	Compare a key to a given record.
+ *
+ * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t,
+ * PUBLIC:    int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__bam_cmp(dbc, dbt, h, indx, func, cmpp)
+	DBC *dbc;
+	const DBT *dbt;
+	PAGE *h;
+	u_int32_t indx;
+	int (*func)__P((DB *, const DBT *, const DBT *));
+	int *cmpp;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	DB *dbp;
+	DBT pg_dbt;
+
+	dbp = dbc->dbp;
+
+	/*
+	 * Returns:
+	 *	< 0 if dbt is < page record
+	 *	= 0 if dbt is = page record
+	 *	> 0 if dbt is > page record
+	 *
+	 * !!!
+	 * We do not clear the pg_dbt DBT even though it's likely to contain
+	 * random bits.  That should be okay, because the app's comparison
+	 * routine had better not be looking at fields other than data, size
+	 * and app_data.  We don't clear it because we go through this path a
+	 * lot and it's expensive.
+	 */
+	switch (TYPE(h)) {
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		bk = GET_BKEYDATA(dbp, h, indx);
+		if (B_TYPE(bk->type) == B_OVERFLOW)
+			bo = (BOVERFLOW *)bk;
+		else {
+			pg_dbt.app_data = NULL;
+			pg_dbt.data = bk->data;
+			pg_dbt.size = bk->len;
+			*cmpp = func(dbp, dbt, &pg_dbt);
+			return (0);
+		}
+		break;
+	case P_IBTREE:
+		/*
+		 * The following code guarantees that the left-most key on an
+		 * internal page at any place in the tree sorts less than any
+		 * user-specified key.  The reason is that if we have reached
+		 * this internal page, we know the user key must sort greater
+		 * than the key we're storing for this page in any internal
+		 * pages at levels above us in the tree.  It then follows that
+		 * any user-specified key cannot sort less than the first page
+		 * which we reference, and so there's no reason to call the
+		 * comparison routine.  While this may save us a comparison
+		 * routine call or two, the real reason for this is because
+		 * we don't maintain a copy of the smallest key in the tree,
+		 * so that we don't have to update all the levels of the tree
+		 * should the application store a new smallest key.  And, so,
+		 * we may not have a key to compare, which makes doing the
+		 * comparison difficult and error prone.
+		 */
+		if (indx == 0) {
+			*cmpp = 1;
+			return (0);
+		}
+
+		bi = GET_BINTERNAL(dbp, h, indx);
+		if (B_TYPE(bi->type) == B_OVERFLOW)
+			bo = (BOVERFLOW *)(bi->data);
+		else {
+			pg_dbt.app_data = NULL;
+			pg_dbt.data = bi->data;
+			pg_dbt.size = bi->len;
+			*cmpp = func(dbp, dbt, &pg_dbt);
+			return (0);
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(h)));
+	}
+
+	/*
+	 * Overflow.
+	 */
+	return (__db_moff(dbc, dbt, bo->pgno, bo->tlen,
+	    func == __bam_defcmp ? NULL : func, cmpp));
+}
+
+/*
+ * __bam_defcmp --
+ *	Default comparison routine.
+ *
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_defcmp(dbp, a, b)
+	DB *dbp;
+	const DBT *a, *b;
+{
+	size_t len;
+	u_int8_t *p1, *p2;
+
+	COMPQUIET(dbp, NULL);
+
+	/*
+	 * Returns:
+	 *	< 0 if a is < b
+	 *	= 0 if a is = b
+	 *	> 0 if a is > b
+	 *
+	 * XXX
+	 * If a size_t doesn't fit into a long, or if the difference between
+	 * any two characters doesn't fit into an int, this routine can lose.
+	 * What we need is a signed integral type that's guaranteed to be at
+	 * least as large as a size_t, and there is no such thing.
+	 */
+	len = a->size > b->size ? b->size : a->size;
+	for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+		if (*p1 != *p2)
+			return ((long)*p1 - (long)*p2);
+	return ((long)a->size - (long)b->size);
+}
+
+/*
+ * __bam_defpfx --
+ *	Default prefix routine.
+ *
+ * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+ */
+size_t
+__bam_defpfx(dbp, a, b)
+	DB *dbp;
+	const DBT *a, *b;
+{
+	size_t cnt, len;
+	u_int8_t *p1, *p2;
+
+	COMPQUIET(dbp, NULL);
+
+	cnt = 1;
+	len = a->size > b->size ? b->size : a->size;
+	for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+		if (*p1 != *p2)
+			return (cnt);
+
+	/*
+	 * They match up to the smaller of the two sizes.
+	 * Collate the longer after the shorter.
+	 */
+	if (a->size < b->size)
+		return (a->size + 1);
+	if (b->size < a->size)
+		return (b->size + 1);
+	return (b->size);
+}
diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c
new file mode 100644
index 00000000..3f293461
--- /dev/null
+++ b/src/btree/bt_compress.c
@@ -0,0 +1,3173 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+#ifdef HAVE_COMPRESSION
+
+static int __bam_compress_marshal_data __P((DB *, const DBT *, DBT *));
+static int __bam_compress_set_dbt __P((DB *, DBT *, const void *, u_int32_t));
+static int __bam_compress_check_sort_multiple_key __P((DB *, DBT *));
+static int __bam_compress_check_sort_multiple __P((DB *, DBT *, DBT *));
+static int __bam_compress_check_sort_multiple_keyonly __P((DB *, DBT *));
+static int __bamc_compress_del_and_get_next __P((DBC *, DBT *, DBT *));
+static int __bamc_compress_get_bothc __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple_key __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple __P((DBC *, DBT *, DBT *,u_int32_t));
+static int __bamc_compress_get_next __P((DBC *, u_int32_t));
+static int __bamc_compress_get_next_dup __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_next_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_dup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_set __P((DBC *,
+	DBT *, DBT *, u_int32_t, u_int32_t));
+static int __bamc_compress_ibulk_del __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_idel __P((DBC *, u_int32_t));
+static int __bamc_compress_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_relocate __P((DBC *));
+static void __bamc_compress_reset __P((DBC *));
+static int __bamc_compress_seek __P((DBC *,
+	const DBT *, const DBT *, u_int32_t));
+static int __bamc_compress_store __P((DBC *,
+	DBT *, DBT*, DBT **, DBT **, DBT *, DBT *));
+static int __bamc_next_decompress __P((DBC *));
+static int __bamc_start_decompress __P((DBC *));
+
+/*
+ * Call __dbc_iget(), resizing DBTs if DB_BUFFER_SMALL is returned.
+ * We're always using a transient cursor when this macro is used, so
+ * we have to replace the OP with DB_CURRENT when we retry.
+ */
+#define	CMP_IGET_RETRY(ret, dbc, dbt1, dbt2, flags) do {		\
+	DB_ASSERT((dbc)->env, F_ISSET((dbt1), DB_DBT_USERMEM));		\
+	DB_ASSERT((dbc)->env, F_ISSET((dbt2), DB_DBT_USERMEM));		\
+	if (((ret) =__dbc_iget((dbc),                                   \
+	    (dbt1), (dbt2), (flags))) == DB_BUFFER_SMALL) {		\
+		if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt1))) != 0)	\
+			break;						\
+		if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt2))) != 0)	\
+			break;						\
+		(ret) = __dbc_iget((dbc), (dbt1), (dbt2),		\
+			((flags) & ~DB_OPFLAGS_MASK) | DB_CURRENT);	\
+	}								\
+} while (0)
+
+#define	CMP_INIT_DBT(dbt) do {						\
+	(dbt)->data = NULL;						\
+	(dbt)->size = 0;						\
+	(dbt)->ulen = 0;						\
+	(dbt)->doff = 0;						\
+	(dbt)->dlen = 0;						\
+	(dbt)->flags = DB_DBT_USERMEM;					\
+	(dbt)->app_data = NULL;						\
+} while (0)
+
+#define	CMP_FREE_DBT(env, dbt) do {					\
+	DB_ASSERT((env), F_ISSET((dbt), DB_DBT_USERMEM));		\
+	__os_free((env), (dbt)->data);					\
+} while (0)
+
+#define	CMP_RESIZE_DBT(ret, env, dbt)					\
+	(((dbt)->size > (dbt)->ulen) ?					\
+	((((ret) = __os_realloc((env), (dbt)->size, &(dbt)->data))	\
+		!= 0) ? (ret) : (((dbt)->ulen = (dbt)->size), 0)) : 0)
+
+static int
+__bam_compress_set_dbt(dbp, dbt, data, size)
+	DB *dbp;
+	DBT *dbt;
+	const void *data;
+	u_int32_t size;
+{
+	int ret;
+
+	ret = 0;
+	DB_ASSERT(dbp->env, F_ISSET(dbt, DB_DBT_USERMEM));
+
+	dbt->size = size;
+	if (CMP_RESIZE_DBT(ret, dbp->env, dbt) != 0)
+		return (ret);
+
+	memcpy(dbt->data, data, size);
+	return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Very simple key/data stream to give __bamc_compress_merge_insert()
+ * a source of data to work on.
+ */
+struct __bam_compress_stream;
+typedef struct __bam_compress_stream BTREE_COMPRESS_STREAM;
+struct __bam_compress_stream
+{
+	int (*next)(BTREE_COMPRESS_STREAM *, DBT *, DBT *);
+
+	void *kptr, *dptr;
+	DBT *key, *data;
+};
+
+/*
+ * These function prototypes can not go at the beginning because they rely on
+ * on BTREE_COMPRESS_STREAM defined above.
+ * The prototypes are required to avoid the Microsoft C++ compiler generating
+ * warnings about mismatching parameter lists.
+ */
+static int __bam_cs_next_done __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single
+			__P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_keyonly_next
+			__P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single_keyonly
+			__P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_key_next
+			__P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_key __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple
+			__P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_multiple_keyonly_next
+			__P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_keyonly
+			__P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bamc_compress_merge_insert
+		__P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *, u_int32_t));
+static int __bamc_compress_merge_delete
+		__P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+static int __bamc_compress_merge_delete_dups
+		__P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+
+/* BTREE_COMPRESS_STREAM->next() for when the data has finished. */
+static int
+__bam_cs_next_done(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	COMPQUIET(stream, NULL);
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	return (0);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key/data pair. */
+static int
+__bam_cs_single_next(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	key->data = stream->key->data;
+	key->size = stream->key->size;
+	data->data = stream->data->data;
+	data->size = stream->data->size;
+	stream->next = __bam_cs_next_done;
+	return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	stream->next = __bam_cs_single_next;
+	stream->key = key;
+	stream->data = data;
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key. */
+static int
+__bam_cs_single_keyonly_next(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	key->data = stream->key->data;
+	key->size = stream->key->size;
+	if (data != NULL) {
+		data->data = NULL;
+		data->size = 0;
+	}
+	stream->next = __bam_cs_next_done;
+	return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single_keyonly(stream, key)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key;
+{
+	stream->next = __bam_cs_single_keyonly_next;
+	stream->key = key;
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static int
+__bam_cs_multiple_key_next(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	DB_MULTIPLE_KEY_NEXT(stream->kptr, stream->key, key->data, key->size,
+		data->data, data->size);
+	if (key->data == NULL) {
+		stream->next = __bam_cs_next_done;
+		return (0);
+	}
+	return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static void
+__bam_cs_create_multiple_key(stream, multiple)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *multiple;
+{
+	stream->next = __bam_cs_multiple_key_next;
+	stream->key = multiple;
+	DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for two buffers in the DB_MULTIPLE format. */
+static int
+__bam_cs_multiple_next(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+	DB_MULTIPLE_NEXT(stream->dptr, stream->data, data->data, data->size);
+	if (key->data == NULL || data->data == NULL) {
+		stream->next = __bam_cs_next_done;
+		return (0);
+	}
+	return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for two buffers in the DB_MULTIPLE format. */
+static void
+__bam_cs_create_multiple(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	stream->next = __bam_cs_multiple_next;
+	stream->key = key;
+	stream->data = data;
+	DB_MULTIPLE_INIT(stream->kptr, stream->key);
+	DB_MULTIPLE_INIT(stream->dptr, stream->data);
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static int
+__bam_cs_multiple_keyonly_next(stream, key, data)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key, *data;
+{
+	DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+	if (key->data == NULL) {
+		stream->next = __bam_cs_next_done;
+		return (0);
+	}
+	if (data != NULL) {
+		data->data = NULL;
+		data->size = 0;
+	}
+	return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static void
+__bam_cs_create_multiple_keyonly(stream, key)
+	BTREE_COMPRESS_STREAM *stream;
+	DBT *key;
+{
+	stream->next = __bam_cs_multiple_keyonly_next;
+	stream->key = key;
+	DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/******************************************************************************/
+
+/*
+ * Marshal data in initial data format into destbuf, resizing destbuf if
+ * necessary.
+ */
+static int
+__bam_compress_marshal_data(dbp, data, destbuf)
+	DB *dbp;
+	const DBT *data;
+	DBT *destbuf;
+{
+	int ret;
+	u_int8_t *ptr;
+
+	ret = 0;
+	DB_ASSERT(dbp->env, F_ISSET(destbuf, DB_DBT_USERMEM));
+
+	destbuf->size = __db_compress_count_int(data->size);
+	destbuf->size += data->size;
+	if (CMP_RESIZE_DBT(ret, dbp->env, destbuf) != 0)
+		return (ret);
+
+	ptr = (u_int8_t*)destbuf->data;
+	ptr += __db_compress_int(ptr, data->size);
+	memcpy(ptr, data->data, data->size);
+
+	return (0);
+}
+
+/*
+ * Unmarshal initial data from source into data - does not copy, points
+ * into source.
+ */
+#define	CMP_UNMARSHAL_DATA(src, dest) do {				\
+	(dest)->data = ((u_int8_t*)(src)->data) +			\
+		__db_decompress_int32((u_int8_t*)(src)->data,		\
+			&(dest)->size);					\
+} while (0)
+
+/******************************************************************************/
+
+/*
+ * __bam_compress_dupcmp --
+ *	Duplicate comparison function for compressed BTrees.
+ *
+ * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_compress_dupcmp(db, a, b)
+	DB *db;
+	const DBT *a;
+	const DBT *b;
+{
+	DBT dcmp_a, dcmp_b;
+
+	/* Decompress the initial data in a */
+	CMP_UNMARSHAL_DATA(a, &dcmp_a);
+	dcmp_a.ulen = 0;
+	dcmp_a.doff = 0;
+	dcmp_a.dlen = 0;
+	dcmp_a.flags = 0;
+	dcmp_a.app_data = 0;
+
+	/* Decompress the initial data in b */
+	CMP_UNMARSHAL_DATA(b, &dcmp_b);
+	dcmp_b.ulen = 0;
+	dcmp_b.doff = 0;
+	dcmp_b.dlen = 0;
+	dcmp_b.flags = 0;
+	dcmp_b.app_data = 0;
+
+	/* Call the user's duplicate compare function */
+	return ((BTREE *)db->bt_internal)->
+		compress_dup_compare(db, &dcmp_a, &dcmp_b);
+}
+
+/*
+ * __bam_defcompress --
+ *	Default compression routine.
+ *
+ * PUBLIC: int __bam_defcompress __P((DB *, const DBT *, const DBT *,
+ * PUBLIC:     const DBT *, const DBT *, DBT *));
+ */
+int
+__bam_defcompress(dbp, prevKey, prevData, key, data, dest)
+	DB *dbp;
+	const DBT *prevKey, *prevData, *key, *data;
+	DBT *dest;
+{
+	u_int8_t *ptr;
+	const u_int8_t *k, *p;
+	size_t len, prefix, suffix;
+
+	COMPQUIET(dbp, NULL);
+
+	k = (const u_int8_t*)key->data;
+	p = (const u_int8_t*)prevKey->data;
+	len = key->size > prevKey->size ? prevKey->size : key->size;
+	for (; len-- && *k == *p; ++k, ++p)
+		continue;
+
+	prefix = (size_t)(k - (u_int8_t*)key->data);
+	suffix = key->size - prefix;
+
+	if (prefix == prevKey->size && suffix == 0) {
+		/* It's a duplicate - do prefix compression on the value */
+		k = (const u_int8_t*)data->data;
+		p = (const u_int8_t*)prevData->data;
+		len = data->size > prevData->size ? prevData->size : data->size;
+		for (; len-- && *k == *p; ++k, ++p)
+			continue;
+
+		prefix = (size_t)(k - (u_int8_t*)data->data);
+		suffix = data->size - prefix;
+
+		/* Check that we have enough space in dest */
+		dest->size = (u_int32_t)(1 + __db_compress_count_int(prefix) +
+			__db_compress_count_int(suffix) + suffix);
+		if (dest->size > dest->ulen)
+			return (DB_BUFFER_SMALL);
+
+		/* Magic identifying byte */
+		ptr = (u_int8_t*)dest->data;
+		*ptr = CMP_INT_SPARE_VAL;
+		++ptr;
+
+		/* prefix length */
+		ptr += __db_compress_int(ptr, prefix);
+
+		/* suffix length */
+		ptr += __db_compress_int(ptr, suffix);
+
+		/* suffix */
+		memcpy(ptr, k, suffix);
+
+		return (0);
+	}
+
+	/* Check that we have enough space in dest */
+	dest->size = (u_int32_t)(__db_compress_count_int(prefix) +
+		__db_compress_count_int(suffix) +
+		__db_compress_count_int(data->size) + suffix + data->size);
+	if (dest->size > dest->ulen)
+		return (DB_BUFFER_SMALL);
+
+	/* prefix length */
+	ptr = (u_int8_t*)dest->data;
+	ptr += __db_compress_int(ptr, prefix);
+
+	/* suffix length */
+	ptr += __db_compress_int(ptr, suffix);
+
+	/* data length */
+	ptr += __db_compress_int(ptr, data->size);
+
+	/* suffix */
+	memcpy(ptr, k, suffix);
+	ptr += suffix;
+
+	/* data */
+	memcpy(ptr, data->data, data->size);
+
+	return (0);
+}
+
+/*
+ * __bam_defdecompress --
+ *	Default decompression routine.
+ *
+ * PUBLIC: int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *,
+ * PUBLIC:     DBT *, DBT *));
+ */
+int
+__bam_defdecompress(dbp, prevKey, prevData, compressed, destKey, destData)
+	DB *dbp;
+	const DBT *prevKey, *prevData;
+	DBT *compressed, *destKey, *destData;
+{
+	u_int8_t *s, *d;
+	u_int32_t prefix, suffix, size;
+
+	COMPQUIET(dbp, NULL);
+
+	/*
+	 * Check for the magic identifying byte, that tells us that this is a
+	 * compressed duplicate value.
+	 */
+	s = (u_int8_t*)compressed->data;
+	if (*s == CMP_INT_SPARE_VAL) {
+		++s;
+		size = 1;
+
+		/* Unmarshal prefix and suffix */
+		size += __db_decompress_count_int(s);
+		if (size > compressed->size)
+			return (EINVAL);
+		s += __db_decompress_int32(s, &prefix);
+
+		size += __db_decompress_count_int(s);
+		if (size > compressed->size)
+			return (EINVAL);
+		s += __db_decompress_int32(s, &suffix);
+
+		/* Check destination lengths */
+		destKey->size = prevKey->size;
+		destData->size = prefix + suffix;
+		if (destKey->size > destKey->ulen ||
+			destData->size > destData->ulen)
+			return (DB_BUFFER_SMALL);
+
+		/* Write the key */
+		memcpy(destKey->data, prevKey->data, destKey->size);
+
+		/* Write the prefix */
+		if (prefix > prevData->size)
+			return (EINVAL);
+		d = (u_int8_t*)destData->data;
+		memcpy(d, prevData->data, prefix);
+		d += prefix;
+
+		/* Write the suffix */
+		size += suffix;
+		if (size > compressed->size)
+			return (EINVAL);
+		memcpy(d, s, suffix);
+		s += suffix;
+
+		/* Return bytes read */
+		compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+		return (0);
+	}
+
+	/* Unmarshal prefix, suffix and data length */
+	size = __db_decompress_count_int(s);
+	if (size > compressed->size)
+		return (EINVAL);
+	s += __db_decompress_int32(s, &prefix);
+
+	size += __db_decompress_count_int(s);
+	if (size > compressed->size)
+		return (EINVAL);
+	s += __db_decompress_int32(s, &suffix);
+
+	size += __db_decompress_count_int(s);
+	if (size > compressed->size)
+		return (EINVAL);
+	s += __db_decompress_int32(s, &destData->size);
+
+	/* Check destination lengths */
+	destKey->size = prefix + suffix;
+	if (destKey->size > destKey->ulen || destData->size > destData->ulen)
+		return (DB_BUFFER_SMALL);
+
+	/* Write the prefix */
+	if (prefix > prevKey->size)
+		return (EINVAL);
+	d = (u_int8_t*)destKey->data;
+	memcpy(d, prevKey->data, prefix);
+	d += prefix;
+
+	/* Write the suffix */
+	size += suffix;
+	if (size > compressed->size)
+		return (EINVAL);
+	memcpy(d, s, suffix);
+	s += suffix;
+
+	/* Write the data */
+	size += destData->size;
+	if (size > compressed->size)
+		return (EINVAL);
+	memcpy(destData->data, s, destData->size);
+	s += destData->size;
+
+	/* Return bytes read */
+	compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+	return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Set dbc up to start decompressing the compressed key/data pair, dbc->key1
+ * and dbc->compressed.
+ */
+static int
+__bamc_start_decompress(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	int ret;
+	u_int32_t datasize;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	cp->prevKey = NULL;
+	cp->prevData = NULL;
+	cp->currentKey = &cp->key1;
+	cp->currentData = &cp->data1;
+	cp->compcursor = (u_int8_t*)cp->compressed.data;
+	cp->compend = cp->compcursor + cp->compressed.size;
+	cp->prevcursor = NULL;
+	cp->prev2cursor = NULL;
+
+	/* Unmarshal the first data */
+	cp->compcursor += __db_decompress_int32(cp->compcursor, &datasize);
+	ret = __bam_compress_set_dbt(dbc->dbp,
+	    cp->currentData, cp->compcursor, datasize);
+
+	if (ret == 0)
+	     cp->compcursor += datasize;
+	return (ret);
+}
+
+/* Decompress the next key/data pair from dbc->compressed. */
+static int
+__bamc_next_decompress(dbc)
+	DBC *dbc;
+{
+	DBT compressed;
+	int ret;
+	BTREE_CURSOR *cp;
+	DB *db;
+
+	ret = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	db = dbc->dbp;
+
+	if (cp->compcursor >= cp->compend)
+		return (DB_NOTFOUND);
+
+	cp->prevKey = cp->currentKey;
+	cp->prevData = cp->currentData;
+	cp->prev2cursor = cp->prevcursor;
+	cp->prevcursor = cp->compcursor;
+
+	if (cp->currentKey == &cp->key1) {
+		cp->currentKey = &cp->key2;
+		cp->currentData = &cp->data2;
+	} else {
+		cp->currentKey = &cp->key1;
+		cp->currentData = &cp->data1;
+	}
+
+	compressed.flags = DB_DBT_USERMEM;
+	compressed.data = (void*)cp->compcursor;
+	compressed.ulen = compressed.size =
+	    (u_int32_t)(cp->compend - cp->compcursor);
+	compressed.app_data = NULL;
+
+	while ((ret = ((BTREE *)db->bt_internal)->bt_decompress(db,
+	     cp->prevKey, cp->prevData, &compressed,
+	     cp->currentKey, cp->currentData)) == DB_BUFFER_SMALL) {
+		if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentKey) != 0)
+			break;
+		if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentData) != 0)
+			break;
+	}
+
+	if (ret == 0)
+		cp->compcursor += compressed.size;
+	return (ret);
+}
+
+/*
+ * Store key and data into destkey and destbuf, using the compression
+ * callback given.
+ */
+static int
+__bamc_compress_store(dbc, key, data, prevKey, prevData, destkey, destbuf)
+	DBC *dbc;
+	DBT *key, *data;
+	DBT **prevKey, **prevData;
+	DBT *destkey, *destbuf;
+{
+	int ret;
+	DBT dest;
+
+	if (*prevKey == 0) {
+		if ((ret = __bam_compress_set_dbt(dbc->dbp,
+		    destkey, key->data, key->size)) != 0)
+			return (ret);
+
+		/* Marshal data - resize if it won't fit */
+		ret = __bam_compress_marshal_data(dbc->dbp, data, destbuf);
+
+	} else if (((BTREE_CURSOR *)dbc->internal)->ovflsize > destbuf->size) {
+		/*
+		 * Don't write more than cp->ovflsize bytes to the destination
+		 * buffer - destbuf must be at least cp->ovflsize in size.
+		 */
+		dest.flags = DB_DBT_USERMEM;
+		dest.data = (u_int8_t*)destbuf->data + destbuf->size;
+		dest.ulen =
+		    ((BTREE_CURSOR *)dbc->internal)->ovflsize - destbuf->size;
+		dest.size = 0;
+		dest.app_data = NULL;
+
+		ret = ((BTREE *)dbc->dbp->bt_internal)->bt_compress(
+		    dbc->dbp, *prevKey, *prevData, key, data, &dest);
+
+		if (ret == 0)
+			destbuf->size += dest.size;
+	} else
+		ret = DB_BUFFER_SMALL;
+
+	if (ret == 0) {
+		*prevKey = key;
+		*prevData = data;
+	}
+
+	return (ret);
+}
+
+/*
+ * Move dbc->dbc to the correct position to start linear searching for
+ * seek_key/seek_data - the biggest key smaller than or equal to
+ * seek_key/seek_data.
+ */
+static int
+__bamc_compress_seek(dbc, seek_key, seek_data, flags)
+	DBC *dbc;
+	const DBT *seek_key;
+	const DBT *seek_data;
+	u_int32_t flags;
+{
+	int ret;
+	u_int32_t method;
+	DB *dbp;
+	BTREE_CURSOR *cp;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if ((ret = __bam_compress_set_dbt(
+	    dbp, &cp->key1, seek_key->data, seek_key->size)) != 0)
+		return (ret);
+
+	/*
+	 * We allow seek_data to be 0 for __bamc_compress_get_set() with
+	 * DB_SET
+	 */
+	if (F_ISSET(dbp, DB_AM_DUPSORT) && seek_data != NULL) {
+		if ((ret = __bam_compress_marshal_data(
+		    dbp, seek_data, &cp->compressed)) != 0)
+			return (ret);
+
+		method = DB_GET_BOTH_LTE;
+	} else
+		method = DB_SET_LTE;
+
+	CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, method | flags);
+
+	if (ret == 0 &&
+	    F_ISSET(dbp, DB_AM_DUPSORT) && seek_data == NULL &&
+	    __db_compare_both(dbp, seek_key, 0, &cp->key1, 0) == 0) {
+		/*
+		 * Some entries for seek_key might be in the previous chunk,
+		 * so we need to start searching there.
+		 */
+		CMP_IGET_RETRY(ret,
+		    dbc, &cp->key1, &cp->compressed, DB_PREV | flags);
+		if (ret == DB_NOTFOUND) {
+			/* No previous, we must need the first entry */
+			CMP_IGET_RETRY(ret,
+			    dbc, &cp->key1, &cp->compressed, DB_FIRST | flags);
+		}
+	}
+
+	return (ret);
+}
+
+/* Reset the cursor to an uninitialized state */
+static void
+__bamc_compress_reset(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	cp->prevKey = 0;
+	cp->prevData = 0;
+	cp->currentKey = 0;
+	cp->currentData = 0;
+	cp->compcursor = 0;
+	cp->compend = 0;
+	cp->prevcursor = 0;
+	cp->prev2cursor = 0;
+
+	F_CLR(cp, C_COMPRESS_DELETED|C_COMPRESS_MODIFIED);
+}
+
+/*
+ * Duplicate the cursor and delete the current entry, move the original cursor
+ * on and then close the cursor we used to delete. We do that to make sure that
+ * the close method runs __bamc_physdel(), and actually gets rid of the deleted
+ * entry!
+ */
+static int
+__bamc_compress_del_and_get_next(dbc, nextk, nextc)
+     DBC *dbc;
+     DBT *nextk, *nextc;
+{
+	int ret, ret_n;
+	DBC *dbc_n;
+
+	if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION | DB_SHALLOW_DUP)) != 0)
+		return (ret);
+	F_SET(dbc_n, DBC_TRANSIENT);
+
+	if ((ret = __dbc_idel(dbc_n, 0)) != 0)
+		goto err;
+
+	/* Read the next position */
+	CMP_IGET_RETRY(ret, dbc, nextk, nextc, DB_NEXT);
+
+ err:
+	if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+		ret = ret_n;
+
+	/* No need to relocate this cursor */
+	F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+	return (ret);
+}
+
+/*
+ * Duplicate the cursor, re-locate the position that this cursor pointed to
+ * using the duplicate (it may have been deleted), and then swap
+ * the cursors. We do that to make sure that the close method runs
+ * __bamc_physdel(), and gets rid of the entry that may have been deleted.
+ */
+static int
+__bamc_compress_relocate(dbc)
+	DBC *dbc;
+{
+	int ret, t_ret;
+	BTREE_CURSOR *cp, *cp_n;
+	DBC *dbc_n;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+		return (ret);
+	F_SET(dbc_n, DBC_TRANSIENT);
+
+	cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+	if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+		/* Find the position after the deleted entry again */
+		ret = __bamc_compress_get_set(
+		    dbc_n, &cp->del_key, &cp->del_data, 0, 0);
+		if (ret == DB_NOTFOUND) {
+			__bamc_compress_reset(dbc_n);
+			ret = 0;
+		} else if (ret != 0)
+			goto err;
+
+		F_SET(cp_n, C_COMPRESS_DELETED);
+
+	} else if (cp->currentKey != NULL) {
+		/* Find the current entry again */
+		ret = __bamc_compress_get_set(
+		    dbc_n, cp->currentKey, cp->currentData,
+		    F_ISSET(dbc->dbp, DB_AM_DUPSORT) ? DB_GET_BOTH : DB_SET, 0);
+
+		if (ret == DB_NOTFOUND) {
+			/* The current entry has been deleted */
+			if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+			     &cp_n->del_key,
+			     cp->currentKey->data, cp->currentKey->size)) != 0)
+				return (ret);
+			if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+			     &cp_n->del_data, cp->currentData->data,
+			     cp->currentData->size)) != 0)
+				return (ret);
+			F_SET(cp_n, C_COMPRESS_DELETED);
+			ret = 0;
+		} else if (ret != 0)
+			goto err;
+	}
+
+ err:
+	/* Cleanup and cursor resolution. This also clears the
+	   C_COMPRESS_MODIFIED flag. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/******************************************************************************/
+
+#define	CMP_STORE(key, data) do {					    \
+	while ((ret = __bamc_compress_store(dbc, (key), (data),             \
+	    &prevDestKey, &prevDestData, &destkey, &destbuf))               \
+	    == DB_BUFFER_SMALL) {					    \
+		if ((ret = __dbc_iput(dbc,                                  \
+		    &destkey, &destbuf, DB_KEYLAST)) != 0)                  \
+			goto end;					    \
+		prevDestKey = NULL;					    \
+		prevDestData = NULL;					    \
+		destbuf.size = 0;					    \
+	}								    \
+} while (0)
+
+/* Merge the sorted key/data pairs from stream into the compressed database. */
+static int
+__bamc_compress_merge_insert(dbc, stream, countp, flags)
+	DBC *dbc;
+	BTREE_COMPRESS_STREAM *stream;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DBT ikey1, ikey2, idata1, idata2, nextk, nextc, nextd, destkey, destbuf;
+	DBT *ikey, *idata, *prevIkey, *prevIdata, *prevDestKey, *prevDestData;
+	int ret, bulk_ret, cmp, nextExists, moreCompressed, iSmallEnough;
+	int moreStream;
+	u_int32_t chunk_count;
+	ENV *env;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	env = dbc->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	bulk_ret = 0;
+
+	memset(&ikey1, 0, sizeof(DBT));
+	memset(&ikey2, 0, sizeof(DBT));
+	memset(&idata1, 0, sizeof(DBT));
+	memset(&idata2, 0, sizeof(DBT));
+
+	CMP_INIT_DBT(&nextk);
+	CMP_INIT_DBT(&nextc);
+	memset(&nextd, 0, sizeof(DBT));
+
+	CMP_INIT_DBT(&destkey);
+	CMP_INIT_DBT(&destbuf);
+	if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+		goto end;
+	destbuf.ulen = cp->ovflsize;
+
+	if (countp != NULL)
+		*countp = 0;
+	chunk_count = 0;
+
+	/* Get the first input key and data */
+	ret = 0;
+	prevIkey = NULL;
+	prevIdata = NULL;
+	ikey = &ikey1;
+	idata = &idata1;
+	if (stream->next(stream, ikey, idata) == 0)
+		goto end;
+
+	prevDestKey = NULL;
+	prevDestData = NULL;
+
+	moreStream = 1;
+	while (moreStream != 0) {
+		nextExists = 1;
+		moreCompressed = 1;
+
+		/* Seek the ikey/idata position */
+		ret = __bamc_compress_seek(dbc, ikey, idata, 0);
+		if (ret == 0) {
+			/*
+			 * Delete the key - we might overwrite it below
+			 * but it's safer to just always delete it, and it
+			 * doesn't seem significantly slower to do so.
+			 */
+			ret = __bamc_compress_del_and_get_next(dbc, &nextk,
+				&nextc);
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+				nextExists = 0;
+			} else if (ret == 0) {
+				CMP_UNMARSHAL_DATA(&nextc, &nextd);
+			} else
+				goto end;
+			ret = __bamc_start_decompress(dbc);
+		} else if (ret == DB_NOTFOUND) {
+			moreCompressed = 0;
+
+			/* Read the next position */
+			CMP_IGET_RETRY(ret, dbc, &nextk, &nextc, DB_FIRST);
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+				nextExists = 0;
+			} else if (ret == 0) {
+				CMP_UNMARSHAL_DATA(&nextc, &nextd);
+			}
+		}
+
+		if (ret != 0)
+			goto end;
+
+		/* !nextExists || ikey/idata < nextk/nextd */
+		iSmallEnough = 1;
+
+		while (moreCompressed != 0 || iSmallEnough != 0) {
+			if (moreCompressed == 0)
+				cmp = 1;
+			else if (iSmallEnough == 0)
+				cmp = -1;
+			else
+				cmp = __db_compare_both(dbp, cp->currentKey,
+					cp->currentData, ikey, idata);
+
+			if (cmp < 0) {
+store_current:			CMP_STORE(cp->currentKey, cp->currentData);
+				if (ret != 0)
+					goto end;
+			} else {
+				switch (flags) {
+				case DB_KEYLAST:
+				case DB_KEYFIRST:
+				case DB_NODUPDATA:
+					if (cmp == 0 && bulk_ret == 0 &&
+						F_ISSET(dbp, DB_AM_DUPSORT)) {
+						bulk_ret = __db_duperr(dbp,
+							flags);
+
+						/*
+						 * Continue until we store
+						 * the current chunk,
+						 * but don't insert any
+						 * more entries.
+						 */
+						moreStream = 0;
+						iSmallEnough = 0;
+
+						goto store_current;
+					}
+					break;
+				default:
+					break;
+				}
+
+				CMP_STORE(ikey, idata);
+				if (ret != 0)
+					goto end;
+				++chunk_count;
+
+				/*
+				 * prevDestKey/prevDestData now point to
+				 * the same DBTs as ikey/idata. We don't
+				 * want to overwrite them, so swap them
+				 * to point to the other DBTs.
+				 */
+				if (ikey == &ikey1) {
+					ikey = &ikey2;
+					idata = &idata2;
+					prevIkey = &ikey1;
+					prevIdata = &idata1;
+				} else {
+					ikey = &ikey1;
+					idata = &idata1;
+					prevIkey = &ikey2;
+					prevIdata = &idata2;
+				}
+
+				do {
+					/* Get the next input key and data */
+					if (stream->next(
+					    stream, ikey, idata) == 0) {
+						moreStream = 0;
+						iSmallEnough = 0;
+						break;
+					}
+
+#ifdef DIAGNOSTIC
+					/* Check that the stream is sorted */
+					DB_ASSERT(env, __db_compare_both(dbp,
+						ikey, idata, prevIkey,
+						prevIdata) >= 0);
+#endif
+
+					/* Check for duplicates in the stream */
+				} while (__db_compare_both(dbp, ikey, idata,
+						 prevIkey, prevIdata) == 0);
+
+				/*
+				 * Check that !nextExists ||
+				 * ikey/idata < nextk/nextd
+				 */
+				if (moreStream != 0 && nextExists != 0 &&
+					__db_compare_both(dbp, ikey,
+						idata, &nextk, &nextd) >= 0)
+					iSmallEnough = 0;
+			}
+
+			if (cmp <= 0) {
+				ret = __bamc_next_decompress(dbc);
+				if (ret == DB_NOTFOUND) {
+					moreCompressed = 0;
+					ret = 0;
+				} else if (ret != 0)
+					goto end;
+
+			}
+		}
+
+		if (prevDestKey != NULL) {
+			if ((ret = __dbc_iput(
+			    dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+				goto end;
+
+			if (countp != NULL)
+				*countp += chunk_count;
+			chunk_count = 0;
+
+			prevDestKey = NULL;
+			prevDestData = NULL;
+			destbuf.size = 0;
+		}
+	}
+
+ end:
+	CMP_FREE_DBT(env, &destkey);
+	CMP_FREE_DBT(env, &destbuf);
+	CMP_FREE_DBT(env, &nextk);
+	CMP_FREE_DBT(env, &nextc);
+
+	return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Remove the sorted key/data pairs in stream from the compressed database. */
+static int
+__bamc_compress_merge_delete(dbc, stream, countp)
+	DBC *dbc;
+	BTREE_COMPRESS_STREAM *stream;
+	u_int32_t *countp;
+{
+	DBT ikey, idata, nextk, nextc, nextd, destkey, destbuf, pdestkey;
+	DBT pdestdata;
+#ifdef DIAGNOSTIC
+	DBT pikey, pidata;
+#endif
+	DBT *prevDestKey, *prevDestData;
+	int ret, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+	int iSmallEnough;
+	u_int32_t chunk_count;
+	ENV *env;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	env = dbc->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	bulk_ret = 0;
+
+	memset(&ikey, 0, sizeof(DBT));
+	memset(&idata, 0, sizeof(DBT));
+
+	CMP_INIT_DBT(&nextk);
+	CMP_INIT_DBT(&nextc);
+	memset(&nextd, 0, sizeof(DBT));
+
+	CMP_INIT_DBT(&pdestkey);
+	CMP_INIT_DBT(&pdestdata);
+
+	CMP_INIT_DBT(&destkey);
+	CMP_INIT_DBT(&destbuf);
+	if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+		goto end;
+	destbuf.ulen = cp->ovflsize;
+
+	if (countp != NULL)
+		*countp = 0;
+	chunk_count = 0;
+
+	/* Get the first input key and data */
+	ret = 0;
+	if (stream->next(stream, &ikey, &idata) == 0)
+		goto end;
+
+	prevDestKey = NULL;
+	prevDestData = NULL;
+
+	moreStream = 1;
+	while (moreStream != 0) {
+		nextExists = 1;
+		moreCompressed = 1;
+
+		/* Seek the ikey/idata position */
+		if ((ret = __bamc_compress_seek(dbc, &ikey, &idata, 0)) != 0)
+			goto end;
+
+		/*
+		 * Delete the key - we might overwrite it below but it's safer
+		 * to just always delete it, and it doesn't seem significantly
+		 * slower to do so.
+		  */
+		ret = __bamc_compress_del_and_get_next(dbc, &nextk, &nextc);
+		if (ret == DB_NOTFOUND) {
+			ret = 0;
+			nextExists = 0;
+		} else if (ret == 0) {
+			CMP_UNMARSHAL_DATA(&nextc, &nextd);
+		} else
+			goto end;
+
+		if ((ret = __bamc_start_decompress(dbc)) != 0)
+			goto end;
+
+		/* !nextExists || ikey/idata < nextk/nextd */
+		iSmallEnough = 1;
+
+		while (moreCompressed != 0 || iSmallEnough != 0) {
+			if (moreCompressed == 0)
+				cmp = 1;
+			else if (iSmallEnough == 0)
+				cmp = -1;
+			else
+				cmp = __db_compare_both(dbp, cp->currentKey,
+					cp->currentData, &ikey, &idata);
+
+			if (cmp < 0) {
+				CMP_STORE(cp->currentKey, cp->currentData);
+				if (ret != 0)
+					goto end;
+
+				if ((ret = __bam_compress_set_dbt(dbp,
+				    &pdestkey, cp->currentKey->data,
+				    cp->currentKey->size)) != 0)
+					goto end;
+				if ((ret = __bam_compress_set_dbt(dbp,
+				    &pdestdata, cp->currentData->data,
+				    cp->currentData->size)) != 0)
+					goto end;
+				prevDestKey = &pdestkey;
+				prevDestData = &pdestdata;
+			} else {
+				if (cmp != 0) {
+					/*
+					 * Continue until we store the current
+					 * chunk, but don't delete any more
+					 * entries.
+					 */
+					bulk_ret = DB_NOTFOUND;
+					moreStream = 0;
+					iSmallEnough = 0;
+				} else
+					++chunk_count;
+
+#ifdef DIAGNOSTIC
+				pikey = ikey;
+				pidata = idata;
+#endif
+
+				/* Get the next input key and data */
+				if (stream->next(stream, &ikey, &idata) == 0) {
+					moreStream = 0;
+					iSmallEnough = 0;
+				}
+
+#ifdef DIAGNOSTIC
+				/* Check that the stream is sorted */
+				DB_ASSERT(env, moreStream == 0 ||
+				    __db_compare_both(dbp, &ikey, &idata,
+				    &pikey, &pidata) >= 0);
+#endif
+
+				/*
+				 * Check that !nextExists ||
+				 * ikey/idata < nextk/nextd
+				 */
+				if (moreStream != 0 && nextExists != 0 &&
+					__db_compare_both(dbp, &ikey,
+					    &idata, &nextk, &nextd) >= 0)
+					iSmallEnough = 0;
+			}
+
+			if (cmp <= 0) {
+				ret = __bamc_next_decompress(dbc);
+				if (ret == DB_NOTFOUND) {
+					moreCompressed = 0;
+					ret = 0;
+				} else if (ret != 0)
+					goto end;
+			}
+		}
+
+		if (prevDestKey != NULL) {
+			if ((ret = __dbc_iput(
+			    dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+				goto end;
+
+			if (countp)
+				*countp += chunk_count;
+			chunk_count = 0;
+
+			prevDestKey = NULL;
+			prevDestData = NULL;
+			destbuf.size = 0;
+		}
+	}
+
+ end:
+	CMP_FREE_DBT(env, &destkey);
+	CMP_FREE_DBT(env, &destbuf);
+	CMP_FREE_DBT(env, &pdestkey);
+	CMP_FREE_DBT(env, &pdestdata);
+	CMP_FREE_DBT(env, &nextk);
+	CMP_FREE_DBT(env, &nextc);
+
+	return (ret != 0 ? ret : bulk_ret);
+}
+
+/*
+ * Remove the sorted keys in stream along with all duplicate values from
+ * the compressed database.
+ */
+static int
+__bamc_compress_merge_delete_dups(dbc, stream, countp)
+	DBC *dbc;
+	BTREE_COMPRESS_STREAM *stream;
+	u_int32_t *countp;
+{
+	DBC *dbc_n;
+	DBT ikey, nextk, noread, destkey, destbuf, pdestkey, pdestdata;
+#ifdef DIAGNOSTIC
+	DBT pikey;
+#endif
+	DBT *prevDestKey, *prevDestData;
+	int ret, ret_n, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+	int iSmallEnough, ifound;
+	u_int32_t chunk_count;
+	ENV *env;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	env = dbc->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	bulk_ret = 0;
+
+	memset(&ikey, 0, sizeof(DBT));
+
+	CMP_INIT_DBT(&nextk);
+
+	memset(&noread, 0, sizeof(DBT));
+	noread.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+	CMP_INIT_DBT(&pdestkey);
+	CMP_INIT_DBT(&pdestdata);
+
+	CMP_INIT_DBT(&destkey);
+	CMP_INIT_DBT(&destbuf);
+	if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+		goto end;
+	destbuf.ulen = cp->ovflsize;
+
+	if (countp != NULL)
+		*countp = 0;
+	chunk_count = 0;
+
+	/* Get the first input key and data */
+	ret = 0;
+	if (stream->next(stream, &ikey, NULL) == 0)
+		goto end;
+	ifound = 0;
+
+	prevDestKey = NULL;
+	prevDestData = NULL;
+
+	moreStream = 1;
+	iSmallEnough = 0;
+	nextExists = 0;
+	while (moreStream != 0) {
+		if (iSmallEnough != 0) {
+			if (nextExists == 0) {
+				/*
+				 * We've finished deleting the last key
+				 * in the database
+				 */
+				if (ifound == 0) {
+					bulk_ret = DB_NOTFOUND;
+				} else
+					++chunk_count;
+				break;
+			}
+
+			/* Move to the next chunk */
+			CMP_IGET_RETRY(
+			    ret, dbc, &cp->key1, &cp->compressed, DB_CURRENT);
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+				break;
+			} else if (ret != 0)
+				goto end;
+		} else
+			/* Seek the ikey position */
+			if ((ret =
+			    __bamc_compress_seek(dbc, &ikey, NULL, 0)) != 0)
+				goto end;
+
+		nextExists = 1;
+		moreCompressed = 1;
+
+		/*
+		 * Delete the key - we might overwrite it below but it's
+		 * safer to just always delete it, and it doesn't seem
+		 * significantly slower to do so.
+		 */
+		ret = __bamc_compress_del_and_get_next(dbc, &nextk, &noread);
+		if (ret == DB_NOTFOUND) {
+			ret = 0;
+			nextExists = 0;
+		} else if (ret != 0)
+			goto end;
+
+		if ((ret = __bamc_start_decompress(dbc)) != 0)
+			goto end;
+
+		/* !nextExists || ikey <= nextk */
+		iSmallEnough = 1;
+
+		while (moreCompressed != 0) {
+			if (moreCompressed == 0)
+				cmp = 1;
+			else if (iSmallEnough == 0)
+				cmp = -1;
+			else
+				cmp = __db_compare_both(
+				    dbp, cp->currentKey, NULL, &ikey, NULL);
+
+			if (cmp < 0) {
+				if ((ret = __bamc_compress_store(dbc,
+				    cp->currentKey, cp->currentData,
+				    &prevDestKey,
+				    &prevDestData, &destkey, &destbuf)) != 0)
+					goto end;
+
+				if ((ret = __bam_compress_set_dbt(dbp,
+				    &pdestkey, cp->currentKey->data,
+				    cp->currentKey->size)) != 0)
+					goto end;
+				if ((ret = __bam_compress_set_dbt(dbp,
+				     &pdestdata, cp->currentData->data,
+				     cp->currentData->size)) != 0)
+					goto end;
+				prevDestKey = &pdestkey;
+				prevDestData = &pdestdata;
+			} else if (cmp > 0) {
+				if (ifound == 0) {
+					/*
+					 * Continue until we store the
+					 * current chunk, but don't delete
+					 * any more entries.
+					 */
+					bulk_ret = DB_NOTFOUND;
+					moreStream = 0;
+					iSmallEnough = 0;
+				} else
+					++chunk_count;
+
+#ifdef DIAGNOSTIC
+				pikey = ikey;
+#endif
+
+				/* Get the next input key */
+				if (stream->next(stream, &ikey, NULL) == 0) {
+					moreStream = 0;
+					iSmallEnough = 0;
+				}
+				ifound = 0;
+
+#ifdef DIAGNOSTIC
+				/* Check that the stream is sorted */
+				DB_ASSERT(env, moreStream == 0 ||
+				    __db_compare_both(dbp, &ikey, NULL,
+				    &pikey, NULL) >= 0);
+#endif
+
+				/* Check that !nextExists || ikey <= nextk */
+				if (moreStream != 0 && nextExists != 0 &&
+				     __db_compare_both(dbp,
+				     &ikey, NULL, &nextk, NULL) > 0)
+					iSmallEnough = 0;
+			} else /* cmp == 0 */
+				ifound = 1;
+
+			if (cmp <= 0) {
+				ret = __bamc_next_decompress(dbc);
+				if (ret == DB_NOTFOUND) {
+					moreCompressed = 0;
+					ret = 0;
+				} else if (ret != 0)
+					goto end;
+			}
+		}
+
+		if (prevDestKey != NULL) {
+			/*
+			 * Do the DBC->put() with a duplicate cursor, so that
+			 * the main cursor's position isn't changed - we might
+			 * need it to be the same in order to use DB_CURRENT
+			 * above.
+			 */
+			if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+				goto end;
+			F_SET(dbc_n, DBC_TRANSIENT);
+
+			ret = __dbc_iput(dbc_n, &destkey, &destbuf, DB_KEYLAST);
+
+			if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+				ret = ret_n;
+
+			if (ret != 0)
+				goto end;
+
+			if (countp)
+				*countp += chunk_count;
+			chunk_count = 0;
+
+			prevDestKey = NULL;
+			prevDestData = NULL;
+			destbuf.size = 0;
+		}
+	}
+
+ end:
+	CMP_FREE_DBT(env, &destkey);
+	CMP_FREE_DBT(env, &destbuf);
+	CMP_FREE_DBT(env, &pdestkey);
+	CMP_FREE_DBT(env, &pdestdata);
+	CMP_FREE_DBT(env, &nextk);
+
+	return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Implements DB_PREV and DB_LAST for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	u_int32_t tofind;
+	BTREE_CURSOR *cp;
+
+	ret = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	F_CLR(cp, C_COMPRESS_DELETED);
+
+	if (cp->prevKey != NULL) {
+		/* Return the stored previous key */
+		cp->currentKey = cp->prevKey;
+		cp->currentData = cp->prevData;
+		cp->compcursor = cp->prevcursor;
+		cp->prevKey = 0;
+		cp->prevData = 0;
+		cp->prevcursor = cp->prev2cursor;
+		cp->prev2cursor = 0;
+	} else {
+		if (cp->currentKey == NULL) {
+			/* No current key, so fetch the last key */
+			flags |= DB_LAST;
+			tofind = (u_int32_t)-1;
+		} else if (cp->prevcursor == 0) {
+			/*
+			 * The current key is at the beginning of the
+			 * compressed block, so get the last key from the
+			 * previous block
+			 */
+			flags |= DB_PREV;
+			tofind = (u_int32_t)-1;
+		} else {
+			/*
+			 * We have to search for the previous key in the
+			 * current block
+			 */
+			flags |= DB_CURRENT;
+			tofind = (u_int32_t)
+			    (cp->prevcursor - (u_int8_t*)cp->compressed.data);
+		}
+
+		CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+		if (ret != 0)
+			return (ret);
+
+		/* Decompress until we reach tofind */
+		ret = __bamc_start_decompress(dbc);
+		while (ret == 0 && tofind > (u_int32_t)
+			(cp->compcursor - (u_int8_t*)cp->compressed.data)) {
+			ret = __bamc_next_decompress(dbc);
+		}
+
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+	}
+
+	return (ret);
+}
+
+/* Implements DB_PREV_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_dup(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	BTREE *t;
+
+	ret = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	t = (BTREE *)dbp->bt_internal;
+
+	if (cp->currentKey == 0)
+		return (EINVAL);
+
+	/* If this is a deleted entry, del_key is already set, otherwise we
+	   have to set it now */
+	if (!F_ISSET(cp, C_COMPRESS_DELETED)) {
+		if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+			    cp->currentKey->data, cp->currentKey->size)) != 0)
+			return (ret);
+	}
+
+	if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+		return (ret);
+
+	if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0)
+		return (DB_NOTFOUND);
+
+	return (0);
+}
+
+/* Implements DB_PREV_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_nodup(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	BTREE *t;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	t = (BTREE *)dbp->bt_internal;
+
+	if (cp->currentKey == 0)
+		return (__bamc_compress_get_prev(dbc, flags));
+
+	/*
+	 * If this is a deleted entry, del_key is already set, otherwise we
+	 * have to set it now.
+	 */
+	if (!F_ISSET(cp, C_COMPRESS_DELETED))
+		if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+		    cp->currentKey->data, cp->currentKey->size)) != 0)
+			return (ret);
+
+	/*
+	 * Linear search for the next non-duplicate key - this is
+	 * especially inefficient for DB_PREV_NODUP, since we have to
+	 * decompress from the beginning of the chunk to find previous
+	 * key/data pairs. Instead we could check for key equality as we
+	 * decompress.
+	 */
+	do
+		if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+			return (ret);
+	while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+	return (0);
+}
+
+/* Implements DB_NEXT and DB_FIRST for __bamc_compress_get() */
+static int
+__bamc_compress_get_next(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_CURSOR *cp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+		if (cp->currentKey == 0)
+			return (DB_NOTFOUND);
+		F_CLR(cp, C_COMPRESS_DELETED);
+		return (0);
+	} else if (cp->currentKey) {
+		ret = __bamc_next_decompress(dbc);
+		if (ret != DB_NOTFOUND)
+			return (ret);
+
+		flags |= DB_NEXT;
+	} else
+		flags |= DB_FIRST;
+
+	CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+	if (ret == DB_NOTFOUND) {
+		/*
+		 * Reset the cursor, so that
+		 * __bamc_compress_get_multiple_key will end up pointing
+		 * to the right place
+		 */
+		__bamc_compress_reset(dbc);
+		return (DB_NOTFOUND);
+	} else if (ret != 0)
+		return (ret);
+
+	ret = __bamc_start_decompress(dbc);
+
+	return (ret);
+}
+
+/* Implements DB_NEXT_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_dup(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	BTREE *t;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	t = (BTREE *)dbp->bt_internal;
+
+	if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+		/*
+		 * Check that the next entry has the same key as the
+		 * deleted entry.
+		 */
+		if (cp->currentKey == 0)
+			return (DB_NOTFOUND);
+		F_CLR(cp, C_COMPRESS_DELETED);
+		return (t->bt_compare(dbp,
+		    cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND);
+	} else if (cp->currentKey == 0)
+		return (EINVAL);
+
+	/* Check that the next entry has the same key as the previous entry */
+	ret = __bamc_next_decompress(dbc);
+	if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0)
+		return (DB_NOTFOUND);
+	if (ret != DB_NOTFOUND)
+		return (ret);
+
+	if (key == NULL) {
+		/* Copy the current key to del_key */
+		if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+		    cp->currentKey->data, cp->currentKey->size)) != 0)
+			return (ret);
+		key = &cp->del_key;
+	}
+
+	/* Fetch the next chunk */
+	CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, DB_NEXT | flags);
+	if (ret == DB_NOTFOUND) {
+		/*
+		 * Reset the cursor, so that __bamc_compress_get_multiple
+		 * will end up pointing to the right place
+		 */
+		__bamc_compress_reset(dbc);
+		return (DB_NOTFOUND);
+	} else if (ret != 0)
+		return (ret);
+
+	if ((ret = __bamc_start_decompress(dbc)) != 0)
+		return (ret);
+
+	/* Check the keys are the same */
+	if (t->bt_compare(dbp, cp->currentKey, key) != 0)
+		return (DB_NOTFOUND);
+
+	return (0);
+}
+
+/* Implements DB_NEXT_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_nodup(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	BTREE *t;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	t = (BTREE *)dbp->bt_internal;
+
+	if (cp->currentKey == 0)
+		return (__bamc_compress_get_next(dbc, flags));
+
+	/*
+	 * If this is a deleted entry, del_key is already set, otherwise
+	 * we have to set it now
+	 */
+	if (!F_ISSET(cp, C_COMPRESS_DELETED))
+		if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+		    cp->currentKey->data, cp->currentKey->size)) != 0)
+			return (ret);
+
+	/* Linear search for the next non-duplicate key */
+	do
+		if ((ret = __bamc_compress_get_next(dbc, flags)) != 0)
+			return (ret);
+	while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+	return (ret);
+}
+
+/*
+ * Implements DB_SET, DB_SET_RANGE, DB_GET_BOTH, and DB_GET_BOTH_RANGE
+ * for __bamc_compress_get()
+ */
+static int
+__bamc_compress_get_set(dbc, key, data, method, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t method;
+	u_int32_t flags;
+{
+	int ret, cmp;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+
+	if (method == DB_SET || method == DB_SET_RANGE)
+	    data = NULL;
+
+	F_CLR(cp, C_COMPRESS_DELETED);
+
+	ret = __bamc_compress_seek(dbc, key, data, flags);
+	if (ret == DB_NOTFOUND)
+		CMP_IGET_RETRY(ret, dbc,
+		    &cp->key1, &cp->compressed, DB_FIRST | flags);
+	if (ret != 0)
+		return (ret);
+
+	/* Decompress and perform a linear search for the key */
+	cmp = 0;
+	ret = __bamc_start_decompress(dbc);
+	while (ret == 0 && (cmp = __db_compare_both(dbp,
+	   cp->currentKey, cp->currentData, key, data)) < 0) {
+		ret = __bamc_next_decompress(dbc);
+		if (ret == DB_NOTFOUND) {
+			CMP_IGET_RETRY(ret, dbc,
+			    &cp->key1, &cp->compressed, DB_NEXT | flags);
+			if (ret == 0)
+			    ret = __bamc_start_decompress(dbc);
+		}
+	}
+
+	switch (method) {
+	case DB_SET:
+	case DB_GET_BOTH_RANGE:
+		/*
+		 * We need to exactly match the key, and if cmp != 0 we
+		 * might not have - so check again here.
+		 */
+		if (ret == 0 &&
+		    __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) {
+			/* We didn't find the key */
+			ret = DB_NOTFOUND;
+		}
+		break;
+	case DB_GET_BOTH:
+		if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) &&
+		    __bam_defcmp(dbp, cp->currentData, data) != 0))) {
+			/* We didn't find the key/data pair */
+			ret = DB_NOTFOUND;
+		}
+		break;
+	default:
+		DB_ASSERT(dbp->env, method == 0 || method == DB_SET_RANGE);
+	}
+
+	return (ret);
+}
+
+/* Implements DB_GET_BOTHC for __bamc_compress_get() */
+static int
+__bamc_compress_get_bothc(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	int ret, cmp;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+
+	/* Check that the data we are looking for comes after the current
+	   position */
+	if (__db_compare_both(dbp, cp->currentKey,
+	    cp->currentData, cp->currentKey, data) >= 0)
+		return (DB_NOTFOUND);
+
+	cmp = 0;
+	/* Perform a linear search for the data in the current chunk */
+	while ((ret = __bamc_next_decompress(dbc)) == 0 &&
+	    (cmp = __db_compare_both(
+	    dbp, cp->currentKey, cp->currentData, cp->prevKey, data)) < 0)
+		continue;
+
+	if (ret == 0)
+		return (cmp == 0 ? 0 : DB_NOTFOUND);
+	if (ret != DB_NOTFOUND)
+		return (ret);
+
+	/* Copy the current key to del_key */
+	if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+		    cp->currentKey->data, cp->currentKey->size)) != 0)
+		return (ret);
+
+	/* Search for the data using DB_GET_BOTH */
+	return __bamc_compress_get_set(
+	    dbc, &cp->del_key, data, DB_GET_BOTH, flags);
+}
+
+/* Implements DB_MULTIPLE_KEY for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple_key(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	int ret;
+	u_int8_t *writekey, *writedata;
+	void *mptr;
+	BTREE_CURSOR *cp;
+
+	ret = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	DB_MULTIPLE_WRITE_INIT(mptr, data);
+	DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey, cp->currentKey->size,
+		writedata, cp->currentData->size);
+	if (writekey == NULL) {
+		data->size = cp->currentKey->size + cp->currentData->size +
+			4 * sizeof(u_int32_t);
+		return DB_BUFFER_SMALL;
+	}
+	DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+	memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+	memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+	while ((ret = __bamc_compress_get_next(dbc, flags)) == 0) {
+		DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey,
+		    cp->currentKey->size, writedata, cp->currentData->size);
+		if (writekey == NULL)
+			break;
+		DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+		/*
+		 * We could choose to optimize this by just storing one
+		 * copy of a key for each set of duplicate data.
+		 */
+		memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+		memcpy(writedata, cp->currentData->data, cp->currentData->size);
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	if (ret == 0)
+		/*
+		 * Rewind to the previous key/data, since we can't fit
+		 * this one in the buffer
+		 */
+		ret = __bamc_compress_get_prev(dbc, flags);
+
+	return (ret);
+}
+
+/* Implements DB_MULTIPLE for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	int ret;
+	u_int8_t *writedata;
+	void *mptr;
+	BTREE_CURSOR *cp;
+
+	ret = 0;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	data->size = 0;
+
+	DB_MULTIPLE_WRITE_INIT(mptr, data);
+	DB_MULTIPLE_RESERVE_NEXT(mptr, data, writedata, cp->currentData->size);
+	data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+	if (writedata == NULL)
+		return DB_BUFFER_SMALL;
+
+	memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+	while ((ret = __bamc_compress_get_next_dup(dbc, key, flags)) == 0) {
+		DB_MULTIPLE_RESERVE_NEXT(
+		    mptr, data, writedata, cp->currentData->size);
+		data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+		if (writedata == NULL) {
+			/* DBC_FROM_DB_GET indicates we need to fit all the
+			 * duplicates into the buffer or return DB_BUFFER_SMALL.
+			 * [#17039]
+			 */
+			if (F_ISSET(dbc, DBC_FROM_DB_GET))
+				return DB_BUFFER_SMALL;
+			break;
+		}
+
+		memcpy(writedata, cp->currentData->data, cp->currentData->size);
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	if (ret == 0)
+		/*
+		 * Rewind to the previous key/data, as that's now our current
+		 * entry.
+		 */
+		ret = __bamc_compress_get_prev(dbc, flags);
+
+	return (ret);
+}
+
+/*
+ * __bamc_compress_iget --
+ *	Get using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_iget(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	int ret;
+	u_int32_t multiple, method;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	ret = 0;
+
+	multiple = flags & (DB_MULTIPLE|DB_MULTIPLE_KEY);
+	method = flags & DB_OPFLAGS_MASK;
+	flags = flags & ~(DB_OPFLAGS_MASK|DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+	switch (method) {
+	case DB_CURRENT:
+		if (F_ISSET(cp, C_COMPRESS_DELETED))
+			ret = DB_KEYEMPTY;
+		else if (cp->currentKey == NULL)
+			ret = EINVAL;
+		break;
+	case DB_FIRST:
+		__bamc_compress_reset(dbc);
+		ret = __bamc_compress_get_next(dbc, flags);
+		break;
+	case DB_NEXT:
+		ret = __bamc_compress_get_next(dbc, flags);
+		break;
+	case DB_NEXT_DUP:
+		ret = __bamc_compress_get_next_dup(dbc, 0, flags);
+		break;
+	case DB_NEXT_NODUP:
+		ret = __bamc_compress_get_next_nodup(dbc, flags);
+		break;
+	case DB_LAST:
+		__bamc_compress_reset(dbc);
+		ret = __bamc_compress_get_prev(dbc, flags);
+		break;
+	case DB_PREV:
+		ret = __bamc_compress_get_prev(dbc, flags);
+		break;
+	case DB_PREV_DUP:
+		ret = __bamc_compress_get_prev_dup(dbc, flags);
+		break;
+	case DB_PREV_NODUP:
+		ret = __bamc_compress_get_prev_nodup(dbc, flags);
+		break;
+	case DB_SET:
+		if (((BTREE *)
+		    dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+			F_SET(key, DB_DBT_ISSET);
+		/* FALL THROUGH */
+	case DB_SET_RANGE:
+		ret = __bamc_compress_get_set(dbc, key, 0, method, flags);
+		break;
+	case DB_GET_BOTH:
+		if (!F_ISSET(dbc->dbp, DB_AM_DUPSORT) || ((BTREE *)dbc->dbp->
+		   bt_internal)->compress_dup_compare == __bam_defcmp)
+			F_SET(data, DB_DBT_ISSET);
+		/* FALL THROUGH */
+	case DB_GET_BOTH_RANGE:
+		if (((BTREE *)
+		    dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+			F_SET(key, DB_DBT_ISSET);
+		ret = __bamc_compress_get_set(dbc, key, data, method, flags);
+		break;
+	case DB_GET_BOTHC:
+		ret = __bamc_compress_get_bothc(dbc, data, flags);
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+			method);
+		break;
+	}
+
+	if (ret != 0)
+		goto err;
+
+	switch (multiple) {
+	case 0:
+		if (!F_ISSET(key, DB_DBT_ISSET))
+			ret = __db_retcopy(dbc->env, key,
+				cp->currentKey->data, cp->currentKey->size,
+				&dbc->rkey->data, &dbc->rkey->ulen);
+		if (!F_ISSET(data, DB_DBT_ISSET) && ret == 0)
+			ret = __db_retcopy(dbc->env, data,
+				cp->currentData->data, cp->currentData->size,
+				&dbc->rdata->data, &dbc->rdata->ulen);
+		break;
+	case DB_MULTIPLE:
+		if (!F_ISSET(key, DB_DBT_ISSET))
+			ret = __db_retcopy(dbc->env, key,
+				cp->currentKey->data, cp->currentKey->size,
+				&dbc->rkey->data, &dbc->rkey->ulen);
+		if (ret == 0)
+			ret =
+			    __bamc_compress_get_multiple(dbc, key, data, flags);
+		break;
+	case DB_MULTIPLE_KEY:
+		ret = __bamc_compress_get_multiple_key(dbc, data, flags);
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+			multiple);
+		break;
+	}
+
+ err:
+	F_CLR(key, DB_DBT_ISSET);
+	F_CLR(data, DB_DBT_ISSET);
+
+	return (ret);
+}
+
+/*
+ * __bamc_compress_get --
+ *	Get using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc_n;
+	int ret, t_ret;
+	u_int32_t tmp_flags;
+
+	switch (flags & DB_OPFLAGS_MASK) {
+	case DB_CURRENT:
+	case DB_GET_BOTHC:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		if (F_ISSET((BTREE_CURSOR *)dbc->internal,
+		    C_COMPRESS_MODIFIED) &&
+		    (ret = __bamc_compress_relocate(dbc)) != 0)
+			return (ret);
+		tmp_flags = DB_POSITION;
+		break;
+	default:
+		F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+		tmp_flags = 0;
+		break;
+	}
+
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else {
+		if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about preserving the cursor's position on
+		 * error.
+		 */
+		F_SET(dbc_n, DBC_TRANSIENT);
+
+		COPY_RET_MEM(dbc, dbc_n);
+	}
+
+	if ((ret = __bamc_compress_iget(dbc_n, key, data, flags)) != 0)
+		goto err;
+
+err:
+	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+		(ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bamc_compress_iput --
+ *	Put using a compressed cursor (internal)
+ */
+static int
+__bamc_compress_iput(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	int ret;
+	u_int32_t multi;
+	DBT kcpy, pdata, empty;
+	BTREE_COMPRESS_STREAM stream;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	ENV *env;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	env = dbc->env;
+
+	memset(&pdata, 0, sizeof(DBT));
+	memset(&empty, 0, sizeof(DBT));
+
+	multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+	LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+	if (flags == 0)
+		flags = DB_KEYLAST;
+
+	switch (flags) {
+	case DB_CURRENT:
+		if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) {
+			ret = DB_NOTFOUND;
+			goto end;
+		}
+
+		if (F_ISSET(data, DB_DBT_PARTIAL)) {
+			if ((ret = __db_buildpartial(
+			    dbp, cp->currentData, data, &pdata)) != 0)
+				goto end;
+			data = &pdata;
+		}
+
+		if (F_ISSET(dbp, DB_AM_DUPSORT) &&
+		    ((BTREE *)dbp->bt_internal)->compress_dup_compare(
+		    dbp, cp->currentData, data) != 0) {
+			__db_errx(env, DB_STR("1032",
+			    "Existing data sorts differently from put data"));
+			ret = EINVAL;
+			goto end;
+		}
+		CMP_INIT_DBT(&kcpy);
+		if ((ret = __bam_compress_set_dbt(dbp,
+		    &kcpy, cp->currentKey->data, cp->currentKey->size)) != 0)
+			goto end;
+
+		__bam_cs_create_single(&stream, &kcpy, data);
+		ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+		if (ret == 0)
+			/* Position the cursor on the entry written */
+			ret = __bamc_compress_get_set(
+			    dbc, &kcpy, data, DB_GET_BOTH_RANGE, 0);
+
+		CMP_FREE_DBT(env, &kcpy);
+		break;
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_NODUPDATA:
+	case DB_OVERWRITE_DUP:
+		switch (multi) {
+		case 0:
+			if (F_ISSET(data, DB_DBT_PARTIAL)) {
+				if ((ret = __bamc_compress_get_set(dbc, key,
+				    data, DB_SET, 0)) != 0 &&
+				    ret != DB_NOTFOUND)
+					goto end;
+				if ((ret = __db_buildpartial(dbp,
+				    ret == DB_NOTFOUND ? &empty :
+				    cp->currentData, data, &pdata)) != 0)
+					goto end;
+				data = &pdata;
+			}
+
+			__bam_cs_create_single(&stream, key, data);
+			ret = __bamc_compress_merge_insert(
+			    dbc, &stream, NULL,	flags);
+
+			if (ret == 0)
+				/* Position the cursor on the entry written */
+				ret = __bamc_compress_get_set(
+				    dbc, key, data, DB_GET_BOTH_RANGE, 0);
+			break;
+		case DB_MULTIPLE:
+			if ((ret = __bam_compress_check_sort_multiple(dbp,
+			    key, data)) != 0)
+				goto end;
+			__bam_cs_create_multiple(&stream, key, data);
+			ret = __bamc_compress_merge_insert(
+			    dbc, &stream, &key->doff, flags);
+			break;
+		case DB_MULTIPLE_KEY:
+			if ((ret = __bam_compress_check_sort_multiple_key(dbp,
+			    key)) != 0)
+				goto end;
+			__bam_cs_create_multiple_key(&stream, key);
+			ret = __bamc_compress_merge_insert(
+			    dbc, &stream, &key->doff, flags);
+			break;
+		default:
+			return (__db_unknown_flag(
+			    dbp->env, "__bamc_compress_iput", multi));
+		}
+		break;
+	case DB_NOOVERWRITE:
+		/* Check key doesn't already exist */
+		ret = __bamc_compress_get_set(dbc, key, 0, DB_SET, 0);
+		if (ret != DB_NOTFOUND) {
+			if (ret == 0)
+				ret = DB_KEYEXIST;
+			goto end;
+		}
+
+		if (F_ISSET(data, DB_DBT_PARTIAL)) {
+			if ((ret = __db_buildpartial(
+			    dbp, &empty, data, &pdata))	!= 0)
+				goto end;
+			data = &pdata;
+		}
+
+		__bam_cs_create_single(&stream, key, data);
+		ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+		if (ret == 0)
+			/* Position the cursor on the entry written */
+			ret = __bamc_compress_get_set(
+			    dbc, key, data, DB_GET_BOTH_RANGE, 0);
+		break;
+	default:
+		return (__db_unknown_flag(
+		    dbp->env, "__bamc_compress_iput", flags));
+	}
+
+ end:
+	if (pdata.data != NULL)
+		__os_free(env, pdata.data);
+	return (ret);
+}
+
+/*
+ * __bamc_compress_put --
+ *	Put using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc_n;
+	int ret, t_ret;
+
+	if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED)) {
+		if ((flags & DB_OPFLAGS_MASK) == DB_CURRENT &&
+		    (ret = __bamc_compress_relocate(dbc)) != 0)
+			return (ret);
+		F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+	}
+
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else {
+		if ((ret = __dbc_dup(dbc, &dbc_n,
+		    (flags & DB_OPFLAGS_MASK) == DB_CURRENT ?
+		    DB_POSITION : 0)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about preserving the cursor's position on
+		 * error.
+		 */
+		F_SET(dbc_n, DBC_TRANSIENT);
+	}
+
+	if ((ret = __bamc_compress_iput(dbc_n, key, data, flags)) != 0)
+		goto err;
+
+err:
+	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+		(ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bamc_compress_idel --
+ *	Del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_idel(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_COMPRESS_STREAM stream;
+	DB *dbp;
+	BTREE_CURSOR *cp;
+
+	COMPQUIET(flags, 0);
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if (F_ISSET(cp, C_COMPRESS_DELETED))
+		return DB_KEYEMPTY;
+	if (cp->currentKey == 0)
+		return DB_NOTFOUND;
+
+	if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+		     cp->currentKey->data, cp->currentKey->size)) != 0)
+		goto err;
+	if ((ret = __bam_compress_set_dbt(dbp, &cp->del_data,
+		     cp->currentData->data, cp->currentData->size)) != 0)
+		goto err;
+
+	__bam_cs_create_single(&stream, &cp->del_key, &cp->del_data);
+	if ((ret = __bamc_compress_merge_delete(dbc, &stream, NULL)) != 0)
+		goto err;
+
+	/* Position the cursor on the entry after the key/data deleted */
+	ret = __bamc_compress_get_set(dbc, &cp->del_key, &cp->del_data, 0, 0);
+	if (ret == DB_NOTFOUND) {
+		__bamc_compress_reset(dbc);
+		ret = 0;
+	} else if (ret != 0)
+		goto err;
+
+	/* Mark current as being deleted */
+	F_SET(cp, C_COMPRESS_DELETED);
+
+ err:
+	return (ret);
+}
+
+/*
+ * __bamc_compress_del --
+ *	Del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_del __P((DBC *, u_int32_t));
+ */
+int
+__bamc_compress_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+	DBC *dbc_n;
+
+	if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED) &&
+	    (ret = __bamc_compress_relocate(dbc)) != 0)
+		return (ret);
+
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else {
+		if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about preserving the cursor's position on
+		 * error.
+		 */
+		F_SET(dbc_n, DBC_TRANSIENT);
+
+		COPY_RET_MEM(dbc, dbc_n);
+	}
+
+	if ((ret = __bamc_compress_idel(dbc_n, flags)) != 0)
+		goto err;
+
+err:
+	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+		(ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bamc_compress_ibulk_del --
+ *	Bulk del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_ibulk_del(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	int ret;
+	BTREE_COMPRESS_STREAM stream;
+
+	switch (flags) {
+	case 0:
+		__bam_cs_create_single_keyonly(&stream, key);
+		return (__bamc_compress_merge_delete_dups(dbc, &stream, NULL));
+	case DB_MULTIPLE:
+		if ((ret = __bam_compress_check_sort_multiple_keyonly(
+		    dbc->dbp, key)) != 0)
+			return (ret);
+		__bam_cs_create_multiple_keyonly(&stream, key);
+		return (__bamc_compress_merge_delete_dups(
+		    dbc, &stream, &key->doff));
+	case DB_MULTIPLE_KEY:
+		if ((ret = __bam_compress_check_sort_multiple_key(
+		    dbc->dbp, key)) != 0)
+			return (ret);
+		__bam_cs_create_multiple_key(&stream, key);
+		return (__bamc_compress_merge_delete(dbc, &stream, &key->doff));
+	default:
+		break;
+	}
+
+	return (__db_unknown_flag(
+	    dbc->env, "__bamc_compress_ibulk_del", flags));
+}
+
+/*
+ * __bamc_compress_bulk_del --
+ *	Bulk del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_bulk_del(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+	DBC *dbc_n;
+
+	F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else {
+		if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about preserving the cursor's position on
+		 * error.
+		 */
+		F_SET(dbc_n, DBC_TRANSIENT);
+	}
+
+	if ((ret = __bamc_compress_ibulk_del(dbc_n, key, flags)) != 0)
+		goto err;
+
+err:
+	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+		(ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bamc_compress_count --
+ *	Count using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_compress_count(dbc, countp)
+	DBC *dbc;
+	db_recno_t *countp;
+{
+	int ret, t_ret;
+	db_recno_t count;
+	DBT *key;
+	DBC *dbc_n;
+	BTREE_CURSOR *cp;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * If the current entry is deleted use del_key, otherwise use
+	 * currentKey.
+	 */
+	if (F_ISSET(cp, C_COMPRESS_DELETED))
+		key = &cp->del_key;
+	else
+		key = cp->currentKey;
+
+	/* Duplicate the cursor */
+	if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+		return (ret);
+
+	/* We don't care about preserving the cursor's position on error */
+	F_SET(dbc_n, DBC_TRANSIENT);
+
+	/* Find the first duplicate */
+	if ((ret = __bamc_compress_get_set(dbc_n, key, 0, DB_SET, 0)) != 0)
+		goto err;
+	count = 1;
+
+	/* Count subsequent duplicates */
+	while ((ret = __bamc_compress_get_next_dup(dbc_n, key, 0)) == 0)
+		++count;
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	else if (ret != 0)
+		goto err;
+
+	*countp = count;
+
+ err:
+	if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bamc_compress_cmp --
+ *	Compare which compressed value is pointed to.
+ *
+ * PUBLIC: int __bamc_compress_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_compress_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	DB *dbp;
+	BTREE_CURSOR *cp, *ocp;
+
+	/*
+	 * At this point, we already know that the cursors point to the same
+	 * DB.
+	 */
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ocp = (BTREE_CURSOR *)other_dbc->internal;
+
+	if (F_ISSET(cp, C_COMPRESS_DELETED))
+		if (F_ISSET(ocp, C_COMPRESS_DELETED))
+			*result = __db_compare_both(
+			     dbp, &cp->del_key, &cp->del_data,
+			     &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+		else {
+			if (ocp->currentKey == 0)
+				goto err;
+
+			*result = __db_compare_both(
+			     dbp, &cp->del_key, &cp->del_data,
+			     ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+		}
+	else {
+		if (cp->currentKey == 0)
+			goto err;
+
+		if (F_ISSET(ocp, C_COMPRESS_DELETED))
+			*result = __db_compare_both(
+			     dbp, cp->currentKey, cp->currentData,
+			     &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+		else {
+			if (ocp->currentKey == 0)
+				goto err;
+
+			*result = __db_compare_both(
+			     dbp, cp->currentKey, cp->currentData,
+			     ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+		}
+	}
+	return (0);
+
+ err:
+	__db_errx(dbc->env, DB_STR("1033",
+	    "Both cursors must be initialized before calling DBC->cmp."));
+	return (EINVAL);
+}
+
+/*
+ * __bamc_compress_dup --
+ *	Duplicate the compression specific part of a btree cursor.
+ *
+ * PUBLIC: int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_compress_dup(orig_dbc, new_dbc, flags)
+	DBC *orig_dbc, *new_dbc;
+	u_int32_t flags;
+{
+	int ret;
+	DB *dbp;
+	BTREE_CURSOR *orig, *new;
+
+	dbp = new_dbc->dbp;
+
+	orig = (BTREE_CURSOR *)orig_dbc->internal;
+	new = (BTREE_CURSOR *)new_dbc->internal;
+
+	if (orig->currentKey != NULL && !LF_ISSET(DB_SHALLOW_DUP)) {
+		new->currentKey = &new->key1;
+		new->currentData = &new->data1;
+
+		if ((ret = __bam_compress_set_dbt(dbp, new->currentKey,
+		    orig->currentKey->data, orig->currentKey->size)) != 0)
+			return (ret);
+		if ((ret = __bam_compress_set_dbt(dbp, new->currentData,
+		    orig->currentData->data, orig->currentData->size)) != 0)
+			return (ret);
+
+		if (orig->prevKey) {
+			new->prevKey = &new->key2;
+			new->prevData = &new->data2;
+
+			if ((ret = __bam_compress_set_dbt(dbp, new->prevKey,
+			    orig->prevKey->data, orig->prevKey->size)) != 0)
+				return (ret);
+			if ((ret = __bam_compress_set_dbt(dbp, new->prevData,
+			    orig->prevData->data, orig->prevData->size)) != 0)
+				return (ret);
+		}
+
+		if ((ret = __bam_compress_set_dbt(dbp, &new->compressed,
+		    orig->compressed.data, orig->compressed.size)) != 0)
+			return (ret);
+
+		new->compcursor = (u_int8_t*)new->compressed.data +
+			(orig->compcursor - (u_int8_t*)orig->compressed.data);
+		new->compend = (u_int8_t*)new->compressed.data +
+			(orig->compend - (u_int8_t*)orig->compressed.data);
+		new->prevcursor = orig->prevcursor == NULL ? NULL :
+			(u_int8_t*)new->compressed.data + (orig->prevcursor -
+				(u_int8_t*)orig->compressed.data);
+		new->prev2cursor = orig->prev2cursor == NULL ? NULL :
+			(u_int8_t*)new->compressed.data + (orig->prev2cursor -
+				(u_int8_t*)orig->compressed.data);
+
+		if (F_ISSET(orig, C_COMPRESS_DELETED)) {
+			if ((ret = __bam_compress_set_dbt(dbp, &new->del_key,
+			    orig->del_key.data, orig->del_key.size)) != 0)
+				return (ret);
+			if ((ret = __bam_compress_set_dbt(dbp, &new->del_data,
+			    orig->del_data.data, orig->del_data.size)) != 0)
+				return (ret);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_compress_salvage --
+ *	Salvage the compressed data from the key/data pair
+ *
+ * PUBLIC: int __bam_compress_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC:     void *, int (*)(void *, const void *), DBT *, DBT *));
+ */
+int
+__bam_compress_salvage(dbp, vdp, handle, callback, key, data)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	DBT *key, *data;
+{
+	DBT key1, key2, data1, data2, compressed;
+	DBT *currentKey, *currentData, *prevKey, *prevData;
+	ENV *env;
+	int ret, t_ret;
+	u_int8_t *compcursor, *compend;
+	u_int32_t datasize, size;
+
+	env = dbp->env;
+
+	memset(&key1, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data1, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+	memset(&compressed, 0, sizeof(DBT));
+
+	key1.flags = DB_DBT_USERMEM;
+	key2.flags = DB_DBT_USERMEM;
+	data1.flags = DB_DBT_USERMEM;
+	data2.flags = DB_DBT_USERMEM;
+	compressed.flags = DB_DBT_USERMEM;
+
+	prevKey = NULL;
+	prevData = NULL;
+	currentKey = key;
+	currentData = &data2;
+	compcursor = (u_int8_t*)data->data;
+	compend = compcursor + data->size;
+
+	if (data->size == 0) {
+		ret = DB_VERIFY_FATAL;
+		goto unknown_data;
+	}
+
+	/* Unmarshal the first data */
+	size = __db_decompress_count_int(compcursor);
+	if (size == 0xFF || compcursor + size > compend) {
+		ret = DB_VERIFY_FATAL;
+		goto unknown_data;
+	}
+	compcursor += __db_decompress_int32(compcursor, &datasize);
+
+	if (compcursor + datasize > compend) {
+		ret = DB_VERIFY_FATAL;
+		goto unknown_data;
+	}
+	if ((ret = __bam_compress_set_dbt(
+	    dbp, currentData, compcursor, datasize)) != 0)
+		goto err;
+	compcursor += datasize;
+
+	/* Output first data (first key has already been output by our caller */
+	if ((ret = __db_vrfy_prdbt(
+	    currentData, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+		goto err;
+
+	while (compcursor < compend) {
+		prevKey = currentKey;
+		prevData = currentData;
+
+		if (currentKey == &key1) {
+			currentKey = &key2;
+			currentData = &data2;
+		} else {
+			currentKey = &key1;
+			currentData = &data1;
+		}
+
+		compressed.data = (void*)compcursor;
+		compressed.ulen = compressed.size =
+		    (u_int32_t)(compend - compcursor);
+
+		/* Decompress the next key/data pair */
+		while ((ret = ((BTREE *)dbp->bt_internal)->bt_decompress(
+		    dbp, prevKey, prevData,
+		    &compressed, currentKey, currentData)) == DB_BUFFER_SMALL) {
+			if (CMP_RESIZE_DBT(ret, env, currentKey) != 0)
+				break;
+			if (CMP_RESIZE_DBT(ret, env, currentData) != 0)
+				break;
+		}
+
+		if (ret == EINVAL) {
+			ret = DB_VERIFY_FATAL;
+			goto err;
+		}
+		if (ret != 0)
+			goto err;
+
+		compcursor += compressed.size;
+
+		if (compcursor > compend) {
+			ret = DB_VERIFY_FATAL;
+			goto err;
+		}
+
+		/* Output the next key/data pair */
+		if ((ret = __db_vrfy_prdbt(
+		    currentKey, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+			goto err;
+		if ((ret = __db_vrfy_prdbt(
+		    currentData, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+			goto err;
+	}
+
+	if (0) {
+ unknown_data:
+		/*
+		 * Make sure we output a data value for the key that's
+		 * already been output
+		 */
+		DB_INIT_DBT(
+		    compressed, "UNKNOWN_DATA",	sizeof("UNKNOWN_DATA") - 1);
+		if ((t_ret = __db_vrfy_prdbt(
+		    &compressed, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+			ret = t_ret;
+	}
+
+ err:
+	__os_free(env, key1.data);
+	__os_free(env, key2.data);
+	__os_free(env, data1.data);
+	__os_free(env, data2.data);
+	return (ret);
+}
+
+/*
+ * __bam_compress_count --
+ *	Calculate key and entry counts for the compressed BTree
+ *
+ * PUBLIC: int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_compress_count(dbc, nkeysp, ndatap)
+	DBC *dbc;
+	u_int32_t *nkeysp, *ndatap;
+{
+	int ret, t_ret;
+	u_int32_t nkeys, ndata;
+	DB *dbp;
+	BTREE *t;
+	DBC *dbc_n;
+	BTREE_CURSOR *cp_n;
+
+	dbp = dbc->dbp;
+	t = (BTREE *)dbp->bt_internal;
+
+	/* Duplicate the cursor */
+	if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+		return (ret);
+
+	/* We don't care about preserving the cursor's position on error */
+	F_SET(dbc_n, DBC_TRANSIENT);
+
+	cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+	nkeys = 0;
+	ndata = 0;
+
+	CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1, &cp_n->compressed, DB_FIRST);
+	if (ret != 0)
+		goto err;
+
+	if ((ret = __bamc_start_decompress(dbc_n)) != 0)
+		goto err;
+	nkeys += 1;
+
+	for (;;) {
+		ndata += 1;
+
+		ret = __bamc_next_decompress(dbc_n);
+		if (ret == DB_NOTFOUND) {
+			if (cp_n->currentKey == &cp_n->key1) {
+				/*
+				 * Make sure that the previous key isn't
+				 * overwritten when we fetch the next chunk.
+				 */
+				if ((ret = __bam_compress_set_dbt(dbp,
+					     &cp_n->key2, cp_n->key1.data,
+					     cp_n->key1.size)) != 0)
+					goto err;
+			}
+
+			CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1,
+				&cp_n->compressed, DB_NEXT);
+			if (ret != 0)
+				goto err;
+
+			ret = __bamc_start_decompress(dbc_n);
+
+			cp_n->prevKey = &cp_n->key2;
+		}
+
+		if (ret != 0)
+			goto err;
+
+		if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0)
+			nkeys += 1;
+	}
+
+err:
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret == 0) {
+		if (nkeysp != NULL)
+			*nkeysp = nkeys;
+		if (ndatap != NULL)
+			*ndatap = ndata;
+	}
+
+	return (ret);
+}
+
+/*
+ * Check if the key/data pairs in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple_key(dbp, key)
+	DB *dbp;
+	DBT *key;
+{
+#ifdef DIAGNOSTIC
+	void *kptr;
+	DBT key1, data1, key2, data2;
+
+	memset(&key1, 0, sizeof(DBT));
+	memset(&data1, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+
+	DB_MULTIPLE_INIT(kptr, key);
+	DB_MULTIPLE_KEY_NEXT(kptr, key,
+	    key2.data, key2.size, data2.data, data2.size);
+	/* No key/data pair in the bulk buffer */
+	if (kptr == NULL)
+		return (0);
+
+	for (;;) {
+		DB_MULTIPLE_KEY_NEXT(kptr, key,
+		    key1.data, key1.size, data1.data, data1.size);
+		if (kptr == NULL)
+			break;
+		if (__db_compare_both(dbp, &key1, &data1, &key2, &data2) < 0) {
+			__db_errx(dbp->env, DB_STR("1170",
+		    "The key/data pairs in the buffer are not sorted."));
+			return (EINVAL);
+		}
+		key2.data = key1.data;
+		key2.size = key1.size;
+		data2.data = data1.data;
+		data2.size = data1.size;
+	}
+#else
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(key, NULL);
+#endif
+	return (0);
+}
+
+/*
+ * Check if the key/data pairs in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple(dbp, key, data)
+	DB *dbp;
+	DBT *key, *data;
+{
+#ifdef DIAGNOSTIC
+	void *kptr, *dptr;
+	DBT key1, data1, key2, data2;
+
+	memset(&key1, 0, sizeof(DBT));
+	memset(&data1, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+
+	DB_MULTIPLE_INIT(kptr, key);
+	DB_MULTIPLE_INIT(dptr, data);
+	DB_MULTIPLE_NEXT(kptr, key, key2.data, key2.size);
+	DB_MULTIPLE_NEXT(dptr, data, data2.data, data2.size);
+	/* No key/data pair in the bulk buffer */
+	if (kptr == NULL || dptr == NULL)
+		return (0);
+
+	for (;;) {
+		DB_MULTIPLE_NEXT(kptr, key, key1.data, key1.size);
+		DB_MULTIPLE_NEXT(dptr, data, data1.data, data1.size);
+		if (kptr == NULL || dptr == NULL)
+			break;
+		if (__db_compare_both(dbp, &key1, &data1, &key2, &data2) < 0) {
+			__db_errx(dbp->env, DB_STR("1171",
+		    "The key/data pairs in the buffer are not sorted."));
+			return (EINVAL);
+		}
+		key2.data = key1.data;
+		key2.size = key1.size;
+		data2.data = data1.data;
+		data2.size = data1.size;
+	}
+#else
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+#endif
+	return (0);
+}
+
+/*
+ * Check if the keys in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple_keyonly(dbp, key)
+	DB *dbp;
+	DBT *key;
+{
+#ifdef DIAGNOSTIC
+	void *kptr;
+	DBT key1, key2;
+
+	memset(&key1, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+
+	DB_MULTIPLE_INIT(kptr, key);
+	DB_MULTIPLE_NEXT(kptr, key, key2.data, key2.size);
+	/* No DBT item in the bulk buffer */
+	if (kptr == NULL)
+		return (0);
+
+	for (;;) {
+		DB_MULTIPLE_NEXT(kptr, key, key1.data, key1.size);
+		if (kptr == NULL)
+			break;
+		if (__db_compare_both(dbp, &key1, NULL, &key2, NULL) < 0) {
+			__db_errx(dbp->env, DB_STR("1172",
+			    "The DBT items in the buffer are not sorted"));
+			return (EINVAL);
+		}
+		key2.data = key1.data;
+		key2.size = key1.size;
+	}
+#else
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(key, NULL);
+#endif
+	return (0);
+}
+
+#endif
diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c
new file mode 100644
index 00000000..348ce5c2
--- /dev/null
+++ b/src/btree/bt_conv.c
@@ -0,0 +1,95 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_pgin --
+ *	Convert host-specific page layout from the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgin(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	return (TYPE(h) == P_BTREEMETA ?  __bam_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __bam_pgout --
+ *	Convert host-specific page layout to the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgout(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	return (TYPE(h) == P_BTREEMETA ?  __bam_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __bam_mswap --
+ *	Swap the bytes on the btree metadata page.
+ *
+ * PUBLIC: int __bam_mswap __P((ENV *, PAGE *));
+ */
+int
+__bam_mswap(env, pg)
+	ENV *env;
+	PAGE *pg;
+{
+	u_int8_t *p;
+
+	COMPQUIET(env, NULL);
+
+	__db_metaswap(pg);
+	p = (u_int8_t *)pg + sizeof(DBMETA);
+
+	p += sizeof(u_int32_t);	/* unused */
+	SWAP32(p);		/* minkey */
+	SWAP32(p);		/* re_len */
+	SWAP32(p);		/* re_pad */
+	SWAP32(p);		/* root */
+	p += 92 * sizeof(u_int32_t); /* unused */
+	SWAP32(p);		/* crypto_magic */
+
+	return (0);
+}
diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c
new file mode 100644
index 00000000..78606009
--- /dev/null
+++ b/src/btree/bt_curadj.c
@@ -0,0 +1,694 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t));
+static int __bam_ca_delete_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ram_ca_delete_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_di_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_dup_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_undodup_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_rsplit_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_split_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_undosplit_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * Cursor adjustments are logged if they are for subtransactions.  This is
+ * because it's possible for a subtransaction to adjust cursors which will
+ * still be active after the subtransaction aborts, and so which must be
+ * restored to their previous locations.  Cursors that can be both affected
+ * by our cursor adjustments and active after our transaction aborts can
+ * only be found in our parent transaction -- cursors in other transactions,
+ * including other child transactions of our parent, must have conflicting
+ * locker IDs, and so cannot be affected by adjustments in this transaction.
+ */
+
+ /*
+  * __bam_ca_delete_func
+  *	Callback function for walking cursors to update them due to a delete.
+  */
+ static int
+ __bam_ca_delete_func(dbc, my_dbc, countp, pgno, indx, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *args;
+{
+	BTREE_CURSOR *cp;
+	u_int32_t del;
+
+	COMPQUIET(my_dbc, NULL);
+	del = *(u_int32_t *)args;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	if (cp->pgno == pgno && cp->indx == indx &&
+	    !MVCC_SKIP_CURADJ(dbc, pgno)) {
+		/*
+		 * [#8032] This assert is checking for possible race
+		 * conditions where we hold a cursor position without
+		 * a lock.  Unfortunately, there are paths in the
+		 * Btree code that do not satisfy these conditions.
+		 * None of them are known to be a problem, but this
+		 * assert should be re-activated when the Btree stack
+		 * code is re-written.
+		DB_ASSERT(env, !STD_LOCKING(dbc) ||
+		    cp->lock_mode != DB_LOCK_NG);
+		 */
+		if (del) {
+			F_SET(cp, C_DELETED);
+			/*
+			 * If we're deleting the item, we can't
+			 * keep a streaming offset cached.
+			 */
+			cp->stream_start_pgno = PGNO_INVALID;
+		} else
+			F_CLR(cp, C_DELETED);
+
+#ifdef HAVE_COMPRESSION
+		/*
+		 * We also set the C_COMPRESS_MODIFIED flag, which
+		 * prompts the compression code to look for it's
+		 * current entry again if it needs to.
+		 *
+		 * The flag isn't cleared, because the compression
+		 * code still needs to do that even for an entry that
+		 * becomes undeleted.
+		 *
+		 * This flag also needs to be set if an entry is
+		 * updated, but since the compression code always
+		 * deletes before an update, setting it here is
+		 * sufficient.
+		 */
+		F_SET(cp, C_COMPRESS_MODIFIED);
+#endif
+
+		++(*countp);
+	}
+	return (0);
+}
+
+/*
+ * __bam_ca_delete --
+ *	Update the cursors when items are deleted and when already deleted
+ *	items are overwritten.  Return the number of relevant cursors found.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *,
+ * PUBLIC:     db_pgno_t, u_int32_t, int, u_int32_t *));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, del, countp)
+	DB *dbp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	int del;
+	u_int32_t *countp;
+{
+	int ret;
+	u_int32_t count;
+
+	/*
+	 * Adjust the cursors.  We have the page write locked, so the
+	 * only other cursors that can be pointing at a page are
+	 * those in the same thread of control.  Unfortunately, we don't
+	 * know that they're using the same DB handle, so traverse
+	 * all matching DB handles in the same ENV, then all cursors
+	 * on each matching DB handle.
+	 *
+	 * Each cursor is single-threaded, so we only need to lock the
+	 * list of DBs and then the list of cursors in each DB.
+	 */
+	if ((ret = __db_walk_cursors(dbp, NULL,
+	    __bam_ca_delete_func, &count, pgno, indx, &del)) != 0)
+		return (ret);
+
+	if (countp != NULL)
+		*countp = count;
+	return (0);
+}
+
+static int
+__ram_ca_delete_func(dbc, my_dbc, countp, root_pgno, indx, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t root_pgno;
+	u_int32_t indx;
+	void *args;
+{
+	COMPQUIET(indx, 0);
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(args, NULL);
+
+	if (dbc->internal->root == root_pgno &&
+	    !MVCC_SKIP_CURADJ(dbc, root_pgno)) {
+		(*countp)++;
+		return (EEXIST);
+	}
+	return (0);
+}
+
+/*
+ * __ram_ca_delete --
+ *	Return if any relevant cursors found.
+ *
+ * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t, u_int32_t *));
+ */
+int
+__ram_ca_delete(dbp, root_pgno, foundp)
+	DB *dbp;
+	db_pgno_t root_pgno;
+	u_int32_t *foundp;
+{
+	int ret;
+
+	if ((ret = __db_walk_cursors(dbp, NULL, __ram_ca_delete_func,
+	    foundp, root_pgno, 0, NULL)) != 0 && ret != EEXIST)
+		return (ret);
+
+	return (0);
+}
+
+struct __bam_ca_di_args {
+	int adjust;
+	DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_di_func(dbc, my_dbc, foundp, pgno, indx, vargs)
+	DBC *dbc, *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *vargs;
+{
+	DBC_INTERNAL *cp;
+	struct __bam_ca_di_args *args;
+
+	if (dbc->dbtype == DB_RECNO)
+		return (0);
+
+	cp = dbc->internal;
+	args = vargs;
+	if (cp->pgno == pgno && cp->indx >= indx &&
+	    (dbc == my_dbc || !MVCC_SKIP_CURADJ(dbc, pgno))) {
+		/* Cursor indices should never be negative. */
+		DB_ASSERT(dbc->dbp->env, cp->indx != 0 || args->adjust > 0);
+		/* [#8032]
+		DB_ASSERT(env, !STD_LOCKING(dbc) ||
+		    cp->lock_mode != DB_LOCK_NG);
+		*/
+		cp->indx += args->adjust;
+		if (args->my_txn != NULL && args->my_txn != dbc->txn)
+			*foundp = 1;
+	}
+	return (0);
+}
+/*
+ * __bam_ca_di --
+ *	Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_di(my_dbc, pgno, indx, adjust)
+	DBC *my_dbc;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	int adjust;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	int ret;
+	u_int32_t found;
+	struct __bam_ca_di_args args;
+
+	dbp = my_dbc->dbp;
+	args.adjust = adjust;
+	args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 */
+	if ((ret = __db_walk_cursors(dbp, my_dbc, __bam_ca_di_func,
+	    &found, pgno, indx, &args)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(my_dbc)) {
+		if ((ret = __bam_curadj_log(dbp, my_dbc->txn, &lsn, 0,
+		    DB_CA_DI, pgno, 0, 0, (u_int32_t)adjust, indx, 0)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_opd_cursor -- create a new opd cursor.
+ */
+static int
+__bam_opd_cursor(dbp, dbc, first, tpgno, ti)
+	DB *dbp;
+	DBC *dbc;
+	db_pgno_t tpgno;
+	u_int32_t first, ti;
+{
+	BTREE_CURSOR *cp, *orig_cp;
+	DBC *dbc_nopd;
+	int ret;
+
+	orig_cp = (BTREE_CURSOR *)dbc->internal;
+	dbc_nopd = NULL;
+
+	/*
+	 * Allocate a new cursor and create the stack.  If duplicates
+	 * are sorted, we've just created an off-page duplicate Btree.
+	 * If duplicates aren't sorted, we've just created a Recno tree.
+	 *
+	 * Note that in order to get here at all, there shouldn't be
+	 * an old off-page dup cursor--to augment the checking dbc_newopd
+	 * will do, assert this.
+	 */
+	DB_ASSERT(dbp->env, orig_cp->opd == NULL);
+	if ((ret = __dbc_newopd(dbc, tpgno, orig_cp->opd, &dbc_nopd)) != 0)
+		return (ret);
+
+	cp = (BTREE_CURSOR *)dbc_nopd->internal;
+	cp->pgno = tpgno;
+	cp->indx = ti;
+
+	if (dbp->dup_compare == NULL) {
+		/*
+		 * Converting to off-page Recno trees is tricky.  The
+		 * record number for the cursor is the index + 1 (to
+		 * convert to 1-based record numbers).
+		 */
+		cp->recno = ti + 1;
+	}
+
+	/*
+	 * Transfer the deleted flag from the top-level cursor to the
+	 * created one.
+	 */
+	if (F_ISSET(orig_cp, C_DELETED)) {
+		F_SET(cp, C_DELETED);
+		F_CLR(orig_cp, C_DELETED);
+	}
+
+	/* Stack the cursors and reset the initial cursor's index. */
+	orig_cp->opd = dbc_nopd;
+	orig_cp->indx = first;
+	return (0);
+}
+
+struct __bam_ca_dup_args {
+	db_pgno_t tpgno;
+	db_indx_t first, ti;
+	DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_dup_func(dbc, my_dbc, foundp, fpgno, fi, vargs)
+	DBC *dbc;
+	DBC *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t fpgno;
+	u_int32_t fi;
+	void *vargs;
+{
+	BTREE_CURSOR *orig_cp;
+	DB *dbp;
+	int ret;
+	struct __bam_ca_dup_args *args;
+
+	COMPQUIET(my_dbc, NULL);
+
+	/*
+	 * Since we rescan the list see if this is already
+	 * converted.
+	 */
+	orig_cp = (BTREE_CURSOR *)dbc->internal;
+	if (orig_cp->opd != NULL)
+		return (0);
+
+	/* Find cursors pointing to this record. */
+	if (orig_cp->pgno != fpgno || orig_cp->indx != fi ||
+	    MVCC_SKIP_CURADJ(dbc, fpgno))
+		return (0);
+
+	dbp = dbc->dbp;
+	args = vargs;
+
+	MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+	if ((ret = __bam_opd_cursor(dbp,
+	    dbc, args->first, args->tpgno, args->ti)) != 0) {
+		MUTEX_LOCK(dbp->env, dbp->mutex);
+		return (ret);
+	}
+	if (args->my_txn != NULL && args->my_txn != dbc->txn)
+		*foundp = 1;
+	/* We released the mutex to get a cursor, start over. */
+	return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * __bam_ca_dup --
+ *	Adjust the cursors when moving items from a leaf page to a duplicates
+ *	page.
+ *
+ * PUBLIC: int __bam_ca_dup __P((DBC *,
+ * PUBLIC:    u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti)
+	DBC *my_dbc;
+	db_pgno_t fpgno, tpgno;
+	u_int32_t first, fi, ti;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	int ret, t_ret;
+	u_int32_t found;
+	struct __bam_ca_dup_args args;
+
+	dbp = my_dbc->dbp;
+
+	args.first = first;
+	args.tpgno = tpgno;
+	args.ti = ti;
+	args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+	if ((ret = __db_walk_cursors(dbp,
+	    my_dbc, __bam_ca_dup_func, &found, fpgno, fi, &args)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(my_dbc)) {
+		if ((t_ret = __bam_curadj_log(dbp, my_dbc->txn,
+		    &lsn, 0, DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+static int
+__bam_ca_undodup_func(dbc, my_dbc, countp, fpgno, fi, vargs)
+	DBC *dbc;
+	DBC *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t fpgno;
+	u_int32_t fi;
+	void *vargs;
+{
+	BTREE_CURSOR *orig_cp;
+	DB *dbp;
+	int ret;
+	struct __bam_ca_dup_args *args;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(countp, NULL);
+
+	orig_cp = (BTREE_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	args = vargs;
+	/*
+	 * A note on the orig_cp->opd != NULL requirement here:
+	 * it's possible that there's a cursor that refers to
+	 * the same duplicate set, but which has no opd cursor,
+	 * because it refers to a different item and we took
+	 * care of it while processing a previous record.
+	 */
+	if (orig_cp->pgno != fpgno ||
+	    orig_cp->indx != args->first ||
+	    orig_cp->opd == NULL || ((BTREE_CURSOR *)
+	    orig_cp->opd->internal)->indx != args->ti ||
+	    MVCC_SKIP_CURADJ(dbc, fpgno))
+		return (0);
+	MUTEX_UNLOCK(dbp->env, dbp->mutex);
+	if ((ret = __dbc_close(orig_cp->opd)) != 0) {
+		MUTEX_LOCK(dbp->env, dbp->mutex);
+		return (ret);
+	}
+	orig_cp->opd = NULL;
+	orig_cp->indx = fi;
+	/*
+	 * We released the mutex to free a cursor,
+	 * start over.
+	 */
+	return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * __bam_ca_undodup --
+ *	Adjust the cursors when returning items to a leaf page
+ *	from a duplicate page.
+ *	Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undodup __P((DB *,
+ * PUBLIC:    u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__bam_ca_undodup(dbp, first, fpgno, fi, ti)
+	DB *dbp;
+	db_pgno_t fpgno;
+	u_int32_t first, fi, ti;
+{
+	u_int32_t count;
+	struct __bam_ca_dup_args args;
+
+	args.first = first;
+	args.ti = ti;
+	return (__db_walk_cursors(dbp, NULL,
+	    __bam_ca_undodup_func, &count, fpgno, fi, &args));
+
+}
+
+static int
+__bam_ca_rsplit_func(dbc, my_dbc, foundp, fpgno, indx, args)
+	DBC *dbc;
+	DBC *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t fpgno;
+	u_int32_t indx;
+	void *args;
+{
+	db_pgno_t tpgno;
+
+	COMPQUIET(indx, 0);
+
+	if (dbc->dbtype == DB_RECNO)
+		return (0);
+
+	tpgno = *(db_pgno_t *)args;
+	if (dbc->internal->pgno == fpgno &&
+	    !MVCC_SKIP_CURADJ(dbc, fpgno)) {
+		dbc->internal->pgno = tpgno;
+		/* [#8032]
+		DB_ASSERT(env, !STD_LOCKING(dbc) ||
+		    dbc->internal->lock_mode != DB_LOCK_NG);
+		*/
+		if (IS_SUBTRANSACTION(my_dbc->txn) && dbc->txn != my_dbc->txn)
+			*foundp = 1;
+	}
+	return (0);
+}
+
+/*
+ * __bam_ca_rsplit --
+ *	Adjust the cursors when doing reverse splits.
+ *
+ * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+ */
+int
+__bam_ca_rsplit(my_dbc, fpgno, tpgno)
+	DBC* my_dbc;
+	db_pgno_t fpgno, tpgno;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	int ret;
+	u_int32_t found;
+
+	dbp = my_dbc->dbp;
+
+	if ((ret = __db_walk_cursors(dbp, my_dbc,
+	    __bam_ca_rsplit_func, &found, fpgno, 0, &tpgno)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(my_dbc)) {
+		if ((ret = __bam_curadj_log(dbp, my_dbc->txn,
+		    &lsn, 0, DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+struct __bam_ca_split_args {
+	db_pgno_t lpgno, rpgno;
+	int cleft;
+	DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_split_func(dbc, my_dbc, foundp, ppgno, split_indx, vargs)
+	DBC *dbc;
+	DBC *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t ppgno;
+	u_int32_t split_indx;
+	void *vargs;
+{
+	DBC_INTERNAL *cp;
+	struct __bam_ca_split_args *args;
+
+	COMPQUIET(my_dbc, NULL);
+
+	if (dbc->dbtype == DB_RECNO)
+		return (0);
+	cp = dbc->internal;
+	args = vargs;
+	if (cp->pgno == ppgno &&
+	    !MVCC_SKIP_CURADJ(dbc, ppgno)) {
+		/* [#8032]
+		DB_ASSERT(env, !STD_LOCKING(dbc) ||
+		    cp->lock_mode != DB_LOCK_NG);
+		*/
+		if (args->my_txn != NULL && args->my_txn != dbc->txn)
+			*foundp = 1;
+		if (cp->indx < split_indx) {
+			if (args->cleft)
+				cp->pgno = args->lpgno;
+		} else {
+			cp->pgno = args->rpgno;
+			cp->indx -= split_indx;
+		}
+	}
+	return (0);
+}
+
+/*
+ * __bam_ca_split --
+ *	Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: int __bam_ca_split __P((DBC *,
+ * PUBLIC:    db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft)
+	DBC *my_dbc;
+	db_pgno_t ppgno, lpgno, rpgno;
+	u_int32_t split_indx;
+	int cleft;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	int ret;
+	u_int32_t found;
+	struct __bam_ca_split_args args;
+
+	dbp = my_dbc->dbp;
+
+	/*
+	 * If splitting the page that a cursor was on, the cursor has to be
+	 * adjusted to point to the same record as before the split.  Most
+	 * of the time we don't adjust pointers to the left page, because
+	 * we're going to copy its contents back over the original page.  If
+	 * the cursor is on the right page, it is decremented by the number of
+	 * records split to the left page.
+	 */
+	args.lpgno = lpgno;
+	args.rpgno = rpgno;
+	args.cleft = cleft;
+	args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+	if ((ret = __db_walk_cursors(dbp, my_dbc,
+	    __bam_ca_split_func, &found, ppgno, split_indx, &args)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(my_dbc)) {
+		if ((ret = __bam_curadj_log(dbp,
+		    my_dbc->txn, &lsn, 0, DB_CA_SPLIT, ppgno, rpgno,
+		    cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+static int
+__bam_ca_undosplit_func(dbc, my_dbc, foundp, frompgno, split_indx, vargs)
+	DBC *dbc;
+	DBC *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t frompgno;
+	u_int32_t split_indx;
+	void *vargs;
+{
+	DBC_INTERNAL *cp;
+	struct __bam_ca_split_args *args;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(foundp, NULL);
+
+	if (dbc->dbtype == DB_RECNO)
+		return (0);
+	cp = dbc->internal;
+	args = vargs;
+	if (cp->pgno == args->rpgno &&
+	    !MVCC_SKIP_CURADJ(dbc, args->rpgno)) {
+		cp->pgno = frompgno;
+		cp->indx += split_indx;
+	} else if (cp->pgno == args->lpgno &&
+	    !MVCC_SKIP_CURADJ(dbc, args->lpgno))
+		cp->pgno = frompgno;
+
+	return (0);
+}
+
+/*
+ * __bam_ca_undosplit --
+ *	Adjust the cursors when undoing a split of a page.
+ *	If we grew a level we will execute this for both the
+ *	left and the right pages.
+ *	Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undosplit __P((DB *,
+ * PUBLIC:    db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx)
+	DB *dbp;
+	db_pgno_t frompgno, topgno, lpgno;
+	u_int32_t split_indx;
+{
+	u_int32_t count;
+	struct __bam_ca_split_args args;
+
+	/*
+	 * When backing out a split, we move the cursor back
+	 * to the original offset and bump it by the split_indx.
+	 */
+	args.lpgno = lpgno;
+	args.rpgno = topgno;
+	return (__db_walk_cursors(dbp, NULL,
+	    __bam_ca_undosplit_func, &count, frompgno, split_indx, &args));
+}
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
new file mode 100644
index 00000000..860c31ce
--- /dev/null
+++ b/src/btree/bt_cursor.c
@@ -0,0 +1,3076 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int  __bam_bulk __P((DBC *, DBT *, u_int32_t));
+static int  __bamc_close __P((DBC *, db_pgno_t, int *));
+static int  __bamc_del __P((DBC *, u_int32_t));
+static int  __bamc_destroy __P((DBC *));
+static int  __bamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __bamc_getstack __P((DBC *));
+static int  __bamc_next __P((DBC *, int, int));
+static int  __bamc_physdel __P((DBC *));
+static int  __bamc_prev __P((DBC *));
+static int  __bamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __bamc_search __P((DBC *,
+		db_pgno_t, const DBT *, u_int32_t, int *));
+static int  __bamc_writelock __P((DBC *));
+static int  __bam_getboth_finddatum __P((DBC *, DBT *, u_int32_t));
+static int  __bam_getbothc __P((DBC *, DBT *));
+static int  __bam_get_prev __P((DBC *));
+static int  __bam_isopd __P((DBC *, db_pgno_t *));
+#ifdef HAVE_COMPRESSION
+static int  __bam_getlte __P((DBC *, DBT *, DBT *));
+#endif
+
+/*
+ * Acquire a new page/lock.  If we hold a page/lock, discard the page, and
+ * lock-couple the lock.
+ *
+ * !!!
+ * We have to handle both where we have a lock to lock-couple and where we
+ * don't -- we don't duplicate locks when we duplicate cursors if we are
+ * running in a transaction environment as there's no point if locks are
+ * never discarded.  This means that the cursor may or may not hold a lock.
+ * In the case where we are descending the tree we always want to unlock
+ * the held interior page so we use ACQUIRE_COUPLE.
+ */
+#undef	ACQUIRE
+#define	ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, ret) do {	\
+	DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf;				\
+	if ((pagep) != NULL) {						\
+		ret = __memp_fput(__mpf,				\
+		     (dbc)->thread_info, pagep, dbc->priority);		\
+		pagep = NULL;						\
+	} else								\
+		ret = 0;						\
+	if ((ret) == 0 && STD_LOCKING(dbc))				\
+		ret = __db_lget(					\
+		    dbc, LCK_COUPLE, lpgno, mode, flags, &(lock));	\
+	if ((ret) == 0)							\
+		ret = __memp_fget(__mpf, &(fpgno),			\
+		    (dbc)->thread_info, (dbc)->txn, 0, &(pagep));	\
+} while (0)
+
+/* Acquire a new page/lock for a cursor. */
+#undef	ACQUIRE_CUR
+#define	ACQUIRE_CUR(dbc, mode, p, flags, ret) do {			\
+	BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal;		\
+	if (p != __cp->pgno)						\
+		__cp->pgno = PGNO_INVALID;				\
+	ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, ret);	\
+	if ((ret) == 0) {						\
+		__cp->pgno = p;						\
+		__cp->lock_mode = (mode);				\
+	}								\
+} while (0)
+
+/*
+ * Acquire a write lock if we don't already have one.
+ *
+ * !!!
+ * See ACQUIRE macro on why we handle cursors that don't have locks.
+ */
+#undef	ACQUIRE_WRITE_LOCK
+#define	ACQUIRE_WRITE_LOCK(dbc, ret) do {				\
+	BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal;		\
+	DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf;				\
+	int __get_page = 0;						\
+	ret = 0;							\
+	if (STD_LOCKING(dbc) &&	__cp->lock_mode != DB_LOCK_WRITE) {	\
+		if (__cp->page != NULL) {				\
+			(ret) = __memp_fput(__mpf, (dbc)->thread_info,	\
+			    __cp->page, (dbc)->priority);		\
+			__cp->page = NULL;				\
+			__get_page = 1;					\
+			if ((ret) !=0)					\
+				break;					\
+		}							\
+		if (((ret) = __db_lget((dbc),				\
+		    LOCK_ISSET(__cp->lock) ? LCK_COUPLE : 0,		\
+		    __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) != 0)	\
+			break;						\
+		__cp->lock_mode = DB_LOCK_WRITE;			\
+		if (__get_page == 0)					\
+			break;						\
+		(ret) = __memp_fget(__mpf, &__cp->pgno,		\
+		    (dbc)->thread_info,					\
+		    (dbc)->txn, DB_MPOOL_DIRTY, &__cp->page);		\
+	}								\
+} while (0)
+
+/* Discard the current page/lock for a cursor. */
+#undef	DISCARD_CUR
+#define	DISCARD_CUR(dbc, ret) do {					\
+	BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal;		\
+	DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf;				\
+	int __t_ret;							\
+	if ((__cp->page) != NULL) {					\
+		__t_ret = __memp_fput(__mpf,				\
+		     (dbc)->thread_info, __cp->page, dbc->priority);\
+		__cp->page = NULL;					\
+	} else								\
+		__t_ret = 0;						\
+	if (__t_ret != 0 && (ret) == 0)					\
+		ret = __t_ret;						\
+	__t_ret = __TLPUT((dbc), __cp->lock);				\
+	if (__t_ret != 0 && (ret) == 0)					\
+		ret = __t_ret;						\
+	if ((ret) == 0 && !LOCK_ISSET(__cp->lock))			\
+		__cp->lock_mode = DB_LOCK_NG;				\
+	__cp->stream_start_pgno = PGNO_INVALID;				\
+} while (0)
+
+/* If on-page item is a deleted record. */
+#undef	IS_DELETED
+#define	IS_DELETED(dbp, page, indx)					\
+	B_DISSET(GET_BKEYDATA(dbp, page,				\
+	    (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type)
+#undef	IS_CUR_DELETED
+#define	IS_CUR_DELETED(dbc)						\
+	IS_DELETED((dbc)->dbp, (dbc)->internal->page, (dbc)->internal->indx)
+
+/*
+ * Test to see if two cursors could point to duplicates of the same key.
+ * In the case of off-page duplicates they are they same, as the cursors
+ * will be in the same off-page duplicate tree.  In the case of on-page
+ * duplicates, the key index offsets must be the same.  For the last test,
+ * as the original cursor may not have a valid page pointer, we use the
+ * current cursor's.
+ */
+#undef	IS_DUPLICATE
+#define	IS_DUPLICATE(dbc, i1, i2)					\
+	    (P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i1] ==	\
+	     P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i2])
+#undef	IS_CUR_DUPLICATE
+#define	IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)			\
+	(F_ISSET(dbc, DBC_OPD) ||					\
+	    (orig_pgno == (dbc)->internal->pgno &&			\
+	    IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx)))
+
+/*
+ * __bamc_init --
+ *	Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __bamc_init __P((DBC *, DBTYPE));
+ */
+int
+__bamc_init(dbc, dbtype)
+	DBC *dbc;
+	DBTYPE dbtype;
+{
+	ENV *env;
+	int ret;
+#ifdef HAVE_COMPRESSION
+	BTREE_CURSOR *cp;
+#endif
+
+	env = dbc->env;
+
+	/* Allocate/initialize the internal structure. */
+	if (dbc->internal == NULL) {
+		if ((ret = __os_calloc(
+		    env, 1, sizeof(BTREE_CURSOR), &dbc->internal)) != 0)
+			return (ret);
+
+#ifdef HAVE_COMPRESSION
+		cp = (BTREE_CURSOR*)dbc->internal;
+		cp->compressed.flags = DB_DBT_USERMEM;
+		cp->key1.flags = DB_DBT_USERMEM;
+		cp->key2.flags = DB_DBT_USERMEM;
+		cp->data1.flags = DB_DBT_USERMEM;
+		cp->data2.flags = DB_DBT_USERMEM;
+		cp->del_key.flags = DB_DBT_USERMEM;
+		cp->del_data.flags = DB_DBT_USERMEM;
+#endif
+	}
+
+	/* Initialize methods. */
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __dbc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	if (dbtype == DB_BTREE) {
+		dbc->am_bulk = __bam_bulk;
+		dbc->am_close = __bamc_close;
+		dbc->am_del = __bamc_del;
+		dbc->am_destroy = __bamc_destroy;
+		dbc->am_get = __bamc_get;
+		dbc->am_put = __bamc_put;
+		dbc->am_writelock = __bamc_writelock;
+	} else {
+		dbc->am_bulk = __bam_bulk;
+		dbc->am_close = __bamc_close;
+		dbc->am_del = __ramc_del;
+		dbc->am_destroy = __bamc_destroy;
+		dbc->am_get = __ramc_get;
+		dbc->am_put = __ramc_put;
+		dbc->am_writelock = __bamc_writelock;
+	}
+
+	return (0);
+}
+
+/*
+ * __bamc_refresh
+ *	Set things up properly for cursor re-use.
+ *
+ * PUBLIC: int __bamc_refresh __P((DBC *));
+ */
+int
+__bamc_refresh(dbc)
+	DBC *dbc;
+{
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+
+	dbp = dbc->dbp;
+	t = dbp->bt_internal;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * If our caller set the root page number, it's because the root was
+	 * known.  This is always the case for off page dup cursors.  Else,
+	 * pull it out of our internal information, unless this is a subdb.
+	 */
+	if (cp->root == PGNO_INVALID && t->bt_meta == PGNO_BASE_MD)
+		cp->root = t->bt_root;
+
+	LOCK_INIT(cp->lock);
+	cp->lock_mode = DB_LOCK_NG;
+
+	if (cp->sp == NULL) {
+		cp->sp = cp->stack;
+		cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
+	}
+	BT_STK_CLR(cp);
+
+#ifdef HAVE_COMPRESSION
+	/* Initialize compression */
+	cp->prevKey = 0;
+	cp->prevData = 0;
+	cp->currentKey = 0;
+	cp->currentData = 0;
+	cp->compcursor = 0;
+	cp->compend = 0;
+	cp->prevcursor = 0;
+	cp->prev2cursor = 0;
+#endif
+
+	/*
+	 * The btree leaf page data structures require that two key/data pairs
+	 * (or four items) fit on a page, but other than that there's no fixed
+	 * requirement.  The btree off-page duplicates only require two items,
+	 * to be exact, but requiring four for them as well seems reasonable.
+	 *
+	 * Recno uses the btree bt_ovflsize value -- it's close enough.
+	 */
+	cp->ovflsize = B_MINKEY_TO_OVFLSIZE(
+	    dbp,  F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey, dbp->pgsize);
+
+	cp->recno = RECNO_OOB;
+	cp->order = INVALID_ORDER;
+	cp->flags = 0;
+
+	/* Initialize for record numbers. */
+	if (F_ISSET(dbc, DBC_OPD) ||
+	    dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_AM_RECNUM)) {
+		F_SET(cp, C_RECNUM);
+
+		/*
+		 * All btrees that support record numbers, optionally standard
+		 * recno trees, and all off-page duplicate recno trees have
+		 * mutable record numbers.
+		 */
+		if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) ||
+		    F_ISSET(dbp, DB_AM_RECNUM | DB_AM_RENUMBER))
+			F_SET(cp, C_RENUMBER);
+	}
+
+	return (0);
+}
+
+/*
+ * __bamc_close --
+ *	Close down the cursor.
+ */
+static int
+__bamc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	BTREE_CURSOR *cp, *cp_opd, *cp_c;
+	DB *dbp;
+	DBC *dbc_opd, *dbc_c;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	int cdb_lock, ret;
+	u_int32_t count;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	cp_opd = (dbc_opd = cp->opd) == NULL ?
+	    NULL : (BTREE_CURSOR *)dbc_opd->internal;
+	cdb_lock = ret = 0;
+
+	/*
+	 * There are 3 ways this function is called:
+	 *
+	 * 1. Closing a primary cursor: we get called with a pointer to a
+	 *    primary cursor that has a NULL opd field.  This happens when
+	 *    closing a btree/recno database cursor without an associated
+	 *    off-page duplicate tree.
+	 *
+	 * 2. Closing a primary and an off-page duplicate cursor stack: we
+	 *    get called with a pointer to the primary cursor which has a
+	 *    non-NULL opd field.  This happens when closing a btree cursor
+	 *    into database with an associated off-page btree/recno duplicate
+	 *    tree. (It can't be a primary recno database, recno databases
+	 *    don't support duplicates.)
+	 *
+	 * 3. Closing an off-page duplicate cursor stack: we get called with
+	 *    a pointer to the off-page duplicate cursor.  This happens when
+	 *    closing a non-btree database that has an associated off-page
+	 *    btree/recno duplicate tree or for a btree database when the
+	 *    opd tree is not empty (root_pgno == PGNO_INVALID).
+	 *
+	 * If either the primary or off-page duplicate cursor deleted a btree
+	 * key/data pair, check to see if the item is still referenced by a
+	 * different cursor.  If it is, confirm that cursor's delete flag is
+	 * set and leave it to that cursor to do the delete.
+	 *
+	 * NB: The test for == 0 below is correct.  Our caller already removed
+	 * our cursor argument from the active queue, we won't find it when we
+	 * search the queue in __bam_ca_delete().
+	 * NB: It can't be true that both the primary and off-page duplicate
+	 * cursors have deleted a btree key/data pair.  Either the primary
+	 * cursor may have deleted an item and there's no off-page duplicate
+	 * cursor, or there's an off-page duplicate cursor and it may have
+	 * deleted an item.
+	 *
+	 * Primary recno databases aren't an issue here.  Recno keys are either
+	 * deleted immediately or never deleted, and do not have to be handled
+	 * here.
+	 *
+	 * Off-page duplicate recno databases are an issue here, cases #2 and
+	 * #3 above can both be off-page recno databases.  The problem is the
+	 * same as the final problem for off-page duplicate btree databases.
+	 * If we no longer need the off-page duplicate tree, we want to remove
+	 * it.  For off-page duplicate btrees, we are done with the tree when
+	 * we delete the last item it contains, i.e., there can be no further
+	 * references to it when it's empty.  For off-page duplicate recnos,
+	 * we remove items from the tree as the application calls the remove
+	 * function, so we are done with the tree when we close the last cursor
+	 * that references it.
+	 *
+	 * We optionally take the root page number from our caller.  If the
+	 * primary database is a btree, we can get it ourselves because dbc
+	 * is the primary cursor.  If the primary database is not a btree,
+	 * the problem is that we may be dealing with a stack of pages.  The
+	 * cursor we're using to do the delete points at the bottom of that
+	 * stack and we need the top of the stack.
+	 */
+	if (F_ISSET(cp, C_DELETED)) {
+		dbc_c = dbc;
+		switch (dbc->dbtype) {
+		case DB_BTREE:				/* Case #1, #3. */
+			if ((ret = __bam_ca_delete(
+			    dbp, cp->pgno, cp->indx, 1, &count)) != 0)
+				goto err;
+			if (count == 0)
+				goto lock;
+			goto done;
+		case DB_RECNO:
+			if (!F_ISSET(dbc, DBC_OPD))	/* Case #1. */
+				goto done;
+							/* Case #3. */
+			if ((ret = __ram_ca_delete(dbp, cp->root, &count)) != 0)
+				goto err;
+			if (count == 0)
+				goto lock;
+			goto done;
+		case DB_HASH:
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(
+			    env, "DbCursor.close", dbc->dbtype);
+			goto err;
+		}
+	}
+
+	if (dbc_opd == NULL)
+		goto done;
+
+	if (F_ISSET(cp_opd, C_DELETED)) {		/* Case #2. */
+		/*
+		 * We will not have been provided a root page number.  Acquire
+		 * one from the primary database.
+		 */
+		if ((h = cp->page) == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
+			goto err;
+		root_pgno = GET_BOVERFLOW(dbp, h, cp->indx + O_INDX)->pgno;
+		if ((ret = __memp_fput(mpf,
+		     dbc->thread_info, h, dbc->priority)) != 0)
+			goto err;
+		cp->page = NULL;
+
+		dbc_c = dbc_opd;
+		switch (dbc_opd->dbtype) {
+		case DB_BTREE:
+			if ((ret = __bam_ca_delete(
+			    dbp, cp_opd->pgno, cp_opd->indx, 1, &count)) != 0)
+				goto err;
+			if (count == 0)
+				goto lock;
+			goto done;
+		case DB_RECNO:
+			if ((ret =
+			    __ram_ca_delete(dbp, cp_opd->root, &count)) != 0)
+				goto err;
+			if (count == 0)
+				goto lock;
+			goto done;
+		case DB_HASH:
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(
+			   env, "DbCursor.close", dbc->dbtype);
+			goto err;
+		}
+	}
+	goto done;
+
+lock:	cp_c = (BTREE_CURSOR *)dbc_c->internal;
+
+	/*
+	 * If this is CDB, upgrade the lock if necessary.  While we acquired
+	 * the write lock to logically delete the record, we released it when
+	 * we returned from that call, and so may not be holding a write lock
+	 * at the moment.
+	 */
+	if (CDB_LOCKING(env)) {
+		if (F_ISSET(dbc, DBC_WRITECURSOR)) {
+			if ((ret = __lock_get(env,
+			    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt,
+			    DB_LOCK_WRITE, &dbc->mylock)) != 0)
+				goto err;
+			cdb_lock = 1;
+		}
+		goto do_del;
+	}
+
+	/*
+	 * The variable dbc_c has been initialized to reference the cursor in
+	 * which we're going to do the delete.  Initialize the cursor's lock
+	 * structures as necessary.
+	 *
+	 * First, we may not need to acquire any locks.  If we're in case #3,
+	 * that is, the primary database isn't a btree database, our caller
+	 * is responsible for acquiring any necessary locks before calling us.
+	 */
+	if (F_ISSET(dbc, DBC_OPD))
+		goto do_del;
+
+	/*
+	 * Otherwise, acquire a write lock on the primary database's page.
+	 *
+	 * Lock the primary database page, regardless of whether we're deleting
+	 * an item on a primary database page or an off-page duplicates page.
+	 *
+	 * If the cursor that did the initial logical deletion (and had a write
+	 * lock) is not the same cursor doing the physical deletion (which may
+	 * have only ever had a read lock on the item), we need to upgrade to a
+	 * write lock.  The confusion comes as follows:
+	 *
+	 *	C1	created, acquires item read lock
+	 *	C2	dup C1, create C2, also has item read lock.
+	 *	C1	acquire write lock, delete item
+	 *	C1	close
+	 *	C2	close, needs a write lock to physically delete item.
+	 *
+	 * If we're in a TXN, we know that C2 will be able to acquire the write
+	 * lock, because no locker other than the one shared by C1 and C2 can
+	 * acquire a write lock -- the original write lock C1 acquired was never
+	 * discarded.
+	 *
+	 * If we're not in a TXN, it's nastier.  Other cursors might acquire
+	 * read locks on the item after C1 closed, discarding its write lock,
+	 * and such locks would prevent C2 from acquiring a read lock.  That's
+	 * OK, though, we'll simply wait until we can acquire a write lock, or
+	 * we'll deadlock.  (Which better not happen, since we're not in a TXN.)
+	 *
+	 * There are similar scenarios with dirty reads, where the cursor may
+	 * have downgraded its write lock to a was-write lock.
+	 */
+	if (STD_LOCKING(dbc))
+		if ((ret = __db_lget(dbc,
+		    LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+			goto err;
+
+do_del:	/*
+	 * If the delete occurred in a Btree, we're going to look at the page
+	 * to see if the item has to be physically deleted.  Otherwise, we do
+	 * not need the actual page (and it may not even exist, it might have
+	 * been truncated from the file after an allocation aborted).
+	 *
+	 * Delete the on-page physical item referenced by the cursor.
+	 */
+	if (F_ISSET(dbc_c, DBC_OPD))
+		LOCK_CHECK_OFF(dbc_c->thread_info);
+	if (dbc_c->dbtype == DB_BTREE) {
+		if ((ret = __memp_fget(mpf, &cp_c->pgno, dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &cp_c->page)) != 0)
+			goto err_c;
+		if ((ret = __bamc_physdel(dbc_c)) != 0)
+			goto err_c;
+	}
+
+	/*
+	 * If we're not working in an off-page duplicate tree, then we're
+	 * done.
+	 */
+	if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID)
+		goto done;
+
+	/*
+	 * We may have just deleted the last element in the off-page duplicate
+	 * tree, and closed the last cursor in the tree.  For an off-page btree
+	 * there are no other cursors in the tree by definition, if the tree is
+	 * empty.  For an off-page recno we know we have closed the last cursor
+	 * in the tree because the __ram_ca_delete call above returned 0 only
+	 * in that case.  So, if the off-page duplicate tree is empty at this
+	 * point, we want to remove it.
+	 */
+	if (((h = dbc_c->internal->page) == NULL || h->pgno != root_pgno) &&
+	    (ret = __memp_fget(mpf,
+	    &root_pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0)
+		goto err_c;
+	if ((count = NUM_ENT(h)) == 0) {
+		if (h != dbc_c->internal->page)
+			DISCARD_CUR(dbc_c, ret);
+		else
+			dbc_c->internal->page = NULL;
+		if (ret == 0)
+			ret = __db_free(dbc, h, 0);
+	} else if (h != dbc_c->internal->page)
+		ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+
+err_c:	if (F_ISSET(dbc_c, DBC_OPD))
+		LOCK_CHECK_ON(dbc_c->thread_info);
+	if (ret != 0)
+		goto err;
+
+	if (count != 0)
+		goto done;
+
+	/*
+	 * When removing the tree, we have to do one of two things.  If this is
+	 * case #2, that is, the primary tree is a btree, delete the key that's
+	 * associated with the tree from the btree leaf page.  We know we are
+	 * the only reference to it and we already have the correct lock.  We
+	 * detect this case because the cursor that was passed to us references
+	 * an off-page duplicate cursor.
+	 *
+	 * If this is case #3, that is, the primary tree isn't a btree, pass
+	 * the information back to our caller, it's their job to do cleanup on
+	 * the primary page.
+	 */
+	if (dbc_opd != NULL) {
+		if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+			goto err;
+		if ((ret = __bamc_physdel(dbc)) != 0)
+			goto err;
+	} else
+		*rmroot = 1;
+err:
+done:	/*
+	 * Discard the page references and locks, and confirm that the stack
+	 * has been emptied.
+	 */
+	if (dbc_opd != NULL)
+		DISCARD_CUR(dbc_opd, ret);
+	DISCARD_CUR(dbc, ret);
+
+	/* Downgrade any CDB lock we acquired. */
+	if (cdb_lock)
+		(void)__lock_downgrade(env, &dbc->mylock, DB_LOCK_IWRITE, 0);
+
+	return (ret);
+}
+
+/*
+ * __bamc_cmp --
+ *	Compare two btree cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It only distinguishes cursors pointing to deleted and undeleted items at
+ * the same location.
+ *
+ * PUBLIC: int __bamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	ENV *env;
+	BTREE_CURSOR *bcp, *obcp;
+
+	env = dbc->env;
+	bcp = (BTREE_CURSOR *)dbc->internal;
+	obcp = (BTREE_CURSOR *)other_dbc->internal;
+
+	DB_ASSERT (env, bcp->pgno == obcp->pgno);
+	DB_ASSERT (env, bcp->indx == obcp->indx);
+
+	/* Check to see if both cursors have the same deleted flag. */
+	*result =
+	    ((F_ISSET(bcp, C_DELETED)) == F_ISSET(obcp, C_DELETED)) ? 0 : 1;
+	return (0);
+}
+
+/*
+ * __bamc_destroy --
+ *	Close a single cursor -- internal version.
+ */
+static int
+__bamc_destroy(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	ENV *env;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	env = dbc->env;
+
+	/* Discard the structures. */
+	if (cp->sp != cp->stack)
+		__os_free(env, cp->sp);
+
+#ifdef HAVE_COMPRESSION
+	/* Free the memory used for compression */
+	__os_free(env, cp->compressed.data);
+	__os_free(env, cp->key1.data);
+	__os_free(env, cp->key2.data);
+	__os_free(env, cp->data1.data);
+	__os_free(env, cp->data2.data);
+	__os_free(env, cp->del_key.data);
+	__os_free(env, cp->del_data.data);
+#endif
+
+	__os_free(env, cp);
+
+	return (0);
+}
+
+/*
+ * __bamc_count --
+ *	Return a count of on and off-page duplicates.
+ *
+ * PUBLIC: int __bamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_count(dbc, recnop)
+	DBC *dbc;
+	db_recno_t *recnop;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	db_indx_t indx, top;
+	db_recno_t recno;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Called with the top-level cursor that may reference an off-page
+	 * duplicates tree.  We don't have to acquire any new locks, we have
+	 * to have a read lock to even get here.
+	 */
+	if (cp->opd == NULL) {
+		/*
+		 * On-page duplicates, get the page and count.
+		 */
+		DB_ASSERT(dbp->env, cp->page == NULL);
+		if ((ret = __memp_fget(mpf, &cp->pgno,
+		    dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+			return (ret);
+
+		/*
+		 * Move back to the beginning of the set of duplicates and
+		 * then count forward.
+		 */
+		for (indx = cp->indx;; indx -= P_INDX)
+			if (indx == 0 ||
+			    !IS_DUPLICATE(dbc, indx, indx - P_INDX))
+				break;
+		for (recno = 0,
+		    top = NUM_ENT(cp->page) - P_INDX;; indx += P_INDX) {
+			if (!IS_DELETED(dbp, cp->page, indx))
+				++recno;
+			if (indx == top ||
+			    !IS_DUPLICATE(dbc, indx, indx + P_INDX))
+				break;
+		}
+	} else {
+		/*
+		 * Off-page duplicates tree, get the root page of the off-page
+		 * duplicate tree.
+		 */
+		if ((ret = __memp_fget(mpf, &cp->opd->internal->root,
+		    dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+			return (ret);
+
+		/*
+		 * If the page is an internal page use the page's count as it's
+		 * up-to-date and reflects the status of cursors in the tree.
+		 * If the page is a leaf page for unsorted duplicates, use the
+		 * page's count as cursors don't mark items deleted on the page
+		 * and wait, cursor delete items immediately.
+		 * If the page is a leaf page for sorted duplicates, there may
+		 * be cursors on the page marking deleted items -- count.
+		 */
+		if (TYPE(cp->page) == P_LDUP)
+			for (recno = 0, indx = 0,
+			    top = NUM_ENT(cp->page) - O_INDX;; indx += O_INDX) {
+				if (!IS_DELETED(dbp, cp->page, indx))
+					++recno;
+				if (indx == top)
+					break;
+			}
+		else
+			recno = RE_NREC(cp->page);
+	}
+
+	*recnop = recno;
+
+	ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+	cp->page = NULL;
+
+	return (ret);
+}
+
+/*
+ * __bamc_del --
+ *	Delete using a cursor.
+ */
+static int
+__bamc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+	u_int32_t count;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ret = 0;
+	COMPQUIET(flags, 0);
+
+	/* If the item was already deleted, return failure. */
+	if (F_ISSET(cp, C_DELETED))
+		return (DB_KEYEMPTY);
+
+	/*
+	 * This code is always called with a page lock but no page.
+	 */
+	DB_ASSERT(dbp->env, cp->page == NULL);
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+
+	/*
+	 * We don't physically delete the record until the cursor moves, so
+	 * we have to have a long-lived write lock on the page instead of a
+	 * a long-lived read lock.  Note, we have to have a read lock to even
+	 * get here.
+	 *
+	 * If we're maintaining record numbers, we lock the entire tree, else
+	 * we lock the single page.
+	 */
+	if (F_ISSET(cp, C_RECNUM)) {
+		if ((ret = __bamc_getstack(dbc)) != 0)
+			goto err;
+		cp->page = cp->csp->page;
+	} else {
+		ACQUIRE_CUR(dbc, DB_LOCK_WRITE, cp->pgno, 0, ret);
+		if (ret != 0)
+			goto err;
+	}
+
+	/* Mark the page dirty. */
+	if ((ret = __memp_dirty(mpf,
+	    &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __bam_cdel_log(dbp, dbc->txn, &LSN(cp->page), 0,
+		    PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+
+	/* Set the intent-to-delete flag on the page. */
+	if (TYPE(cp->page) == P_LBTREE)
+		B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx + O_INDX)->type);
+	else
+		B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type);
+
+err:	/*
+	 * If we've been successful so far and the tree has record numbers,
+	 * adjust the record counts.  Either way, release acquired page(s).
+	 */
+	if (F_ISSET(cp, C_RECNUM)) {
+		cp->csp->page = cp->page;
+		if (ret == 0)
+			ret = __bam_adjust(dbc, -1);
+		(void)__bam_stkrel(dbc, 0);
+	} else
+		if (cp->page != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    cp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+
+	cp->page = NULL;
+
+	/*
+	 * Update the cursors last, after all chance of recoverable failure
+	 * is past.
+	 */
+	if (ret == 0)
+		ret = __bam_ca_delete(dbp, cp->pgno, cp->indx, 1, &count);
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+	return (ret);
+}
+
+/*
+ * __bamc_dup --
+ *	Duplicate a btree cursor, such that the new one holds appropriate
+ *	locks for the position of the original.
+ *
+ * PUBLIC: int __bamc_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_dup(orig_dbc, new_dbc, flags)
+	DBC *orig_dbc, *new_dbc;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *orig, *new;
+
+	orig = (BTREE_CURSOR *)orig_dbc->internal;
+	new = (BTREE_CURSOR *)new_dbc->internal;
+
+	new->ovflsize = orig->ovflsize;
+	new->recno = orig->recno;
+	new->flags = orig->flags;
+
+#ifdef HAVE_COMPRESSION
+	/* Copy the compression state */
+	return (__bamc_compress_dup(orig_dbc, new_dbc, flags));
+#else
+	COMPQUIET(flags, 0);
+
+	return (0);
+#endif
+}
+
+/*
+ * __bamc_get --
+ *	Get using a cursor (btree).
+ */
+static int
+__bamc_get(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	db_pgno_t orig_pgno;
+	db_indx_t orig_indx;
+	int exact, newopd, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	orig_pgno = cp->pgno;
+	orig_indx = cp->indx;
+
+	newopd = 0;
+	switch (flags) {
+	case DB_CURRENT:
+		/* It's not possible to return a deleted record. */
+		if (F_ISSET(cp, C_DELETED)) {
+			ret = DB_KEYEMPTY;
+			goto err;
+		}
+
+		/*
+		 * Acquire the current page.  We have at least a read-lock
+		 * already.  The caller may have set DB_RMW asking for a
+		 * write lock, but upgrading to a write lock has no better
+		 * chance of succeeding now instead of later, so don't try.
+		 */
+		if ((ret = __memp_fget(mpf, &cp->pgno,
+		    dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+			goto err;
+		break;
+	case DB_FIRST:
+		newopd = 1;
+		if ((ret = __bamc_search(dbc,
+		     PGNO_INVALID, NULL, flags, &exact)) != 0)
+			goto err;
+		break;
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		/*
+		 * There are two ways to get here based on DBcursor->get
+		 * with the DB_GET_BOTH/DB_GET_BOTH_RANGE flags set:
+		 *
+		 * 1. Searching a sorted off-page duplicate tree: do a tree
+		 * search.
+		 *
+		 * 2. Searching btree: do a tree search.  If it returns a
+		 * reference to off-page duplicate tree, return immediately
+		 * and let our caller deal with it.  If the search doesn't
+		 * return a reference to off-page duplicate tree, continue
+		 * with an on-page search.
+		 */
+		if (F_ISSET(dbc, DBC_OPD)) {
+			if ((ret = __bamc_search(
+			    dbc, PGNO_INVALID, data, flags, &exact)) != 0)
+				goto err;
+			if (flags == DB_GET_BOTH) {
+				if (!exact) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+				break;
+			}
+
+			/*
+			 * We didn't require an exact match, so the search may
+			 * may have returned an entry past the end of the page,
+			 * or we may be referencing a deleted record.  If so,
+			 * move to the next entry.
+			 */
+			if ((cp->indx == NUM_ENT(cp->page) ||
+			    IS_CUR_DELETED(dbc)) &&
+			    (ret = __bamc_next(dbc, 1, 0)) != 0)
+				goto err;
+		} else {
+			if ((ret = __bamc_search(
+			    dbc, PGNO_INVALID, key, flags, &exact)) != 0)
+				return (ret);
+			if (!exact) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+
+			if (pgnop != NULL && __bam_isopd(dbc, pgnop)) {
+				newopd = 1;
+				break;
+			}
+			if ((ret =
+			    __bam_getboth_finddatum(dbc, data, flags)) != 0)
+				goto err;
+		}
+		break;
+#ifdef HAVE_COMPRESSION
+	case DB_SET_LTE:
+		if ((ret = __bam_getlte(dbc, key, NULL)) != 0)
+			goto err;
+		break;
+	case DB_GET_BOTH_LTE:
+		if ((ret = __bam_getlte(dbc, key, data)) != 0)
+			goto err;
+		break;
+#endif
+	case DB_GET_BOTHC:
+		if ((ret = __bam_getbothc(dbc, data)) != 0)
+			goto err;
+		break;
+	case DB_LAST:
+		newopd = 1;
+		if ((ret = __bamc_search(dbc,
+		     PGNO_INVALID, NULL, flags, &exact)) != 0)
+			goto err;
+		break;
+	case DB_NEXT:
+		newopd = 1;
+		if (cp->pgno == PGNO_INVALID) {
+			if ((ret = __bamc_search(dbc,
+			     PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+				goto err;
+		} else
+			if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+				goto err;
+		break;
+	case DB_NEXT_DUP:
+		if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+			goto err;
+		if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		break;
+	case DB_NEXT_NODUP:
+		newopd = 1;
+		if (cp->pgno == PGNO_INVALID) {
+			if ((ret = __bamc_search(dbc,
+			     PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+				goto err;
+		} else
+			do {
+				if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+					goto err;
+			} while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+		break;
+	case DB_PREV:
+		newopd = 1;
+		if (cp->pgno == PGNO_INVALID) {
+			if ((ret = __bamc_search(dbc,
+			     PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+				goto err;
+		} else
+			if ((ret = __bamc_prev(dbc)) != 0)
+				goto err;
+		break;
+	case DB_PREV_DUP:
+		if ((ret = __bamc_prev(dbc)) != 0)
+			goto err;
+		if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		break;
+	case DB_PREV_NODUP:
+		newopd = 1;
+		if (cp->pgno == PGNO_INVALID) {
+			if ((ret = __bamc_search(dbc,
+			     PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+				goto err;
+		} else
+			do {
+				if ((ret = __bamc_prev(dbc)) != 0)
+					goto err;
+			} while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+		break;
+	case DB_SET:
+	case DB_SET_RECNO:
+		newopd = 1;
+		if ((ret = __bamc_search(dbc,
+		    PGNO_INVALID, key, flags, &exact)) != 0)
+			goto err;
+		break;
+	case DB_SET_RANGE:
+		newopd = 1;
+		if ((ret = __bamc_search(dbc,
+		    PGNO_INVALID, key, flags, &exact)) != 0)
+			goto err;
+
+		/*
+		 * As we didn't require an exact match, the search function
+		 * may have returned an entry past the end of the page.  Or,
+		 * we may be referencing a deleted record.  If so, move to
+		 * the next entry.
+		 */
+		if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc))
+			if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+				goto err;
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__bamc_get", flags);
+		goto err;
+	}
+
+	/*
+	 * We may have moved to an off-page duplicate tree.  Return that
+	 * information to our caller.
+	 */
+	if (newopd && pgnop != NULL)
+		(void)__bam_isopd(dbc, pgnop);
+
+err:	/*
+	 * Regardless of whether we were successful or not, if the cursor
+	 * moved, clear the delete flag, DBcursor->get never references a
+	 * deleted key, if it moved at all.
+	 */
+	if (F_ISSET(cp, C_DELETED) &&
+	    (cp->pgno != orig_pgno || cp->indx != orig_indx))
+		F_CLR(cp, C_DELETED);
+
+	return (ret);
+}
+
+static int
+__bam_get_prev(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	DBT key, data;
+	db_pgno_t pgno;
+	int ret;
+
+	if ((ret = __bamc_prev(dbc)) != 0)
+		return (ret);
+
+	if (__bam_isopd(dbc, &pgno)) {
+		cp = (BTREE_CURSOR *)dbc->internal;
+		if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+			return (ret);
+		if ((ret = cp->opd->am_get(cp->opd,
+		    &key, &data, DB_LAST, NULL)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_bulk -- Return bulk data from a btree.
+ */
+static int
+__bam_bulk(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	BTREE_CURSOR *cp;
+	PAGE *pg;
+	db_indx_t *inp, indx, pg_keyoff;
+	int32_t  *endp, key_off, *offp, *saveoffp;
+	u_int8_t *dbuf, *dp, *np;
+	u_int32_t key_size, pagesize, size, space;
+	int adj, is_key, need_pg, next_key, no_dup, rec_key, ret;
+
+	ret = 0;
+	key_off = 0;
+	size = 0;
+	pagesize = dbc->dbp->pgsize;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * dp tracks the beginning of the page in the buffer.
+	 * np is the next place to copy things into the buffer.
+	 * dbuf always stays at the beginning of the buffer.
+	 */
+	dbuf = data->data;
+	np = dp = dbuf;
+
+	/* Keep track of space that is left.  There is a termination entry */
+	space = data->ulen;
+	space -= sizeof(*offp);
+
+	/* Build the offset/size table from the end up. */
+	endp = (int32_t *)((u_int8_t *)dbuf + data->ulen);
+	endp--;
+	offp = endp;
+
+	key_size = 0;
+
+	/*
+	 * Distinguish between BTREE and RECNO.
+	 * There are no keys in RECNO.  If MULTIPLE_KEY is specified
+	 * then we return the record numbers.
+	 * is_key indicates that multiple btree keys are returned.
+	 * rec_key is set if we are returning record numbers.
+	 * next_key is set if we are going after the next key rather than dup.
+	 */
+	if (dbc->dbtype == DB_BTREE) {
+		is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1: 0;
+		rec_key = 0;
+		next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+		adj = 2;
+	} else {
+		is_key = 0;
+		rec_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+		next_key = LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+		adj = 1;
+	}
+	no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+
+next_pg:
+	indx = cp->indx;
+	pg = cp->page;
+
+	inp = P_INP(dbc->dbp, pg);
+	/* The current page is not yet in the buffer. */
+	need_pg = 1;
+
+	/*
+	 * Keep track of the offset of the current key on the page.
+	 * If we are returning keys, set it to 0 first so we force
+	 * the copy of the key to the buffer.
+	 */
+	pg_keyoff = 0;
+	if (is_key == 0)
+		pg_keyoff = inp[indx];
+
+	do {
+		if (IS_DELETED(dbc->dbp, pg, indx)) {
+			if (dbc->dbtype != DB_RECNO)
+				continue;
+
+			cp->recno++;
+			/*
+			 * If we are not returning recnos then we
+			 * need to fill in every slot so the user
+			 * can calculate the record numbers.
+			 */
+			if (rec_key != 0)
+				continue;
+
+			space -= 2 * sizeof(*offp);
+			/* Check if space as underflowed. */
+			if (space > data->ulen)
+				goto back_up;
+
+			/* Just mark the empty recno slots. */
+			*offp-- = 0;
+			*offp-- = 0;
+			continue;
+		}
+
+		/*
+		 * Check to see if we have a new key.
+		 * If so, then see if we need to put the
+		 * key on the page.  If its already there
+		 * then we just point to it.
+		 */
+		if (is_key && pg_keyoff != inp[indx]) {
+			bk = GET_BKEYDATA(dbc->dbp, pg, indx);
+			if (B_TYPE(bk->type) == B_OVERFLOW) {
+				bo = (BOVERFLOW *)bk;
+				size = key_size = bo->tlen;
+				if (key_size > space)
+					goto get_key_space;
+				if ((ret = __bam_bulk_overflow(dbc,
+				    bo->tlen, bo->pgno, np)) != 0)
+					return (ret);
+				space -= key_size;
+				key_off = (int32_t)(np - dbuf);
+				np += key_size;
+			} else {
+				if (need_pg) {
+					dp = np;
+					size = pagesize - HOFFSET(pg);
+					if (space < size) {
+get_key_space:
+						/* Nothing added, then error. */
+						if (offp == endp) {
+							data->size = (u_int32_t)
+							    DB_ALIGN(size +
+							    pagesize, 1024);
+							return
+							    (DB_BUFFER_SMALL);
+						}
+						/*
+						 * We need to back up to the
+						 * last record put into the
+						 * buffer so that it is
+						 * CURRENT.
+						 */
+						if (indx != 0)
+							indx -= P_INDX;
+						else {
+							if ((ret =
+							    __bam_get_prev(
+							    dbc)) != 0)
+								return (ret);
+							indx = cp->indx;
+							pg = cp->page;
+						}
+						break;
+					}
+					/*
+					 * Move the data part of the page
+					 * to the buffer.
+					 */
+					memcpy(dp,
+					   (u_int8_t *)pg + HOFFSET(pg), size);
+					need_pg = 0;
+					space -= size;
+					np += size;
+				}
+				key_size = bk->len;
+				key_off = (int32_t)((inp[indx] - HOFFSET(pg))
+				    + (dp - dbuf) + SSZA(BKEYDATA, data));
+				pg_keyoff = inp[indx];
+			}
+		}
+
+		/*
+		 * Reserve space for the pointers and sizes.
+		 * Either key/data pair or just for a data item.
+		 */
+		space -= (is_key ? 4 : 2) * sizeof(*offp);
+		if (rec_key)
+			space -= sizeof(*offp);
+
+		/* Check to see if space has underflowed. */
+		if (space > data->ulen)
+			goto back_up;
+
+		/*
+		 * Determine if the next record is in the
+		 * buffer already or if it needs to be copied in.
+		 * If we have an off page dup, then copy as many
+		 * as will fit into the buffer.
+		 */
+		bk = GET_BKEYDATA(dbc->dbp, pg, indx + adj - 1);
+		if (B_TYPE(bk->type) == B_DUPLICATE) {
+			bo = (BOVERFLOW *)bk;
+			if (is_key) {
+				*offp-- = (int32_t)key_off;
+				*offp-- = (int32_t)key_size;
+			}
+			/*
+			 * We pass the offset of the current key.
+			 * On return we check to see if offp has
+			 * moved to see if any data fit.
+			 */
+			saveoffp = offp;
+			if ((ret = __bam_bulk_duplicates(dbc, bo->pgno,
+			    dbuf, is_key ? offp + P_INDX : NULL,
+			    &offp, &np, &space, no_dup)) != 0) {
+				if (ret == DB_BUFFER_SMALL) {
+					size = space;
+					space = 0;
+					/* If nothing was added, then error. */
+					if (offp == saveoffp) {
+						offp += 2;
+						goto back_up;
+					}
+					goto get_space;
+				}
+				return (ret);
+			}
+		} else if (B_TYPE(bk->type) == B_OVERFLOW) {
+			bo = (BOVERFLOW *)bk;
+			size = bo->tlen;
+			if (size > space)
+				goto back_up;
+			if ((ret =
+			    __bam_bulk_overflow(dbc,
+				bo->tlen, bo->pgno, np)) != 0)
+				return (ret);
+			space -= size;
+			if (is_key) {
+				*offp-- = (int32_t)key_off;
+				*offp-- = (int32_t)key_size;
+			} else if (rec_key)
+				*offp-- = (int32_t)cp->recno;
+			*offp-- = (int32_t)(np - dbuf);
+			np += size;
+			*offp-- = (int32_t)size;
+		} else {
+			if (need_pg) {
+				dp = np;
+				size = pagesize - HOFFSET(pg);
+				if (space < size) {
+back_up:
+					/*
+					 * Back up the index so that the
+					 * last record in the buffer is CURRENT
+					 */
+					if (indx >= adj)
+						indx -= adj;
+					else {
+						if ((ret =
+						    __bam_get_prev(dbc)) != 0 &&
+						    ret != DB_NOTFOUND)
+							return (ret);
+						indx = cp->indx;
+						pg = cp->page;
+					}
+					if (dbc->dbtype == DB_RECNO)
+						cp->recno--;
+get_space:
+					/*
+					 * See if we put anything in the
+					 * buffer or if we are doing a DBP->get
+					 * did we get all of the data.
+					 */
+					if (offp >=
+					    (is_key ? &endp[-1] : endp) ||
+					    F_ISSET(dbc, DBC_FROM_DB_GET)) {
+						data->size = (u_int32_t)
+						    DB_ALIGN(size +
+						    data->ulen - space, 1024);
+						return (DB_BUFFER_SMALL);
+					}
+					break;
+				}
+				memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+				need_pg = 0;
+				space -= size;
+				np += size;
+			}
+			/*
+			 * Add the offsets and sizes to the end of the buffer.
+			 * First add the key info then the data info.
+			 */
+			if (is_key) {
+				*offp-- = (int32_t)key_off;
+				*offp-- = (int32_t)key_size;
+			} else if (rec_key)
+				*offp-- = (int32_t)cp->recno;
+			*offp-- = (int32_t)((inp[indx + adj - 1] - HOFFSET(pg))
+			    + (dp - dbuf) + SSZA(BKEYDATA, data));
+			*offp-- = bk->len;
+		}
+		if (dbc->dbtype == DB_RECNO)
+			cp->recno++;
+		else if (no_dup) {
+			while (indx + adj < NUM_ENT(pg) &&
+			    pg_keyoff == inp[indx + adj])
+				indx += adj;
+		}
+	/*
+	 * Stop when we either run off the page or we move to the next key and
+	 * we are not returning multiple keys.
+	 */
+	} while ((indx += adj) < NUM_ENT(pg) &&
+	    (next_key || pg_keyoff == inp[indx]));
+
+	/* If we are off the page then try to the next page. */
+	if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+		cp->indx = indx;
+		ret = __bamc_next(dbc, 0, 1);
+		if (ret == 0)
+			goto next_pg;
+		if (ret != DB_NOTFOUND)
+			return (ret);
+	}
+
+	/*
+	 * If we did a DBP->get we must error if we did not return
+	 * all the data for the current key because there is
+	 * no way to know if we did not get it all, nor any
+	 * interface to fetch the balance.
+	 */
+
+	if (ret == 0 && indx < pg->entries &&
+	    F_ISSET(dbc, DBC_TRANSIENT) && pg_keyoff == inp[indx]) {
+		data->size = (data->ulen - space) + size;
+		return (DB_BUFFER_SMALL);
+	}
+	/*
+	 * Must leave the index pointing at the last record fetched.
+	 * If we are not fetching keys, we may have stepped to the
+	 * next key.
+	 */
+	if (ret == DB_BUFFER_SMALL || next_key || pg_keyoff == inp[indx])
+		cp->indx = indx;
+	else
+		cp->indx = indx - P_INDX;
+
+	if (rec_key == 1)
+		*offp = RECNO_OOB;
+	else
+		*offp = -1;
+	return (0);
+}
+
+/*
+ * __bam_bulk_overflow --
+ *	Dump overflow record into the buffer.
+ *	The space requirements have already been checked.
+ * PUBLIC: int __bam_bulk_overflow
+ * PUBLIC:    __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *));
+ */
+int
+__bam_bulk_overflow(dbc, len, pgno, dp)
+	DBC *dbc;
+	u_int32_t len;
+	db_pgno_t pgno;
+	u_int8_t *dp;
+{
+	DBT dbt;
+
+	memset(&dbt, 0, sizeof(dbt));
+	F_SET(&dbt, DB_DBT_USERMEM);
+	dbt.ulen = len;
+	dbt.data = (void *)dp;
+	return (__db_goff(dbc, &dbt, len, pgno, NULL, NULL));
+}
+
+/*
+ * __bam_bulk_duplicates --
+ *	Put as many off page duplicates as will fit into the buffer.
+ * This routine will adjust the cursor to reflect the position in
+ * the overflow tree.
+ * PUBLIC: int __bam_bulk_duplicates __P((DBC *,
+ * PUBLIC:       db_pgno_t, u_int8_t *, int32_t *,
+ * PUBLIC:	 int32_t **, u_int8_t **, u_int32_t *, int));
+ */
+int
+__bam_bulk_duplicates(dbc, pgno, dbuf, keyoff, offpp, dpp, spacep, no_dup)
+	DBC *dbc;
+	db_pgno_t pgno;
+	u_int8_t *dbuf;
+	int32_t *keyoff, **offpp;
+	u_int8_t **dpp;
+	u_int32_t *spacep;
+	int no_dup;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBC *opd;
+	DBT key, data;
+	PAGE *pg;
+	db_indx_t indx, *inp;
+	int32_t *offp;
+	u_int32_t pagesize, size, space;
+	u_int8_t *dp, *np;
+	int first, need_pg, ret, t_ret;
+
+	ret = 0;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	opd = cp->opd;
+
+	if (opd == NULL) {
+		if ((ret = __dbc_newopd(dbc, pgno, NULL, &opd)) != 0)
+			return (ret);
+		cp->opd = opd;
+		if ((ret = opd->am_get(opd,
+		    &key, &data, DB_FIRST, NULL)) != 0)
+			goto close_opd;
+	}
+
+	pagesize = opd->dbp->pgsize;
+	cp = (BTREE_CURSOR *)opd->internal;
+	space = *spacep;
+	/* Get current offset slot. */
+	offp = *offpp;
+
+	/*
+	 * np is the next place to put data.
+	 * dp is the beginning of the current page in the buffer.
+	 */
+	np = dp = *dpp;
+	first = 1;
+	indx = cp->indx;
+
+	do {
+		/* Fetch the current record.  No initial move. */
+		if ((ret = __bamc_next(opd, 0, 0)) != 0)
+			break;
+		pg = cp->page;
+		indx = cp->indx;
+		inp = P_INP(dbp, pg);
+		/* We need to copy the page to the buffer. */
+		need_pg = 1;
+
+		do {
+			if (IS_DELETED(dbp, pg, indx))
+				goto contin;
+			bk = GET_BKEYDATA(dbp, pg, indx);
+			space -= 2 * sizeof(*offp);
+			/* Allocate space for key if needed. */
+			if (first == 0 && keyoff != NULL)
+				space -= 2 * sizeof(*offp);
+
+			/* Did space underflow? */
+			if (space > *spacep) {
+				ret = DB_BUFFER_SMALL;
+				if (first == 1) {
+					/* Get the absolute value. */
+					space = -(int32_t)space;
+					space = *spacep + space;
+					if (need_pg)
+						space += pagesize - HOFFSET(pg);
+				}
+				break;
+			}
+			if (B_TYPE(bk->type) == B_OVERFLOW) {
+				bo = (BOVERFLOW *)bk;
+				size = bo->tlen;
+				if (size > space) {
+					ret = DB_BUFFER_SMALL;
+					space = *spacep + size;
+					break;
+				}
+				if (first == 0 && keyoff != NULL) {
+					*offp-- = keyoff[0];
+					*offp-- = keyoff[-1];
+				}
+				if ((ret = __bam_bulk_overflow(dbc,
+				    bo->tlen, bo->pgno, np)) != 0)
+					return (ret);
+				space -= size;
+				*offp-- = (int32_t)(np - dbuf);
+				np += size;
+			} else {
+				if (need_pg) {
+					dp = np;
+					size = pagesize - HOFFSET(pg);
+					if (space < size) {
+						ret = DB_BUFFER_SMALL;
+						/* Return space required. */
+						space = *spacep + size;
+						break;
+					}
+					memcpy(dp,
+					    (u_int8_t *)pg + HOFFSET(pg), size);
+					need_pg = 0;
+					space -= size;
+					np += size;
+				}
+				if (first == 0 && keyoff != NULL) {
+					*offp-- = keyoff[0];
+					*offp-- = keyoff[-1];
+				}
+				size = bk->len;
+				*offp-- = (int32_t)((inp[indx] - HOFFSET(pg))
+				    + (dp - dbuf) + SSZA(BKEYDATA, data));
+			}
+			*offp-- = (int32_t)size;
+			first = 0;
+			if (no_dup)
+				break;
+contin:
+			indx++;
+			if (opd->dbtype == DB_RECNO)
+				cp->recno++;
+		} while (indx < NUM_ENT(pg));
+		if (no_dup)
+			break;
+		cp->indx = indx;
+
+	} while (ret == 0);
+
+	/* Return the updated information. */
+	*spacep = space;
+	*offpp = offp;
+	*dpp = np;
+
+	/*
+	 * If we ran out of space back up the pointer.
+	 * If we did not return any dups or reached the end, close the opd.
+	 */
+	if (ret == DB_BUFFER_SMALL) {
+		if (opd->dbtype == DB_RECNO) {
+			if (--cp->recno == 0)
+				goto close_opd;
+		} else if (indx != 0)
+			cp->indx--;
+		else {
+			t_ret = __bamc_prev(opd);
+			if (t_ret == DB_NOTFOUND)
+				goto close_opd;
+			if (t_ret != 0)
+				ret = t_ret;
+		}
+	} else if (keyoff == NULL && ret == DB_NOTFOUND) {
+		cp->indx--;
+		if (opd->dbtype == DB_RECNO)
+			--cp->recno;
+	} else if (indx == 0 || ret == DB_NOTFOUND) {
+close_opd:
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		if ((t_ret = __dbc_close(opd)) != 0 && ret == 0)
+			ret = t_ret;
+		((BTREE_CURSOR *)dbc->internal)->opd = NULL;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	return (ret);
+}
+
+/*
+ * __bam_getbothc --
+ *	Search for a matching data item on a join.
+ */
+static int
+__bam_getbothc(dbc, data)
+	DBC *dbc;
+	DBT *data;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	int cmp, exact, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Acquire the current page.  We have at least a read-lock
+	 * already.  The caller may have set DB_RMW asking for a
+	 * write lock, but upgrading to a write lock has no better
+	 * chance of succeeding now instead of later, so don't try.
+	 */
+	if ((ret = __memp_fget(mpf, &cp->pgno,
+	     dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+		return (ret);
+
+	/*
+	 * An off-page duplicate cursor.  Search the remaining duplicates
+	 * for one which matches (do a normal btree search, then verify
+	 * that the retrieved record is greater than the original one).
+	 */
+	if (F_ISSET(dbc, DBC_OPD)) {
+		/*
+		 * Check to make sure the desired item comes strictly after
+		 * the current position;  if it doesn't, return DB_NOTFOUND.
+		 */
+		if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+		    dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
+		    &cmp)) != 0)
+			return (ret);
+
+		if (cmp <= 0)
+			return (DB_NOTFOUND);
+
+		/* Discard the current page, we're going to do a full search. */
+		if ((ret = __memp_fput(mpf,
+		     dbc->thread_info, cp->page, dbc->priority)) != 0)
+			return (ret);
+		cp->page = NULL;
+
+		return (__bamc_search(dbc,
+		    PGNO_INVALID, data, DB_GET_BOTH, &exact));
+	}
+
+	/*
+	 * We're doing a DBC->get(DB_GET_BOTHC) and we're already searching
+	 * a set of on-page duplicates (either sorted or unsorted).  Continue
+	 * a linear search from after the current position.
+	 *
+	 * (Note that we could have just finished a "set" of one duplicate,
+	 * i.e. not a duplicate at all, but the following check will always
+	 * return DB_NOTFOUND in this case, which is the desired behavior.)
+	 */
+	if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+	    !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+		return (DB_NOTFOUND);
+	cp->indx += P_INDX;
+
+	return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH));
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __bam_getlte --
+ *	Search for the largest entry <= key/data - used by compression.
+ *
+ *	data == NULL indicates the DB_SET_LTE flag
+ *	data != NULL indicates the DB_GET_BOTH_LTE flag
+ *
+ *	Only works for a primary cursor - not an OPD cursor. Handles the
+ *	OPD manipulation as well - no need to return to the caller to
+ *	perform more OPD movements.
+ */
+static int
+__bam_getlte(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	BTREE_CURSOR *cp, *ocp;
+	DB *dbp;
+	db_pgno_t pgno;
+	int exact, ret;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/* Begin by searching for the key */
+	ret = __bamc_search(dbc, PGNO_INVALID, key, DB_SET_RANGE, &exact);
+	if (ret == DB_NOTFOUND)
+		goto find_last;
+	if (ret != 0)
+		goto end;
+
+	if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) {
+		/*
+		 * Move to the next entry if we're past the end of the
+		 * page or on a deleted entry.
+		 */
+		ret = __bamc_next(dbc, 0, 0);
+		if (ret == DB_NOTFOUND)
+			goto find_last;
+		if (ret != 0)
+			goto end;
+
+		/* Check if we're still on the correct key */
+		if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx,
+		    ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0)
+			goto end;
+		exact = (exact == 0);
+	}
+
+	if (exact == 0) {
+		ret = __bam_get_prev(dbc);
+		goto end;
+	}
+
+	if (__bam_isopd(dbc, &pgno)) {
+		/*
+		 * We want to do unusual things with off-page duplicates, so
+		 * deal with them here rather than returning to handle them.
+		 */
+		if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+			goto end;
+
+		/* Search for the correct duplicate */
+		ret = __bamc_search(cp->opd, PGNO_INVALID, data,
+			data == NULL ? DB_FIRST : DB_SET_RANGE, &exact);
+		if (ret == DB_NOTFOUND)
+			goto find_last_dup;
+		if (ret != 0)
+			goto end;
+
+		ocp = (BTREE_CURSOR *)cp->opd->internal;
+		if (ocp->indx == NUM_ENT(ocp->page) ||
+		    IS_CUR_DELETED(cp->opd)) {
+			/*
+			 * Move to the next entry if we're past the end of the
+			 * page or on a deleted entry.
+			 */
+			ret = __bamc_next(cp->opd, 0, 0);
+			if (ret == DB_NOTFOUND)
+				goto find_last_dup;
+			if (ret != 0)
+				goto end;
+
+			if (data != NULL) {
+				/* Check if we're still on the correct data */
+				if ((ret = __bam_cmp(
+					    dbc, data, ocp->page, ocp->indx,
+					    dbp->dup_compare, &exact)) != 0)
+					goto end;
+				exact = (exact == 0);
+			} else
+				exact = 1;
+		}
+
+		if (exact == 0) {
+			/* Move to the previous entry */
+			ret = __bamc_prev(cp->opd);
+			if (ret == DB_NOTFOUND) {
+				if ((ret = __dbc_close(cp->opd)) != 0)
+					goto end;
+				cp->opd = NULL;
+				ret = __bam_get_prev(dbc);
+			}
+		}
+	} else if (data != NULL) {
+		/*
+		 * If we got an exact match with on-page duplicates, we need to
+		 * search in them.
+		 */
+		ret = __bam_getboth_finddatum(dbc, data, DB_GET_BOTH_RANGE);
+		if (ret == DB_NOTFOUND)
+			exact = 0;
+		else if (ret != 0)
+			goto end;
+		else {
+			/* Check if we're still on the correct data */
+			if ((ret = __bam_cmp(dbc, data, cp->page,
+			    cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0)
+				goto end;
+			exact = (exact == 0);
+		}
+
+		if (exact == 0) {
+			ret = __bam_get_prev(dbc);
+		}
+	}
+
+ end:
+	return (ret);
+
+ find_last:
+	if ((ret = __bamc_search(
+	    dbc, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+		return (ret);
+
+	if (__bam_isopd(dbc, &pgno)) {
+		if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+			return (ret);
+ find_last_dup:
+		if ((ret = __bamc_search(
+		    cp->opd, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+			return (ret);
+	}
+
+	return (ret);
+}
+#endif
+
+/*
+ * __bam_getboth_finddatum --
+ *	Find a matching on-page data item.
+ */
+static int
+__bam_getboth_finddatum(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	db_indx_t base, lim, top;
+	int cmp, ret;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	cmp = 0;
+
+	/*
+	 * Called (sometimes indirectly) from DBC->get to search on-page data
+	 * item(s) for a matching value.  If the original flag was DB_GET_BOTH
+	 * or DB_GET_BOTH_RANGE, the cursor is set to the first undeleted data
+	 * item for the key.  If the original flag was DB_GET_BOTHC, the cursor
+	 * argument is set to the first data item we can potentially return.
+	 * In both cases, there may or may not be additional duplicate data
+	 * items to search.
+	 *
+	 * If the duplicates are not sorted, do a linear search.
+	 */
+	if (dbp->dup_compare == NULL) {
+		for (;; cp->indx += P_INDX) {
+			if (!IS_CUR_DELETED(dbc)) {
+				if ((ret = __bam_cmp(
+				    dbc, data, cp->page, cp->indx + O_INDX,
+				    __bam_defcmp, &cmp)) != 0)
+					return (ret);
+				if (cmp == 0)
+					return (0);
+			}
+
+			if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+			    !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+				break;
+		}
+		return (DB_NOTFOUND);
+	}
+
+	/*
+	 * If the duplicates are sorted, do a binary search.  The reason for
+	 * this is that large pages and small key/data pairs result in large
+	 * numbers of on-page duplicates before they get pushed off-page.
+	 *
+	 * Find the top and bottom of the duplicate set.  Binary search
+	 * requires at least two items, don't loop if there's only one.
+	 */
+	for (base = top = cp->indx; top < NUM_ENT(cp->page); top += P_INDX)
+		if (!IS_DUPLICATE(dbc, cp->indx, top))
+			break;
+	if (base == (top - P_INDX)) {
+		if  ((ret = __bam_cmp(dbc, data, cp->page,
+		    cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+			return (ret);
+		if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE))
+			return (0);
+		cp->indx = top;
+		return DB_NOTFOUND;
+	}
+
+	for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
+		cp->indx = base + ((lim >> 1) * P_INDX);
+		if ((ret = __bam_cmp(dbc, data, cp->page,
+		    cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+			return (ret);
+		if (cmp == 0) {
+			/*
+			 * XXX
+			 * No duplicate duplicates in sorted duplicate sets,
+			 * so there can be only one.
+			 */
+			if (!IS_CUR_DELETED(dbc))
+				return (0);
+			break;
+		}
+		if (cmp > 0) {
+			base = cp->indx + P_INDX;
+			--lim;
+		}
+	}
+
+	/* No match found; if we're looking for an exact match, we're done. */
+	if (flags == DB_GET_BOTH)
+		return (DB_NOTFOUND);
+
+	/*
+	 * Base is the smallest index greater than the data item, may be zero
+	 * or a last + O_INDX index, and may be deleted.  Find an undeleted
+	 * item.
+	 */
+	cp->indx = base;
+	while (cp->indx < top && IS_CUR_DELETED(dbc))
+		cp->indx += P_INDX;
+	return (cp->indx < top ? 0 : DB_NOTFOUND);
+}
+
+/*
+ * __bamc_put --
+ *	Put using a cursor.
+ */
+static int
+__bamc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT dbt;
+	DB_MPOOLFILE *mpf;
+	db_pgno_t root_pgno;
+	int cmp, exact, own, ret, stack;
+	u_int32_t iiop;
+	void *arg;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	root_pgno = cp->root;
+
+split:	ret = stack = 0;
+	switch (flags) {
+	case DB_CURRENT:
+		if (F_ISSET(cp, C_DELETED))
+			return (DB_NOTFOUND);
+		/* FALLTHROUGH */
+	case DB_AFTER:
+	case DB_BEFORE:
+		iiop = flags;
+		own = 1;
+
+		/* Acquire the current page with a write lock. */
+		ACQUIRE_WRITE_LOCK(dbc, ret);
+		if (ret != 0)
+			goto err;
+		if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+		    dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+			goto err;
+		break;
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_NODUPDATA:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		own = 0;
+		/*
+		 * Searching off-page, sorted duplicate tree: do a tree search
+		 * for the correct item; __bamc_search returns the smallest
+		 * slot greater than the key, use it.
+		 *
+		 * See comment below regarding where we can start the search.
+		 */
+		if (F_ISSET(dbc, DBC_OPD)) {
+			if ((ret = __bamc_search(dbc,
+			    F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno,
+			    data, flags, &exact)) != 0)
+				goto err;
+			stack = 1;
+
+			/* Disallow "sorted" duplicate duplicates. */
+			if (exact != 0) {
+				if (flags == DB_OVERWRITE_DUP ||
+				    IS_DELETED(dbp, cp->page, cp->indx)) {
+					iiop = DB_CURRENT;
+					break;
+				}
+				ret = __db_duperr(dbp, flags);
+				goto err;
+			}
+			iiop = DB_BEFORE;
+			break;
+		}
+
+		/*
+		 * Searching a btree.
+		 *
+		 * If we've done a split, we can start the search from the
+		 * parent of the split page, which __bam_split returned
+		 * for us in root_pgno, unless we're in a Btree with record
+		 * numbering.  In that case, we'll need the true root page
+		 * in order to adjust the record count.
+		 */
+		if ((ret = __bamc_search(dbc,
+		    F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, key,
+		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
+		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
+			goto err;
+		stack = 1;
+
+		/*
+		 * If we don't have an exact match, __bamc_search returned
+		 * the smallest slot greater than the key, use it.
+		 */
+		if (!exact) {
+			iiop = DB_KEYFIRST;
+			break;
+
+		/*
+		 * Check for NOOVERWRITE.  It is possible that there
+		 * is a key with an empty duplicate page attached.
+		 */
+		} else if (flags == DB_NOOVERWRITE && !IS_CUR_DELETED(dbc)) {
+			if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+				ret = __bam_opd_exists(dbc, *pgnop);
+			else
+				ret = DB_KEYEXIST;
+			if (ret != 0)
+				goto err;
+		}
+
+		/*
+		 * If duplicates aren't supported, replace the current item.
+		 */
+		if (!F_ISSET(dbp, DB_AM_DUP)) {
+			iiop = DB_CURRENT;
+			break;
+		}
+
+		/*
+		 * If we find a matching entry, it may be an off-page duplicate
+		 * tree.  Return the page number to our caller, we need a new
+		 * cursor.
+		 */
+		if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+			goto done;
+
+		/* If the duplicates aren't sorted, move to the right slot. */
+		if (dbp->dup_compare == NULL) {
+			if (flags == DB_KEYFIRST)
+				iiop = DB_BEFORE;
+			else
+				for (;; cp->indx += P_INDX)
+					if (cp->indx + P_INDX >=
+					    NUM_ENT(cp->page) ||
+					    !IS_DUPLICATE(dbc, cp->indx,
+					    cp->indx + P_INDX)) {
+						iiop = DB_AFTER;
+						break;
+					}
+			break;
+		}
+
+		/*
+		 * We know that we're looking at the first of a set of sorted
+		 * on-page duplicates.  Walk the list to find the right slot.
+		 */
+		for (;; cp->indx += P_INDX) {
+			if ((ret = __bam_cmp(dbc, data, cp->page,
+			    cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+				goto err;
+			if (cmp < 0) {
+				iiop = DB_BEFORE;
+				break;
+			}
+
+			/* Disallow "sorted" duplicate duplicates. */
+			if (cmp == 0) {
+				if (flags == DB_OVERWRITE_DUP ||
+				    IS_DELETED(dbp, cp->page, cp->indx)) {
+					iiop = DB_CURRENT;
+					break;
+				}
+				ret = __db_duperr(dbp, flags);
+				goto err;
+			}
+
+			if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+			    P_INP(dbp, ((PAGE *)cp->page))[cp->indx] !=
+			    P_INP(dbp, ((PAGE *)cp->page))[cp->indx + P_INDX]) {
+				iiop = DB_AFTER;
+				break;
+			}
+		}
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__bamc_put", flags);
+		goto err;
+	}
+
+	switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) {
+	case 0:
+		break;
+	case DB_NEEDSPLIT:
+		/*
+		 * To split, we need a key for the page.  Either use the key
+		 * argument or get a copy of the key from the page.
+		 */
+		if (flags == DB_AFTER ||
+		    flags == DB_BEFORE || flags == DB_CURRENT) {
+			memset(&dbt, 0, sizeof(DBT));
+			if ((ret = __db_ret(dbc, cp->page, 0, &dbt,
+			    &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+				goto err;
+			arg = &dbt;
+		} else
+			arg = F_ISSET(dbc, DBC_OPD) ? data : key;
+
+		/*
+		 * Discard any locks and pinned pages (the locks are discarded
+		 * even if we're running with transactions, as they lock pages
+		 * that we're sorry we ever acquired).  If stack is set and the
+		 * cursor entries are valid, they point to the same entries as
+		 * the stack, don't free them twice.
+		 */
+		if (stack)
+			ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK);
+		else
+			DISCARD_CUR(dbc, ret);
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * SR [#6059]
+		 * If we do not own a lock on the page any more, then clear the
+		 * cursor so we don't point at it.  Even if we call __bam_stkrel
+		 * above we still may have entered the routine with the cursor
+		 * positioned to a particular record.  This is in the case
+		 * where C_RECNUM is set.
+		 */
+		if (own == 0) {
+			cp->pgno = PGNO_INVALID;
+			cp->indx = 0;
+		}
+
+		/* Split the tree. */
+		if ((ret = __bam_split(dbc, arg, &root_pgno)) != 0)
+			return (ret);
+
+		goto split;
+	default:
+		goto err;
+	}
+
+err:
+done:	/*
+	 * If we inserted a key into the first or last slot of the tree,
+	 * remember where it was so we can do it more quickly next time.
+	 * If the tree has record numbers, we need a complete stack so
+	 * that we can adjust the record counts, so skipping the tree search
+	 * isn't possible.  For subdatabases we need to be careful that the
+	 * page does not move from one db to another, so we track its LSN.
+	 *
+	 * If there are duplicates and we are inserting into the last slot,
+	 * the cursor will point _to_ the last item, not after it, which
+	 * is why we subtract P_INDX below.
+	 */
+
+	t = dbp->bt_internal;
+	if (ret == 0 && TYPE(cp->page) == P_LBTREE &&
+	    (flags == DB_KEYFIRST || flags == DB_KEYLAST) &&
+	    !F_ISSET(cp, C_RECNUM) &&
+	    (!F_ISSET(dbp, DB_AM_SUBDB) ||
+	    (LOGGING_ON(dbp->env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))) &&
+	    ((NEXT_PGNO(cp->page) == PGNO_INVALID &&
+	    cp->indx >= NUM_ENT(cp->page) - P_INDX) ||
+	    (PREV_PGNO(cp->page) == PGNO_INVALID && cp->indx == 0))) {
+		t->bt_lpgno = cp->pgno;
+		if (F_ISSET(dbp, DB_AM_SUBDB))
+			t->bt_llsn = LSN(cp->page);
+	} else
+		t->bt_lpgno = PGNO_INVALID;
+	/*
+	 * Discard any pages pinned in the tree and their locks, except for
+	 * the leaf page.  Note, the leaf page participated in any stack we
+	 * acquired, and so we have to adjust the stack as necessary.  If
+	 * there was only a single page on the stack, we don't have to free
+	 * further stack pages.
+	 */
+	if (stack && BT_STK_POP(cp) != NULL)
+		(void)__bam_stkrel(dbc, 0);
+
+	/*
+	 * Regardless of whether we were successful or not, clear the delete
+	 * flag.  If we're successful, we either moved the cursor or the item
+	 * is no longer deleted.  If we're not successful, then we're just a
+	 * copy, no need to have the flag set.
+	 *
+	 * We may have instantiated off-page duplicate cursors during the put,
+	 * so clear the deleted bit from the off-page duplicate cursor as well.
+	 */
+	F_CLR(cp, C_DELETED);
+	if (cp->opd != NULL) {
+		cp = (BTREE_CURSOR *)cp->opd->internal;
+		F_CLR(cp, C_DELETED);
+	}
+
+	return (ret);
+}
+
+/*
+ * __bamc_rget --
+ *	Return the record number for a cursor.
+ *
+ * PUBLIC: int __bamc_rget __P((DBC *, DBT *));
+ */
+int
+__bamc_rget(dbc, data)
+	DBC *dbc;
+	DBT *data;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT dbt;
+	DB_MPOOLFILE *mpf;
+	db_recno_t recno;
+	int exact, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Get the page with the current item on it.
+	 * Get a copy of the key.
+	 * Release the page, making sure we don't release it twice.
+	 */
+	if ((ret = __memp_fget(mpf, &cp->pgno,
+	     dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+		return (ret);
+	memset(&dbt, 0, sizeof(DBT));
+	if ((ret = __db_ret(dbc, cp->page, cp->indx, &dbt,
+	    &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+		goto err;
+	ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+	cp->page = NULL;
+	if (ret != 0)
+		return (ret);
+
+	if ((ret = __bam_search(dbc, PGNO_INVALID, &dbt,
+	    F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+	    1, &recno, &exact)) != 0)
+		goto err;
+
+	ret = __db_retcopy(dbc->env, data,
+	    &recno, sizeof(recno), &dbc->rdata->data, &dbc->rdata->ulen);
+
+	/* Release the stack. */
+err:	if ((t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bamc_writelock --
+ *	Upgrade the cursor to a write lock.
+ */
+static int
+__bamc_writelock(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	int ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	if (cp->lock_mode == DB_LOCK_WRITE)
+		return (0);
+
+	/*
+	 * When writing to an off-page duplicate tree, we need to have the
+	 * appropriate page in the primary tree locked.  The general DBC
+	 * code calls us first with the primary cursor so we can acquire the
+	 * appropriate lock.
+	 */
+	ACQUIRE_WRITE_LOCK(dbc, ret);
+	return (ret);
+}
+
+/*
+ * __bamc_next --
+ *	Move to the next record.
+ */
+static int
+__bamc_next(dbc, initial_move, deleted_okay)
+	DBC *dbc;
+	int initial_move, deleted_okay;
+{
+	BTREE_CURSOR *cp;
+	db_indx_t adjust;
+	db_lockmode_t lock_mode;
+	db_pgno_t pgno;
+	int ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ret = 0;
+
+	/*
+	 * We're either moving through a page of duplicates or a btree leaf
+	 * page.
+	 *
+	 * !!!
+	 * This code handles empty pages and pages with only deleted entries.
+	 */
+	if (F_ISSET(dbc, DBC_OPD)) {
+		adjust = O_INDX;
+		lock_mode = DB_LOCK_NG;
+	} else {
+		adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+		lock_mode =
+		    F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+	}
+	if (cp->page == NULL) {
+		ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+		if (ret != 0)
+			return (ret);
+	}
+
+	if (initial_move)
+		cp->indx += adjust;
+
+	for (;;) {
+		/*
+		 * If at the end of the page, move to a subsequent page.
+		 *
+		 * !!!
+		 * Check for >= NUM_ENT.  If the original search landed us on
+		 * NUM_ENT, we may have incremented indx before the test.
+		 */
+		if (cp->indx >= NUM_ENT(cp->page)) {
+			if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID)
+				return (DB_NOTFOUND);
+
+			ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+			if (ret != 0)
+				return (ret);
+			cp->indx = 0;
+			continue;
+		}
+		if (!deleted_okay && IS_CUR_DELETED(dbc)) {
+			cp->indx += adjust;
+			continue;
+		}
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __bamc_prev --
+ *	Move to the previous record.
+ */
+static int
+__bamc_prev(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	db_indx_t adjust;
+	db_lockmode_t lock_mode;
+	db_pgno_t pgno;
+	int ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ret = 0;
+
+	/*
+	 * We're either moving through a page of duplicates or a btree leaf
+	 * page.
+	 *
+	 * !!!
+	 * This code handles empty pages and pages with only deleted entries.
+	 */
+	if (F_ISSET(dbc, DBC_OPD)) {
+		adjust = O_INDX;
+		lock_mode = DB_LOCK_NG;
+	} else {
+		adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+		lock_mode =
+		    F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+	}
+	if (cp->page == NULL) {
+		ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+		if (ret != 0)
+			return (ret);
+	}
+
+	for (;;) {
+		/* If at the beginning of the page, move to a previous one. */
+		if (cp->indx == 0) {
+			if ((pgno =
+			    PREV_PGNO(cp->page)) == PGNO_INVALID)
+				return (DB_NOTFOUND);
+
+			ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+			if (ret != 0)
+				return (ret);
+
+			if ((cp->indx = NUM_ENT(cp->page)) == 0)
+				continue;
+		}
+
+		/* Ignore deleted records. */
+		cp->indx -= adjust;
+		if (IS_CUR_DELETED(dbc))
+			continue;
+
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __bamc_search --
+ *	Move to a specified record.
+ */
+static int
+__bamc_search(dbc, root_pgno, key, flags, exactp)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	const DBT *key;
+	u_int32_t flags;
+	int *exactp;
+{
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	PAGE *h;
+	db_indx_t base, indx, *inp, lim;
+	db_pgno_t bt_lpgno;
+	db_recno_t recno;
+	u_int32_t sflags;
+	int bulk, cmp, ret, t_ret;
+
+	COMPQUIET(cmp, 0);
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	t = dbp->bt_internal;
+	ret = 0;
+	bulk = (F_ISSET(dbc, DBC_BULK) && cp->pgno != PGNO_INVALID);
+
+	/*
+	 * Find an entry in the database.  Discard any lock we currently hold,
+	 * we're going to search the tree.
+	 */
+	DISCARD_CUR(dbc, ret);
+	if (ret != 0)
+		return (ret);
+
+	switch (flags) {
+	case DB_FIRST:
+		sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MIN;
+		goto search;
+	case DB_LAST:
+		sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MAX;
+		goto search;
+	case DB_SET_RECNO:
+		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+			return (ret);
+		sflags =
+		    (F_ISSET(dbc, DBC_RMW) ?  SR_FIND_WR : SR_FIND) | SR_EXACT;
+		if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0)
+			return (ret);
+		goto done;
+	case DB_SET:
+	case DB_GET_BOTH:
+		sflags =
+		    (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT;
+		if (bulk)
+			break;
+		goto search;
+	case DB_GET_BOTH_RANGE:
+		sflags = (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND);
+		goto search;
+	case DB_SET_RANGE:
+		sflags =
+		    (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_DUPFIRST;
+		goto search;
+	case DB_KEYFIRST:
+	case DB_NOOVERWRITE:
+		sflags = SR_KEYFIRST;
+		break;
+	case DB_KEYLAST:
+	case DB_NODUPDATA:
+	case DB_OVERWRITE_DUP:
+		sflags = SR_KEYLAST;
+		break;
+	default:
+		return (__db_unknown_flag(dbp->env, "__bamc_search", flags));
+	}
+
+	/*
+	 * If the application has a history of inserting into the first or last
+	 * pages of the database, we check those pages first to avoid doing a
+	 * full search.  Similarly, if the cursor is configured as a bulk
+	 * cursor, check whether this operation belongs on the same page as the
+	 * last one.
+	 */
+	if (bulk)
+		bt_lpgno = cp->pgno;
+	else {
+		if (F_ISSET(dbc, DBC_OPD))
+			goto search;
+
+		/*
+		 * !!!
+		 * We do not mutex protect the t->bt_lpgno field, which means
+		 * that it can only be used in an advisory manner.  If we find
+		 * page we can use, great.  If we don't, we don't care, we do
+		 * it the slow way instead.  Regardless, copy it into a local
+		 * variable, otherwise we might acquire a lock for a page and
+		 * then read a different page because it changed underfoot.
+		 */
+		bt_lpgno = t->bt_lpgno;
+	}
+
+	/*
+	 * If the tree has no history of insertion, do it the slow way.
+	 */
+	if (bt_lpgno == PGNO_INVALID)
+		goto search;
+
+	/*
+	 * Lock and retrieve the page on which we last inserted.
+	 *
+	 * The page may not exist: if a transaction created the page
+	 * and then aborted, the page might have been truncated from
+	 * the end of the file.  We don't want to wait on the lock.
+	 * The page may not even be relevant to this search.
+	 */
+	h = NULL;
+	ACQUIRE_CUR(dbc, DB_LOCK_WRITE, bt_lpgno, DB_LOCK_NOWAIT, ret);
+	if (ret != 0) {
+		if (ret == DB_LOCK_DEADLOCK ||
+		    ret == DB_LOCK_NOTGRANTED ||
+		    ret == DB_PAGE_NOTFOUND)
+			ret = 0;
+		goto fast_miss;
+	}
+
+	h = cp->page;
+	inp = P_INP(dbp, h);
+
+	/*
+	 * It's okay if the page type isn't right or it's empty, it
+	 * just means that the world changed.
+	 */
+	if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0)
+		goto fast_miss;
+
+	/* Verify that this page cannot have moved to another db. */
+	if (F_ISSET(dbp, DB_AM_SUBDB) &&
+	    LOG_COMPARE(&t->bt_llsn, &LSN(h)) != 0)
+		goto fast_miss;
+
+	/*
+	 * What we do here is test to see if we're at the beginning or
+	 * end of the tree and if the new item sorts before/after the
+	 * first/last page entry.  We only try to catch inserts into
+	 * the middle of the tree for bulk cursors.
+	 */
+	if (h->next_pgno == PGNO_INVALID) {
+		indx = NUM_ENT(h) - P_INDX;
+		if ((ret = __bam_cmp(dbc, key, h, indx,
+		    t->bt_compare, &cmp)) != 0)
+			goto fast_miss;
+		if (cmp > 0) {
+			if (FLD_ISSET(sflags, SR_EXACT))
+				return (DB_NOTFOUND);
+			else
+				indx += P_INDX;
+		}
+		if (cmp >= 0)
+			goto fast_hit;
+	}
+	if (h->prev_pgno == PGNO_INVALID) {
+		indx = 0;
+		if ((ret = __bam_cmp(dbc, key, h, indx,
+		    t->bt_compare, &cmp)) != 0)
+			goto fast_miss;
+		if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT))
+			return (DB_NOTFOUND);
+		if (cmp <= 0)
+			goto fast_hit;
+	}
+	if (bulk) {
+		DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) {
+			DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX);
+			if ((ret = __bam_cmp(dbc, key, h, indx,
+			    t->bt_compare, &cmp)) != 0)
+				goto fast_miss;
+
+			if (cmp == 0)
+				goto fast_hit;
+			if (cmp > 0)
+				DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+				    lim, P_INDX);
+		}
+		/*
+		 * No match found: base is the smallest index greater than
+		 * the key and may be zero or NUM_ENT(h).
+		 */
+		indx = base;
+		if (indx > 0 && indx < NUM_ENT(h)) {
+			if (FLD_ISSET(sflags, SR_EXACT))
+				return (DB_NOTFOUND);
+			goto fast_hit;
+		}
+	}
+	goto fast_miss;
+
+fast_hit:
+	if (cmp == 0) {
+		/*
+		 * Found a duplicate.  Deal with DB_KEYFIRST / DB_KEYLAST.
+		 */
+		if (FLD_ISSET(sflags, SR_DUPFIRST))
+			while (indx > 0 && inp[indx - P_INDX] == inp[indx])
+				indx -= P_INDX;
+		else if (FLD_ISSET(sflags, SR_DUPLAST))
+			while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+			    inp[indx] == inp[indx + P_INDX])
+				indx += P_INDX;
+	}
+
+	/* Set the exact match flag, we may have found a duplicate. */
+	*exactp = (cmp == 0);
+
+	/*
+	 * Insert the entry in the stack.  (Our caller is likely to
+	 * call __bam_stkrel() after our return.)
+	 */
+	BT_STK_CLR(cp);
+	BT_STK_ENTER(dbp->env,
+	    cp, h, indx, cp->lock, cp->lock_mode, ret);
+	if (ret != 0)
+		return (ret);
+	goto done;
+
+fast_miss:
+	/*
+	 * This was not the right page, so we do not need to retain
+	 * the lock even in the presence of transactions.
+	 *
+	 * This is also an error path, so ret may have been set.
+	 */
+	DISCARD_CUR(dbc, ret);
+	cp->pgno = PGNO_INVALID;
+	if ((t_ret = __LPUT(dbc, cp->lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		return (ret);
+
+search:
+	if ((ret = __bam_search(dbc, root_pgno,
+	    key, sflags, 1, NULL, exactp)) != 0)
+		return (ret);
+
+done:	/* Initialize the cursor from the stack. */
+	cp->page = cp->csp->page;
+	cp->pgno = cp->csp->page->pgno;
+	cp->indx = cp->csp->indx;
+	cp->lock = cp->csp->lock;
+	cp->lock_mode = cp->csp->lock_mode;
+
+	/* If on an empty page or a deleted record, move to the next one. */
+	if (flags == DB_FIRST &&
+	    (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+		if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+			return (ret);
+	if (flags == DB_LAST &&
+	    (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+		if ((ret = __bamc_prev(dbc)) != 0)
+			return (ret);
+
+	return (0);
+}
+
+/*
+ * __bamc_physdel --
+ *	Physically remove an item from the page.
+ */
+static int
+__bamc_physdel(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT key;
+	DB_LOCK next_lock, prev_lock;
+	db_pgno_t pgno;
+	int delete_page, empty_page, exact, ret;
+
+	dbp = dbc->dbp;
+	memset(&key, 0, sizeof(DBT));
+	cp = (BTREE_CURSOR *)dbc->internal;
+	delete_page = empty_page = ret = 0;
+	LOCK_INIT(next_lock);
+	LOCK_INIT(prev_lock);
+
+	/* If the page is going to be emptied, consider deleting it. */
+	delete_page = empty_page =
+	    NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1);
+
+	/*
+	 * Check if the application turned off reverse splits.  Applications
+	 * can't turn off reverse splits in off-page duplicate trees, that
+	 * space will never be reused unless the exact same key is specified.
+	 */
+	if (delete_page &&
+	    !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_AM_REVSPLITOFF))
+		delete_page = 0;
+
+	/*
+	 * We never delete the last leaf page.  (Not really true -- we delete
+	 * the last leaf page of off-page duplicate trees, but that's handled
+	 * by our caller, not down here.)
+	 */
+	if (delete_page && cp->pgno == BAM_ROOT_PGNO(dbc))
+		delete_page = 0;
+
+	/*
+	 * To delete a leaf page other than an empty root page, we need a
+	 * copy of a key from the page.  Use the 0th page index since it's
+	 * the last key the page held.
+	 *
+	 * !!!
+	 * Note that because __bamc_physdel is always called from a cursor
+	 * close, it should be safe to use the cursor's own "my_rkey" memory
+	 * to temporarily hold this key.  We shouldn't own any returned-data
+	 * memory of interest--if we do, we're in trouble anyway.
+	 */
+	if (delete_page) {
+		if ((ret = __db_ret(dbc, cp->page, 0, &key,
+		    &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Delete the items.  If page isn't empty, we adjust the cursors.
+	 *
+	 * !!!
+	 * The following operations to delete a page may deadlock.  The easy
+	 * scenario is if we're deleting an item because we're closing cursors
+	 * because we've already deadlocked and want to call txn->abort.  If
+	 * we fail due to deadlock, we'll leave a locked, possibly empty page
+	 * in the tree, which won't be empty long because we'll undo the delete
+	 * when we undo the transaction's modifications.
+	 *
+	 * !!!
+	 * Delete the key item first, otherwise the on-page duplicate checks
+	 * in __bam_ditem() won't work!
+	 */
+	if ((ret = __memp_dirty(dbp->mpf,
+	    &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+	if (TYPE(cp->page) == P_LBTREE) {
+		if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+			goto err;
+		if (!empty_page)
+			if ((ret = __bam_ca_di(dbc,
+			    PGNO(cp->page), cp->indx, -1)) != 0)
+				goto err;
+	}
+	if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+		goto err;
+
+	/* Clear the deleted flag, the item is gone. */
+	F_CLR(cp, C_DELETED);
+
+	if (!empty_page)
+		if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0)
+			goto err;
+
+	/*
+	 * Need to downgrade write locks here or non-txn locks will get stuck.
+	 */
+	if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED)) {
+		if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+			goto err;
+		cp->lock_mode = DB_LOCK_WWRITE;
+		if (cp->page != NULL &&
+		    (ret = __memp_shared(dbp->mpf, cp->page)) != 0)
+			goto err;
+	}
+	/* If we're not going to try and delete the page, we're done. */
+	if (!delete_page)
+		return (0);
+
+	/*
+	 * Lock the previous and next pages before latching the parent
+	 * sub tree.
+	 */
+	if (STD_LOCKING(dbc)) {
+		if ((pgno = PREV_PGNO(cp->page)) != PGNO_INVALID &&
+		    (ret = __db_lget(dbc,
+		    0, pgno, DB_LOCK_WRITE, 0, &prev_lock)) != 0)
+			return (ret);
+		if ((pgno = NEXT_PGNO(cp->page)) != PGNO_INVALID &&
+		    (ret = __db_lget(dbc,
+		    0, pgno, DB_LOCK_WRITE, 0, &next_lock)) != 0) {
+			(void)__TLPUT(dbc, next_lock);
+			return (ret);
+		}
+	}
+	DISCARD_CUR(dbc, ret);
+	if (ret != 0)
+		goto err;
+	ret = __bam_search(dbc, PGNO_INVALID, &key, SR_DEL, 0, NULL, &exact);
+
+	/*
+	 * If everything worked, delete the stack, otherwise, release the
+	 * stack and page locks without further damage.
+	 */
+	if (ret == 0)
+		ret = __bam_dpages(dbc, 1, BTD_RELINK);
+	else
+		(void)__bam_stkrel(dbc, 0);
+
+err:	if (ret != 0)
+		F_SET(dbc, DBC_ERROR);
+	(void)__TLPUT(dbc, prev_lock);
+	(void)__TLPUT(dbc, next_lock);
+	return (ret);
+}
+
+/*
+ * __bamc_getstack --
+ *	Acquire a full stack for a cursor.
+ */
+static int
+__bamc_getstack(dbc)
+	DBC *dbc;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT dbt;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int exact, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Get the page with the current item on it.  The caller of this
+	 * routine has to already hold a read lock on the page, so there
+	 * is no additional lock to acquire.
+	 */
+	if ((ret = __memp_fget(mpf, &cp->pgno,
+	     dbc->thread_info, dbc->txn, 0, &h)) != 0)
+		return (ret);
+
+	/* Get a copy of a key from the page. */
+	memset(&dbt, 0, sizeof(DBT));
+	ret = __db_ret(dbc, h, 0, &dbt,
+	     &dbc->my_rkey.data, &dbc->my_rkey.ulen);
+	if ((t_ret = __memp_fput(mpf,
+	     dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		return (ret);
+
+	/* Get a write-locked stack for the page. */
+	exact = 0;
+	ret = __bam_search(dbc, PGNO_INVALID,
+	    &dbt, SR_KEYFIRST, 1, NULL, &exact);
+
+	return (ret);
+}
+
+/*
+ * __bam_isopd --
+ *	Return if the cursor references an off-page duplicate tree via its
+ *	page number.
+ */
+static int
+__bam_isopd(dbc, pgnop)
+	DBC *dbc;
+	db_pgno_t *pgnop;
+{
+	BOVERFLOW *bo;
+
+	if (TYPE(dbc->internal->page) != P_LBTREE)
+		return (0);
+
+	bo = GET_BOVERFLOW(dbc->dbp,
+	    dbc->internal->page, dbc->internal->indx + O_INDX);
+	if (B_TYPE(bo->type) == B_DUPLICATE) {
+		*pgnop = bo->pgno;
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * __bam_opd_exists --
+ *	Return if the current position has any data.
+ * PUBLIC: int  __bam_opd_exists __P((DBC *, db_pgno_t));
+ */
+int
+__bam_opd_exists(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	PAGE *h;
+	int ret;
+
+	if ((ret = __memp_fget(dbc->dbp->mpf, &pgno,
+	    dbc->thread_info, dbc->txn, 0, &h)) != 0)
+		return (ret);
+
+	/*
+	 * We always collapse OPD trees so we only need to check
+	 * the number of entries on the root.  If there is a non-empty
+	 * tree then there will be duplicates.
+	 */
+	if (NUM_ENT(h) == 0)
+		ret = 0;
+	else
+		ret = DB_KEYEXIST;
+
+	(void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority);
+
+	return (ret);
+}
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
new file mode 100644
index 00000000..37496b3f
--- /dev/null
+++ b/src/btree/bt_delete.c
@@ -0,0 +1,541 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_ditem --
+ *	Delete one or more entries from a page.
+ *
+ * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__bam_ditem(dbc, h, indx)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	DB *dbp;
+	u_int32_t nbytes;
+	int ret;
+	db_indx_t *inp;
+
+	dbp = dbc->dbp;
+	inp = P_INP(dbp, h);
+
+	/* The page should already have been dirtied by our caller. */
+	DB_ASSERT(dbp->env, IS_DIRTY(h));
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+		bi = GET_BINTERNAL(dbp, h, indx);
+		switch (B_TYPE(bi->type)) {
+		case B_DUPLICATE:
+		case B_KEYDATA:
+			nbytes = BINTERNAL_SIZE(bi->len);
+			break;
+		case B_OVERFLOW:
+			nbytes = BINTERNAL_SIZE(bi->len);
+			if ((ret =
+			    __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+				return (ret);
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, PGNO(h)));
+		}
+		break;
+	case P_IRECNO:
+		nbytes = RINTERNAL_SIZE;
+		break;
+	case P_LBTREE:
+		/*
+		 * If it's a duplicate key, discard the index and don't touch
+		 * the actual page item.
+		 *
+		 * !!!
+		 * This works because no data item can have an index matching
+		 * any other index so even if the data item is in a key "slot",
+		 * it won't match any other index.
+		 */
+		if ((indx % 2) == 0) {
+			/*
+			 * Check for a duplicate after us on the page.  NOTE:
+			 * we have to delete the key item before deleting the
+			 * data item, otherwise the "indx + P_INDX" calculation
+			 * won't work!
+			 */
+			if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
+			    inp[indx] == inp[indx + P_INDX])
+				return (__bam_adjindx(dbc,
+				    h, indx, indx + O_INDX, 0));
+			/*
+			 * Check for a duplicate before us on the page.  It
+			 * doesn't matter if we delete the key item before or
+			 * after the data item for the purposes of this one.
+			 */
+			if (indx > 0 && inp[indx] == inp[indx - P_INDX])
+				return (__bam_adjindx(dbc,
+				    h, indx, indx - P_INDX, 0));
+		}
+		/* FALLTHROUGH */
+	case P_LDUP:
+	case P_LRECNO:
+		bk = GET_BKEYDATA(dbp, h, indx);
+		switch (B_TYPE(bk->type)) {
+		case B_DUPLICATE:
+			nbytes = BOVERFLOW_SIZE;
+			break;
+		case B_OVERFLOW:
+			nbytes = BOVERFLOW_SIZE;
+			if ((ret = __db_doff(
+			    dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
+				return (ret);
+			break;
+		case B_KEYDATA:
+			nbytes = BKEYDATA_SIZE(bk->len);
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, PGNO(h)));
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(h)));
+	}
+
+	/* Delete the item and mark the page dirty. */
+	if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __bam_adjindx --
+ *	Adjust an index on the page.
+ *
+ * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+ */
+int
+__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx, indx_copy;
+	int is_insert;
+{
+	DB *dbp;
+	db_indx_t copy, *inp;
+	int ret;
+
+	dbp = dbc->dbp;
+	inp = P_INP(dbp, h);
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+	    if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
+		PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(h));
+
+	/* Shuffle the indices and mark the page dirty. */
+	if (is_insert) {
+		copy = inp[indx_copy];
+		if (indx != NUM_ENT(h))
+			memmove(&inp[indx + O_INDX], &inp[indx],
+			    sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+		inp[indx] = copy;
+		++NUM_ENT(h);
+	} else {
+		--NUM_ENT(h);
+		if (indx != NUM_ENT(h))
+			memmove(&inp[indx], &inp[indx + O_INDX],
+			    sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_dpages --
+ *	Delete a set of locked pages.
+ *
+ * PUBLIC: int __bam_dpages __P((DBC *, int, int));
+ */
+int
+__bam_dpages(dbc, use_top, flags)
+	DBC *dbc;
+	int use_top;
+	int flags;
+{
+	BINTERNAL *bi;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT a, b;
+	DB_LOCK c_lock, p_lock;
+	DB_MPOOLFILE *mpf;
+	EPG *epg, *save_sp, *stack_epg;
+	PAGE *child, *parent;
+	db_indx_t nitems;
+	db_pgno_t pgno, root_pgno;
+	db_recno_t rcnt;
+	int done, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	nitems = 0;
+	pgno = PGNO_INVALID;
+
+	/*
+	 * We have the entire stack of deletable pages locked.
+	 *
+	 * Btree calls us with the first page in the stack is to have a
+	 * single item deleted, and the rest of the pages are to be removed.
+	 *
+	 * Recno always has a stack to the root and __bam_merge operations
+	 * may have unneeded items in the sack.  We find the lowest page
+	 * in the stack that has more than one record in it and start there.
+	 */
+	ret = 0;
+	if (use_top)
+		stack_epg = cp->sp;
+	else
+		for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg)
+			if (NUM_ENT(stack_epg->page) > 1)
+				break;
+	epg = stack_epg;
+	/*
+	 * !!!
+	 * There is an interesting deadlock situation here.  We have to relink
+	 * the leaf page chain around the leaf page being deleted.  Consider
+	 * a cursor walking through the leaf pages, that has the previous page
+	 * read-locked and is waiting on a lock for the page we're deleting.
+	 * It will deadlock here.  Before we unlink the subtree, we relink the
+	 * leaf page chain.
+	 */
+	if (LF_ISSET(BTD_RELINK) && LEVEL(cp->csp->page) == 1 &&
+	    (ret = __db_relink(dbc, cp->csp->page, NULL, PGNO_INVALID)) != 0)
+		goto discard;
+
+	/*
+	 * Delete the last item that references the underlying pages that are
+	 * to be deleted, and adjust cursors that reference that page.  Then,
+	 * save that page's page number and item count and release it.  If
+	 * the application isn't retaining locks because it's running without
+	 * transactions, this lets the rest of the tree get back to business
+	 * immediately.
+	 */
+	if ((ret = __memp_dirty(mpf,
+	    &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto discard;
+	if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+		goto discard;
+	if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
+		goto discard;
+
+	if (LF_ISSET(BTD_UPDATE) && epg->indx == 0) {
+		save_sp = cp->csp;
+		cp->csp = epg;
+		ret = __bam_pupdate(dbc, epg->page);
+		cp->csp = save_sp;
+		if (ret != 0)
+			goto discard;
+	}
+
+	pgno = PGNO(epg->page);
+	nitems = NUM_ENT(epg->page);
+
+	ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority);
+	epg->page = NULL;
+	if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err_inc;
+
+	/* Then, discard any pages that we don't care about. */
+discard: for (epg = cp->sp; epg < stack_epg; ++epg) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		     epg->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		epg->page = NULL;
+		if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (ret != 0)
+		goto err;
+
+	/* Free the rest of the pages in the stack. */
+	while (++epg <= cp->csp) {
+		if ((ret = __memp_dirty(mpf, &epg->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err;
+		/*
+		 * Delete page entries so they will be restored as part of
+		 * recovery.  We don't need to do cursor adjustment here as
+		 * the pages are being emptied by definition and so cannot
+		 * be referenced by a cursor.
+		 */
+		if (NUM_ENT(epg->page) != 0) {
+			DB_ASSERT(dbp->env, LEVEL(epg->page) != 1);
+
+			if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+				goto err;
+			/*
+			 * Sheer paranoia: if we find any pages that aren't
+			 * emptied by the delete, someone else added an item
+			 * while we were walking the tree, and we discontinue
+			 * the delete.  Shouldn't be possible, but we check
+			 * regardless.
+			 */
+			if (NUM_ENT(epg->page) != 0)
+				goto err;
+		}
+
+		ret = __db_free(dbc, epg->page, 0);
+		if (cp->page == epg->page)
+			cp->page = NULL;
+		epg->page = NULL;
+		if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err_inc;
+	}
+
+	if (0) {
+err_inc:	++epg;
+err:		for (; epg <= cp->csp; ++epg) {
+			if (epg->page != NULL) {
+				(void)__memp_fput(mpf, dbc->thread_info,
+				     epg->page, dbc->priority);
+				epg->page = NULL;
+			}
+			(void)__TLPUT(dbc, epg->lock);
+		}
+		BT_STK_CLR(cp);
+		return (ret);
+	}
+	BT_STK_CLR(cp);
+
+	/*
+	 * If we just deleted the next-to-last item from the root page, the
+	 * tree can collapse one or more levels.  While there remains only a
+	 * single item on the root page, write lock the last page referenced
+	 * by the root page and copy it over the root page.
+	 * Note that if pgno is the root of a btree database then the root
+	 * cannot change as we have it locked.
+	 */
+	if (nitems != 1)
+		return (0);
+	root_pgno = BAM_ROOT_PGNO(dbc);
+	if (pgno != root_pgno)
+		return (0);
+
+	for (done = 0; !done;) {
+		/* Initialize. */
+		parent = child = NULL;
+		LOCK_INIT(p_lock);
+		LOCK_INIT(c_lock);
+
+		/* Get the root. */
+		root_pgno = cp->root;
+		BAM_GET_ROOT(dbc, root_pgno,
+		    parent, DB_MPOOL_DIRTY, DB_LOCK_WRITE, p_lock, ret);
+
+		DB_ASSERT(dbp->env, parent != NULL);
+		if (ret != 0 || NUM_ENT(parent) != 1)
+			goto stop;
+
+		switch (TYPE(parent)) {
+		case P_IBTREE:
+			/*
+			 * If this is overflow, then try to delete it.
+			 * The child may or may not still point at it.
+			 */
+			bi = GET_BINTERNAL(dbp, parent, 0);
+			if (B_TYPE(bi->type) == B_OVERFLOW)
+				if ((ret = __db_doff(dbc,
+				    ((BOVERFLOW *)bi->data)->pgno)) != 0)
+					goto stop;
+			pgno = bi->pgno;
+			break;
+		case P_IRECNO:
+			pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
+			break;
+		default:
+			goto stop;
+		}
+
+		/* Lock the child page. */
+		if ((ret =
+		    __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
+			goto stop;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &child)) != 0)
+			goto stop;
+
+		/* Log the change. */
+		if (DBC_LOGGING(dbc)) {
+			memset(&a, 0, sizeof(a));
+			a.data = child;
+			a.size = dbp->pgsize;
+			memset(&b, 0, sizeof(b));
+			b.data = P_ENTRY(dbp, parent, 0);
+			b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
+			    BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
+			if ((ret = __bam_rsplit_log(dbp, dbc->txn,
+			    &child->lsn, 0, PGNO(child), &a, PGNO(parent),
+			    RE_NREC(parent), &b, &parent->lsn)) != 0)
+				goto stop;
+		} else
+			LSN_NOT_LOGGED(child->lsn);
+
+		/*
+		 * Make the switch.
+		 *
+		 * One fixup -- internal pages below the top level do not store
+		 * a record count, so we have to preserve it if we're not
+		 * converting to a leaf page.  Note also that we are about to
+		 * overwrite the parent page, including its LSN.  This is OK
+		 * because the log message we wrote describing this update
+		 * stores its LSN on the child page.  When the child is copied
+		 * onto the parent, the correct LSN is copied into place.
+		 */
+		COMPQUIET(rcnt, 0);
+		if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+			rcnt = RE_NREC(parent);
+		memcpy(parent, child, dbp->pgsize);
+		PGNO(parent) = root_pgno;
+		if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+			RE_NREC_SET(parent, rcnt);
+
+		/* Adjust the cursors. */
+		if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
+			goto stop;
+
+		/*
+		 * Free the page copied onto the root page and discard its
+		 * lock.  (The call to __db_free() discards our reference
+		 * to the page.)
+		 */
+		if ((ret = __db_free(dbc, child, 0)) != 0) {
+			child = NULL;
+			goto stop;
+		}
+		child = NULL;
+
+		if (0) {
+stop:			done = 1;
+		}
+		if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (parent != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    parent, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (child != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    child, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+/*
+ * __bam_pupdate --
+ *	Update parent key pointers up the tree.
+ *
+ * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
+ */
+int
+__bam_pupdate(dbc, lpg)
+	DBC *dbc;
+	PAGE *lpg;
+{
+	BTREE_CURSOR *cp;
+	ENV *env;
+	EPG *epg;
+	int ret;
+
+	env = dbc->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	ret = 0;
+
+	/*
+	 * Update the parents up the tree.  __bam_pinsert only looks at the
+	 * left child if is a leaf page, so we don't need to change it.  We
+	 * just do a delete and insert; a replace is possible but reusing
+	 * pinsert is better.
+	 */
+	for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) {
+		if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		epg->indx--;
+		if ((ret = __bam_pinsert(dbc, epg, 0,
+		    lpg, epg[1].page, BPI_NORECNUM | BPI_REPLACE)) != 0) {
+			if (ret == DB_NEEDSPLIT) {
+				/* This should not happen. */
+				__db_errx(env, DB_STR_A("1020",
+				    "Not enough room in parent: %s: page %lu",
+				    "%s %lu"), dbc->dbp->fname,
+				    (u_long)PGNO(epg->page));
+				ret = __env_panic(env, EINVAL);
+			}
+			epg->indx++;
+			return (ret);
+		}
+		epg->indx++;
+	}
+	return (ret);
+}
diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c
new file mode 100644
index 00000000..5cf93d2e
--- /dev/null
+++ b/src/btree/bt_method.c
@@ -0,0 +1,745 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/qam.h"
+
+static int __bam_set_bt_minkey __P((DB *, u_int32_t));
+static int __bam_get_bt_compare
+	       __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_prefix
+	       __P((DB *, size_t(**)(DB *, const DBT *, const DBT *)));
+static int __bam_set_bt_prefix
+	       __P((DB *, size_t(*)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_compress __P((DB *,
+    int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+    int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+static int __ram_get_re_delim __P((DB *, int *));
+static int __ram_set_re_delim __P((DB *, int));
+static int __ram_set_re_len __P((DB *, u_int32_t));
+static int __ram_set_re_pad __P((DB *, int));
+static int __ram_get_re_source __P((DB *, const char **));
+static int __ram_set_re_source __P((DB *, const char *));
+
+/*
+ * __bam_db_create --
+ *	Btree specific initialization of the DB structure.
+ *
+ * PUBLIC: int __bam_db_create __P((DB *));
+ */
+int
+__bam_db_create(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	int ret;
+
+	/* Allocate and initialize the private btree structure. */
+	if ((ret = __os_calloc(dbp->env, 1, sizeof(BTREE), &t)) != 0)
+		return (ret);
+	dbp->bt_internal = t;
+
+	t->bt_minkey = DEFMINKEYPAGE;		/* Btree */
+	t->bt_compare = __bam_defcmp;
+	t->bt_prefix = __bam_defpfx;
+#ifdef HAVE_COMPRESSION
+	t->bt_compress = NULL;
+	t->bt_decompress = NULL;
+	t->compress_dup_compare = NULL;
+
+	/*
+	 * DB_AM_COMPRESS may have been set in __bam_metachk before the
+	 * bt_internal structure existed.
+	 */
+	if (F_ISSET(dbp, DB_AM_COMPRESS) &&
+	    (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+		return (ret);
+#endif
+
+	dbp->get_bt_compare = __bam_get_bt_compare;
+	dbp->set_bt_compare = __bam_set_bt_compare;
+	dbp->get_bt_minkey = __bam_get_bt_minkey;
+	dbp->set_bt_minkey = __bam_set_bt_minkey;
+	dbp->get_bt_prefix = __bam_get_bt_prefix;
+	dbp->set_bt_prefix = __bam_set_bt_prefix;
+	dbp->get_bt_compress = __bam_get_bt_compress;
+	dbp->set_bt_compress = __bam_set_bt_compress;
+
+	t->re_pad = ' ';			/* Recno */
+	t->re_delim = '\n';
+	t->re_eof = 1;
+
+	dbp->get_re_delim = __ram_get_re_delim;
+	dbp->set_re_delim = __ram_set_re_delim;
+	dbp->get_re_len = __ram_get_re_len;
+	dbp->set_re_len = __ram_set_re_len;
+	dbp->get_re_pad = __ram_get_re_pad;
+	dbp->set_re_pad = __ram_set_re_pad;
+	dbp->get_re_source = __ram_get_re_source;
+	dbp->set_re_source = __ram_set_re_source;
+
+	return (0);
+}
+
+/*
+ * __bam_db_close --
+ *	Btree specific discard of the DB structure.
+ *
+ * PUBLIC: int __bam_db_close __P((DB *));
+ */
+int
+__bam_db_close(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+
+	if ((t = dbp->bt_internal) == NULL)
+		return (0);
+						/* Recno */
+	/* Close any backing source file descriptor. */
+	if (t->re_fp != NULL)
+		(void)fclose(t->re_fp);
+
+	/* Free any backing source file name. */
+	if (t->re_source != NULL)
+		__os_free(dbp->env, t->re_source);
+
+	__os_free(dbp->env, t);
+	dbp->bt_internal = NULL;
+
+	return (0);
+}
+
+/*
+ * __bam_map_flags --
+ *	Map Btree specific flags from public to the internal values.
+ *
+ * PUBLIC: void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__bam_map_flags(dbp, inflagsp, outflagsp)
+	DB *dbp;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(dbp, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_DUP)) {
+		FLD_SET(*outflagsp, DB_AM_DUP);
+		FLD_CLR(*inflagsp, DB_DUP);
+	}
+	if (FLD_ISSET(*inflagsp, DB_DUPSORT)) {
+		FLD_SET(*outflagsp, DB_AM_DUP | DB_AM_DUPSORT);
+		FLD_CLR(*inflagsp, DB_DUPSORT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_RECNUM)) {
+		FLD_SET(*outflagsp, DB_AM_RECNUM);
+		FLD_CLR(*inflagsp, DB_RECNUM);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REVSPLITOFF)) {
+		FLD_SET(*outflagsp, DB_AM_REVSPLITOFF);
+		FLD_CLR(*inflagsp, DB_REVSPLITOFF);
+	}
+}
+
+/*
+ * __bam_set_flags --
+ *	Set Btree specific flags.
+ *
+ * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__bam_set_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	BTREE *t;
+	u_int32_t flags;
+
+	t = dbp->bt_internal;
+
+	flags = *flagsp;
+	if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF))
+		DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+
+	/*
+	 * The DB_DUP and DB_DUPSORT flags are shared by the Hash
+	 * and Btree access methods.
+	 */
+	if (LF_ISSET(DB_DUP | DB_DUPSORT))
+		DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF))
+		DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	/* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */
+	if (LF_ISSET(DB_DUP | DB_DUPSORT) && F_ISSET(dbp, DB_AM_RECNUM))
+		goto incompat;
+
+	/* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+	if (LF_ISSET(DB_RECNUM) && F_ISSET(dbp, DB_AM_DUP))
+		goto incompat;
+
+	/* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+	if (LF_ISSET(DB_RECNUM) && LF_ISSET(DB_DUP | DB_DUPSORT))
+		goto incompat;
+
+#ifdef HAVE_COMPRESSION
+	/* DB_RECNUM is incompatible with compression */
+	if (LF_ISSET(DB_RECNUM) && DB_IS_COMPRESSED(dbp)) {
+		__db_errx(dbp->env, DB_STR("1024",
+		    "DB_RECNUM cannot be used with compression"));
+		return (EINVAL);
+	}
+
+	/* DB_DUP without DB_DUPSORT is incompatible with compression */
+	if (LF_ISSET(DB_DUP) && !LF_ISSET(DB_DUPSORT) &&
+		!F_ISSET(dbp, DB_AM_DUPSORT) && DB_IS_COMPRESSED(dbp)) {
+		__db_errx(dbp->env, DB_STR("1025",
+	    "DB_DUP cannot be used with compression without DB_DUPSORT"));
+		return (EINVAL);
+	}
+#endif
+
+	if (LF_ISSET(DB_DUPSORT) && dbp->dup_compare == NULL) {
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbp)) {
+			dbp->dup_compare = __bam_compress_dupcmp;
+			t->compress_dup_compare = __bam_defcmp;
+		} else
+#endif
+			dbp->dup_compare = __bam_defcmp;
+	}
+
+	__bam_map_flags(dbp, flagsp, &dbp->flags);
+	return (0);
+
+incompat:
+	return (__db_ferr(dbp->env, "DB->set_flags", 1));
+}
+
+/*
+ * __bam_get_bt_compare --
+ *	Get the comparison function.
+ */
+static int
+__bam_get_bt_compare(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	if (funcp != NULL)
+		*funcp = t->bt_compare;
+
+	return (0);
+}
+
+/*
+ * __bam_set_bt_compare --
+ *	Set the comparison function.
+ *
+ * PUBLIC: int __bam_set_bt_compare
+ * PUBLIC:         __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ */
+int
+__bam_set_bt_compare(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+{
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compare");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	/*
+	 * Can't default the prefix routine if the user supplies a comparison
+	 * routine; shortening the keys can break their comparison algorithm.
+	 */
+	t->bt_compare = func;
+	if (t->bt_prefix == __bam_defpfx)
+		t->bt_prefix = NULL;
+
+	return (0);
+}
+
+/*
+ * __bam_get_bt_compress --
+ *	Get the compression functions.
+ */
+static int
+__bam_get_bt_compress(dbp, compressp, decompressp)
+	DB *dbp;
+	int (**compressp) __P((DB *, const DBT *, const DBT *, const DBT *,
+				      const DBT *, DBT *));
+	int (**decompressp) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+					DBT *));
+{
+#ifdef HAVE_COMPRESSION
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	if (compressp != NULL)
+		*compressp = t->bt_compress;
+	if (decompressp != NULL)
+		*decompressp = t->bt_decompress;
+
+	return (0);
+#else
+	COMPQUIET(compressp, NULL);
+	COMPQUIET(decompressp, NULL);
+
+	__db_errx(dbp->env, DB_STR("1026",
+	    "compression support has not been compiled in"));
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __bam_set_bt_compress --
+ *	Set the compression functions.
+ *
+ * PUBLIC: int __bam_set_bt_compress __P((DB *,
+ * PUBLIC:  int (*)(DB *, const DBT *, const DBT *,
+ * PUBLIC:	    const DBT *, const DBT *, DBT *),
+ * PUBLIC:  int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+ */
+int
+__bam_set_bt_compress(dbp, compress, decompress)
+	DB *dbp;
+	int (*compress) __P((DB *, const DBT *, const DBT *, const DBT *,
+				    const DBT *, DBT *));
+	int (*decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+				      DBT *));
+{
+#ifdef HAVE_COMPRESSION
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compress");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	/* compression is incompatible with DB_RECNUM */
+	if (F_ISSET(dbp, DB_AM_RECNUM)) {
+		__db_errx(dbp->env, DB_STR("1027",
+		    "compression cannot be used with DB_RECNUM"));
+		return (EINVAL);
+	}
+
+	/* compression is incompatible with DB_DUP without DB_DUPSORT */
+	if (F_ISSET(dbp, DB_AM_DUP) && !F_ISSET(dbp, DB_AM_DUPSORT)) {
+		__db_errx(dbp->env, DB_STR("1028",
+	    "compression cannot be used with DB_DUP without DB_DUPSORT"));
+		return (EINVAL);
+	}
+
+	if (compress != 0 && decompress != 0) {
+		t->bt_compress = compress;
+		t->bt_decompress = decompress;
+	} else if (compress == 0 && decompress == 0) {
+		t->bt_compress = __bam_defcompress;
+		t->bt_decompress = __bam_defdecompress;
+	} else {
+		__db_errx(dbp->env, DB_STR("1029",
+    "to enable compression you need to supply both function arguments"));
+		return (EINVAL);
+	}
+	F_SET(dbp, DB_AM_COMPRESS);
+
+	/* Copy dup_compare to compress_dup_compare, and use the compression
+	   duplicate compare */
+	if (F_ISSET(dbp, DB_AM_DUPSORT)) {
+		t->compress_dup_compare = dbp->dup_compare;
+		dbp->dup_compare = __bam_compress_dupcmp;
+	}
+
+	return (0);
+#else
+	COMPQUIET(compress, NULL);
+	COMPQUIET(decompress, NULL);
+
+	__db_errx(dbp->env, DB_STR("1030",
+	    "compression support has not been compiled in"));
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __db_get_bt_minkey --
+ *	Get the minimum keys per page.
+ *
+ * PUBLIC: int __bam_get_bt_minkey __P((DB *, u_int32_t *));
+ */
+int
+__bam_get_bt_minkey(dbp, bt_minkeyp)
+	DB *dbp;
+	u_int32_t *bt_minkeyp;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+	*bt_minkeyp = t->bt_minkey;
+	return (0);
+}
+
+/*
+ * __bam_set_bt_minkey --
+ *	Set the minimum keys per page.
+ */
+static int
+__bam_set_bt_minkey(dbp, bt_minkey)
+	DB *dbp;
+	u_int32_t bt_minkey;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_minkey");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	if (bt_minkey < 2) {
+		__db_errx(dbp->env, DB_STR("1031",
+		    "minimum bt_minkey value is 2"));
+		return (EINVAL);
+	}
+
+	t->bt_minkey = bt_minkey;
+	return (0);
+}
+
+/*
+ * __bam_get_bt_prefix --
+ *	Get the prefix function.
+ */
+static int
+__bam_get_bt_prefix(dbp, funcp)
+	DB *dbp;
+	size_t (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+	if (funcp != NULL)
+		*funcp = t->bt_prefix;
+	return (0);
+}
+
+/*
+ * __bam_set_bt_prefix --
+ *	Set the prefix function.
+ */
+static int
+__bam_set_bt_prefix(dbp, func)
+	DB *dbp;
+	size_t (*func) __P((DB *, const DBT *, const DBT *));
+{
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_prefix");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+	t = dbp->bt_internal;
+
+	t->bt_prefix = func;
+	return (0);
+}
+
+/*
+ * __bam_copy_config
+ *	Copy the configuration of one DB handle to another.
+ * PUBLIC: void __bam_copy_config __P((DB *, DB*, u_int32_t));
+ */
+void
+__bam_copy_config(src, dst, nparts)
+	DB *src, *dst;
+	u_int32_t nparts;
+{
+	BTREE *s, *d;
+
+	COMPQUIET(nparts, 0);
+
+	s = src->bt_internal;
+	d = dst->bt_internal;
+	d->bt_compare = s->bt_compare;
+	d->bt_minkey = s->bt_minkey;
+	d->bt_minkey = s->bt_minkey;
+	d->bt_prefix = s->bt_prefix;
+#ifdef HAVE_COMPRESSION
+	d->bt_compress = s->bt_compress;
+	d->bt_decompress = s->bt_decompress;
+	d->compress_dup_compare = s->compress_dup_compare;
+#endif
+}
+
+/*
+ * __ram_map_flags --
+ *	Map Recno specific flags from public to the internal values.
+ *
+ * PUBLIC: void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__ram_map_flags(dbp, inflagsp, outflagsp)
+	DB *dbp;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(dbp, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_RENUMBER)) {
+		FLD_SET(*outflagsp, DB_AM_RENUMBER);
+		FLD_CLR(*inflagsp, DB_RENUMBER);
+	}
+	if (FLD_ISSET(*inflagsp, DB_SNAPSHOT)) {
+		FLD_SET(*outflagsp, DB_AM_SNAPSHOT);
+		FLD_CLR(*inflagsp, DB_SNAPSHOT);
+	}
+}
+
+/*
+ * __ram_set_flags --
+ *	Set Recno specific flags.
+ *
+ * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__ram_set_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	u_int32_t flags;
+
+	flags = *flagsp;
+	if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) {
+		DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+		DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+	}
+
+	__ram_map_flags(dbp, flagsp, &dbp->flags);
+	return (0);
+}
+
+/*
+ * __db_get_re_delim --
+ *	Get the variable-length input record delimiter.
+ */
+static int
+__ram_get_re_delim(dbp, re_delimp)
+	DB *dbp;
+	int *re_delimp;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+	t = dbp->bt_internal;
+	*re_delimp = t->re_delim;
+	return (0);
+}
+
+/*
+ * __ram_set_re_delim --
+ *	Set the variable-length input record delimiter.
+ */
+static int
+__ram_set_re_delim(dbp, re_delim)
+	DB *dbp;
+	int re_delim;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_delim");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+	t = dbp->bt_internal;
+
+	t->re_delim = re_delim;
+	F_SET(dbp, DB_AM_DELIMITER);
+
+	return (0);
+}
+
+/*
+ * __db_get_re_len --
+ *	Get the variable-length input record length.
+ *
+ * PUBLIC: int __ram_get_re_len __P((DB *, u_int32_t *));
+ */
+int
+__ram_get_re_len(dbp, re_lenp)
+	DB *dbp;
+	u_int32_t *re_lenp;
+{
+	BTREE *t;
+	QUEUE *q;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	/*
+	 * This has to work for all access methods, before or after opening the
+	 * database.  When the record length is set with __ram_set_re_len, the
+	 * value in both the BTREE and QUEUE structs will be correct.
+	 * Otherwise, this only makes sense after the database in opened, in
+	 * which case we know the type.
+	 */
+	if (dbp->type == DB_QUEUE) {
+		q = dbp->q_internal;
+		*re_lenp = q->re_len;
+	} else {
+		t = dbp->bt_internal;
+		*re_lenp = t->re_len;
+	}
+
+	return (0);
+}
+
+/*
+ * __ram_set_re_len --
+ *	Set the variable-length input record length.
+ */
+static int
+__ram_set_re_len(dbp, re_len)
+	DB *dbp;
+	u_int32_t re_len;
+{
+	BTREE *t;
+#ifdef HAVE_QUEUE
+	QUEUE *q;
+#endif
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_len");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	t = dbp->bt_internal;
+	t->re_len = re_len;
+
+#ifdef HAVE_QUEUE
+	q = dbp->q_internal;
+	q->re_len = re_len;
+#endif
+
+	F_SET(dbp, DB_AM_FIXEDLEN);
+
+	return (0);
+}
+
+/*
+ * __db_get_re_pad --
+ *	Get the fixed-length record pad character.
+ *
+ * PUBLIC: int __ram_get_re_pad __P((DB *, int *));
+ */
+int
+__ram_get_re_pad(dbp, re_padp)
+	DB *dbp;
+	int *re_padp;
+{
+	BTREE *t;
+	QUEUE *q;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	/*
+	 * This has to work for all access methods, before or after opening the
+	 * database.  When the record length is set with __ram_set_re_pad, the
+	 * value in both the BTREE and QUEUE structs will be correct.
+	 * Otherwise, this only makes sense after the database in opened, in
+	 * which case we know the type.
+	 */
+	if (dbp->type == DB_QUEUE) {
+		q = dbp->q_internal;
+		*re_padp = q->re_pad;
+	} else {
+		t = dbp->bt_internal;
+		*re_padp = t->re_pad;
+	}
+
+	return (0);
+}
+
+/*
+ * __ram_set_re_pad --
+ *	Set the fixed-length record pad character.
+ */
+static int
+__ram_set_re_pad(dbp, re_pad)
+	DB *dbp;
+	int re_pad;
+{
+	BTREE *t;
+#ifdef HAVE_QUEUE
+	QUEUE *q;
+#endif
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_pad");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	t = dbp->bt_internal;
+	t->re_pad = re_pad;
+
+#ifdef HAVE_QUEUE
+	q = dbp->q_internal;
+	q->re_pad = re_pad;
+#endif
+
+	F_SET(dbp, DB_AM_PAD);
+
+	return (0);
+}
+
+/*
+ * __db_get_re_source --
+ *	Get the backing source file name.
+ */
+static int
+__ram_get_re_source(dbp, re_sourcep)
+	DB *dbp;
+	const char **re_sourcep;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+	t = dbp->bt_internal;
+	*re_sourcep = t->re_source;
+	return (0);
+}
+
+/*
+ * __ram_set_re_source --
+ *	Set the backing source file name.
+ */
+static int
+__ram_set_re_source(dbp, re_source)
+	DB *dbp;
+	const char *re_source;
+{
+	BTREE *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_source");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+	t = dbp->bt_internal;
+
+	return (__os_strdup(dbp->env, re_source, &t->re_source));
+}
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
new file mode 100644
index 00000000..7be141c1
--- /dev/null
+++ b/src/btree/bt_open.c
@@ -0,0 +1,677 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/fop.h"
+
+static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *));
+
+/*
+ * __bam_open --
+ *	Open a btree.
+ *
+ * PUBLIC: int __bam_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	BTREE *t;
+
+	COMPQUIET(name, NULL);
+	t = dbp->bt_internal;
+
+	/*
+	 * We don't permit the user to specify a prefix routine if they didn't
+	 * also specify a comparison routine, they can't know enough about our
+	 * comparison routine to get it right.
+	 */
+	if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) {
+		__db_errx(dbp->env, DB_STR("1006",
+"prefix comparison may not be specified for default comparison routine"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Verify that the bt_minkey value specified won't cause the
+	 * calculation of ovflsize to underflow [#2406] for this pagesize.
+	 */
+	if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) >
+	    B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+		__db_errx(dbp->env, DB_STR_A("1007",
+		    "bt_minkey value of %lu too high for page size of %lu",
+		    "%lu %lu"), (u_long)t->bt_minkey, (u_long)dbp->pgsize);
+		return (EINVAL);
+	}
+
+	/* Start up the tree. */
+	return (__bam_read_root(dbp, ip, txn, base_pgno, flags));
+}
+
+/*
+ * __bam_metachk --
+ *
+ * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *));
+ */
+int
+__bam_metachk(dbp, name, btm)
+	DB *dbp;
+	const char *name;
+	BTMETA *btm;
+{
+	ENV *env;
+	u_int32_t vers;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * At this point, all we know is that the magic number is for a Btree.
+	 * Check the version, the database may be out of date.
+	 */
+	vers = btm->dbmeta.version;
+	if (F_ISSET(dbp, DB_AM_SWAP))
+		M_32_SWAP(vers);
+	switch (vers) {
+	case 6:
+	case 7:
+		__db_errx(env, DB_STR_A("1008",
+		    "%s: btree version %lu requires a version upgrade",
+		    "%s %lu"), name, (u_long)vers);
+		return (DB_OLD_VERSION);
+	case 8:
+	case 9:
+		break;
+	default:
+		__db_errx(env, DB_STR_A("1009",
+		    "%s: unsupported btree version: %lu", "%s %lu"),
+		    name, (u_long)vers);
+		return (EINVAL);
+	}
+
+	/* Swap the page if we need to. */
+	if (F_ISSET(dbp, DB_AM_SWAP) &&
+	    (ret = __bam_mswap(env, (PAGE *)btm)) != 0)
+		return (ret);
+
+	/*
+	 * Check application info against metadata info, and set info, flags,
+	 * and type based on metadata info.
+	 */
+	if ((ret =
+	    __db_fchk(env, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0)
+		return (ret);
+
+	if (F_ISSET(&btm->dbmeta, BTM_RECNO)) {
+		if (dbp->type == DB_BTREE)
+			goto wrong_type;
+		dbp->type = DB_RECNO;
+		DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+	} else {
+		if (dbp->type == DB_RECNO)
+			goto wrong_type;
+		dbp->type = DB_BTREE;
+		DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+	}
+
+	if (F_ISSET(&btm->dbmeta, BTM_DUP))
+		F_SET(dbp, DB_AM_DUP);
+	else
+		if (F_ISSET(dbp, DB_AM_DUP)) {
+			__db_errx(env, DB_STR_A("1010",
+		"%s: DB_DUP specified to open method but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) {
+		if (dbp->type != DB_BTREE)
+			goto wrong_type;
+		F_SET(dbp, DB_AM_RECNUM);
+
+		if ((ret = __db_fcchk(env,
+		    "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0)
+			return (ret);
+	} else
+		if (F_ISSET(dbp, DB_AM_RECNUM)) {
+			__db_errx(env, DB_STR_A("1011",
+	    "%s: DB_RECNUM specified to open method but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) {
+		if (dbp->type != DB_RECNO)
+			goto wrong_type;
+		F_SET(dbp, DB_AM_FIXEDLEN);
+	} else
+		if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+			__db_errx(env, DB_STR_A("1012",
+	"%s: DB_FIXEDLEN specified to open method but not set in database",
+			"%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) {
+		if (dbp->type != DB_RECNO)
+			goto wrong_type;
+		F_SET(dbp, DB_AM_RENUMBER);
+	} else
+		if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+			__db_errx(env, DB_STR_A("1013",
+	    "%s: DB_RENUMBER specified to open method but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&btm->dbmeta, BTM_SUBDB))
+		F_SET(dbp, DB_AM_SUBDB);
+	else
+		if (F_ISSET(dbp, DB_AM_SUBDB)) {
+			__db_errx(env, DB_STR_A("1014",
+	    "%s: multiple databases specified but not supported by file",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) {
+		if (dbp->dup_compare == NULL)
+			dbp->dup_compare = __bam_defcmp;
+		F_SET(dbp, DB_AM_DUPSORT);
+	} else
+		if (dbp->dup_compare != NULL) {
+			__db_errx(env, DB_STR_A("1015",
+		"%s: duplicate sort specified but not supported in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+#ifdef HAVE_COMPRESSION
+	if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+		F_SET(dbp, DB_AM_COMPRESS);
+		if ((BTREE *)dbp->bt_internal != NULL &&
+		    !DB_IS_COMPRESSED(dbp) &&
+		    (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+			return (ret);
+	} else {
+		if ((BTREE *)dbp->bt_internal != NULL &&
+		    DB_IS_COMPRESSED(dbp)) {
+			__db_errx(env, DB_STR_A("1016",
+	"%s: compresssion specified to open method but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+	}
+#else
+	if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+		__db_errx(env, DB_STR_A("1017",
+		    "%s: compression support has not been compiled in", "%s"),
+		    name);
+		return (EINVAL);
+	}
+#endif
+
+	/* Set the page size. */
+	dbp->pgsize = btm->dbmeta.pagesize;
+
+	/* Copy the file's ID. */
+	memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
+
+	return (0);
+
+wrong_type:
+	if (dbp->type == DB_BTREE)
+		__db_errx(env, DB_STR("1018",
+		    "open method type is Btree, database type is Recno"));
+	else
+		__db_errx(env, DB_STR("1019",
+		    "open method type is Recno, database type is Btree"));
+	return (EINVAL);
+}
+
+/*
+ * __bam_read_root --
+ *	Read the root page and check a tree.
+ *
+ * PUBLIC: int __bam_read_root __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_read_root(dbp, ip, txn, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	BTMETA *meta;
+	BTREE *t;
+	DBC *dbc;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+
+	meta = NULL;
+	t = dbp->bt_internal;
+	LOCK_INIT(metalock);
+	mpf = dbp->mpf;
+	ret = 0;
+
+	/* Get a cursor.  */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+	    F_ISSET(dbp, DB_AM_RECOVER) ? DB_RECOVER : 0)) != 0)
+		return (ret);
+
+	/* Get the metadata page. */
+	if ((ret =
+	    __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	/*
+	 * If the magic number is set, the tree has been created.  Correct
+	 * any fields that may not be right.  Note, all of the local flags
+	 * were set by DB->open.
+	 *
+	 * Otherwise, we'd better be in recovery or abort, in which case the
+	 * metadata page will be created/initialized elsewhere.
+	 *
+	 * Ignore the last_pgno on the metadata page for snapshot transactions:
+	 * we may be reading an old version of the page, and we've already
+	 * set last_pgno from the file size.  The only time this would matter
+	 * is if we don't have ftruncate and there are some free pages at the
+	 * end of the file: we could end up with holes.
+	 */
+	if (meta->dbmeta.magic == DB_BTREEMAGIC) {
+		t->bt_minkey = meta->minkey;
+		t->re_pad = (int)meta->re_pad;
+		t->re_len = meta->re_len;
+
+		t->bt_meta = base_pgno;
+		t->bt_root = meta->root;
+		t->revision = dbp->mpf->mfp->revision;
+		if (PGNO(meta) == PGNO_BASE_MD &&
+		    !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    (txn == NULL || !F_ISSET(txn, TXN_SNAPSHOT)) && (ret =
+		    __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno)) != 0)
+			goto err;
+	} else {
+		DB_ASSERT(dbp->env,
+		    IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
+	}
+
+	/*
+	 * !!!
+	 * If creating a subdatabase, we've already done an insert when
+	 * we put the subdatabase's entry into the master database, so
+	 * our last-page-inserted value is wrongly initialized for the
+	 * master database, not the subdatabase we're creating.  I'm not
+	 * sure where the *right* place to clear this value is, it's not
+	 * intuitively obvious that it belongs here.
+	 */
+	t->bt_lpgno = PGNO_INVALID;
+
+err:	/* Put the metadata page back. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bam_init_meta --
+ *
+ * Initialize a btree meta-data page.  The following fields may need
+ * to be updated later: last_pgno, root.
+ */
+static void
+__bam_init_meta(dbp, meta, pgno, lsnp)
+	DB *dbp;
+	BTMETA *meta;
+	db_pgno_t pgno;
+	DB_LSN *lsnp;
+{
+	BTREE *t;
+#ifdef HAVE_PARTITION
+	DB_PARTITION *part;
+#endif
+	ENV *env;
+
+	env = dbp->env;
+	t = dbp->bt_internal;
+
+	memset(meta, 0, sizeof(BTMETA));
+	meta->dbmeta.lsn = *lsnp;
+	meta->dbmeta.pgno = pgno;
+	meta->dbmeta.magic = DB_BTREEMAGIC;
+	meta->dbmeta.version = DB_BTREEVERSION;
+	meta->dbmeta.pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+		DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+		meta->crypto_magic = meta->dbmeta.magic;
+	}
+	meta->dbmeta.type = P_BTREEMETA;
+	meta->dbmeta.free = PGNO_INVALID;
+	meta->dbmeta.last_pgno = pgno;
+	if (F_ISSET(dbp, DB_AM_DUP))
+		F_SET(&meta->dbmeta, BTM_DUP);
+	if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+		F_SET(&meta->dbmeta, BTM_FIXEDLEN);
+	if (F_ISSET(dbp, DB_AM_RECNUM))
+		F_SET(&meta->dbmeta, BTM_RECNUM);
+	if (F_ISSET(dbp, DB_AM_RENUMBER))
+		F_SET(&meta->dbmeta, BTM_RENUMBER);
+	if (F_ISSET(dbp, DB_AM_SUBDB))
+		F_SET(&meta->dbmeta, BTM_SUBDB);
+	if (dbp->dup_compare != NULL)
+		F_SET(&meta->dbmeta, BTM_DUPSORT);
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp))
+		F_SET(&meta->dbmeta, BTM_COMPRESS);
+#endif
+	if (dbp->type == DB_RECNO)
+		F_SET(&meta->dbmeta, BTM_RECNO);
+	memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+	meta->minkey = t->bt_minkey;
+	meta->re_len = t->re_len;
+	meta->re_pad = (u_int32_t)t->re_pad;
+
+#ifdef HAVE_PARTITION
+	if ((part = dbp->p_internal) != NULL) {
+		meta->dbmeta.nparts = part->nparts;
+		if (F_ISSET(part, PART_CALLBACK))
+			FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
+		if (F_ISSET(part, PART_RANGE))
+			FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
+	}
+#endif
+}
+
+/*
+ * __bam_new_file --
+ * Create the necessary pages to begin a new database file.
+ *
+ * This code appears more complex than it is because of the two cases (named
+ * and unnamed).  The way to read the code is that for each page being created,
+ * there are three parts: 1) a "get page" chunk (which either uses malloc'd
+ * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
+ * chunk which either does a fop write or an __memp_fput.
+ *
+ * PUBLIC: int __bam_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__bam_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	BTMETA *meta;
+	DBT pdbt;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	PAGE *root;
+	db_pgno_t pgno;
+	int ret, t_ret;
+	void *buf;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	root = NULL;
+	meta = NULL;
+	buf = NULL;
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		/* Build the meta-data page. */
+		pgno = PGNO_BASE_MD;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+			return (ret);
+		LSN_NOT_LOGGED(lsn);
+		__bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		meta->root = 1;
+		meta->dbmeta.last_pgno = 1;
+		if ((ret =
+		    __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta)) != 0)
+			goto err;
+		ret = __memp_fput(mpf, ip, meta, dbp->priority);
+		meta = NULL;
+		if (ret != 0)
+			goto err;
+
+		/* Build the root page. */
+		pgno = 1;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &root)) != 0)
+			goto err;
+		P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+		    LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+		LSN_NOT_LOGGED(root->lsn);
+		if ((ret =
+		    __db_log_page(dbp, txn, &root->lsn, pgno, root)) != 0)
+			goto err;
+		ret = __memp_fput(mpf, ip, root, dbp->priority);
+		root = NULL;
+		if (ret != 0)
+			goto err;
+	} else {
+		memset(&pdbt, 0, sizeof(pdbt));
+
+		/* Build the meta-data page. */
+		pginfo.db_pagesize = dbp->pgsize;
+		pginfo.flags =
+		    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+		pginfo.type = dbp->type;
+		pdbt.data = &pginfo;
+		pdbt.size = sizeof(pginfo);
+		if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
+			return (ret);
+		meta = (BTMETA *)buf;
+		LSN_NOT_LOGGED(lsn);
+		__bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		meta->root = 1;
+		meta->dbmeta.last_pgno = 1;
+		if ((ret = __db_pgout(
+		    dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+			goto err;
+		if ((ret = __fop_write(env, txn, name, dbp->dirname,
+		    DB_APP_DATA, fhp,
+		    dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		meta = NULL;
+
+		/* Build the root page. */
+#ifdef DIAGNOSTIC
+		memset(buf, CLEAR_BYTE, dbp->pgsize);
+#endif
+		root = (PAGE *)buf;
+		P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+		    LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+		LSN_NOT_LOGGED(root->lsn);
+		if ((ret =
+		    __db_pgout(dbp->dbenv, root->pgno, root, &pdbt)) != 0)
+			goto err;
+		if ((ret =
+		    __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
+		    fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		root = NULL;
+	}
+
+err:	if (buf != NULL)
+		__os_free(env, buf);
+	else {
+		if (meta != NULL &&
+		    (t_ret = __memp_fput(mpf, ip,
+		    meta, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (root != NULL &&
+		    (t_ret = __memp_fput(mpf, ip,
+		    root, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+
+/*
+ * __bam_new_subdb --
+ *	Create a metadata page and a root page for a new btree.
+ *
+ * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__bam_new_subdb(mdbp, dbp, ip, txn)
+	DB *mdbp, *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	BTMETA *meta;
+	DBC *dbc;
+	DB_LOCK metalock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *root;
+	int ret, t_ret;
+
+	env = mdbp->env;
+	mpf = mdbp->mpf;
+	dbc = NULL;
+	meta = NULL;
+	root = NULL;
+
+	if ((ret = __db_cursor(mdbp, ip, txn,
+	    &dbc, CDB_LOCKING(env) ?  DB_WRITECURSOR : 0)) != 0)
+		return (ret);
+
+	/* Get, and optionally create the metadata page. */
+	if ((ret = __db_lget(dbc,
+	    0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+	    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+		goto err;
+
+	/* Build meta-data page. */
+	lsn = meta->dbmeta.lsn;
+	__bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
+	if ((ret = __db_log_page(mdbp,
+	    txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
+		goto err;
+
+	/* Create and initialize a root page. */
+	if ((ret = __db_new(dbc,
+	    dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, NULL, &root)) != 0)
+		goto err;
+	root->level = LEAFLEVEL;
+
+	if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+	    txn != NULL &&
+#endif
+
+	    (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0,
+	    meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0)
+		goto err;
+
+	meta->root = root->pgno;
+	if ((ret =
+	    __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0)
+		goto err;
+
+	/* Release the metadata and root pages. */
+	if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+		goto err;
+	meta = NULL;
+	if ((ret = __memp_fput(mpf, ip, root, dbc->priority)) != 0)
+		goto err;
+	root = NULL;
+err:
+	if (meta != NULL)
+		if ((t_ret = __memp_fput(mpf, ip,
+		meta, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	if (root != NULL)
+		if ((t_ret = __memp_fput(mpf, ip,
+		root, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbc != NULL)
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	return (ret);
+}
diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c
new file mode 100644
index 00000000..13316181
--- /dev/null
+++ b/src/btree/bt_put.c
@@ -0,0 +1,1087 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_build
+	       __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_dup_check __P((DBC *, u_int32_t,
+		PAGE *, u_int32_t, u_int32_t, db_indx_t *));
+static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_ovput
+	       __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
+static u_int32_t
+	   __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
+
+/*
+ * __bam_iitem --
+ *	Insert an item into the tree.
+ *
+ * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+ */
+int
+__bam_iitem(dbc, key, data, op, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t op, flags;
+{
+	BKEYDATA *bk, bk_tmp;
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT bk_hdr, tdbt;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_indx_t cnt, indx;
+	u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace;
+	char tmp_ch;
+	int cmp, bigkey, bigdata, del, dupadjust;
+	int padrec, replace, ret, t_ret, was_deleted;
+
+	COMPQUIET(cnt, 0);
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	t = dbp->bt_internal;
+	h = cp->page;
+	indx = cp->indx;
+	del = dupadjust = replace = was_deleted = 0;
+
+	/*
+	 * Fixed-length records with partial puts: it's an error to specify
+	 * anything other simple overwrite.
+	 */
+	if (F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+	    F_ISSET(data, DB_DBT_PARTIAL) && data->size != data->dlen)
+		return (__db_rec_repl(env, data->size, data->dlen));
+
+	/*
+	 * Figure out how much space the data will take, including if it's a
+	 * partial record.
+	 *
+	 * Fixed-length records: it's an error to specify a record that's
+	 * longer than the fixed-length, and we never require less than
+	 * the fixed-length record size.
+	 */
+	data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
+	    __bam_partsize(dbp, op, data, h, indx) : data->size;
+	padrec = 0;
+	if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+		if (data_size > t->re_len)
+			return (__db_rec_toobig(env, data_size, t->re_len));
+
+		/* Records that are deleted anyway needn't be padded out. */
+		if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) {
+			padrec = 1;
+			data_size = t->re_len;
+		}
+	}
+
+	/*
+	 * Handle partial puts or short fixed-length records: check whether we
+	 * can just append the data or else build the real record.  We can't
+	 * append if there are secondaries: we need the whole data item for the
+	 * application's secondary callback.
+	 */
+	if (op == DB_CURRENT && dbp->dup_compare == NULL &&
+	    F_ISSET(data, DB_DBT_PARTIAL) && !DB_IS_PRIMARY(dbp)) {
+		bk = GET_BKEYDATA(
+		    dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+		/*
+		 * If the item is an overflow type, and the input DBT is
+		 * partial, and begins at the length of the current item then
+		 * it is an append. Avoid deleting and re-creating the entire
+		 * offpage item.
+		 */
+		if (B_TYPE(bk->type) == B_OVERFLOW &&
+		    data->doff == ((BOVERFLOW *)bk)->tlen) {
+			/*
+			 * If the cursor has not already cached the last page
+			 * in the offpage chain. We need to walk the chain
+			 * to be sure that the page has been read.
+			 */
+			if (cp->stream_start_pgno != ((BOVERFLOW *)bk)->pgno ||
+			    cp->stream_off > data->doff || data->doff >
+			    cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+				memset(&tdbt, 0, sizeof(DBT));
+				tdbt.doff = data->doff - 1;
+				/*
+				 * Set the length to 1, to force __db_goff
+				 * to do the traversal.
+				 */
+				tdbt.dlen = tdbt.ulen = 1;
+				tdbt.data = &tmp_ch;
+				tdbt.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+				/*
+				 * Read to the last page.  It will be cached
+				 * in the cursor.
+				 */
+				if ((ret = __db_goff(
+				    dbc, &tdbt, ((BOVERFLOW *)bk)->tlen,
+				    ((BOVERFLOW *)bk)->pgno, NULL, NULL)) != 0)
+					return (ret);
+			}
+
+			/*
+			 * Since this is an append, dlen is irrelevant (there
+			 * are no bytes to overwrite). We need the caller's
+			 * DBT size to end up with the total size of the item.
+			 * From now on, use dlen as the length of the user's
+			 * data that we are going to append.
+			 * Don't futz with the caller's DBT any more than we
+			 * have to in order to send back the size.
+			 */
+			tdbt = *data;
+			tdbt.dlen = data->size;
+			tdbt.size = data_size;
+			data = &tdbt;
+			F_SET(data, DB_DBT_STREAMING);
+		}
+	}
+	if (!F_ISSET(data, DB_DBT_STREAMING) &&
+	    (padrec || F_ISSET(data, DB_DBT_PARTIAL))) {
+		tdbt = *data;
+		if ((ret =
+		    __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
+			return (ret);
+		data = &tdbt;
+	}
+
+	/*
+	 * If the user has specified a duplicate comparison function, return
+	 * an error if DB_CURRENT was specified and the replacement data
+	 * doesn't compare equal to the current data.  This stops apps from
+	 * screwing up the duplicate sort order.  We have to do this after
+	 * we build the real record so that we're comparing the real items.
+	 */
+	if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+		if ((ret = __bam_cmp(dbc, data, h,
+		    indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
+		    dbp->dup_compare, &cmp)) != 0)
+			return (ret);
+		if (cmp != 0) {
+			__db_errx(env, DB_STR("1004",
+			    "Existing data sorts differently from put data"));
+			return (EINVAL);
+		}
+	}
+
+	/*
+	 * If the key or data item won't fit on a page, we'll have to store
+	 * them on overflow pages.
+	 */
+	needed = 0;
+	bigdata = data_size > cp->ovflsize;
+	switch (op) {
+	case DB_KEYFIRST:
+		/* We're adding a new key and data pair. */
+		bigkey = key->size > cp->ovflsize;
+		if (bigkey)
+			needed += BOVERFLOW_PSIZE;
+		else
+			needed += BKEYDATA_PSIZE(key->size);
+		if (bigdata)
+			needed += BOVERFLOW_PSIZE;
+		else
+			needed += BKEYDATA_PSIZE(data_size);
+		break;
+	case DB_AFTER:
+	case DB_BEFORE:
+	case DB_CURRENT:
+		/*
+		 * We're either overwriting the data item of a key/data pair
+		 * or we're creating a new on-page duplicate and only adding
+		 * a data item.
+		 *
+		 * !!!
+		 * We're not currently correcting for space reclaimed from
+		 * already deleted items, but I don't think it's worth the
+		 * complexity.
+		 */
+		bigkey = 0;
+		if (op == DB_CURRENT) {
+			bk = GET_BKEYDATA(dbp, h,
+			    indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+			if (B_TYPE(bk->type) == B_KEYDATA)
+				have_bytes = BKEYDATA_PSIZE(bk->len);
+			else
+				have_bytes = BOVERFLOW_PSIZE;
+			need_bytes = 0;
+		} else {
+			have_bytes = 0;
+			need_bytes = sizeof(db_indx_t);
+		}
+		if (bigdata)
+			need_bytes += BOVERFLOW_PSIZE;
+		else
+			need_bytes += BKEYDATA_PSIZE(data_size);
+
+		if (have_bytes < need_bytes)
+			needed += need_bytes - have_bytes;
+		break;
+	default:
+		return (__db_unknown_flag(env, "DB->put", op));
+	}
+
+	/* Split the page if there's not enough room. */
+	if (P_FREESPACE(dbp, h) < needed)
+		return (DB_NEEDSPLIT);
+
+	/*
+	 * Check to see if we will convert to off page duplicates -- if
+	 * so, we'll need a page.
+	 */
+	if (F_ISSET(dbp, DB_AM_DUP) &&
+	    TYPE(h) == P_LBTREE && op != DB_KEYFIRST &&
+	    P_FREESPACE(dbp, h) - needed <= dbp->pgsize / 2 &&
+	    __bam_dup_check(dbc, op, h, indx, needed, &cnt)) {
+		pages = 1;
+		dupadjust = 1;
+	} else
+		pages = 0;
+
+	/*
+	 * If we are not using transactions and there is a page limit
+	 * set on the file, then figure out if things will fit before
+	 * taking action.
+	 */
+	if (dbc->txn == NULL && mpf->mfp->maxpgno != 0) {
+		pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+		if (bigdata)
+			pages += ((data_size - 1) / pagespace) + 1;
+		if (bigkey)
+			pages += ((key->size - 1) / pagespace) + 1;
+
+		if (pages > (mpf->mfp->maxpgno - mpf->mfp->last_pgno))
+			return (__db_space_err(dbp));
+	}
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+	ret = __memp_dirty(mpf, &h,
+	     dbc->thread_info, dbc->txn, dbc->priority, 0);
+	if (cp->csp->page == cp->page)
+		cp->csp->page = h;
+	cp->page = h;
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * The code breaks it up into five cases:
+	 *
+	 * 1. Insert a new key/data pair.
+	 * 2. Append a new data item (a new duplicate).
+	 * 3. Insert a new data item (a new duplicate).
+	 * 4. Delete and re-add the data item (overflow item).
+	 * 5. Overwrite the data item.
+	 */
+	switch (op) {
+	case DB_KEYFIRST:		/* 1. Insert a new key/data pair. */
+		if (bigkey) {
+			if ((ret = __bam_ovput(dbc,
+			    B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
+				return (ret);
+		} else
+			if ((ret = __db_pitem(dbc, h, indx,
+			    BKEYDATA_SIZE(key->size), NULL, key)) != 0)
+				return (ret);
+
+		if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+			return (ret);
+		++indx;
+		break;
+	case DB_AFTER:			/* 2. Append a new data item. */
+		if (TYPE(h) == P_LBTREE) {
+			/* Copy the key for the duplicate and adjust cursors. */
+			if ((ret =
+			    __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
+				return (ret);
+			if ((ret =
+			    __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0)
+				return (ret);
+
+			indx += 3;
+
+			cp->indx += 2;
+		} else {
+			++indx;
+			cp->indx += 1;
+		}
+		break;
+	case DB_BEFORE:			/* 3. Insert a new data item. */
+		if (TYPE(h) == P_LBTREE) {
+			/* Copy the key for the duplicate and adjust cursors. */
+			if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
+				return (ret);
+			if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+				return (ret);
+
+			++indx;
+		}
+		break;
+	case DB_CURRENT:
+		 /*
+		  * Clear the cursor's deleted flag.  The problem is that if
+		  * we deadlock or fail while deleting the overflow item or
+		  * replacing the non-overflow item, a subsequent cursor close
+		  * will try and remove the item because the cursor's delete
+		  * flag is set.
+		  */
+		if ((ret = __bam_ca_delete(dbp, PGNO(h), indx, 0, NULL)) != 0)
+			return (ret);
+
+		if (TYPE(h) == P_LBTREE)
+			++indx;
+		bk = GET_BKEYDATA(dbp, h, indx);
+
+		/*
+		 * In a Btree deleted records aren't counted (deleted records
+		 * are counted in a Recno because all accesses are based on
+		 * record number).  If it's a Btree and it's a DB_CURRENT
+		 * operation overwriting a previously deleted record, increment
+		 * the record count.
+		 */
+		if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
+			was_deleted = B_DISSET(bk->type);
+
+		/*
+		 * 4. Delete and re-add the data item.
+		 *
+		 * If we're changing the type of the on-page structure, or we
+		 * are referencing offpage items, we have to delete and then
+		 * re-add the item.  We do not do any cursor adjustments here
+		 * because we're going to immediately re-add the item into the
+		 * same slot.
+		 */
+		if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+			/*
+			 * If streaming, don't delete the overflow item,
+			 * just delete the item pointing to the overflow item.
+			 * It will be added back in later, with the new size.
+			 * We can't simply adjust the size of the item on the
+			 * page, because there is no easy way to log a
+			 * modification.
+			 */
+			if (F_ISSET(data, DB_DBT_STREAMING)) {
+				if ((ret = __db_ditem(
+				    dbc, h, indx, BOVERFLOW_SIZE)) != 0)
+					return (ret);
+			} else if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+				return (ret);
+			del = 1;
+			break;
+		}
+
+		/* 5. Overwrite the data item. */
+		replace = 1;
+		break;
+	default:
+		return (__db_unknown_flag(env, "DB->put", op));
+	}
+
+	/* Add the data. */
+	if (bigdata) {
+		/*
+		 * We do not have to handle deleted (BI_DELETED) records
+		 * in this case; the actual records should never be created.
+		 */
+		DB_ASSERT(env, !LF_ISSET(BI_DELETED));
+		ret = __bam_ovput(dbc,
+		    B_OVERFLOW, PGNO_INVALID, h, indx, data);
+	} else {
+		if (LF_ISSET(BI_DELETED)) {
+			B_TSET_DELETED(bk_tmp.type, B_KEYDATA);
+			bk_tmp.len = data->size;
+			bk_hdr.data = &bk_tmp;
+			bk_hdr.size = SSZA(BKEYDATA, data);
+			ret = __db_pitem(dbc, h, indx,
+			    BKEYDATA_SIZE(data->size), &bk_hdr, data);
+		} else if (replace)
+			ret = __bam_ritem(dbc, h, indx, data, 0);
+		else
+			ret = __db_pitem(dbc, h, indx,
+			    BKEYDATA_SIZE(data->size), NULL, data);
+	}
+	if (ret != 0) {
+		if (del == 1 && (t_ret =
+		     __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) {
+			__db_err(env, t_ret, DB_STR("1005",
+			    "cursor adjustment after delete failed"));
+			return (__env_panic(env, t_ret));
+		}
+		return (ret);
+	}
+
+	/*
+	 * Re-position the cursors if necessary and reset the current cursor
+	 * to point to the new item.
+	 */
+	if (op != DB_CURRENT) {
+		if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+			return (ret);
+		cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
+	}
+
+	/*
+	 * If we've changed the record count, update the tree.  There's no
+	 * need to adjust the count if the operation not performed on the
+	 * current record or when the current record was previously deleted.
+	 */
+	if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
+		if ((ret = __bam_adjust(dbc, 1)) != 0)
+			return (ret);
+
+	/*
+	 * If a Btree leaf page is at least 50% full and we may have added or
+	 * modified a duplicate data item, see if the set of duplicates takes
+	 * up at least 25% of the space on the page.  If it does, move it onto
+	 * its own page.
+	 */
+	if (dupadjust &&
+	    (ret = __bam_dup_convert(dbc, h, indx - O_INDX, cnt)) != 0)
+		return (ret);
+
+	/* If we've modified a recno file, set the flag. */
+	if (dbc->dbtype == DB_RECNO)
+		t->re_modified = 1;
+
+	return (ret);
+}
+
+/*
+ * __bam_partsize --
+ *	Figure out how much space a partial data item is in total.
+ */
+static u_int32_t
+__bam_partsize(dbp, op, data, h, indx)
+	DB *dbp;
+	u_int32_t op, indx;
+	DBT *data;
+	PAGE *h;
+{
+	BKEYDATA *bk;
+	u_int32_t nbytes;
+
+	/*
+	 * If the record doesn't already exist, it's simply the data we're
+	 * provided.
+	 */
+	if (op != DB_CURRENT)
+		return (data->doff + data->size);
+
+	/*
+	 * Otherwise, it's the data provided plus any already existing data
+	 * that we're not replacing.
+	 */
+	bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+	nbytes =
+	    B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+
+	return (__db_partsize(nbytes, data));
+}
+
+/*
+ * __bam_build --
+ *	Build the real record for a partial put, or short fixed-length record.
+ */
+static int
+__bam_build(dbc, op, dbt, h, indx, nbytes)
+	DBC *dbc;
+	u_int32_t op, indx, nbytes;
+	DBT *dbt;
+	PAGE *h;
+{
+	BKEYDATA *bk, tbk;
+	BOVERFLOW *bo;
+	BTREE *t;
+	DB *dbp;
+	DBT copy, *rdata;
+	u_int32_t len, tlen;
+	u_int8_t *p;
+	int ret;
+
+	COMPQUIET(bo, NULL);
+
+	dbp = dbc->dbp;
+	t = dbp->bt_internal;
+
+	/* We use the record data return memory, it's only a short-term use. */
+	rdata = &dbc->my_rdata;
+	if (rdata->ulen < nbytes) {
+		if ((ret = __os_realloc(dbp->env,
+		    nbytes, &rdata->data)) != 0) {
+			rdata->ulen = 0;
+			rdata->data = NULL;
+			return (ret);
+		}
+		rdata->ulen = nbytes;
+	}
+
+	/*
+	 * We use nul or pad bytes for any part of the record that isn't
+	 * specified; get it over with.
+	 */
+	memset(rdata->data,
+	   F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes);
+
+	/*
+	 * In the next clauses, we need to do three things: a) set p to point
+	 * to the place at which to copy the user's data, b) set tlen to the
+	 * total length of the record, not including the bytes contributed by
+	 * the user, and c) copy any valid data from an existing record.  If
+	 * it's not a partial put (this code is called for both partial puts
+	 * and fixed-length record padding) or it's a new key, we can cut to
+	 * the chase.
+	 */
+	if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
+		p = (u_int8_t *)rdata->data + dbt->doff;
+		tlen = dbt->doff;
+		goto user_copy;
+	}
+
+	/* Find the current record. */
+	if (indx < NUM_ENT(h)) {
+		bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
+		    O_INDX : 0));
+		bo = (BOVERFLOW *)bk;
+	} else {
+		bk = &tbk;
+		B_TSET(bk->type, B_KEYDATA);
+		bk->len = 0;
+	}
+	if (B_TYPE(bk->type) == B_OVERFLOW) {
+		/*
+		 * In the case of an overflow record, we shift things around
+		 * in the current record rather than allocate a separate copy.
+		 */
+		memset(&copy, 0, sizeof(copy));
+		if ((ret = __db_goff(dbc, &copy, bo->tlen, bo->pgno,
+		    &rdata->data, &rdata->ulen)) != 0)
+			return (ret);
+
+		/* Skip any leading data from the original record. */
+		tlen = dbt->doff;
+		p = (u_int8_t *)rdata->data + dbt->doff;
+
+		/*
+		 * Copy in any trailing data from the original record.
+		 *
+		 * If the original record was larger than the original offset
+		 * plus the bytes being deleted, there is trailing data in the
+		 * original record we need to preserve.  If we aren't deleting
+		 * the same number of bytes as we're inserting, copy it up or
+		 * down, into place.
+		 *
+		 * Use memmove(), the regions may overlap.
+		 */
+		if (bo->tlen > dbt->doff + dbt->dlen) {
+			len = bo->tlen - (dbt->doff + dbt->dlen);
+			if (dbt->dlen != dbt->size)
+				memmove(p + dbt->size, p + dbt->dlen, len);
+			tlen += len;
+		}
+	} else {
+		/* Copy in any leading data from the original record. */
+		memcpy(rdata->data,
+		    bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
+		tlen = dbt->doff;
+		p = (u_int8_t *)rdata->data + dbt->doff;
+
+		/* Copy in any trailing data from the original record. */
+		len = dbt->doff + dbt->dlen;
+		if (bk->len > len) {
+			memcpy(p + dbt->size, bk->data + len, bk->len - len);
+			tlen += bk->len - len;
+		}
+	}
+
+user_copy:
+	/*
+	 * Copy in the application provided data -- p and tlen must have been
+	 * initialized above.
+	 */
+	memcpy(p, dbt->data, dbt->size);
+	tlen += dbt->size;
+
+	/* Set the DBT to reference our new record. */
+	rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen;
+	rdata->dlen = 0;
+	rdata->doff = 0;
+	rdata->flags = 0;
+	*dbt = *rdata;
+	return (0);
+}
+
+/*
+ * __bam_ritem --
+ *	Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t));
+ */
+int
+__bam_ritem(dbc, h, indx, data, typeflag)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+	DBT *data;
+	u_int32_t typeflag;
+{
+	BKEYDATA *bk;
+	DB *dbp;
+	DBT orig, repl;
+	db_indx_t min, prefix, suffix;
+	u_int32_t len;
+	int ret;
+	u_int8_t *dp, *p, *t, type;
+
+	dbp = dbc->dbp;
+
+	/*
+	 * Replace a single item onto a page.  The logic figuring out where
+	 * to insert and whether it fits is handled in the caller.  All we do
+	 * here is manage the page shuffling.
+	 */
+	bk = GET_BKEYDATA(dbp, h, indx);
+	len = bk->len;
+	dp = bk->data;
+	type = bk->type;
+	typeflag = B_DISSET(type);
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+		/*
+		 * We might as well check to see if the two data items share
+		 * a common prefix and suffix -- it can save us a lot of log
+		 * message if they're large.
+		 */
+		min = data->size < len ? data->size : len;
+		for (prefix = 0,
+		    p = dp, t = data->data;
+		    prefix < min && *p == *t; ++prefix, ++p, ++t)
+			;
+
+		min -= prefix;
+		for (suffix = 0,
+		    p = (u_int8_t *)dp + len - 1,
+		    t = (u_int8_t *)data->data + data->size - 1;
+		    suffix < min && *p == *t; ++suffix, --p, --t)
+			;
+
+		/* We only log the parts of the keys that have changed. */
+		orig.data = (u_int8_t *)dp + prefix;
+		orig.size = len - (prefix + suffix);
+		repl.data = (u_int8_t *)data->data + prefix;
+		repl.size = data->size - (prefix + suffix);
+		if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
+		    &LSN(h), (u_int32_t)indx, typeflag,
+		    &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(h));
+
+	return (__bam_ritem_nolog(dbc, h, indx, NULL, data, type));
+}
+
+/*
+ * __bam_ritem_nolog --
+ *	Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem_nolog __P((DBC *,
+ * PUBLIC:      PAGE *, u_int32_t, DBT *, DBT *, u_int32_t));
+ */
+int
+__bam_ritem_nolog(dbc, h, indx, hdr, data, type)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+	DBT *hdr, *data;
+	u_int32_t type;
+{
+	BKEYDATA *bk;
+	BINTERNAL *bi;
+	DB *dbp;
+	db_indx_t cnt, off, lo, ln;
+	db_indx_t *inp;
+	int32_t nbytes;
+	u_int8_t *p, *t;
+
+	dbp = dbc->dbp;
+	/*
+	 * Set references to the first in-use byte on the page and the
+	 * first byte of the item being replaced.
+	 */
+	inp = P_INP(dbp, h);
+	p = (u_int8_t *)h + HOFFSET(h);
+	if (TYPE(h) == P_IBTREE) {
+		bi = GET_BINTERNAL(dbp, h, indx);
+		t = (u_int8_t *)bi;
+		lo = (db_indx_t)BINTERNAL_SIZE(bi->len);
+		if (data == NULL) {
+			DB_ASSERT(dbp->env, hdr != NULL);
+			bi = (BINTERNAL*)hdr->data;
+			P_16_COPY(&bi->len, &cnt);
+			ln = (db_indx_t)BINTERNAL_SIZE(cnt);
+		} else
+			ln = (db_indx_t)BINTERNAL_SIZE(data->size);
+	} else {
+		bk = GET_BKEYDATA(dbp, h, indx);
+		t = (u_int8_t *)bk;
+		lo = (db_indx_t)BKEYDATA_SIZE(bk->len);
+		ln = (db_indx_t)BKEYDATA_SIZE(data->size);
+	}
+
+	/*
+	 * If the entry is growing in size, shift the beginning of the data
+	 * part of the page down.  If the entry is shrinking in size, shift
+	 * the beginning of the data part of the page up.  Use memmove(3),
+	 * the regions overlap.
+	 */
+	if (lo != ln) {
+		nbytes = (int32_t)(lo - ln);	/* Signed difference. */
+		if (p == t)			/* First index is fast. */
+			inp[indx] += (u_int32_t)nbytes;
+		else {				/* Else, shift the page. */
+			memmove(p + nbytes, p, (size_t)(t - p));
+
+			/* Adjust the indices' offsets. */
+			off = (u_int32_t)inp[indx];
+			for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
+				if (inp[cnt] <= off)
+					inp[cnt] += (u_int32_t)nbytes;
+		}
+
+		/* Clean up the page and adjust the item's reference. */
+		HOFFSET(h) += (u_int32_t)nbytes;
+		t += nbytes;
+	}
+
+	/* Copy the new item onto the page. */
+	if (TYPE(h) == P_IBTREE) {
+		DB_ASSERT(dbp->env, hdr != NULL);
+		memcpy(t, hdr->data, hdr->size);
+		bi = (BINTERNAL *)t;
+		if (data != NULL && data->size != 0)
+			memcpy(bi->data, data->data, data->size);
+	} else {
+		bk = (BKEYDATA *)t;
+		bk->len = data->size;
+		B_TSET(bk->type, type);
+		memcpy(bk->data, data->data, bk->len);
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_irep --
+ *	Replace an item on an internal page.
+ *
+ * PUBLIC: int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *));
+ */
+int
+__bam_irep(dbc, h, indx, hdr, data)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+	DBT *hdr;
+	DBT *data;
+{
+	BINTERNAL *bi, *bn;
+	DB *dbp;
+	DBT dbt;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	bi = GET_BINTERNAL(dbp, h, indx);
+	bn = (BINTERNAL *) hdr->data;
+
+	if (B_TYPE(bi->type) == B_OVERFLOW &&
+	    (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+		return (ret);
+
+	if (DBC_LOGGING(dbc)) {
+		dbt.data = bi;
+		dbt.size = BINTERNAL_SIZE(bi->len);
+		if ((ret = __bam_irep_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
+		    &LSN(h), (u_int32_t)indx, TYPE(h), hdr, data, &dbt)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(h));
+
+	return (__bam_ritem_nolog(dbc, h, indx, hdr, data, bn->type));
+}
+
+/*
+ * __bam_dup_check --
+ *	Check to see if the duplicate set at indx should have its own page.
+ */
+static int
+__bam_dup_check(dbc, op, h, indx, sz, cntp)
+	DBC *dbc;
+	u_int32_t op;
+	PAGE *h;
+	u_int32_t indx, sz;
+	db_indx_t *cntp;
+{
+	BKEYDATA *bk;
+	DB *dbp;
+	db_indx_t cnt, first, *inp;
+
+	dbp = dbc->dbp;
+	inp = P_INP(dbp, h);
+
+	/*
+	 * Count the duplicate records and calculate how much room they're
+	 * using on the page.
+	 */
+	while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+		indx -= P_INDX;
+
+	/* Count the key once. */
+	bk = GET_BKEYDATA(dbp, h, indx);
+	sz += B_TYPE(bk->type) == B_KEYDATA ?
+	    BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+
+	/*
+	 * Sum up all the data items.
+	 * Account for the record being inserted.  If we are replacing it,
+	 * don't count it twice.
+	 *
+	 * We execute the loop with first == indx to get the size of the
+	 * first record.
+	 */
+	cnt = op == DB_CURRENT ? 0 : 1;
+	for (first = indx;
+	    indx < NUM_ENT(h) && inp[first] == inp[indx];
+	    ++cnt, indx += P_INDX) {
+		bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+		sz += B_TYPE(bk->type) == B_KEYDATA ?
+		    BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+	}
+
+	/*
+	 * We have to do these checks when the user is replacing the cursor's
+	 * data item -- if the application replaces a duplicate item with a
+	 * larger data item, it can increase the amount of space used by the
+	 * duplicates, requiring this check.  But that means we may have done
+	 * this check when it wasn't a duplicate item after all.
+	 */
+	if (cnt == 1)
+		return (0);
+
+	/*
+	 * If this set of duplicates is using more than 25% of the page, move
+	 * them off.  The choice of 25% is a WAG, but the value must be small
+	 * enough that we can always split a page without putting duplicates
+	 * on two different pages.
+	 */
+	if (sz < dbp->pgsize / 4)
+		return (0);
+
+	*cntp = cnt;
+	return (1);
+}
+
+/*
+ * __bam_dup_convert --
+ *	Move a set of duplicates off-page and into their own tree.
+ */
+static int
+__bam_dup_convert(dbc, h, indx, cnt)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx, cnt;
+{
+	BKEYDATA *bk;
+	DB *dbp;
+	DBT hdr;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	PAGE *dp;
+	db_indx_t cpindx, dindx, first, *inp;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	inp = P_INP(dbp, h);
+
+	/* Move to the beginning of the dup set. */
+	while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+		indx -= P_INDX;
+
+	/* Get a new page. */
+	if ((ret = __db_new(dbc,
+	    dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &lock, &dp)) != 0)
+		return (ret);
+	P_INIT(dp, dbp->pgsize, dp->pgno,
+	    PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+	/*
+	 * Move this set of duplicates off the page.  First points to the first
+	 * key of the first duplicate key/data pair, cnt is the number of pairs
+	 * we're dealing with.
+	 */
+	memset(&hdr, 0, sizeof(hdr));
+	first = indx;
+	dindx = indx;
+	cpindx = 0;
+	do {
+		/* Move cursors referencing the old entry to the new entry. */
+		if ((ret = __bam_ca_dup(dbc, first,
+		    PGNO(h), indx, PGNO(dp), cpindx)) != 0)
+			goto err;
+
+		/*
+		 * Copy the entry to the new page.  If the off-duplicate page
+		 * If the off-duplicate page is a Btree page (i.e. dup_compare
+		 * will be non-NULL, we use Btree pages for sorted dups,
+		 * and Recno pages for unsorted dups), move all entries
+		 * normally, even deleted ones.  If it's a Recno page,
+		 * deleted entries are discarded (if the deleted entry is
+		 * overflow, then free up those pages).
+		 */
+		bk = GET_BKEYDATA(dbp, h, dindx + 1);
+		hdr.data = bk;
+		hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
+		    BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
+		if (dbp->dup_compare == NULL && B_DISSET(bk->type)) {
+			/*
+			 * Unsorted dups, i.e. recno page, and we have
+			 * a deleted entry, don't move it, but if it was
+			 * an overflow entry, we need to free those pages.
+			 */
+			if (B_TYPE(bk->type) == B_OVERFLOW &&
+			    (ret = __db_doff(dbc,
+			    (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0)
+				goto err;
+		} else {
+			if ((ret = __db_pitem(
+			    dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
+				goto err;
+			++cpindx;
+		}
+		/* Delete all but the last reference to the key. */
+		if (cnt != 1) {
+			if ((ret = __bam_adjindx(dbc,
+			    h, dindx, first + 1, 0)) != 0)
+				goto err;
+		} else
+			dindx++;
+
+		/* Delete the data item. */
+		if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0)
+			goto err;
+		indx += P_INDX;
+	} while (--cnt);
+
+	/* Put in a new data item that points to the duplicates page. */
+	if ((ret = __bam_ovput(dbc,
+	    B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0)
+		goto err;
+
+	/* Adjust cursors for all the above movements. */
+	ret = __bam_ca_di(dbc,
+	    PGNO(h), first + P_INDX, (int)(first + P_INDX - indx));
+
+err:	if ((t_ret = __memp_fput(mpf,
+	     dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	(void)__TLPUT(dbc, lock);
+	return (ret);
+}
+
+/*
+ * __bam_ovput --
+ *	Build an item for an off-page duplicates page or overflow page and
+ *	insert it on the page.
+ */
+static int
+__bam_ovput(dbc, type, pgno, h, indx, item)
+	DBC *dbc;
+	u_int32_t type, indx;
+	db_pgno_t pgno;
+	PAGE *h;
+	DBT *item;
+{
+	BOVERFLOW bo;
+	DBT hdr;
+	int ret;
+
+	UMRW_SET(bo.unused1);
+	B_TSET(bo.type, type);
+	UMRW_SET(bo.unused2);
+
+	/*
+	 * If we're creating an overflow item, do so and acquire the page
+	 * number for it.  If we're creating an off-page duplicates tree,
+	 * we are giving the page number as an argument.
+	 */
+	if (type == B_OVERFLOW) {
+		if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0)
+			return (ret);
+		bo.tlen = item->size;
+	} else {
+		bo.pgno = pgno;
+		bo.tlen = 0;
+	}
+
+	/* Store the new record on the page. */
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.data = &bo;
+	hdr.size = BOVERFLOW_SIZE;
+	return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
+}
diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c
new file mode 100644
index 00000000..026564b6
--- /dev/null
+++ b/src/btree/bt_rec.c
@@ -0,0 +1,2036 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+#define	IS_BTREE_PAGE(pagep)						\
+	(TYPE(pagep) == P_IBTREE ||					\
+	 TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP)
+
+/*
+ * __bam_split_recover --
+ *	Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_split_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN *plsnp;
+	DB_MPOOLFILE *mpf;
+	PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+	db_pgno_t pgno, parent_pgno;
+	u_int32_t opflags, size;
+	int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__bam_split_print);
+
+	_lp = lp = np = pp = _rp = rp = NULL;
+	sp = NULL;
+
+	REC_INTRO(__bam_split_read, ip, 0);
+
+	opflags = OP_MODE_GET(argp->opflags);
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    (opflags & SPL_RECNO) ?  DB_RECNO : DB_BTREE,
+	    PGNO_INVALID, DB_RECOVER, NULL, &dbc)) != 0)
+		goto out;
+	if (opflags & SPL_NRECS)
+		F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM);
+
+	/*
+	 * There are two kinds of splits that we have to recover from.  The
+	 * first is a root-page split, where the root page is split from a
+	 * leaf page into an internal page and two new leaf pages are created.
+	 * The second is where a page is split into two pages, and a new key
+	 * is inserted into the parent page.
+	 *
+	 * DBTs are not aligned in log records, so we need to copy the page
+	 * so that we can access fields within it throughout this routine.
+	 * Although we could hardcode the unaligned copies in this routine,
+	 * we will be calling into regular btree functions with this page,
+	 * so it's got to be aligned.  Copying it into allocated memory is
+	 * the only way to guarantee this.
+	 */
+	if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+		goto out;
+	memcpy(sp, argp->pg.data, argp->pg.size);
+
+	pgno = PGNO(sp);
+	parent_pgno = argp->ppgno;
+	rootsplit = parent_pgno == pgno;
+
+	/* Get the pages going down the tree. */
+	REC_FGET(mpf, ip, parent_pgno, &pp, left);
+left:	REC_FGET(mpf, ip, argp->left, &lp, right);
+right:	REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo:	if (DB_REDO(op)) {
+		l_update = r_update = p_update = 0;
+		/*
+		 * Decide if we need to resplit the page.
+		 *
+		 * If this is a root split, then the root has to exist unless
+		 * we have truncated it due to a future deallocation.
+		 */
+		if (pp != NULL) {
+			if (rootsplit)
+				plsnp = &LSN(argp->pg.data);
+			else
+				plsnp = &argp->plsn;
+			cmp = LOG_COMPARE(&LSN(pp), plsnp);
+			CHECK_LSN(env, op, cmp, &LSN(pp), plsnp);
+			if (cmp == 0)
+				p_update = 1;
+		}
+
+		if (lp != NULL) {
+			cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+			CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+			if (cmp == 0)
+				l_update = 1;
+		}
+
+		if (rp != NULL) {
+			cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+			CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+			if (cmp == 0)
+				r_update = 1;
+		}
+
+		if (!p_update && !l_update && !r_update)
+			goto check_next;
+
+		/* Allocate and initialize new left/right child pages. */
+		if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+		    (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+			goto out;
+		if (rootsplit) {
+			P_INIT(_lp, file_dbp->pgsize, argp->left,
+			    PGNO_INVALID,
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ?  PGNO_INVALID : argp->left,
+			    PGNO_INVALID, LEVEL(sp), TYPE(sp));
+		} else {
+			P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+			    ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+			    LEVEL(sp), TYPE(sp));
+		}
+
+		/* Split the page. */
+		if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+		    (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+		    NUM_ENT(sp))) != 0)
+			goto out;
+
+		if (l_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+			memcpy(lp, _lp, file_dbp->pgsize);
+			lp->lsn = *lsnp;
+		}
+
+		if (r_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+			memcpy(rp, _rp, file_dbp->pgsize);
+			rp->lsn = *lsnp;
+		}
+
+		/*
+		 * Drop the latches on the lower level pages before
+		 * getting an exclusive latch on the higher level page.
+		 */
+		if (lp != NULL && (ret = __memp_fput(mpf,
+		    ip, lp, file_dbp->priority)) && ret == 0)
+			goto out;
+		lp = NULL;
+		if (rp != NULL && (ret = __memp_fput(mpf,
+		    ip, rp, file_dbp->priority)) && ret == 0)
+			goto out;
+		rp = NULL;
+		/*
+		 * If the parent page is wrong, update it.
+		 * For recno the insert into an existing parent
+		 * was logged separately.
+		 * If it is a root page update initialize the page and
+		 * update the record counts if needed.
+		 * Then insert the record for the right hand child page.
+		 */
+		if (p_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+
+			if (rootsplit) {
+				P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID,
+				    PGNO_INVALID, _lp->level + 1,
+				    (opflags & SPL_RECNO) ?
+				    P_IRECNO : P_IBTREE);
+				if (opflags & SPL_NRECS) {
+					RE_NREC_SET(pp,
+					    __bam_total(file_dbp, _lp) +
+					    __bam_total(file_dbp, _rp));
+				}
+				if ((ret = __db_pitem_nolog(dbc, pp,
+				    argp->pindx, argp->pentry.size,
+				    &argp->pentry, NULL)) != 0)
+					goto out;
+
+			} else if (opflags & SPL_NRECS)
+				goto recno;
+			if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1,
+			    argp->rentry.size, &argp->rentry, NULL)) != 0)
+				goto out;
+recno:			pp->lsn = *lsnp;
+		}
+
+check_next:	/*
+		 * Finally, redo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  The next
+		 * page must exist because we're redoing the operation.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			REC_FGET(mpf, ip, argp->npgno, &np, done);
+			cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+			CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->right;
+				np->lsn = *lsnp;
+			}
+		}
+	} else {
+		/*
+		 * If it's a root split and the left child ever existed, update
+		 * its LSN.   Otherwise its the split page. If
+		 * right child ever existed, root split or not, update its LSN.
+		 * The undo of the page allocation(s) will restore them to the
+		 * free list.
+		 */
+		if (rootsplit && lp != NULL &&
+		    LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+			lp->lsn = argp->llsn;
+		}
+		if (rp != NULL &&
+		    LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+			rp->lsn = argp->rlsn;
+		}
+		/*
+		 * Drop the lower level pages before getting an exclusive
+		 * latch on  the parent.
+		 */
+		if (rp != NULL && (ret = __memp_fput(mpf,
+		    ip, rp, file_dbp->priority)))
+			goto out;
+		rp = NULL;
+
+		/*
+		 * Check the state of the split page.  If its a rootsplit
+		 * then that's the rootpage otherwise its the left page.
+		 */
+		if (rootsplit) {
+			DB_ASSERT(env, pgno == argp->ppgno);
+			if (lp != NULL && (ret = __memp_fput(mpf, ip,
+			     lp, file_dbp->priority)) != 0)
+				goto out;
+			lp = pp;
+			pp = NULL;
+		}
+		if (lp != NULL) {
+			cmp = LOG_COMPARE(lsnp, &LSN(lp));
+			CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+				memcpy(lp, argp->pg.data, argp->pg.size);
+				if ((ret = __memp_fput(mpf,
+				    ip, lp, file_dbp->priority)))
+					goto out;
+				lp = NULL;
+			}
+		}
+
+		/*
+		 * Next we can update the parent removing the new index.
+		 * If this has record numbers, then we log this separately.
+		 */
+		if (pp != NULL) {
+			DB_ASSERT(env, !rootsplit);
+			cmp = LOG_COMPARE(lsnp, &LSN(pp));
+			CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+				if ((opflags & SPL_NRECS) == 0) {
+					size  = BINTERNAL_SIZE(
+					    GET_BINTERNAL(file_dbp,
+					    pp, argp->pindx + 1)->len);
+
+					if ((ret = __db_ditem(dbc, pp,
+					    argp->pindx + 1, size)) != 0)
+						goto out;
+				}
+				pp->lsn = argp->plsn;
+			}
+		}
+
+		/*
+		 * Finally, undo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  Since it's
+		 * possible that the next-page never existed, we ignore it as
+		 * if there's nothing to undo.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->npgno,
+			    ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+				np = NULL;
+				goto done;
+			}
+			if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->left;
+				np->lsn = argp->nlsn;
+			}
+		}
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	/* Free any pages that are left. */
+	if (lp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, lp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (np != NULL && (t_ret = __memp_fput(mpf,
+	    ip, np, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (rp != NULL && (t_ret = __memp_fput(mpf,
+	     ip, rp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, pp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Free any allocated space. */
+	if (_lp != NULL)
+		__os_free(env, _lp);
+	if (_rp != NULL)
+		__os_free(env, _rp);
+	if (sp != NULL)
+		__os_free(env, sp);
+
+	REC_CLOSE;
+}
+/*
+ * __bam_split_48_recover --
+ *	Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_48_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_48_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_split_48_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN *plsnp;
+	DB_MPOOLFILE *mpf;
+	PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+	db_pgno_t pgno, parent_pgno;
+	u_int32_t ptype, size;
+	int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__bam_split_print);
+
+	_lp = lp = np = pp = _rp = rp = NULL;
+	sp = NULL;
+
+	REC_INTRO(__bam_split_48_read, ip, 0);
+
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    (argp->opflags & SPL_RECNO) ?  DB_RECNO : DB_BTREE,
+	    PGNO_INVALID, DB_RECOVER, NULL, &dbc)) != 0)
+		goto out;
+	if (argp->opflags & SPL_NRECS)
+		F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM);
+
+	/*
+	 * There are two kinds of splits that we have to recover from.  The
+	 * first is a root-page split, where the root page is split from a
+	 * leaf page into an internal page and two new leaf pages are created.
+	 * The second is where a page is split into two pages, and a new key
+	 * is inserted into the parent page.
+	 *
+	 * DBTs are not aligned in log records, so we need to copy the page
+	 * so that we can access fields within it throughout this routine.
+	 * Although we could hardcode the unaligned copies in this routine,
+	 * we will be calling into regular btree functions with this page,
+	 * so it's got to be aligned.  Copying it into allocated memory is
+	 * the only way to guarantee this.
+	 */
+	if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+		goto out;
+	memcpy(sp, argp->pg.data, argp->pg.size);
+
+	pgno = PGNO(sp);
+	parent_pgno = argp->ppgno;
+	rootsplit = parent_pgno == pgno;
+
+	/* Get the pages going down the tree. */
+	REC_FGET(mpf, ip, parent_pgno, &pp, left);
+left:	REC_FGET(mpf, ip, argp->left, &lp, right);
+right:	REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo:	if (DB_REDO(op)) {
+		l_update = r_update = p_update = 0;
+		/*
+		 * Decide if we need to resplit the page.
+		 *
+		 * If this is a root split, then the root has to exist unless
+		 * we have truncated it due to a future deallocation.
+		 */
+		if (pp != NULL) {
+			if (rootsplit)
+				plsnp = &LSN(argp->pg.data);
+			else
+				plsnp = &argp->plsn;
+			cmp = LOG_COMPARE(&LSN(pp), plsnp);
+			CHECK_LSN(env, op, cmp, &LSN(pp), plsnp);
+			if (cmp == 0)
+				p_update = 1;
+		}
+
+		if (lp != NULL) {
+			cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+			CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+			if (cmp == 0)
+				l_update = 1;
+		}
+
+		if (rp != NULL) {
+			cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+			CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+			if (cmp == 0)
+				r_update = 1;
+		}
+
+		if (!p_update && !l_update && !r_update)
+			goto check_next;
+
+		/* Allocate and initialize new left/right child pages. */
+		if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+		    (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+			goto out;
+		if (rootsplit) {
+			P_INIT(_lp, file_dbp->pgsize, argp->left,
+			    PGNO_INVALID,
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ?  PGNO_INVALID : argp->left,
+			    PGNO_INVALID, LEVEL(sp), TYPE(sp));
+		} else {
+			P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+			    ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+			    LEVEL(sp), TYPE(sp));
+		}
+
+		/* Split the page. */
+		if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+		    (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+		    NUM_ENT(sp))) != 0)
+			goto out;
+
+		if (l_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+			memcpy(lp, _lp, file_dbp->pgsize);
+			lp->lsn = *lsnp;
+		}
+
+		if (r_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+			memcpy(rp, _rp, file_dbp->pgsize);
+			rp->lsn = *lsnp;
+		}
+
+		/*
+		 * Drop the latches on the lower level pages before
+		 * getting an exclusive latch on the higher level page.
+		 */
+		if (lp != NULL && (ret = __memp_fput(mpf,
+		    ip, lp, file_dbp->priority)) && ret == 0)
+			goto out;
+		lp = NULL;
+		if (rp != NULL && (ret = __memp_fput(mpf,
+		    ip, rp, file_dbp->priority)) && ret == 0)
+			goto out;
+		rp = NULL;
+		/*
+		 * If the parent page is wrong, update it.
+		 * Initialize the page.  If it is a root page update
+		 * the record counts if needed and put the first record in.
+		 * Then insert the record for the right hand child page.
+		 */
+		if (p_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+			if (argp->opflags & SPL_RECNO)
+				ptype = P_IRECNO;
+			else
+				ptype = P_IBTREE;
+
+			if (rootsplit) {
+				P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID,
+				    PGNO_INVALID, _lp->level + 1, ptype);
+				if (argp->opflags & SPL_NRECS) {
+					RE_NREC_SET(pp,
+					    __bam_total(file_dbp, _lp) +
+					    __bam_total(file_dbp, _rp));
+				}
+				if ((ret = __db_pitem_nolog(dbc, pp,
+				    argp->pindx, argp->pentry.size,
+				    &argp->pentry, NULL)) != 0)
+					goto out;
+
+			}
+			if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1,
+			    argp->rentry.size, &argp->rentry, NULL)) != 0)
+				goto out;
+			pp->lsn = *lsnp;
+		}
+
+check_next:	/*
+		 * Finally, redo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  The next
+		 * page must exist because we're redoing the operation.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			REC_FGET(mpf, ip, argp->npgno, &np, done);
+			cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+			CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->right;
+				np->lsn = *lsnp;
+			}
+		}
+	} else {
+		/*
+		 * If it's a root split and the left child ever existed, update
+		 * its LSN.   Otherwise its the split page. If
+		 * right child ever existed, root split or not, update its LSN.
+		 * The undo of the page allocation(s) will restore them to the
+		 * free list.
+		 */
+		if (rootsplit && lp != NULL &&
+		    LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+			lp->lsn = argp->llsn;
+		}
+		if (rp != NULL &&
+		    LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+			rp->lsn = argp->rlsn;
+		}
+		/*
+		 * Drop the lower level pages before getting an exclusive
+		 * latch on  the parent.
+		 */
+		if (rp != NULL && (ret = __memp_fput(mpf,
+		    ip, rp, file_dbp->priority)))
+			goto out;
+		rp = NULL;
+
+		/*
+		 * Check the state of the split page.  If its a rootsplit
+		 * then that's the rootpage otherwise its the left page.
+		 */
+		if (rootsplit) {
+			DB_ASSERT(env, pgno == argp->ppgno);
+			if (lp != NULL && (ret = __memp_fput(mpf, ip,
+			     lp, file_dbp->priority)) != 0)
+				goto out;
+			lp = pp;
+			pp = NULL;
+		}
+		if (lp != NULL) {
+			cmp = LOG_COMPARE(lsnp, &LSN(lp));
+			CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+				memcpy(lp, argp->pg.data, argp->pg.size);
+				if ((ret = __memp_fput(mpf,
+				    ip, lp, file_dbp->priority)))
+					goto out;
+				lp = NULL;
+			}
+		}
+
+		/*
+		 * Next we can update the parent removing the new index.
+		 */
+		if (pp != NULL) {
+			DB_ASSERT(env, !rootsplit);
+			cmp = LOG_COMPARE(lsnp, &LSN(pp));
+			CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+				if (argp->opflags & SPL_RECNO)
+					size = RINTERNAL_SIZE;
+				else
+					size  = BINTERNAL_SIZE(
+					    GET_BINTERNAL(file_dbp,
+					    pp, argp->pindx + 1)->len);
+
+				if ((ret = __db_ditem(dbc, pp,
+				    argp->pindx + 1, size)) != 0)
+					goto out;
+				pp->lsn = argp->plsn;
+			}
+		}
+
+		/*
+		 * Finally, undo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  Since it's
+		 * possible that the next-page never existed, we ignore it as
+		 * if there's nothing to undo.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->npgno,
+			    ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+				np = NULL;
+				goto done;
+			}
+			if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->left;
+				np->lsn = argp->nlsn;
+			}
+		}
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	/* Free any pages that are left. */
+	if (lp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, lp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (np != NULL && (t_ret = __memp_fput(mpf,
+	    ip, np, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (rp != NULL && (t_ret = __memp_fput(mpf,
+	     ip, rp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, pp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Free any allocated space. */
+	if (_lp != NULL)
+		__os_free(env, _lp);
+	if (_rp != NULL)
+		__os_free(env, _rp);
+	if (sp != NULL)
+		__os_free(env, sp);
+
+	REC_CLOSE;
+}
+/*
+ * __bam_split_recover --
+ *	Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_split_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+	db_pgno_t pgno, root_pgno;
+	u_int32_t ptype;
+	int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__bam_split_print);
+
+	_lp = lp = np = pp = _rp = rp = NULL;
+	sp = NULL;
+
+	REC_INTRO(__bam_split_42_read, ip, 0);
+
+	/*
+	 * There are two kinds of splits that we have to recover from.  The
+	 * first is a root-page split, where the root page is split from a
+	 * leaf page into an internal page and two new leaf pages are created.
+	 * The second is where a page is split into two pages, and a new key
+	 * is inserted into the parent page.
+	 *
+	 * DBTs are not aligned in log records, so we need to copy the page
+	 * so that we can access fields within it throughout this routine.
+	 * Although we could hardcode the unaligned copies in this routine,
+	 * we will be calling into regular btree functions with this page,
+	 * so it's got to be aligned.  Copying it into allocated memory is
+	 * the only way to guarantee this.
+	 */
+	if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+		goto out;
+	memcpy(sp, argp->pg.data, argp->pg.size);
+
+	pgno = PGNO(sp);
+	root_pgno = argp->root_pgno;
+	rootsplit = root_pgno != PGNO_INVALID;
+	REC_FGET(mpf, ip, argp->left, &lp, right);
+right:	REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo:	if (DB_REDO(op)) {
+		l_update = r_update = p_update = 0;
+		/*
+		 * Decide if we need to resplit the page.
+		 *
+		 * If this is a root split, then the root has to exist unless
+		 * we have truncated it due to a future deallocation.
+		 */
+		if (rootsplit) {
+			REC_FGET(mpf, ip, root_pgno, &pp, do_left);
+			cmp = LOG_COMPARE(&LSN(pp), &LSN(argp->pg.data));
+			CHECK_LSN(env, op,
+			    cmp, &LSN(pp), &LSN(argp->pg.data));
+			p_update = cmp  == 0;
+		}
+
+do_left:	if (lp != NULL) {
+			cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+			CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+			if (cmp == 0)
+				l_update = 1;
+		}
+
+		if (rp != NULL) {
+			cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+			CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+			if (cmp == 0)
+				r_update = 1;
+		}
+
+		if (!p_update && !l_update && !r_update)
+			goto check_next;
+
+		/* Allocate and initialize new left/right child pages. */
+		if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+		    (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+			goto out;
+		if (rootsplit) {
+			P_INIT(_lp, file_dbp->pgsize, argp->left,
+			    PGNO_INVALID,
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ?  PGNO_INVALID : argp->left,
+			    PGNO_INVALID, LEVEL(sp), TYPE(sp));
+		} else {
+			P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+			    LEVEL(sp), TYPE(sp));
+			P_INIT(_rp, file_dbp->pgsize, argp->right,
+			    ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+			    ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+			    LEVEL(sp), TYPE(sp));
+		}
+
+		/* Split the page. */
+		if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+		    (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+		    NUM_ENT(sp))) != 0)
+			goto out;
+
+		if (l_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+			memcpy(lp, _lp, file_dbp->pgsize);
+			lp->lsn = *lsnp;
+			if ((ret = __memp_fput(mpf,
+			     ip, lp, file_dbp->priority)) != 0)
+				goto out;
+			lp = NULL;
+		}
+
+		if (r_update) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+			memcpy(rp, _rp, file_dbp->pgsize);
+			rp->lsn = *lsnp;
+			if ((ret = __memp_fput(mpf,
+			    ip, rp, file_dbp->priority)) != 0)
+				goto out;
+			rp = NULL;
+		}
+
+		/*
+		 * If the parent page is wrong, update it.  This is of interest
+		 * only if it was a root split, since root splits create parent
+		 * pages.  All other splits modify a parent page, but those are
+		 * separately logged and recovered.
+		 */
+		if (rootsplit && p_update) {
+			if (IS_BTREE_PAGE(sp)) {
+				ptype = P_IBTREE;
+				rc = argp->opflags & SPL_NRECS ? 1 : 0;
+			} else {
+				ptype = P_IRECNO;
+				rc = 1;
+			}
+
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+			P_INIT(pp, file_dbp->pgsize, root_pgno,
+			    PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype);
+			RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) +
+			    __bam_total(file_dbp, _rp) : 0);
+
+			pp->lsn = *lsnp;
+			if ((ret = __memp_fput(mpf,
+			     ip, pp, file_dbp->priority)) != 0)
+				goto out;
+			pp = NULL;
+		}
+
+check_next:	/*
+		 * Finally, redo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  The next
+		 * page must exist because we're redoing the operation.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->npgno,
+			    ip, NULL, 0, &np)) != 0) {
+				if (ret != DB_PAGE_NOTFOUND) {
+					ret = __db_pgerr(
+					    file_dbp, argp->npgno, ret);
+					goto out;
+				} else
+					goto done;
+			}
+			cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+			CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+			if (cmp == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->right;
+				np->lsn = *lsnp;
+				if ((ret = __memp_fput(mpf, ip,
+				    np, file_dbp->priority)) != 0)
+					goto out;
+				np = NULL;
+			}
+		}
+	} else {
+		/*
+		 * If the split page is wrong, replace its contents with the
+		 * logged page contents.  If the page doesn't exist, it means
+		 * that the create of the page never happened, nor did any of
+		 * the adds onto the page that caused the split, and there's
+		 * really no undo-ing to be done.
+		 */
+		if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+		    DB_MPOOL_EDIT, &pp)) != 0) {
+			pp = NULL;
+			goto lrundo;
+		}
+		if (LOG_COMPARE(lsnp, &LSN(pp)) == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+			memcpy(pp, argp->pg.data, argp->pg.size);
+			if ((ret = __memp_fput(mpf,
+			     ip, pp, file_dbp->priority)) != 0)
+				goto out;
+			pp = NULL;
+		}
+
+		/*
+		 * If it's a root split and the left child ever existed, update
+		 * its LSN.  (If it's not a root split, we've updated the left
+		 * page already -- it's the same as the split page.) If the
+		 * right child ever existed, root split or not, update its LSN.
+		 * The undo of the page allocation(s) will restore them to the
+		 * free list.
+		 */
+lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
+			if (rootsplit && lp != NULL &&
+			    LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+				lp->lsn = argp->llsn;
+				if ((ret = __memp_fput(mpf, ip,
+				    lp, file_dbp->priority)) != 0)
+					goto out;
+				lp = NULL;
+			}
+			if (rp != NULL &&
+			    LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+				rp->lsn = argp->rlsn;
+				if ((ret = __memp_fput(mpf, ip,
+				     rp, file_dbp->priority)) != 0)
+					goto out;
+				rp = NULL;
+			}
+		}
+
+		/*
+		 * Finally, undo the next-page link if necessary.  This is of
+		 * interest only if it wasn't a root split -- inserting a new
+		 * page in the tree requires that any following page have its
+		 * previous-page pointer updated to our new page.  Since it's
+		 * possible that the next-page never existed, we ignore it as
+		 * if there's nothing to undo.
+		 */
+		if (!rootsplit && argp->npgno != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->npgno,
+			    ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+				np = NULL;
+				goto done;
+			}
+			if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+				PREV_PGNO(np) = argp->left;
+				np->lsn = argp->nlsn;
+				if (__memp_fput(mpf,
+				     ip, np, file_dbp->priority))
+					goto out;
+				np = NULL;
+			}
+		}
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	/* Free any pages that weren't dirtied. */
+	if (pp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, pp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (lp != NULL && (t_ret = __memp_fput(mpf,
+	    ip, lp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (np != NULL && (t_ret = __memp_fput(mpf,
+	    ip, np, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (rp != NULL && (t_ret = __memp_fput(mpf,
+	     ip, rp, file_dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Free any allocated space. */
+	if (_lp != NULL)
+		__os_free(env, _lp);
+	if (_rp != NULL)
+		__os_free(env, _rp);
+	if (sp != NULL)
+		__os_free(env, sp);
+
+	REC_CLOSE;
+}
+
+/*
+ * __bam_rsplit_recover --
+ *	Recovery function for a reverse split.
+ *
+ * PUBLIC: int __bam_rsplit_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rsplit_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_rsplit_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN copy_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno, root_pgno;
+	db_recno_t rcnt;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_rsplit_print);
+	REC_INTRO(__bam_rsplit_read, ip, 1);
+
+	/* Fix the root page. */
+	pgno = root_pgno = argp->root_pgno;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto do_page;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->rootlsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->rootlsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/*
+		 * Copy the new data to the root page.  If it is not now a
+		 * leaf page we need to restore the record number.  We could
+		 * try to determine if C_RECNUM was set in the btree, but
+		 * that's not really necessary since the field is not used
+		 * otherwise.
+		 */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		rcnt = RE_NREC(pagep);
+		memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+		if (LEVEL(pagep) > LEAFLEVEL)
+			RE_NREC_SET(pagep, rcnt);
+		pagep->pgno = root_pgno;
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, root_pgno,
+		    argp->nrec, PGNO_INVALID, pagep->level + 1,
+		    IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO);
+		if ((ret = __db_pitem(dbc, pagep, 0,
+		    argp->rootent.size, &argp->rootent, NULL)) != 0)
+			goto out;
+		pagep->lsn = argp->rootlsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+do_page:
+	/*
+	 * Fix the page copied over the root page.  It's possible that the
+	 * page never made it to disk, or was truncated so if the page
+	 * doesn't exist, it's okay and there's nothing further to do.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN));
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_adj_recover --
+ *	Recovery function for adj.
+ *
+ * PUBLIC: int __bam_adj_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_adj_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_adj_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_adj_print);
+	REC_INTRO(__bam_adj_read, ip, 1);
+
+	/* Get the page; if it never existed and we're undoing, we're done. */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __bam_adjindx(dbc,
+		    pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
+			goto out;
+
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __bam_adjindx(dbc,
+		    pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
+			goto out;
+
+		LSN(pagep) = argp->lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_cadjust_recover --
+ *	Recovery function for the adjust of a count change in an internal
+ *	page.
+ *
+ * PUBLIC: int __bam_cadjust_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cadjust_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_cadjust_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_cadjust_print);
+	REC_INTRO(__bam_cadjust_read, ip, 0);
+
+	/* Get the page; if it never existed and we're undoing, we're done. */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (IS_BTREE_PAGE(pagep)) {
+			GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+			    argp->adjust;
+			if (argp->opflags & CAD_UPDATEROOT)
+				RE_NREC_ADJ(pagep, argp->adjust);
+		} else {
+			GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+			    argp->adjust;
+			if (argp->opflags & CAD_UPDATEROOT)
+				RE_NREC_ADJ(pagep, argp->adjust);
+		}
+
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (IS_BTREE_PAGE(pagep)) {
+			GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+			    argp->adjust;
+			if (argp->opflags & CAD_UPDATEROOT)
+				RE_NREC_ADJ(pagep, -(argp->adjust));
+		} else {
+			GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+			    argp->adjust;
+			if (argp->opflags & CAD_UPDATEROOT)
+				RE_NREC_ADJ(pagep, -(argp->adjust));
+		}
+		LSN(pagep) = argp->lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_cdel_recover --
+ *	Recovery function for the intent-to-delete of a cursor record.
+ *
+ * PUBLIC: int __bam_cdel_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cdel_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_cdel_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	u_int32_t indx;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_cdel_print);
+	REC_INTRO(__bam_cdel_read, ip, 0);
+
+	/* Get the page; if it never existed and we're undoing, we're done. */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+		B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+		B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+		if ((ret = __bam_ca_delete(
+		    file_dbp, argp->pgno, argp->indx, 0, NULL)) != 0)
+			goto out;
+
+		LSN(pagep) = argp->lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_repl_recover --
+ *	Recovery function for page item replacement.
+ *
+ * PUBLIC: int __bam_repl_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_repl_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_repl_args *argp;
+	DB_THREAD_INFO *ip;
+	BKEYDATA *bk;
+	DB *file_dbp;
+	DBC *dbc;
+	DBT dbt;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+	u_int32_t len;
+	u_int8_t *dp, *p;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_repl_print);
+	REC_INTRO(__bam_repl_read, ip, 1);
+
+	/* Get the page; if it never existed and we're undoing, we're done. */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/*
+		 * Need to redo update described.
+		 *
+		 * Re-build the replacement item.
+		 */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+		dp = bk->data;
+		len = bk->len;
+		memset(&dbt, 0, sizeof(dbt));
+		dbt.size = argp->prefix + argp->suffix + argp->repl.size;
+		if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+			goto out;
+		p = dbt.data;
+		memcpy(p, dp, argp->prefix);
+		p += argp->prefix;
+		memcpy(p, argp->repl.data, argp->repl.size);
+		p += argp->repl.size;
+		memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt, 0);
+		__os_free(env, dbt.data);
+		if (ret != 0)
+			goto out;
+
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/*
+		 * Need to undo update described.
+		 *
+		 * Re-build the original item.
+		 */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+		dp = bk->data;
+		len = bk->len;
+		memset(&dbt, 0, sizeof(dbt));
+		dbt.size = argp->prefix + argp->suffix + argp->orig.size;
+		if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+			goto out;
+		p = dbt.data;
+		memcpy(p, dp, argp->prefix);
+		p += argp->prefix;
+		memcpy(p, argp->orig.data, argp->orig.size);
+		p += argp->orig.size;
+		memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt, 0);
+		__os_free(env, dbt.data);
+		if (ret != 0)
+			goto out;
+
+		/* Reset the deleted flag, if necessary. */
+		if (argp->isdeleted)
+			B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type);
+
+		LSN(pagep) = argp->lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_irep_recover --
+ *	Recovery function for internal page item replacement.
+ *
+ * PUBLIC: int __bam_irep_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_irep_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_irep_args *argp;
+	BINTERNAL *bn;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_irep_print);
+	REC_INTRO(__bam_irep_read, ip, 1);
+
+	/* Get the page; if it never existed and we're undoing, we're done. */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		bn = (BINTERNAL *)argp->hdr.data;
+		if ((ret = __bam_ritem_nolog(dbc,
+		    pagep, argp->indx, &argp->hdr, &argp->data, bn->type)) != 0)
+			goto out;
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		bn = (BINTERNAL *)argp->old.data;
+		if ((ret = __bam_ritem_nolog(dbc,
+		    pagep, argp->indx, &argp->old, NULL, bn->type)) != 0)
+			goto out;
+		LSN(pagep) = argp->lsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_root_recover --
+ *	Recovery function for setting the root page on the meta-data page.
+ *
+ * PUBLIC: int __bam_root_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_root_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_root_args *argp;
+	DB_THREAD_INFO *ip;
+	BTMETA *meta;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	REC_PRINT(__bam_root_print);
+	REC_INTRO(__bam_root_read, ip, 0);
+
+	if ((ret = __memp_fget(mpf, &argp->meta_pgno, ip, NULL,
+	    0, &meta)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		meta->root = argp->root_pgno;
+		meta->dbmeta.lsn = *lsnp;
+		((BTREE *)file_dbp->bt_internal)->bt_root = meta->root;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Nothing to undo except lsn. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		meta->dbmeta.lsn = argp->meta_lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __bam_curadj_recover --
+ *	Transaction abort function to undo cursor adjustments.
+ *	This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_curadj_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_curadj_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_curadj_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	int ret;
+
+	COMPQUIET(mpf, NULL);
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__bam_curadj_print);
+	REC_INTRO(__bam_curadj_read, ip, 1);
+
+	ret = 0;
+	if (op != DB_TXN_ABORT)
+		goto done;
+
+	switch (argp->mode) {
+	case DB_CA_DI:
+		if ((ret = __bam_ca_di(dbc, argp->from_pgno,
+		    argp->from_indx, -(int)argp->first_indx)) != 0)
+			goto out;
+		break;
+	case DB_CA_DUP:
+		if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx,
+		    argp->from_pgno, argp->from_indx, argp->to_indx)) != 0)
+			goto out;
+		break;
+
+	case DB_CA_RSPLIT:
+		if ((ret =
+		    __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0)
+			goto out;
+		break;
+
+	case DB_CA_SPLIT:
+		if ((ret = __bam_ca_undosplit(file_dbp, argp->from_pgno,
+		    argp->to_pgno, argp->left_pgno, argp->from_indx)) != 0)
+			goto out;
+		break;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+out:	REC_CLOSE;
+}
+
+/*
+ * __bam_rcuradj_recover --
+ *	Transaction abort function to undo cursor adjustments in rrecno.
+ *	This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_rcuradj_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rcuradj_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_rcuradj_args *argp;
+	DB_THREAD_INFO *ip;
+	BTREE_CURSOR *cp;
+	DB *file_dbp;
+	DBC *dbc, *rdbc;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	COMPQUIET(mpf, NULL);
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	rdbc = NULL;
+	REC_PRINT(__bam_rcuradj_print);
+	REC_INTRO(__bam_rcuradj_read, ip, 1);
+
+	ret = t_ret = 0;
+
+	if (op != DB_TXN_ABORT)
+		goto done;
+
+	/*
+	 * We don't know whether we're in an offpage dup set, and
+	 * thus don't know whether the dbc REC_INTRO has handed us is
+	 * of a reasonable type.  It's certainly unset, so if this is
+	 * an offpage dup set, we don't have an OPD cursor.  The
+	 * simplest solution is just to allocate a whole new cursor
+	 * for our use;  we're only really using it to hold pass some
+	 * state into __ram_ca, and this way we don't need to make
+	 * this function know anything about how offpage dups work.
+	 */
+	if ((ret = __db_cursor_int(file_dbp, NULL,
+		NULL, DB_RECNO, argp->root, DB_RECOVER, NULL, &rdbc)) != 0)
+		goto out;
+
+	cp = (BTREE_CURSOR *)rdbc->internal;
+	F_SET(cp, C_RENUMBER);
+	cp->recno = argp->recno;
+
+	switch (argp->mode) {
+	case CA_DELETE:
+		/*
+		 * The way to undo a delete is with an insert.  Since
+		 * we're undoing it, the delete flag must be set.
+		 */
+		F_SET(cp, C_DELETED);
+		F_SET(cp, C_RENUMBER);	/* Just in case. */
+		cp->order = argp->order;
+		if ((ret = __ram_ca(rdbc, CA_ICURRENT, NULL)) != 0)
+			goto out;
+		break;
+	case CA_IAFTER:
+	case CA_IBEFORE:
+	case CA_ICURRENT:
+		/*
+		 * The way to undo an insert is with a delete.  The delete
+		 * flag is unset to start with.
+		 */
+		F_CLR(cp, C_DELETED);
+		cp->order = INVALID_ORDER;
+		if ((ret = __ram_ca(rdbc, CA_DELETE, NULL)) != 0)
+			goto out;
+		break;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+out:	if (rdbc != NULL && (t_ret = __dbc_close(rdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	REC_CLOSE;
+}
+
+/*
+ * __bam_merge_44_recover --
+ *	Recovery function for merge.
+ *
+ * PUBLIC: int __bam_merge_44_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_merge_44_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_merge_44_args *argp;
+	DB_THREAD_INFO *ip;
+	BKEYDATA *bk;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_indx_t indx, *ninp, *pinp;
+	u_int32_t size;
+	u_int8_t *bp;
+	int cmp_n, cmp_p, i, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__bam_merge_44_print);
+	REC_INTRO(__bam_merge_44_read, ip, 1);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto next;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/*
+		 * If the header is provided the page is empty, copy the
+		 * needed data.
+		 */
+		DB_ASSERT(env, argp->hdr.size == 0 || NUM_ENT(pagep) == 0);
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (argp->hdr.size != 0) {
+			P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+			     PREV_PGNO(argp->hdr.data),
+			     NEXT_PGNO(argp->hdr.data),
+			     LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+		}
+		if (TYPE(pagep) == P_OVERFLOW) {
+			OV_REF(pagep) = OV_REF(argp->hdr.data);
+			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+			bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+			memcpy(bp, argp->data.data, argp->data.size);
+		} else {
+			/* Copy the data segment. */
+			bp = (u_int8_t *)pagep +
+			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+			memcpy(bp, argp->data.data, argp->data.size);
+
+			/* Copy index table offset past the current entries. */
+			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+			ninp = argp->ind.data;
+			for (i = 0;
+			     i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+				*pinp++ = *ninp++
+				      - (file_dbp->pgsize - HOFFSET(pagep));
+			HOFFSET(pagep) -= argp->data.size;
+			NUM_ENT(pagep) += i;
+		}
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && !DB_REDO(op)) {
+		/*
+		 * Since logging is logical at the page level
+		 * we cannot just truncate the data space.  Delete
+		 * the proper number of items from the logical end
+		 * of the page.
+		 */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		for (i = 0; i < (int)(argp->ind.size / sizeof(*ninp)); i++) {
+			indx = NUM_ENT(pagep) - 1;
+			if (P_INP(file_dbp, pagep)[indx] ==
+			     P_INP(file_dbp, pagep)[indx - P_INDX]) {
+				NUM_ENT(pagep)--;
+				continue;
+			}
+			switch (TYPE(pagep)) {
+			case P_LBTREE:
+			case P_LRECNO:
+			case P_LDUP:
+				bk = GET_BKEYDATA(file_dbp, pagep, indx);
+				size = BITEM_SIZE(bk);
+				break;
+
+			case P_IBTREE:
+				size = BINTERNAL_SIZE(
+				     GET_BINTERNAL(file_dbp, pagep, indx)->len);
+				break;
+			case P_IRECNO:
+				size = RINTERNAL_SIZE;
+				break;
+
+			default:
+				ret = __db_pgfmt(env, PGNO(pagep));
+				goto out;
+			}
+			if ((ret =
+			     __db_ditem(dbc, pagep, indx, size)) != 0)
+				goto out;
+		}
+		if (argp->ind.size == 0)
+			HOFFSET(pagep) = file_dbp->pgsize;
+		pagep->lsn = argp->lsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+
+next:	if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to truncate the page. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		HOFFSET(pagep) = file_dbp->pgsize;
+		NUM_ENT(pagep) = 0;
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && !DB_REDO(op)) {
+		/* Need to put the data back on the page. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (TYPE(pagep) == P_OVERFLOW) {
+			OV_REF(pagep) = OV_REF(argp->hdr.data);
+			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+			bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+			memcpy(bp, argp->data.data, argp->data.size);
+		} else {
+			bp = (u_int8_t *)pagep +
+			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+			memcpy(bp, argp->data.data, argp->data.size);
+
+			/* Copy index table. */
+			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+			ninp = argp->ind.data;
+			for (i = 0;
+			    i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+				*pinp++ = *ninp++;
+			HOFFSET(pagep) -= argp->data.size;
+			NUM_ENT(pagep) = i;
+		}
+		pagep->lsn = argp->nlsn;
+	}
+
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, dbc->priority)) != 0)
+		goto out;
+done:
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __bam_relink_43_recover --
+ *	Recovery function for relink.
+ *
+ * PUBLIC: int __bam_relink_43_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_relink_43_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__bam_relink_43_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__bam_relink_43_print);
+	REC_INTRO(__bam_relink_43_read, ip, 0);
+
+	/*
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto next2;
+	}
+
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		pagep->prev_pgno = argp->prev;
+		pagep->lsn = argp->lsn;
+	}
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->next, ret);
+			goto out;
+		} else
+			goto prev;
+	}
+
+	modified = 0;
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the remove or undo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->prev;
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the remove or redo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_next;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->prev, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	modified = 0;
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		modified = 1;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_prev;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c
new file mode 100644
index 00000000..f465cc5a
--- /dev/null
+++ b/src/btree/bt_reclaim.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+/*
+ * __bam_reclaim --
+ *	Free a database.
+ *
+ * PUBLIC: int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+ */
+int
+__bam_reclaim(dbp, ip, txn, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_LOCK meta_lock;
+	int ret, t_ret;
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	/* Write lock the metapage for deallocations. */
+	if ((ret = __db_lget(dbc,
+	    0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+		goto err;
+
+	/* Avoid locking every page, we have the handle locked exclusive. */
+	F_SET(dbc, DBC_DONTLOCK);
+
+	/* Walk the tree, freeing pages. */
+	ret = __bam_traverse(dbc, DB_LOCK_WRITE,
+	    PGNO_INVALID, __db_reclaim_callback, &flags);
+
+	if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the cursor. */
+err:	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bam_truncate --
+ *	Truncate a database.
+ *
+ * PUBLIC: int __bam_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__bam_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	u_int32_t count;
+	int ret;
+
+#ifdef HAVE_COMPRESSION
+	u_int32_t comp_count;
+
+	comp_count = 0;
+	if (DB_IS_COMPRESSED(dbc->dbp) &&
+	    (ret = __bam_compress_count(dbc, NULL, &comp_count)) != 0)
+		return (ret);
+#endif
+
+	count = 0;
+
+	/* Walk the tree, freeing pages. */
+	ret = __bam_traverse(dbc,
+	    DB_LOCK_WRITE, PGNO_INVALID, __db_truncate_callback, &count);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbc->dbp)) {
+		if (countp != NULL)
+			*countp = comp_count;
+	} else
+#endif
+	if (countp != NULL)
+		*countp = count;
+
+	return (ret);
+}
diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c
new file mode 100644
index 00000000..9356a742
--- /dev/null
+++ b/src/btree/bt_recno.c
@@ -0,0 +1,1427 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int  __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int  __ram_source __P((DB *));
+static int  __ram_sread __P((DBC *, db_recno_t));
+static int  __ram_update __P((DBC *, db_recno_t, int));
+static int __ram_ca_getorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ram_ca_setorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+/*
+ * In recno, there are two meanings to the on-page "deleted" flag.  If we're
+ * re-numbering records, it means the record was implicitly created.  We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested..  If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag.  This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
+ */
+#define	CD_SET(cp) {							\
+	if (F_ISSET(cp, C_RENUMBER))					\
+		F_SET(cp, C_DELETED);					\
+}
+#define	CD_CLR(cp) {							\
+	if (F_ISSET(cp, C_RENUMBER)) {					\
+		F_CLR(cp, C_DELETED);					\
+		cp->order = INVALID_ORDER;				\
+	}								\
+}
+#define	CD_ISSET(cp)							\
+	(F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
+
+/*
+ * Macros for comparing the ordering of two cursors.
+ * cp1 comes before cp2 iff one of the following holds:
+ *	cp1's recno is less than cp2's recno
+ *	recnos are equal, both deleted, and cp1's order is less than cp2's
+ *	recnos are equal, cp1 deleted, and cp2 not deleted
+ */
+#define	C_LESSTHAN(cp1, cp2)						\
+    (((cp1)->recno < (cp2)->recno) ||					\
+    (((cp1)->recno == (cp2)->recno) &&					\
+    ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
+    (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
+
+/*
+ * cp1 is equal to cp2 iff their recnos and delete flags are identical,
+ * and if the delete flag is set their orders are also identical.
+ */
+#define	C_EQUAL(cp1, cp2)						\
+    ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
+    (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
+
+/*
+ * Do we need to log the current cursor adjustment?
+ */
+#define	CURADJ_LOG(dbc)							\
+	(DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
+
+/*
+ * After a search, copy the found page into the cursor, discarding any
+ * currently held lock.
+ */
+#define	STACK_TO_CURSOR(cp, ret) {					\
+	int __t_ret;							\
+	(cp)->page = (cp)->csp->page;					\
+	(cp)->pgno = (cp)->csp->page->pgno;				\
+	(cp)->indx = (cp)->csp->indx;					\
+	if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0)	\
+		ret = __t_ret;						\
+	(cp)->lock = (cp)->csp->lock;					\
+	(cp)->lock_mode = (cp)->csp->lock_mode;				\
+}
+
+/*
+ * __ram_open --
+ *	Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__ram_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	BTREE *t;
+	DBC *dbc;
+	int ret, t_ret;
+
+	COMPQUIET(name, NULL);
+	t = dbp->bt_internal;
+
+	/* Start up the tree. */
+	if ((ret = __bam_read_root(dbp, ip, txn, base_pgno, flags)) != 0)
+		return (ret);
+
+	/*
+	 * If the user specified a source tree, open it and map it in.
+	 *
+	 * !!!
+	 * We don't complain if the user specified transactions or threads.
+	 * It's possible to make it work, but you'd better know what you're
+	 * doing!
+	 */
+	if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
+		return (ret);
+
+	/* If we're snapshotting an underlying source file, do it now. */
+	if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
+		/* Allocate a cursor. */
+		if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+			return (ret);
+
+		/* Do the snapshot. */
+		if ((ret = __ram_update(dbc,
+		    DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+			ret = 0;
+
+		/* Discard the cursor. */
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+/*
+ * __ram_append --
+ *	Recno append function.
+ *
+ * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
+ */
+int
+__ram_append(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	BTREE_CURSOR *cp;
+	int ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Make sure we've read in all of the backing source file.  If
+	 * we found the record or it simply didn't exist, add the
+	 * user's record.
+	 */
+	ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
+	if (ret == 0 || ret == DB_NOTFOUND)
+		ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
+
+	/* Return the record number. */
+	if (ret == 0 && key != NULL)
+		ret = __db_retcopy(dbc->env, key, &cp->recno,
+		    sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+	if (!DB_RETOK_DBCPUT(ret))
+		F_SET(dbc, DBC_ERROR);
+	return (ret);
+}
+
+/*
+ * __ramc_del --
+ *	Recno DBC->del function.
+ *
+ * PUBLIC: int __ramc_del __P((DBC *, u_int32_t));
+ */
+int
+__ramc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	BKEYDATA bk;
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT hdr, data;
+	DB_LOCK next_lock, prev_lock;
+	DB_LSN lsn;
+	db_pgno_t npgno, ppgno, save_npgno, save_ppgno;
+	int exact, nc, ret, stack, t_ret;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	t = dbp->bt_internal;
+	stack = 0;
+	save_npgno = save_ppgno = PGNO_INVALID;
+	LOCK_INIT(next_lock);
+	LOCK_INIT(prev_lock);
+	COMPQUIET(flags, 0);
+
+	/*
+	 * The semantics of cursors during delete are as follows: in
+	 * non-renumbering recnos, records are replaced with a marker
+	 * containing a delete flag.  If the record referenced by this cursor
+	 * has already been deleted, we will detect that as part of the delete
+	 * operation, and fail.
+	 *
+	 * In renumbering recnos, cursors which represent deleted items
+	 * are flagged with the C_DELETED flag, and it is an error to
+	 * call c_del a second time without an intervening cursor motion.
+	 */
+	if (CD_ISSET(cp))
+		return (DB_KEYEMPTY);
+
+	/* Search the tree for the key; delete only deletes exact matches. */
+retry:	if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
+		goto err;
+	if (!exact) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+	stack = 1;
+
+	/* Copy the page into the cursor. */
+	STACK_TO_CURSOR(cp, ret);
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If re-numbering records, the on-page deleted flag can only mean
+	 * that this record was implicitly created.  Applications aren't
+	 * permitted to delete records they never created, return an error.
+	 *
+	 * If not re-numbering records, the on-page deleted flag means that
+	 * this record was implicitly created, or, was deleted at some time.
+	 * The former is an error because applications aren't permitted to
+	 * delete records they never created, the latter is an error because
+	 * if the record was "deleted", we could never have found it.
+	 */
+	if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
+		ret = DB_KEYEMPTY;
+		goto err;
+	}
+
+	if (F_ISSET(cp, C_RENUMBER)) {
+		/* If we are going to drop the page, lock its neighbors. */
+		if (STD_LOCKING(dbc) && NUM_ENT(cp->page) == 1 &&
+		    PGNO(cp->page) != BAM_ROOT_PGNO(dbc)) {
+			if ((npgno = NEXT_PGNO(cp->page)) != PGNO_INVALID)
+				TRY_LOCK(dbc, npgno, save_npgno,
+				    next_lock, DB_LOCK_WRITE, retry);
+			if (ret != 0)
+				goto err;
+			if ((ppgno = PREV_PGNO(cp->page)) != PGNO_INVALID)
+				TRY_LOCK(dbc, ppgno, save_ppgno,
+				    prev_lock, DB_LOCK_WRITE, retry);
+			if (ret != 0)
+				goto err;
+		}
+		/* Delete the item, adjust the counts, adjust the cursors. */
+		if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+			goto err;
+		if ((ret = __bam_adjust(dbc, -1)) != 0)
+			goto err;
+		if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0)
+			goto err;
+		if (nc > 0 && CURADJ_LOG(dbc) &&
+		    (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
+		    CA_DELETE, BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+			goto err;
+
+		/*
+		 * If the page is empty, delete it.
+		 *
+		 * We never delete a root page.  First, root pages of primary
+		 * databases never go away, recno or otherwise.  However, if
+		 * it's the root page of an off-page duplicates database, then
+		 * it can be deleted.   We don't delete it here because we have
+		 * no way of telling the primary database page holder (e.g.,
+		 * the hash access method) that its page element should cleaned
+		 * up because the underlying tree is gone.  So, we keep the page
+		 * around until the last cursor referencing the empty tree is
+		 * are closed, and then clean it up.
+		 */
+		if (NUM_ENT(cp->page) == 0 &&
+		    PGNO(cp->page) != BAM_ROOT_PGNO(dbc)) {
+			/*
+			 * We want to delete a single item out of the last page
+			 * that we're not deleting.
+			 */
+			if (F_ISSET(dbc, DBC_OPD))
+				LOCK_CHECK_OFF(dbc->thread_info);
+			ret = __bam_dpages(dbc, 0, BTD_RELINK);
+			if (F_ISSET(dbc, DBC_OPD))
+				LOCK_CHECK_ON(dbc->thread_info);
+
+			/*
+			 * Regardless of the return from __bam_dpages, it will
+			 * discard our stack and pinned page.
+			 */
+			stack = 0;
+			cp->page = NULL;
+			LOCK_INIT(cp->lock);
+			cp->lock_mode = DB_LOCK_NG;
+		}
+	} else {
+		/* Use a delete/put pair to replace the record with a marker. */
+		if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+			goto err;
+
+		B_TSET_DELETED(bk.type, B_KEYDATA);
+		bk.len = 0;
+		DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data));
+		DB_INIT_DBT(data, "", 0);
+		if ((ret = __db_pitem(dbc,
+		    cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+			goto err;
+	}
+
+	t->re_modified = 1;
+
+err:	if (!DB_RETOK_DBCDEL(ret))
+		F_SET(dbc, DBC_ERROR);
+	if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, next_lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, prev_lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __ramc_get --
+ *	Recno DBC->get function.
+ *
+ * PUBLIC: int __ramc_get
+ * PUBLIC:     __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_get(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	int cmp, exact, ret;
+
+	COMPQUIET(pgnop, NULL);
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+retry:	switch (flags) {
+	case DB_CURRENT:
+		/*
+		 * If we're using mutable records and the deleted flag is
+		 * set, the cursor is pointing at a nonexistent record;
+		 * return an error.
+		 */
+		if (CD_ISSET(cp))
+			return (DB_KEYEMPTY);
+		break;
+	case DB_NEXT_DUP:
+		/*
+		 * If we're not in an off-page dup set, we know there's no
+		 * next duplicate since recnos don't have them.  If we
+		 * are in an off-page dup set, the next item assuredly is
+		 * a dup, so we set flags to DB_NEXT and keep going.
+		 */
+		if (!F_ISSET(dbc, DBC_OPD))
+			return (DB_NOTFOUND);
+		/* FALLTHROUGH */
+	case DB_NEXT_NODUP:
+		/*
+		 * Recno databases don't have duplicates, set flags to DB_NEXT
+		 * and keep going.
+		 */
+		/* FALLTHROUGH */
+	case DB_NEXT:
+		flags = DB_NEXT;
+		/*
+		 * If record numbers are mutable: if we just deleted a record,
+		 * we have to avoid incrementing the record number so that we
+		 * return the right record by virtue of renumbering the tree.
+		 */
+		if (CD_ISSET(cp)) {
+			/*
+			 * Clear the flag, we've moved off the deleted record.
+			 */
+			CD_CLR(cp);
+			break;
+		}
+
+		if (cp->recno != RECNO_OOB) {
+			++cp->recno;
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_FIRST:
+		flags = DB_NEXT;
+		cp->recno = 1;
+		break;
+	case DB_PREV_DUP:
+		/*
+		 * If we're not in an off-page dup set, we know there's no
+		 * previous duplicate since recnos don't have them.  If we
+		 * are in an off-page dup set, the previous item assuredly
+		 * is a dup, so we set flags to DB_PREV and keep going.
+		 */
+		if (!F_ISSET(dbc, DBC_OPD))
+			return (DB_NOTFOUND);
+		/* FALLTHROUGH */
+	case DB_PREV_NODUP:
+		/*
+		 * Recno databases don't have duplicates, set flags to DB_PREV
+		 * and keep going.
+		 */
+		/* FALLTHROUGH */
+	case DB_PREV:
+		flags = DB_PREV;
+		if (cp->recno != RECNO_OOB) {
+			if (cp->recno == 1) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+			--cp->recno;
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_LAST:
+		flags = DB_PREV;
+		if (((ret = __ram_update(dbc,
+		    DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
+			goto err;
+		if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
+			goto err;
+		if (cp->recno == 0) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		break;
+	case DB_GET_BOTHC:
+		/*
+		 * If we're doing a join and these are offpage dups,
+		 * we want to keep searching forward from after the
+		 * current cursor position.  Increment the recno by 1,
+		 * then proceed as for a DB_SET.
+		 *
+		 * Otherwise, we know there are no additional matching
+		 * data, as recnos don't have dups.  return DB_NOTFOUND.
+		 */
+		if (F_ISSET(dbc, DBC_OPD)) {
+			cp->recno++;
+			break;
+		}
+		ret = DB_NOTFOUND;
+		goto err;
+		/* NOTREACHED */
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		/*
+		 * If we're searching a set of off-page dups, we start
+		 * a new linear search from the first record.  Otherwise,
+		 * we compare the single data item associated with the
+		 * requested record for a match.
+		 */
+		if (F_ISSET(dbc, DBC_OPD)) {
+			cp->recno = 1;
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_SET:
+	case DB_SET_RANGE:
+		if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
+			goto err;
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__ramc_get", flags);
+		goto err;
+	}
+
+	/*
+	 * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
+	 * called __ram_update() to make sure sufficient records have been
+	 * read from the backing source file.  Do it now for DB_CURRENT (if
+	 * the current record was deleted we may need more records from the
+	 * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
+	 * (We don't have to test for flags == DB_FIRST, because the switch
+	 * statement above re-set flags to DB_NEXT in that case.)
+	 */
+	if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
+	    __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
+		goto err;
+
+	for (;; ++cp->recno) {
+		/* Search the tree for the record. */
+		if ((ret = __bam_rsearch(dbc, &cp->recno,
+		    F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+		    1, &exact)) != 0)
+			goto err;
+		if (!exact) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		/* Copy the page into the cursor. */
+		STACK_TO_CURSOR(cp, ret);
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * If re-numbering records, the on-page deleted flag means this
+		 * record was implicitly created.  If not re-numbering records,
+		 * the on-page deleted flag means this record was implicitly
+		 * created, or, it was deleted at some time.  Regardless, we
+		 * skip such records if doing cursor next/prev operations or
+		 * walking through off-page duplicates, and fail if they were
+		 * requested explicitly by the application.
+		 */
+		if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
+			switch (flags) {
+			case DB_NEXT:
+			case DB_PREV:
+				(void)__bam_stkrel(dbc, STK_CLRDBC);
+				PERFMON4(env, race, ramc_get,
+				    dbp->fname, dbp->dname, cp->page, flags);
+				goto retry;
+			case DB_GET_BOTH:
+			case DB_GET_BOTH_RANGE:
+				/*
+				 * If we're an OPD tree, we don't care about
+				 * matching a record number on a DB_GET_BOTH
+				 * -- everything belongs to the same tree.  A
+				 * normal recno should give up and return
+				 * DB_NOTFOUND if the matching recno is deleted.
+				 */
+				if (F_ISSET(dbc, DBC_OPD)) {
+					(void)__bam_stkrel(dbc, STK_CLRDBC);
+					continue;
+				}
+				ret = DB_NOTFOUND;
+				goto err;
+			default:
+				ret = DB_KEYEMPTY;
+				goto err;
+			}
+
+		if (flags == DB_GET_BOTH ||
+		    flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+			if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+			    __bam_defcmp, &cmp)) != 0)
+				return (ret);
+			if (cmp == 0)
+				break;
+			if (!F_ISSET(dbc, DBC_OPD)) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+			(void)__bam_stkrel(dbc, STK_CLRDBC);
+		} else
+			break;
+	}
+
+	/* Return the key if the user didn't give us one. */
+	if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) {
+		ret = __db_retcopy(dbp->env,
+		    key, &cp->recno, sizeof(cp->recno),
+		    &dbc->rkey->data, &dbc->rkey->ulen);
+		F_SET(key, DB_DBT_ISSET);
+	}
+
+	/* The cursor was reset, no further delete adjustment is necessary. */
+err:	CD_CLR(cp);
+
+	return (ret);
+}
+
+/*
+ * __ramc_put --
+ *	Recno DBC->put function.
+ *
+ * PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_LSN lsn;
+	ENV *env;
+	u_int32_t iiflags;
+	int exact, nc, ret, t_ret;
+	void *arg;
+
+	COMPQUIET(pgnop, NULL);
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * DB_KEYFIRST and DB_KEYLAST mean different things if they're
+	 * used in an off-page duplicate tree.  If we're an off-page
+	 * duplicate tree, they really mean "put at the beginning of the
+	 * tree" and "put at the end of the tree" respectively, so translate
+	 * them to something else.
+	 */
+	if (F_ISSET(dbc, DBC_OPD))
+		switch (flags) {
+		case DB_KEYFIRST:
+			cp->recno = 1;
+			flags = DB_BEFORE;
+			break;
+		case DB_KEYLAST:
+			if ((ret = __ram_add(dbc,
+			    &cp->recno, data, DB_APPEND, 0)) != 0)
+				return (ret);
+			if (CURADJ_LOG(dbc) &&
+			    (ret = __bam_rcuradj_log(dbp, dbc->txn,
+			    &lsn, 0, CA_ICURRENT,
+			    BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+				return (ret);
+			return (0);
+		default:
+			break;
+		}
+
+	/*
+	 * Handle normal DB_KEYFIRST/DB_KEYLAST;  for a recno, which has
+	 * no duplicates, these are identical and mean "put the given
+	 * datum at the given recno".
+	 */
+	if (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+	    flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP) {
+		ret = __ram_getno(dbc, key, &cp->recno, 1);
+		if (ret == 0 || ret == DB_NOTFOUND)
+			ret = __ram_add(dbc, &cp->recno, data, flags, 0);
+		return (ret);
+	}
+
+	/*
+	 * If we're putting with a cursor that's marked C_DELETED, we need to
+	 * take special care;  the cursor doesn't "really" reference the item
+	 * corresponding to its current recno, but instead is "between" that
+	 * record and the current one.  Translate the actual insert into
+	 * DB_BEFORE, and let the __ram_ca work out the gory details of what
+	 * should wind up pointing where.
+	 */
+	if (CD_ISSET(cp))
+		iiflags = DB_BEFORE;
+	else
+		iiflags = flags;
+
+split:	if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0)
+		goto err;
+	/*
+	 * An inexact match is okay;  it just means we're one record past the
+	 * end, which is reasonable if we're marked deleted.
+	 */
+	DB_ASSERT(env, exact || CD_ISSET(cp));
+
+	/* Copy the page into the cursor. */
+	STACK_TO_CURSOR(cp, ret);
+	if (ret != 0)
+		goto err;
+
+	ret = __bam_iitem(dbc, key, data, iiflags, 0);
+	t_ret = __bam_stkrel(dbc, STK_CLRDBC);
+
+	if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
+		ret = t_ret;
+	else if (ret == DB_NEEDSPLIT) {
+		arg = &cp->recno;
+		if ((ret = __bam_split(dbc, arg, NULL)) != 0)
+			goto err;
+		goto split;
+	}
+	if (ret != 0)
+		goto err;
+
+	switch (flags) {			/* Adjust the cursors. */
+	case DB_AFTER:
+		if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0)
+			goto err;
+
+		/*
+		 * We only need to adjust this cursor forward if we truly added
+		 * the item after the current recno, rather than remapping it
+		 * to DB_BEFORE.
+		 */
+		if (iiflags == DB_AFTER)
+			++cp->recno;
+
+		/* Only log if __ram_ca found any relevant cursors. */
+		if (nc > 0 && CURADJ_LOG(dbc) &&
+		    (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
+		    BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+			goto err;
+		break;
+	case DB_BEFORE:
+		if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0)
+			goto err;
+		--cp->recno;
+
+		/* Only log if __ram_ca found any relevant cursors. */
+		if (nc > 0 && CURADJ_LOG(dbc) &&
+		    (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
+		    BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+			goto err;
+		break;
+	case DB_CURRENT:
+		/*
+		 * We only need to do an adjustment if we actually
+		 * added an item, which we only would have done if the
+		 * cursor was marked deleted.
+		 */
+		if (!CD_ISSET(cp))
+			break;
+
+		/* Only log if __ram_ca found any relevant cursors. */
+		if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0)
+			goto err;
+		if (nc > 0 && CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp,
+		    dbc->txn, &lsn, 0, CA_ICURRENT,
+		    BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+			goto err;
+		break;
+	default:
+		break;
+	}
+
+	/* Return the key if we've created a new record. */
+	if (!F_ISSET(dbc, DBC_OPD) &&
+	    (flags == DB_AFTER || flags == DB_BEFORE) && key != NULL)
+		ret = __db_retcopy(env, key, &cp->recno,
+		    sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+	/* The cursor was reset, no further delete adjustment is necessary. */
+err:	CD_CLR(cp);
+
+	if (!DB_RETOK_DBCDEL(ret))
+		F_SET(dbc, DBC_ERROR);
+	return (ret);
+}
+
+static int
+__ram_ca_getorder(dbc, my_dbc, orderp, root_pgno, recno, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *orderp;
+	db_pgno_t root_pgno;
+	u_int32_t recno;
+	void *args;
+{
+	BTREE_CURSOR *cp;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(args, NULL);
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	if (root_pgno == BAM_ROOT_PGNO(dbc) &&
+	    recno == cp->recno && CD_ISSET(cp) &&
+	    *orderp <= cp->order &&
+	    !MVCC_SKIP_CURADJ(dbc, BAM_ROOT_PGNO(dbc)))
+		*orderp = cp->order;
+	return (0);
+}
+
+static int
+__ram_ca_setorder(dbc, my_dbc, foundp, pgno, order, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t pgno;
+	u_int32_t order;
+	void *args;
+{
+	BTREE_CURSOR *cp, *cp_arg;
+	int adjusted;
+	ca_recno_arg op;
+	db_recno_t recno;
+
+	COMPQUIET(pgno, 0);
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	cp_arg = (BTREE_CURSOR *)my_dbc->internal;
+	op = *(ca_recno_arg *)args;
+
+	if (cp_arg->root != cp->root ||
+	    MVCC_SKIP_CURADJ(dbc, BAM_ROOT_PGNO(dbc)))
+		return (0);
+	++(*foundp);
+	adjusted = 0;
+	recno = cp_arg->recno;
+	switch (op) {
+	case CA_DELETE:
+		if (recno < cp->recno) {
+			--cp->recno;
+			/*
+			 * If the adjustment made them equal,
+			 * we have to merge the orders.
+			 */
+			if (recno == cp->recno && CD_ISSET(cp))
+				cp->order += order;
+		} else if (recno == cp->recno &&
+		    !CD_ISSET(cp)) {
+			CD_SET(cp);
+			cp->order = order;
+			/*
+			 * If we're deleting the item, we can't
+			 * keep a streaming offset cached.
+			 */
+			cp->stream_start_pgno = PGNO_INVALID;
+		}
+		break;
+	case CA_IBEFORE:
+		/*
+		 * IBEFORE is just like IAFTER, except that we
+		 * adjust cursors on the current record too.
+		 */
+		if (C_EQUAL(cp_arg, cp)) {
+			++cp->recno;
+			adjusted = 1;
+		}
+		goto iafter;
+	case CA_ICURRENT:
+
+		/*
+		 * If the original cursor wasn't deleted, we
+		 * just did a replacement and so there's no
+		 * need to adjust anything--we shouldn't have
+		 * gotten this far.  Otherwise, we behave
+		 * much like an IAFTER, except that all
+		 * cursors pointing to the current item get
+		 * marked undeleted and point to the new
+		 * item.
+		 */
+		DB_ASSERT(dbc->dbp->env, CD_ISSET(cp_arg));
+		if (C_EQUAL(cp_arg, cp)) {
+			CD_CLR(cp);
+			break;
+		}
+		/* FALLTHROUGH */
+	case CA_IAFTER:
+iafter:		if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
+			++cp->recno;
+			adjusted = 1;
+		}
+		if (recno == cp->recno && adjusted)
+			/*
+			 * If we've moved this cursor's recno,
+			 * split its order number--i.e.,
+			 * decrement it by enough so that
+			 * the lowest cursor moved has order 1.
+			 * cp_arg->order is the split point,
+			 * so decrement by one less than that.
+			 */
+			cp->order -= (cp_arg->order - 1);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __ram_ca --
+ *	Adjust cursors.  Returns the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *));
+ */
+int
+__ram_ca(dbc_arg, op, foundp)
+	DBC *dbc_arg;
+	ca_recno_arg op;
+	int *foundp;
+{
+	BTREE_CURSOR *cp_arg;
+	DB *dbp;
+	ENV *env;
+	db_recno_t recno;
+	u_int32_t found, order;
+	int ret;
+
+	dbp = dbc_arg->dbp;
+	env = dbp->env;
+	cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
+	recno = cp_arg->recno;
+
+	/*
+	 * It only makes sense to adjust cursors if we're a renumbering
+	 * recno;  we should only be called if this is one.
+	 */
+	DB_ASSERT(env, F_ISSET(cp_arg, C_RENUMBER));
+
+	/*
+	 * Adjust the cursors.  See the comment in __bam_ca_delete().
+	 *
+	 * If we're doing a delete, we need to find the highest
+	 * order of any cursor currently pointing at this item,
+	 * so we can assign a higher order to the newly deleted
+	 * cursor.  Unfortunately, this requires a second pass through
+	 * the cursor list.
+	 */
+	if (op == CA_DELETE) {
+		if ((ret = __db_walk_cursors(dbp, NULL, __ram_ca_getorder,
+		    &order, BAM_ROOT_PGNO(dbc_arg), recno, NULL)) != 0)
+			return (ret);
+		order++;
+	} else
+		order = INVALID_ORDER;
+
+	if ((ret = __db_walk_cursors(dbp, dbc_arg,
+	    __ram_ca_setorder, &found, 0, order, &op)) != 0)
+		return (ret);
+	if (foundp != NULL)
+		*foundp = (int)found;
+	return (0);
+}
+
+/*
+ * __ram_getno --
+ *	Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbc, key, rep, can_create)
+	DBC *dbc;
+	const DBT *key;
+	db_recno_t *rep;
+	int can_create;
+{
+	DB *dbp;
+	db_recno_t recno;
+
+	dbp = dbc->dbp;
+
+	/* If passed an empty DBT from Java, key->data may be NULL */
+	if (key->size != sizeof(db_recno_t)) {
+		__db_errx(dbp->env, DB_STR("1001",
+		    "illegal record number size"));
+		return (EINVAL);
+	}
+
+	/* Check the user's record number. */
+	if ((recno = *(db_recno_t *)key->data) == 0) {
+		__db_errx(dbp->env, DB_STR("1002",
+		    "illegal record number of 0"));
+		return (EINVAL);
+	}
+	if (rep != NULL)
+		*rep = recno;
+
+	/*
+	 * Btree can neither create records nor read them in.  Recno can
+	 * do both, see if we can find the record.
+	 */
+	return (dbc->dbtype == DB_RECNO ?
+	    __ram_update(dbc, recno, can_create) : 0);
+}
+
+/*
+ * __ram_update --
+ *	Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbc, recno, can_create)
+	DBC *dbc;
+	db_recno_t recno;
+	int can_create;
+{
+	BTREE *t;
+	DB *dbp;
+	DBT *rdata;
+	db_recno_t nrecs;
+	int ret;
+
+	dbp = dbc->dbp;
+	t = dbp->bt_internal;
+
+	/*
+	 * If we can't create records and we've read the entire backing input
+	 * file, we're done.
+	 */
+	if (!can_create && t->re_eof)
+		return (0);
+
+	/*
+	 * If we haven't seen this record yet, try to get it from the original
+	 * file.
+	 */
+	if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+		return (ret);
+	if (!t->re_eof && recno > nrecs) {
+		if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
+			return (ret);
+		if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * If we can create records, create empty ones up to the requested
+	 * record.
+	 */
+	if (!can_create || recno <= nrecs + 1)
+		return (0);
+
+	rdata = &dbc->my_rdata;
+	rdata->flags = 0;
+	rdata->size = 0;
+
+	while (recno > ++nrecs)
+		if ((ret = __ram_add(dbc,
+		    &nrecs, rdata, 0, BI_DELETED)) != 0)
+			return (ret);
+	return (0);
+}
+
+/*
+ * __ram_source --
+ *	Load information about the backing file.
+ */
+static int
+__ram_source(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	ENV *env;
+	char *source;
+	int ret;
+
+	env = dbp->env;
+	t = dbp->bt_internal;
+
+	/* Find the real name, and swap out the one we had before. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, t->re_source, NULL, &source)) != 0)
+		return (ret);
+	__os_free(env, t->re_source);
+	t->re_source = source;
+
+	/*
+	 * !!!
+	 * It's possible that the backing source file is read-only.  We don't
+	 * much care other than we'll complain if there are any modifications
+	 * when it comes time to write the database back to the source.
+	 */
+	if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) {
+		ret = __os_get_errno();
+		__db_err(env, ret, "%s", t->re_source);
+		return (ret);
+	}
+
+	t->re_eof = 0;
+	return (0);
+}
+
+/*
+ * __ram_writeback --
+ *	Rewrite the backing file.
+ *
+ * PUBLIC: int __ram_writeback __P((DB *));
+ */
+int
+__ram_writeback(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	DBC *dbc;
+	DBT key, data;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	FILE *fp;
+	db_recno_t keyno;
+	int ret, t_ret;
+	u_int8_t delim, *pad;
+
+	t = dbp->bt_internal;
+	env = dbp->env;
+	fp = NULL;
+	pad = NULL;
+
+	/* If the file wasn't modified, we're done. */
+	if (!t->re_modified)
+		return (0);
+
+	/* If there's no backing source file, we're done. */
+	if (t->re_source == NULL) {
+		t->re_modified = 0;
+		return (0);
+	}
+
+	/*
+	 * We step through the records, writing each one out.  Use the record
+	 * number and the dbp->get() function, instead of a cursor, so we find
+	 * and write out "deleted" or non-existent records.  The DB handle may
+	 * be threaded, so allocate memory as we go.
+	 */
+	memset(&key, 0, sizeof(key));
+	key.size = sizeof(db_recno_t);
+	key.data = &keyno;
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_REALLOC);
+
+	/* Allocate a cursor. */
+	ENV_GET_THREAD_INFO(env, ip);
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Read any remaining records into the tree.
+	 *
+	 * !!!
+	 * This is why we can't support transactions when applications specify
+	 * backing (re_source) files.  At this point we have to read in the
+	 * rest of the records from the file so that we can write all of the
+	 * records back out again, which could modify a page for which we'd
+	 * have to log changes and which we don't have locked.  This could be
+	 * partially fixed by taking a snapshot of the entire file during the
+	 * DB->open as DB->open is transaction protected.  But, if a checkpoint
+	 * occurs then, the part of the log holding the copy of the file could
+	 * be discarded, and that would make it impossible to recover in the
+	 * face of disaster.  This could all probably be fixed, but it would
+	 * require transaction protecting the backing source file.
+	 *
+	 * XXX
+	 * This could be made to work now that we have transactions protecting
+	 * file operations.  Margo has specifically asked for the privilege of
+	 * doing this work.
+	 */
+	if ((ret =
+	    __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
+		goto err;
+
+	/*
+	 * Close any existing file handle and re-open the file, truncating it.
+	 */
+	if (t->re_fp != NULL) {
+		if (fclose(t->re_fp) != 0) {
+			ret = __os_get_errno();
+			__db_err(env, ret, "%s", t->re_source);
+			goto err;
+		}
+		t->re_fp = NULL;
+	}
+	if ((fp = fopen(t->re_source, "wb")) == NULL) {
+		ret = __os_get_errno();
+		__db_err(env, ret, "%s", t->re_source);
+		goto err;
+	}
+
+	/*
+	 * We'll need the delimiter if we're doing variable-length records,
+	 * and the pad character if we're doing fixed-length records.
+	 */
+	delim = t->re_delim;
+	for (keyno = 1;; ++keyno) {
+		switch (ret = __db_get(dbp, ip, NULL, &key, &data, 0)) {
+		case 0:
+			if (data.size != 0 &&
+			    fwrite(data.data, 1, data.size, fp) != data.size)
+				goto write_err;
+			break;
+		case DB_KEYEMPTY:
+			if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+				if (pad == NULL) {
+					if ((ret = __os_malloc(
+					    env, t->re_len, &pad)) != 0)
+						goto err;
+					memset(pad, t->re_pad, t->re_len);
+				}
+				if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
+					goto write_err;
+			}
+			break;
+		case DB_NOTFOUND:
+			ret = 0;
+			goto done;
+		default:
+			goto err;
+		}
+		if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+		    fwrite(&delim, 1, 1, fp) != 1) {
+write_err:		ret = __os_get_errno();
+			__db_err(env, ret, DB_STR_A("1003",
+			    "%s: write failed to backing file", "%s"),
+			    t->re_source);
+			goto err;
+		}
+	}
+
+err:
+done:	/* Close the file descriptor. */
+	if (fp != NULL && fclose(fp) != 0) {
+		t_ret = __os_get_errno();
+		__db_err(env, t_ret, "%s", t->re_source);
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	/* Discard the cursor. */
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard memory allocated to hold the data items. */
+	if (data.data != NULL)
+		__os_ufree(env, data.data);
+	if (pad != NULL)
+		__os_free(env, pad);
+
+	if (ret == 0)
+		t->re_modified = 0;
+
+	return (ret);
+}
+
+/*
+ * __ram_sread --
+ *	Read records from a source file.
+ */
+static int
+__ram_sread(dbc, top)
+	DBC *dbc;
+	db_recno_t top;
+{
+	BTREE *t;
+	DB *dbp;
+	DBT data, *rdata;
+	db_recno_t recno;
+	size_t len;
+	int ch, ret, was_modified;
+
+	t = dbc->dbp->bt_internal;
+	dbp = dbc->dbp;
+	was_modified = t->re_modified;
+
+	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
+		return (ret);
+
+	/*
+	 * Use the record key return memory, it's only a short-term use.
+	 * The record data return memory is used by __bam_iitem, which
+	 * we'll indirectly call, so use the key so as not to collide.
+	 */
+	len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
+	rdata = &dbc->my_rkey;
+	if (rdata->ulen < len) {
+		if ((ret = __os_realloc(
+		    dbp->env, len, &rdata->data)) != 0) {
+			rdata->ulen = 0;
+			rdata->data = NULL;
+			return (ret);
+		}
+		rdata->ulen = (u_int32_t)len;
+	}
+
+	memset(&data, 0, sizeof(data));
+	while (recno < top) {
+		data.data = rdata->data;
+		data.size = 0;
+		if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+			for (len = t->re_len; len > 0; --len) {
+				if ((ch = fgetc(t->re_fp)) == EOF) {
+					if (data.size == 0)
+						goto eof;
+					break;
+				}
+				((u_int8_t *)data.data)[data.size++] = ch;
+			}
+		else
+			for (;;) {
+				if ((ch = fgetc(t->re_fp)) == EOF) {
+					if (data.size == 0)
+						goto eof;
+					break;
+				}
+				if (ch == t->re_delim)
+					break;
+
+				((u_int8_t *)data.data)[data.size++] = ch;
+				if (data.size == rdata->ulen) {
+					if ((ret = __os_realloc(dbp->env,
+					    rdata->ulen *= 2,
+					    &rdata->data)) != 0) {
+						rdata->ulen = 0;
+						rdata->data = NULL;
+						return (ret);
+					} else
+						data.data = rdata->data;
+				}
+			}
+
+		/*
+		 * Another process may have read this record from the input
+		 * file and stored it into the database already, in which
+		 * case we don't need to repeat that operation.  We detect
+		 * this by checking if the last record we've read is greater
+		 * or equal to the number of records in the database.
+		 */
+		if (t->re_last >= recno) {
+			++recno;
+			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
+				goto err;
+		}
+		++t->re_last;
+	}
+
+	if (0) {
+eof:		t->re_eof = 1;
+		ret = DB_NOTFOUND;
+	}
+err:	if (!was_modified)
+		t->re_modified = 0;
+
+	return (ret);
+}
+
+/*
+ * __ram_add --
+ *	Add records into the tree.
+ */
+static int
+__ram_add(dbc, recnop, data, flags, bi_flags)
+	DBC *dbc;
+	db_recno_t *recnop;
+	DBT *data;
+	u_int32_t flags, bi_flags;
+{
+	BTREE_CURSOR *cp;
+	int exact, ret, stack, t_ret;
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+retry:	/* Find the slot for insertion. */
+	if ((ret = __bam_rsearch(dbc, recnop,
+	    SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0)
+		return (ret);
+	stack = 1;
+
+	/* Copy the page into the cursor. */
+	STACK_TO_CURSOR(cp, ret);
+	if (ret != 0)
+		goto err;
+
+	if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
+	    !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
+		ret = DB_KEYEXIST;
+		goto err;
+	}
+
+	/*
+	 * The application may modify the data based on the selected record
+	 * number.
+	 */
+	if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
+	    (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
+		goto err;
+
+	/*
+	 * Select the arguments for __bam_iitem() and do the insert.  If the
+	 * key is an exact match, or we're replacing the data item with a
+	 * new data item, replace the current item.  If the key isn't an exact
+	 * match, we're inserting a new key/data pair, before the search
+	 * location.
+	 */
+	switch (ret = __bam_iitem(dbc,
+	    NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
+	case 0:
+		/*
+		 * Don't adjust anything.
+		 *
+		 * If we inserted a record, no cursors need adjusting because
+		 * the only new record it's possible to insert is at the very
+		 * end of the tree.  The necessary adjustments to the internal
+		 * page counts were made by __bam_iitem().
+		 *
+		 * If we overwrote a record, no cursors need adjusting because
+		 * future DBcursor->get calls will simply return the underlying
+		 * record (there's no adjustment made for the DB_CURRENT flag
+		 * when a cursor get operation immediately follows a cursor
+		 * delete operation, and the normal adjustment for the DB_NEXT
+		 * flag is still correct).
+		 */
+		break;
+	case DB_NEEDSPLIT:
+		/* Discard the stack of pages and split the page. */
+		(void)__bam_stkrel(dbc, STK_CLRDBC);
+		stack = 0;
+
+		if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
+			goto err;
+
+		goto retry;
+		/* NOTREACHED */
+	default:
+		goto err;
+	}
+
+err:	if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c
new file mode 100644
index 00000000..36d1c667
--- /dev/null
+++ b/src/btree/bt_rsearch.c
@@ -0,0 +1,513 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_rsearch --
+ *	Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+ */
+int
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+	DBC *dbc;
+	db_recno_t *recnop;
+	u_int32_t flags;
+	int stop, *exactp;
+{
+	BINTERNAL *bi;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	RINTERNAL *ri;
+	db_indx_t adjust, deloffset, indx, top;
+	db_lockmode_t lock_mode;
+	db_pgno_t pg;
+	db_recno_t recno, t_recno, total;
+	u_int32_t get_mode;
+	int ret, stack, t_ret;
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	h = NULL;
+	ret = 0;
+
+	BT_STK_CLR(cp);
+
+	/*
+	 * There are several ways we search a btree tree.  The flags argument
+	 * specifies if we're acquiring read or write locks and if we are
+	 * locking pairs of pages.  In addition, if we're adding or deleting
+	 * an item, we have to lock the entire tree, regardless.  See btree.h
+	 * for more details.
+	 *
+	 * If write-locking pages, we need to know whether or not to acquire a
+	 * write lock on a page before getting it.  This depends on how deep it
+	 * is in tree, which we don't know until we acquire the root page.  So,
+	 * if we need to lock the root page we may have to upgrade it later,
+	 * because we won't get the correct lock initially.
+	 *
+	 * Retrieve the root page.
+	 */
+
+	if ((ret = __bam_get_root(dbc, PGNO_INVALID, stop, flags, &stack)) != 0)
+		goto done;
+	lock_mode = cp->csp->lock_mode;
+	get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+	lock = cp->csp->lock;
+	h = cp->csp->page;
+
+	BT_STK_CLR(cp);
+	/*
+	 * If appending to the tree, set the record number now -- we have the
+	 * root page locked.
+	 *
+	 * Delete only deletes exact matches, read only returns exact matches.
+	 * Note, this is different from __bam_search(), which returns non-exact
+	 * matches for read.
+	 *
+	 * The record may not exist.  We can only return the correct location
+	 * for the record immediately after the last record in the tree, so do
+	 * a fast check now.
+	 */
+	total = RE_NREC(h);
+	if (LF_ISSET(SR_APPEND)) {
+		*exactp = 0;
+		*recnop = recno = total + 1;
+	} else {
+		recno = *recnop;
+		if (recno <= total)
+			*exactp = 1;
+		else {
+			*exactp = 0;
+			if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
+				/*
+				 * Keep the page locked for serializability.
+				 *
+				 * XXX
+				 * This leaves the root page locked, which will
+				 * eliminate any concurrency.  A possible fix
+				 * would be to lock the last leaf page instead.
+				 */
+				ret = __memp_fput(mpf,
+				    dbc->thread_info, h, dbc->priority);
+				if ((t_ret =
+				    __TLPUT(dbc, lock)) != 0 && ret == 0)
+					ret = t_ret;
+				if (ret == 0)
+					ret = DB_NOTFOUND;
+				goto done;
+			}
+		}
+	}
+
+	/*
+	 * !!!
+	 * Record numbers in the tree are 0-based, but the recno is
+	 * 1-based.  All of the calculations below have to take this
+	 * into account.
+	 */
+	for (total = 0;;) {
+		switch (TYPE(h)) {
+		case P_LBTREE:
+			if (LF_ISSET(SR_MAX)) {
+				indx = NUM_ENT(h) - 2;
+				goto enter;
+			}
+			/* FALLTHROUGH */
+		case P_LDUP:
+			if (LF_ISSET(SR_MAX)) {
+				indx = NUM_ENT(h) - 1;
+				goto enter;
+			}
+			recno -= total;
+			/*
+			 * There may be logically deleted records on the page.
+			 * If there are enough, the record may not exist.
+			 */
+			if (TYPE(h) == P_LBTREE) {
+				adjust = P_INDX;
+				deloffset = O_INDX;
+			} else {
+				adjust = O_INDX;
+				deloffset = 0;
+			}
+			for (t_recno = 0, indx = 0;; indx += adjust) {
+				if (indx >= NUM_ENT(h)) {
+					*exactp = 0;
+					if (!LF_ISSET(SR_PAST_EOF) ||
+					    recno > t_recno + 1) {
+						ret = __memp_fput(mpf,
+						    dbc->thread_info,
+						    h, dbc->priority);
+						h = NULL;
+						if ((t_ret = __TLPUT(dbc,
+						    lock)) != 0 && ret == 0)
+							ret = t_ret;
+						if (ret == 0)
+							ret = DB_NOTFOUND;
+						goto err;
+					}
+				}
+				if (!B_DISSET(GET_BKEYDATA(dbp, h,
+				    indx + deloffset)->type) &&
+				    ++t_recno == recno)
+					break;
+			}
+
+			BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+			if (LF_ISSET(SR_BOTH))
+				goto get_prev;
+			goto done;
+		case P_IBTREE:
+			if (LF_ISSET(SR_MAX)) {
+				indx = NUM_ENT(h);
+				bi = GET_BINTERNAL(dbp, h, indx - 1);
+			} else for (indx = 0, top = NUM_ENT(h);;) {
+				bi = GET_BINTERNAL(dbp, h, indx);
+				if (++indx == top || total + bi->nrecs >= recno)
+					break;
+				total += bi->nrecs;
+			}
+			pg = bi->pgno;
+			break;
+		case P_LRECNO:
+			if (LF_ISSET(SR_MAX))
+				recno = NUM_ENT(h);
+			else
+				recno -= total;
+
+			/* Correct from 1-based to 0-based for a page offset. */
+			--recno;
+enter:			BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+			if (LF_ISSET(SR_BOTH)) {
+get_prev:			DB_ASSERT(env, LF_ISSET(SR_NEXT));
+				/*
+				 * We have a NEXT tree, now add the sub tree
+				 * that points gets to the previous page.
+				 */
+				cp->csp++;
+				indx = cp->sp->indx - 1;
+				h = cp->sp->page;
+				if (TYPE(h) == P_IRECNO) {
+					ri = GET_RINTERNAL(dbp, h, indx);
+					pg = ri->pgno;
+				} else {
+					DB_ASSERT(env, TYPE(h) == P_IBTREE);
+					bi = GET_BINTERNAL(dbp, h, indx);
+					pg = bi->pgno;
+				}
+				LF_CLR(SR_NEXT | SR_BOTH);
+				LF_SET(SR_MAX);
+				stack = 1;
+				h = NULL;
+				goto lock_next;
+			}
+			goto done;
+		case P_IRECNO:
+			if (LF_ISSET(SR_MAX)) {
+				indx = NUM_ENT(h);
+				ri = GET_RINTERNAL(dbp, h, indx - 1);
+			} else for (indx = 0, top = NUM_ENT(h);;) {
+				ri = GET_RINTERNAL(dbp, h, indx);
+				if (++indx == top || total + ri->nrecs >= recno)
+					break;
+				total += ri->nrecs;
+			}
+			pg = ri->pgno;
+			break;
+		default:
+			ret = __db_pgfmt(env, h->pgno);
+			goto done;
+		}
+		--indx;
+
+		/* Return if this is the lowest page wanted. */
+		if (stop == LEVEL(h)) {
+			BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+			goto done;
+		}
+		if (stack) {
+			BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+			h = NULL;
+
+			lock_mode = DB_LOCK_WRITE;
+			get_mode = DB_MPOOL_DIRTY;
+			if ((ret =
+			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+				goto err;
+		} else if (LF_ISSET(SR_NEXT)) {
+			/*
+			 * For RECNO if we are doing a NEXT search the
+			 * search recno is the one we are looking for
+			 * but we want to keep the stack from the spanning
+			 * node on down.  We only know we have the spanning
+			 * node when its child's index is 0, so save
+			 * each node and discard the tree when we find out
+			 * its not needed.
+			 */
+			if (indx != 0 && cp->sp->page != NULL) {
+				BT_STK_POP(cp);
+				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+					goto err;
+			}
+
+			BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+			h = NULL;
+			if (ret != 0)
+				goto err;
+lock_next:		if ((ret =
+			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+				goto err;
+		} else {
+			/*
+			 * Decide if we want to return a pointer to the next
+			 * page in the stack.  If we do, write lock it and
+			 * never unlock it.
+			 */
+			if ((LF_ISSET(SR_PARENT) &&
+			    (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
+			    (LEVEL(h) - 1) == LEAFLEVEL)
+				stack = 1;
+
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority)) != 0)
+				goto err;
+			h = NULL;
+
+			lock_mode = stack &&
+			    LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+			if (lock_mode == DB_LOCK_WRITE)
+				get_mode = DB_MPOOL_DIRTY;
+			if ((ret = __db_lget(dbc,
+			    LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
+				/*
+				 * If we fail, discard the lock we held.  This
+				 * is OK because this only happens when we are
+				 * descending the tree holding read-locks.
+				 */
+				(void)__LPUT(dbc, lock);
+				goto err;
+			}
+		}
+
+		if ((ret = __memp_fget(mpf, &pg,
+		     dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+			goto err;
+	}
+	/* NOTREACHED */
+
+err:	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	BT_STK_POP(cp);
+	(void)__bam_stkrel(dbc, 0);
+
+done:
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+	return (ret);
+}
+
+/*
+ * __bam_adjust --
+ *	Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
+ */
+int
+__bam_adjust(dbc, adjust)
+	DBC *dbc;
+	int32_t adjust;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	EPG *epg;
+	PAGE *h;
+	db_pgno_t root_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	root_pgno = BAM_ROOT_PGNO(dbc);
+
+	/* Update the record counts for the tree. */
+	for (epg = cp->sp; epg <= cp->csp; ++epg) {
+		h = epg->page;
+		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+			ret = __memp_dirty(mpf, &h,
+			    dbc->thread_info, dbc->txn, dbc->priority, 0);
+			epg->page = h;
+			if (ret != 0)
+				return (ret);
+			if (DBC_LOGGING(dbc)) {
+				if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+				    &LSN(h), 0, PGNO(h), &LSN(h),
+				    (u_int32_t)epg->indx, adjust,
+				    PGNO(h) == root_pgno ?
+				    CAD_UPDATEROOT : 0)) != 0)
+					return (ret);
+			} else
+				LSN_NOT_LOGGED(LSN(h));
+
+			if (TYPE(h) == P_IBTREE)
+				GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
+				    adjust;
+			else
+				GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
+				    adjust;
+
+			if (PGNO(h) == root_pgno)
+				RE_NREC_ADJ(h, adjust);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __bam_nrecs --
+ *	Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbc, rep)
+	DBC *dbc;
+	db_recno_t *rep;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pgno_t pgno;
+	int ret, t_ret;
+
+	COMPQUIET(h, NULL);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	LOCK_INIT(lock);
+
+	pgno = PGNO_INVALID;
+	BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+	if (ret != 0)
+		goto err;
+	DB_ASSERT(dbp->env, h != NULL);
+
+	*rep = RE_NREC(h);
+
+	ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+err:	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bam_total --
+ *	Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
+ */
+db_recno_t
+__bam_total(dbp, h)
+	DB *dbp;
+	PAGE *h;
+{
+	db_recno_t nrecs;
+	db_indx_t indx, top;
+
+	nrecs = 0;
+	top = NUM_ENT(h);
+
+	switch (TYPE(h)) {
+	case P_LBTREE:
+		/* Check for logically deleted records. */
+		for (indx = 0; indx < top; indx += P_INDX)
+			if (!B_DISSET(
+			    GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
+				++nrecs;
+		break;
+	case P_LDUP:
+		/* Check for logically deleted records. */
+		for (indx = 0; indx < top; indx += O_INDX)
+			if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+				++nrecs;
+		break;
+	case P_IBTREE:
+		for (indx = 0; indx < top; indx += O_INDX)
+			nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
+		break;
+	case P_LRECNO:
+		nrecs = NUM_ENT(h);
+		break;
+	case P_IRECNO:
+		for (indx = 0; indx < top; indx += O_INDX)
+			nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
+		break;
+	}
+
+	return (nrecs);
+}
diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c
new file mode 100644
index 00000000..e809a852
--- /dev/null
+++ b/src/btree/bt_search.c
@@ -0,0 +1,1028 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_get_root --
+ *	Fetch the root of a tree and see if we want to keep
+ * it in the stack.
+ *
+ * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
+ */
+int
+__bam_get_root(dbc, root_pgno, slevel, flags, stack)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int slevel;
+	u_int32_t flags;
+	int *stack;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_lockmode_t lock_mode;
+	u_int32_t get_mode;
+	int ret, t_ret;
+
+	COMPQUIET(h, NULL);
+	LOCK_INIT(lock);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	/*
+	 * If write-locking pages, we need to know whether or not to acquire a
+	 * write lock on a page before getting it.  This depends on how deep it
+	 * is in tree, which we don't know until we acquire the root page.  So,
+	 * if we need to lock the root page we may have to upgrade it later,
+	 * because we won't get the correct lock initially.
+	 *
+	 * Retrieve the root page.
+	 */
+try_again:
+	*stack = LF_ISSET(SR_STACK) &&
+	      (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM));
+	lock_mode = DB_LOCK_READ;
+	if (*stack ||
+	    LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE)))
+		lock_mode = DB_LOCK_WRITE;
+
+	/*
+	 * Get the root.  If the root happens to be a leaf page then
+	 * we are supposed to get a read lock on it before latching
+	 * it.  So if we have not locked it do a try get first.
+	 * If we can't get the root shared, then get a lock on it and
+	 * then wait for the latch.
+	 */
+retry:	if (lock_mode == DB_LOCK_WRITE)
+		get_mode = DB_MPOOL_DIRTY;
+	else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc) ||
+	    F_ISSET(dbc, DBC_DOWNREV) ||
+	    dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))
+		get_mode = 0;
+	else
+		get_mode = DB_MPOOL_TRY;
+
+	BAM_GET_ROOT(dbc, root_pgno, h, get_mode, lock_mode, lock, ret);
+	if (ret == DB_LOCK_NOTGRANTED && get_mode == DB_MPOOL_TRY) {
+		DB_ASSERT(dbp->env, !LOCK_ISSET(lock));
+		if ((ret = __db_lget(dbc, 0,
+		    root_pgno == PGNO_INVALID ? BAM_ROOT_PGNO(dbc) : root_pgno,
+		    lock_mode, 0, &lock)) != 0)
+			return (ret);
+		goto retry;
+	}
+	if (ret != 0) {
+		/* Did not read it, so we can release the lock */
+		(void)__LPUT(dbc, lock);
+		return (ret);
+	}
+	DB_ASSERT(dbp->env, TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO ||
+	    TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO || TYPE(h) == P_LDUP);
+
+	/*
+	 * Decide if we need to dirty and/or lock this page.
+	 * We must not hold the latch while we get the lock.
+	 */
+	if (!*stack &&
+	    ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+	    LEVEL(h) == LEAFLEVEL ||
+	    (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+		*stack = 1;
+		/* If we already have the write lock, we are done. */
+		if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) {
+			if (lock_mode == DB_LOCK_WRITE)
+				goto done;
+			if ((ret = __LPUT(dbc, lock)) != 0)
+				return (ret);
+		}
+
+		/*
+		 * Now that we know what level the root is at, do we need a
+		 * write lock?  If not or we got the lock before latching
+		 * we are done.
+		 */
+		if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) {
+			lock_mode = DB_LOCK_WRITE;
+			/* Drop the read lock if we got it above. */
+			if ((ret = __LPUT(dbc, lock)) != 0)
+				return (ret);
+		} else if (LOCK_ISSET(lock))
+			goto done;
+		if (!STD_LOCKING(dbc)) {
+			if (lock_mode != DB_LOCK_WRITE)
+				goto done;
+			if ((ret = __memp_dirty(mpf, &h, dbc->thread_info,
+			    dbc->txn, dbc->priority, 0)) != 0) {
+				if (h != NULL)
+					(void)__memp_fput(mpf,
+					    dbc->thread_info, h, dbc->priority);
+				return (ret);
+			}
+		} else {
+			/* Try to lock the page without waiting first. */
+			if ((ret = __db_lget(dbc, 0, root_pgno,
+			    lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) {
+				if (lock_mode == DB_LOCK_WRITE && (ret =
+				    __memp_dirty(mpf, &h, dbc->thread_info,
+				    dbc->txn, dbc->priority, 0)) != 0) {
+					if (h != NULL)
+						(void)__memp_fput(mpf,
+						    dbc->thread_info, h,
+						    dbc->priority);
+					return (ret);
+				}
+				goto done;
+			}
+
+			t_ret = __memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+			h = NULL;
+
+			if (ret == DB_LOCK_DEADLOCK ||
+			    ret == DB_LOCK_NOTGRANTED)
+				ret = 0;
+			if (ret == 0)
+				ret = t_ret;
+
+			if (ret != 0)
+				return (ret);
+			get_mode = 0;
+			if (lock_mode == DB_LOCK_WRITE)
+				get_mode = DB_MPOOL_DIRTY;
+
+			if ((ret = __db_lget(dbc,
+			     0, root_pgno, lock_mode, 0, &lock)) != 0)
+				return (ret);
+			if ((ret = __memp_fget(mpf,
+			     &root_pgno, dbc->thread_info, dbc->txn,
+			     (atomic_read(&mpf->mfp->multiversion) == 0 &&
+			     lock_mode == DB_LOCK_WRITE) ? DB_MPOOL_DIRTY : 0,
+			     &h)) != 0) {
+				/* Did not read it, release the lock */
+				(void)__LPUT(dbc, lock);
+				return (ret);
+			}
+		}
+		/*
+		 * While getting dirty or locked we need to drop the mutex
+		 * so someone else could get in and split the root.
+		 */
+		if (!((LF_ISSET(SR_PARENT) &&
+		    (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+		    LEVEL(h) == LEAFLEVEL ||
+		    (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+			/* Someone else split the root, start over. */
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+			h = NULL;
+			if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				return (ret);
+			goto try_again;
+		} else if (atomic_read(&mpf->mfp->multiversion) != 0 &&
+		    lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+			(void)__LPUT(dbc, lock);
+		}
+	}
+
+done:	BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret);
+
+	return (ret);
+}
+
+/*
+ * __bam_search --
+ *	Search a btree for a key.
+ *
+ * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
+ * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
+ */
+int
+__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	const DBT *key;
+	u_int32_t flags;
+	int slevel, *exactp;
+	db_recno_t *recnop;
+{
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_LOCK lock, saved_lock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h, *parent_h;
+	db_indx_t base, i, indx, *inp, lim;
+	db_lockmode_t lock_mode;
+	db_pgno_t pg, saved_pg, start_pgno;
+	db_recno_t recno;
+	int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
+	int getlock, was_next;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+	u_int32_t get_mode, wait;
+	u_int8_t level, saved_level;
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	h = NULL;
+	parent_h = NULL;
+	t = dbp->bt_internal;
+	recno = 0;
+	t_ret = 0;
+
+	BT_STK_CLR(cp);
+	LOCK_INIT(saved_lock);
+	LOCK_INIT(lock);
+	was_next = LF_ISSET(SR_NEXT);
+	wait = DB_LOCK_NOWAIT;
+
+	/*
+	 * There are several ways we search a btree tree.  The flags argument
+	 * specifies if we're acquiring read or write latches, if we position
+	 * to the first or last item in a set of duplicates, if we return
+	 * deleted items, and if we are latching pairs of pages.  In addition,
+	 * if we're modifying record numbers, we have to latch the entire tree
+	 * regardless.  See btree.h for more details.
+	 */
+
+	start_pgno = saved_pg = root_pgno;
+	saved_level = MAXBTREELEVEL;
+retry:	if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
+		goto err;
+	lock_mode = cp->csp->lock_mode;
+	get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+	h = cp->csp->page;
+	root_pgno = pg = PGNO(h);
+	lock = cp->csp->lock;
+	set_stack = stack;
+	/*
+	 * Determine if we need to lock interior nodes.
+	 * If we have record numbers we always lock.  Otherwise we only
+	 * need to do this if we are write locking and we are returning
+	 * a stack of nodes.  SR_NEXT will eventually get a stack and
+	 * release the locks above that level.
+	 */
+	if (F_ISSET(dbc, DBC_DOWNREV)) {
+		getlock = 1;
+		wait = 0;
+	} else
+		getlock = F_ISSET(cp, C_RECNUM) ||
+		   (lock_mode == DB_LOCK_WRITE &&
+		   (stack || LF_ISSET(SR_NEXT | SR_DEL)));
+
+	/*
+	 * If we are asked a level that is above the root,
+	 * just return the root.  This can happen if the tree
+	 * collapses while we are trying to lock the root.
+	 */
+	if (!LF_ISSET(SR_START) && LEVEL(h) < slevel)
+		goto done;
+
+	BT_STK_CLR(cp);
+
+	/* Choose a comparison function. */
+	func = F_ISSET(dbc, DBC_OPD) ?
+	    (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
+	    t->bt_compare;
+
+	for (;;) {
+		if (TYPE(h) == P_LBTREE)
+			adjust = P_INDX;
+		else {
+			/*
+			 * It is possible to catch an internal page as a change
+			 * is being backed out.  Its leaf pages will be locked
+			 * but we must be sure we get to one.  If the page
+			 * is not populated enough lock it.
+			 */
+			if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) {
+				getlock = 1;
+				level = LEVEL(h) + 1;
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				     h, dbc->priority)) != 0)
+					goto err;
+				goto lock_next;
+			}
+			adjust = O_INDX;
+		}
+		inp = P_INP(dbp, h);
+		if (LF_ISSET(SR_MIN | SR_MAX)) {
+			if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0)
+				indx = 0;
+			else if (TYPE(h) == P_LBTREE)
+				indx = NUM_ENT(h) - 2;
+			else
+				indx = NUM_ENT(h) - 1;
+
+			if (LEVEL(h) == LEAFLEVEL ||
+			     (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+				if (LF_ISSET(SR_NEXT))
+					goto get_next;
+				goto found;
+			}
+			goto next;
+		}
+		/*
+		 * Do a binary search on the current page.  If we're searching
+		 * a Btree leaf page, we have to walk the indices in groups of
+		 * two.  If we're searching an internal page or a off-page dup
+		 * page, they're an index per page item.  If we find an exact
+		 * match on a leaf page, we're done.
+		 */
+		DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
+			DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
+			if ((ret = __bam_cmp(dbc, key, h, indx,
+			    func, &cmp)) != 0)
+				goto err;
+			if (cmp == 0) {
+				if (LEVEL(h) == LEAFLEVEL ||
+				    (!LF_ISSET(SR_START) &&
+				    LEVEL(h) == slevel)) {
+					if (LF_ISSET(SR_NEXT))
+						goto get_next;
+					goto found;
+				}
+				goto next;
+			}
+			if (cmp > 0)
+				DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+				    lim, adjust);
+		}
+
+		/*
+		 * No match found.  Base is the smallest index greater than
+		 * key and may be zero or a last + O_INDX index.
+		 *
+		 * If it's a leaf page or the stopping point,
+		 * return base as the "found" value.
+		 * Delete only deletes exact matches.
+		 */
+		if (LEVEL(h) == LEAFLEVEL ||
+		    (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+			*exactp = 0;
+
+			if (LF_ISSET(SR_EXACT)) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+
+			if (LF_ISSET(SR_STK_ONLY)) {
+				BT_STK_NUM(env, cp, h, base, ret);
+				if ((t_ret =
+				    __LPUT(dbc, lock)) != 0 && ret == 0)
+					ret = t_ret;
+				if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+				     h, dbc->priority)) != 0 && ret == 0)
+					ret = t_ret;
+				h = NULL;
+				if (ret != 0)
+					goto err;
+				goto done;
+			}
+			if (LF_ISSET(SR_NEXT)) {
+get_next:			/*
+				 * The caller could have asked for a NEXT
+				 * at the root if the tree recently collapsed.
+				 */
+				if (PGNO(h) == root_pgno) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+
+				indx = cp->sp->indx + 1;
+				if (indx == NUM_ENT(cp->sp->page)) {
+					ret = DB_NOTFOUND;
+					cp->csp++;
+					goto err;
+				}
+				/*
+				 * If we want both the key page and the next
+				 * page, push the key page on the stack
+				 * otherwise save the root of the subtree
+				 * and drop the rest of the subtree.
+				 * Search down again starting at the
+				 * next child of the root of this subtree.
+				 */
+				LF_SET(SR_MIN);
+				LF_CLR(SR_NEXT);
+				set_stack = stack = 1;
+				if (LF_ISSET(SR_BOTH)) {
+					cp->csp++;
+					BT_STK_PUSH(env,
+					    cp, h, indx, lock, lock_mode, ret);
+					if (ret != 0)
+						goto err;
+					LOCK_INIT(lock);
+					h = cp->sp->page;
+					pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+					level = LEVEL(h);
+					h = NULL;
+					goto lock_next;
+				} else {
+					if ((ret = __LPUT(dbc, lock)) != 0)
+						goto err;
+					if ((ret = __memp_fput(mpf,
+					    dbc->thread_info,
+					    h, dbc->priority)) != 0)
+						goto err;
+					h = cp->sp->page;
+					cp->sp->page = NULL;
+					lock = cp->sp->lock;
+					LOCK_INIT(cp->sp->lock);
+					if ((ret = __bam_stkrel(dbc,
+					    STK_NOLOCK)) != 0)
+						goto err;
+					goto next;
+				}
+			}
+
+			/*
+			 * !!!
+			 * Possibly returning a deleted record -- DB_SET_RANGE,
+			 * DB_KEYFIRST and DB_KEYLAST don't require an exact
+			 * match, and we don't want to walk multiple pages here
+			 * to find an undeleted record.  This is handled by the
+			 * calling routine.
+			 */
+			if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+				cp->csp++;
+			BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+			goto done;
+		}
+
+		/*
+		 * If it's not a leaf page, record the internal page (which is
+		 * a parent page for the key).  Decrement the base by 1 if it's
+		 * non-zero so that if a split later occurs, the inserted page
+		 * will be to the right of the saved page.
+		 */
+		indx = base > 0 ? base - O_INDX : base;
+
+		/*
+		 * If we're trying to calculate the record number, sum up
+		 * all the record numbers on this page up to the indx point.
+		 */
+next:		if (recnop != NULL)
+			for (i = 0; i < indx; ++i)
+				recno += GET_BINTERNAL(dbp, h, i)->nrecs;
+
+		pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+		level = LEVEL(h);
+
+		/* See if we are at the level to start stacking. */
+		if (LF_ISSET(SR_START) && slevel == level)
+			set_stack = stack = 1;
+
+		if (LF_ISSET(SR_STK_ONLY)) {
+			if (slevel == LEVEL(h)) {
+				BT_STK_NUM(env, cp, h, indx, ret);
+				if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+				    h, dbc->priority)) != 0 && ret == 0)
+					ret = t_ret;
+				h = NULL;
+				if (ret != 0)
+					goto err;
+				goto done;
+			}
+			BT_STK_NUMPUSH(env, cp, h, indx, ret);
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+			h = NULL;
+		} else if (stack) {
+			/* Return if this is the lowest page wanted. */
+			if (LF_ISSET(SR_PARENT) && slevel == level) {
+				BT_STK_ENTER(env,
+				    cp, h, indx, lock, lock_mode, ret);
+				if (ret != 0)
+					goto err;
+				goto done;
+			}
+			if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) {
+				/*
+				 * There was a page with a singleton pointer
+				 * to a non-empty subtree.
+				 */
+				cp->csp--;
+				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+					goto err;
+				set_stack = stack = 0;
+				goto do_del;
+			}
+			BT_STK_PUSH(env,
+			    cp, h, indx, lock, lock_mode, ret);
+			if (ret != 0)
+				goto err;
+
+			LOCK_INIT(lock);
+			get_mode = DB_MPOOL_DIRTY;
+			lock_mode = DB_LOCK_WRITE;
+			getlock = 1;
+			goto lock_next;
+		} else {
+			/*
+			 * Decide if we want to return a reference to the next
+			 * page in the return stack.  If so, latch it and don't
+			 * unlatch it.  We will want to stack things on the
+			 * next iteration.  The stack variable cannot be
+			 * set until we leave this clause. If we are locking
+			 * then we must lock this level before getting the page.
+			 */
+			if ((LF_ISSET(SR_PARENT) &&
+			    (u_int8_t)(slevel + 1) >= (level - 1)) ||
+			    (level - 1) == LEAFLEVEL)
+				set_stack = 1;
+
+			/*
+			 * Check for a normal search.  If so, we need to
+			 * latch couple the parent/chid buffers.
+			 */
+			if (!LF_ISSET(SR_DEL | SR_NEXT)) {
+				parent_h = h;
+				goto lock_next;
+			}
+
+			/*
+			 * Returning a subtree.  See if we have hit the start
+			 * point if so save the parent and set stack.
+			 * Otherwise free the parent and temporarily
+			 * save this one.
+			 * For SR_DEL we need to find a page with 1 entry.
+			 * For SR_NEXT we want find the minimal subtree
+			 * that contains the key and the next page.
+			 * We save pages as long as we are at the right
+			 * edge of the subtree.  When we leave the right
+			 * edge, then drop the subtree.
+			 */
+
+			if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) {
+				/*
+				 * We are pushing the things on the stack,
+				 * set the stack variable now to indicate this
+				 * has happened.
+				 */
+				stack = set_stack = 1;
+				LF_SET(SR_WRITE);
+				/* Push the parent. */
+				cp->csp++;
+				/* Push this node. */
+				BT_STK_PUSH(env, cp, h,
+				     indx, lock, DB_LOCK_NG, ret);
+				if (ret != 0)
+					goto err;
+				LOCK_INIT(lock);
+			} else {
+			/*
+			 * See if we want to save the tree so far.
+			 * If we are looking for the next key,
+			 * then we must save this node if we are
+			 * at the end of the page.  If not then
+			 * discard anything we have saved so far.
+			 * For delete only keep one node until
+			 * we find a singleton.
+			 */
+do_del:				if (cp->csp->page != NULL) {
+					if (LF_ISSET(SR_NEXT) &&
+					     indx == NUM_ENT(h) - 1)
+						cp->csp++;
+					else if ((ret =
+					    __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+						goto err;
+				}
+				/* Save this node. */
+				BT_STK_ENTER(env, cp,
+				    h, indx, lock, lock_mode, ret);
+				if (ret != 0)
+					goto err;
+				LOCK_INIT(lock);
+			}
+
+lock_next:		h = NULL;
+
+			if (set_stack && LF_ISSET(SR_WRITE)) {
+				lock_mode = DB_LOCK_WRITE;
+				get_mode = DB_MPOOL_DIRTY;
+				getlock = 1;
+			}
+			/*
+			 * If we are retrying and we are back at the same
+			 * page then we already have it locked.  If we are
+			 * at a different page we want to lock couple and
+			 * release that lock.
+			 */
+			if (level - 1 == saved_level) {
+				if ((ret = __LPUT(dbc, lock)) != 0)
+					goto err;
+				lock = saved_lock;
+				LOCK_INIT(saved_lock);
+				saved_level = MAXBTREELEVEL;
+				if (pg == saved_pg)
+					goto skip_lock;
+			}
+			if ((getlock || level - 1 == LEAFLEVEL) &&
+			    (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS,
+			    pg, lock_mode, wait, &lock)) != 0) {
+				/*
+				 * If we are doing DEL or NEXT then we
+				 * have an extra level saved in the stack,
+				 * push it so it will get freed.
+				 */
+				if (LF_ISSET(SR_DEL | SR_NEXT) && !stack)
+					cp->csp++;
+				PERFMON6(env, race, bam_search, dbp->fname,
+				    dbp->dname, ret, h, parent_h, flags);
+				/*
+				 * If we fail, discard the lock we held.
+				 * This is ok because we will either search
+				 * again or exit without actually looking
+				 * at the data.
+				 */
+				if ((t_ret = __LPUT(dbc, lock)) != 0)
+					ret = t_ret;
+				/*
+				 * If we blocked at a different level release
+				 * the previous saved lock.
+				 */
+				if ((t_ret = __LPUT(dbc, saved_lock)) != 0 &&
+				    ret == 0)
+					ret = t_ret;
+				if (wait == 0 || (ret != DB_LOCK_NOTGRANTED &&
+				     ret != DB_LOCK_DEADLOCK))
+					goto err;
+
+				/* Release the parent if we are holding it. */
+				if (parent_h != NULL &&
+				    (ret = __memp_fput(mpf, dbc->thread_info,
+				    parent_h, dbc->priority)) != 0)
+					goto err;
+				parent_h = NULL;
+
+				BT_STK_POP(cp);
+				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+					goto err;
+				if ((ret = __db_lget(dbc,
+				    0, pg, lock_mode, 0, &saved_lock)) != 0)
+					goto err;
+				/*
+				 * A very strange case: if this page was
+				 * freed while we wait then we cannot hold
+				 * the lock on it while we reget the root
+				 * latch because allocation is one place
+				 * we lock while holding a latch.
+				 * We want to hold the lock but must ensure
+				 * that the page is not free or cannot become
+				 * free.  If we are at the LEAF level we can
+				 * hold on to the lock if the page is still
+				 * of the right type.  Otherwise we need to
+				 * be sure this page cannot move to an off page
+				 * duplicate tree (which are not locked) and
+				 * masquerade as the page we want.
+				 */
+
+				/*
+				 * If the page is not at leaf level
+				 * then see if OPD trees are around.
+				 * If the page could appear as an
+				 * interior offpage duplicate node
+				 * at the right level the it will
+				 * not be locked and subsequently be
+				 * freed. If there are multiple
+				 * databases in the file then they
+				 * could have OPDs.
+				 */
+				if (level - 1 > LEAFLEVEL &&
+				    (F_ISSET(dbp, DB_AM_SUBDB) ||
+				    (dbp->type == DB_BTREE &&
+				    F_ISSET(dbp, DB_AM_DUPSORT))))
+					goto drop_lock;
+
+				/*
+				 * Take a look at the page.  If it got
+				 * freed it could be very gone.
+				 */
+				if ((ret = __memp_fget(mpf, &pg,
+				     dbc->thread_info, dbc->txn, 0, &h)) != 0 &&
+				     ret != DB_PAGE_NOTFOUND)
+					goto err;
+
+				/*
+				 * Check for right level and page type.
+				 */
+				if (ret != 0 || LEVEL(h) != level - 1 ||
+				    (LEVEL(h) == LEAFLEVEL ?
+				    TYPE(h) != (dbc->dbtype == DB_BTREE ?
+				    P_LBTREE : P_LRECNO) :
+				    TYPE(h) != (dbc->dbtype == DB_BTREE ?
+				    P_IBTREE : P_IRECNO))) {
+drop_lock:				ret = __LPUT(dbc, saved_lock);
+					if (ret != 0)
+						goto err;
+					pg = root_pgno;
+					saved_level = MAXBTREELEVEL;
+				}
+				if (h != NULL && (ret = __memp_fput(mpf,
+				    dbc->thread_info, h, dbc->priority)) != 0)
+					goto err;
+				h = NULL;
+
+				if (was_next) {
+					LF_CLR(SR_MIN);
+					LF_SET(SR_NEXT);
+				}
+				/*
+				 * We have the lock but we dropped the
+				 * latch so we need to search again. If
+				 * we get back to the same page then all
+				 * is good, otherwise we need to try to
+				 * lock the new page.
+				 */
+				saved_pg = pg;
+				saved_level = level - 1;
+				goto retry;
+			}
+skip_lock:		stack = set_stack;
+		}
+		/* Get the child page. */
+		if ((ret = __memp_fget(mpf, &pg,
+		     dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+			goto err;
+		/* Release the parent. */
+		if (parent_h != NULL && (ret = __memp_fput(mpf,
+		    dbc->thread_info, parent_h, dbc->priority)) != 0)
+			goto err;
+		parent_h = NULL;
+	}
+	/* NOTREACHED */
+
+found:	*exactp = 1;
+
+	/*
+	 * If we got here, we know that we have a Btree leaf or off-page
+	 * duplicates page.  If it's a Btree leaf page, we have to handle
+	 * on-page duplicates.
+	 *
+	 * If there are duplicates, go to the first/last one.  This is
+	 * safe because we know that we're not going to leave the page,
+	 * all duplicate sets that are not on overflow pages exist on a
+	 * single leaf page.
+	 */
+	if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) {
+		if (LF_ISSET(SR_DUPLAST))
+			while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+			    inp[indx] == inp[indx + P_INDX])
+				indx += P_INDX;
+		else if (LF_ISSET(SR_DUPFIRST))
+			while (indx > 0 &&
+			    inp[indx] == inp[indx - P_INDX])
+				indx -= P_INDX;
+	}
+
+	/*
+	 * Now check if we are allowed to return deleted items; if not, then
+	 * find the next (or previous) non-deleted duplicate entry.  (We do
+	 * not move from the original found key on the basis of the SR_DELNO
+	 * flag.)
+	 */
+	DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO));
+	if (LF_ISSET(SR_DELNO)) {
+		deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
+		if (LF_ISSET(SR_DUPLAST))
+			while (B_DISSET(GET_BKEYDATA(dbp,
+			    h, indx + deloffset)->type) && indx > 0 &&
+			    inp[indx] == inp[indx - adjust])
+				indx -= adjust;
+		else
+			while (B_DISSET(GET_BKEYDATA(dbp,
+			    h, indx + deloffset)->type) &&
+			    indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
+			    inp[indx] == inp[indx + adjust])
+				indx += adjust;
+
+		/*
+		 * If we weren't able to find a non-deleted duplicate, return
+		 * DB_NOTFOUND.
+		 */
+		if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		/*
+		 * Increment the record counter to point to the found element.
+		 * Ignore any deleted key/data pairs.  There doesn't need to
+		 * be any correction for duplicates, as Btree doesn't support
+		 * duplicates and record numbers in the same tree.
+		 */
+		if (recnop != NULL) {
+			DB_ASSERT(env, TYPE(h) == P_LBTREE);
+
+			for (i = 0; i < indx; i += P_INDX)
+				if (!B_DISSET(
+				    GET_BKEYDATA(dbp, h, i + O_INDX)->type))
+					++recno;
+
+			/* Correct the number for a 0-base. */
+			*recnop = recno + 1;
+		}
+	}
+
+	if (LF_ISSET(SR_STK_ONLY)) {
+		BT_STK_NUM(env, cp, h, indx, ret);
+		if ((t_ret = __memp_fput(mpf,
+		     dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		h = NULL;
+	} else {
+		if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+			cp->csp++;
+		BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+	}
+	if (ret != 0)
+		goto err;
+
+	cp->csp->lock = lock;
+	DB_ASSERT(env, parent_h == NULL);
+
+done:
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+
+	if ((ret = __LPUT(dbc, saved_lock)) != 0)
+		return (ret);
+
+	return (0);
+
+err:	if (ret == 0)
+		ret = t_ret;
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (parent_h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Keep any not-found page locked for serializability. */
+	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	(void)__LPUT(dbc, saved_lock);
+
+	BT_STK_POP(cp);
+	(void)__bam_stkrel(dbc, 0);
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+
+	return (ret);
+}
+
+/*
+ * __bam_stkrel --
+ *	Release all pages currently held in the stack.
+ *
+ * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
+ */
+int
+__bam_stkrel(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	EPG *epg;
+	int ret, t_ret;
+
+	DB_ASSERT(NULL, dbc != NULL);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	/*
+	 * Release inner pages first.
+	 *
+	 * The caller must be sure that setting STK_NOLOCK will not effect
+	 * either serializability or recoverability.
+	 */
+	for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
+		if (epg->page != NULL) {
+			if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
+				cp->page = NULL;
+				LOCK_INIT(cp->lock);
+			}
+			if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+			     epg->page, dbc->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			epg->page = NULL;
+		}
+		/*
+		 * We set this if we need to release our pins,
+		 * but are not logically ready to have the pages
+		 * visible.
+		 */
+		if (LF_ISSET(STK_PGONLY))
+			continue;
+		if (LF_ISSET(STK_NOLOCK) &&
+		    (epg->lock.mode == DB_LOCK_READ ||
+		    atomic_read(&mpf->mfp->multiversion) == 0)) {
+			if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0)
+				ret = t_ret;
+		} else
+			if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+				ret = t_ret;
+	}
+
+	/* Clear the stack, all pages have been released. */
+	if (!LF_ISSET(STK_PGONLY))
+		BT_STK_CLR(cp);
+
+	return (ret);
+}
+
+/*
+ * __bam_stkgrow --
+ *	Grow the stack.
+ *
+ * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
+ */
+int
+__bam_stkgrow(env, cp)
+	ENV *env;
+	BTREE_CURSOR *cp;
+{
+	EPG *p;
+	size_t entries;
+	int ret;
+
+	entries = cp->esp - cp->sp;
+
+	if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0)
+		return (ret);
+	memcpy(p, cp->sp, entries * sizeof(EPG));
+	if (cp->sp != cp->stack)
+		__os_free(env, cp->sp);
+	cp->sp = p;
+	cp->csp = p + entries;
+	cp->esp = p + entries * 2;
+	return (0);
+}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
new file mode 100644
index 00000000..8299c69a
--- /dev/null
+++ b/src/btree/bt_split.c
@@ -0,0 +1,1332 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/btree.h"
+
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
+
+/*
+ * __bam_split --
+ *	Split a page.
+ *
+ * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *));
+ */
+int
+__bam_split(dbc, arg, root_pgnop)
+	DBC *dbc;
+	void *arg;
+	db_pgno_t *root_pgnop;
+{
+	BTREE_CURSOR *cp;
+	DB_LOCK metalock, next_lock;
+	enum { UP, DOWN } dir;
+	db_pgno_t pgno, next_pgno, root_pgno;
+	int exact, level, ret;
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	LOCK_INIT(next_lock);
+	next_pgno = PGNO_INVALID;
+
+	/*
+	 * First get a lock on the metadata page, we will have to allocate
+	 * pages and cannot get a lock while we have the search tree pinned.
+	 */
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+	root_pgno = BAM_ROOT_PGNO(dbc);
+
+	/*
+	 * The locking protocol we use to avoid deadlock to acquire locks by
+	 * walking down the tree, but we do it as lazily as possible, locking
+	 * the root only as a last resort.  We expect all stack pages to have
+	 * been discarded before we're called; we discard all short-term locks.
+	 *
+	 * When __bam_split is first called, we know that a leaf page was too
+	 * full for an insert.  We don't know what leaf page it was, but we
+	 * have the key/recno that caused the problem.  We call XX_search to
+	 * reacquire the leaf page, but this time get both the leaf page and
+	 * its parent, locked.  We then split the leaf page and see if the new
+	 * internal key will fit into the parent page.  If it will, we're done.
+	 *
+	 * If it won't, we discard our current locks and repeat the process,
+	 * only this time acquiring the parent page and its parent, locked.
+	 * This process repeats until we succeed in the split, splitting the
+	 * root page as the final resort.  The entire process then repeats,
+	 * as necessary, until we split a leaf page.
+	 *
+	 * XXX
+	 * A traditional method of speeding this up is to maintain a stack of
+	 * the pages traversed in the original search.  You can detect if the
+	 * stack is correct by storing the page's LSN when it was searched and
+	 * comparing that LSN with the current one when it's locked during the
+	 * split.  This would be an easy change for this code, but I have no
+	 * numbers that indicate it's worthwhile.
+	 */
+	for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+		/*
+		 * Acquire a page and its parent, locked.
+		 */
+retry:		if ((ret = (dbc->dbtype == DB_BTREE ?
+		    __bam_search(dbc, PGNO_INVALID,
+			arg, SR_WRPAIR, level, NULL, &exact) :
+		    __bam_rsearch(dbc,
+			(db_recno_t *)arg, SR_WRPAIR, level, &exact))) != 0)
+			break;
+
+		if (cp->csp[0].page->pgno == root_pgno) {
+			/* we can overshoot the top of the tree. */
+			level = cp->csp[0].page->level;
+			if (root_pgnop != NULL)
+				*root_pgnop = root_pgno;
+		} else if (root_pgnop != NULL)
+			*root_pgnop = cp->csp[-1].page->pgno;
+
+		/*
+		 * Split the page if it still needs it (it's possible another
+		 * thread of control has already split the page).  If we are
+		 * guaranteed that two items will fit on the page, the split
+		 * is no longer necessary.
+		 */
+		if (2 * B_MAXSIZEONPAGE(cp->ovflsize)
+		    <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) {
+			if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+				goto err;
+			goto no_split;
+		}
+
+		/*
+		 * We need to try to lock the next page so we can update
+		 * its PREV.
+		 */
+		if (ISLEAF(cp->csp->page) &&
+		    (pgno = NEXT_PGNO(cp->csp->page)) != PGNO_INVALID) {
+			TRY_LOCK(dbc, pgno,
+			     next_pgno, next_lock, DB_LOCK_WRITE, retry);
+			if (ret != 0)
+				goto err;
+		}
+		ret = cp->csp[0].page->pgno == root_pgno ?
+		    __bam_root(dbc, &cp->csp[0]) :
+		    __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+		BT_STK_CLR(cp);
+
+		switch (ret) {
+		case 0:
+no_split:		/* Once we've split the leaf page, we're done. */
+			if (level == LEAFLEVEL)
+				goto done;
+
+			/* Switch directions. */
+			if (dir == UP)
+				dir = DOWN;
+			break;
+		case DB_NEEDSPLIT:
+			/*
+			 * It's possible to fail to split repeatedly, as other
+			 * threads may be modifying the tree, or the page usage
+			 * is sufficiently bad that we don't get enough space
+			 * the first time.
+			 */
+			if (dir == DOWN)
+				dir = UP;
+			break;
+		default:
+			goto err;
+		}
+	}
+
+	if (root_pgnop != NULL)
+		*root_pgnop = BAM_ROOT_PGNO(dbc);
+err:
+done:	(void)__LPUT(dbc, metalock);
+	(void)__TLPUT(dbc, next_lock);
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+	return (ret);
+}
+
+/*
+ * __bam_root --
+ *	Split the root page of a btree.
+ */
+static int
+__bam_root(dbc, cp)
+	DBC *dbc;
+	EPG *cp;
+{
+	DB *dbp;
+	DBT log_dbt, rootent[2];
+	DB_LOCK llock, rlock;
+	DB_LSN log_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *lp, *rp;
+	db_indx_t split;
+	u_int32_t opflags;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	lp = rp = NULL;
+	LOCK_INIT(llock);
+	LOCK_INIT(rlock);
+	COMPQUIET(log_dbt.data, NULL);
+
+	/* Yeah, right. */
+	if (cp->page->level >= MAXBTREELEVEL) {
+		__db_errx(dbp->env, DB_STR_A("1021",
+		    "Too many btree levels: %d", "%d"), cp->page->level);
+		return (ENOSPC);
+	}
+
+	if ((ret = __memp_dirty(mpf,
+	    &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+
+	/* Create new left and right pages for the split. */
+	if ((ret = __db_new(dbc, TYPE(cp->page), &llock, &lp)) != 0 ||
+	    (ret = __db_new(dbc, TYPE(cp->page), &rlock, &rp)) != 0)
+		goto err;
+	P_INIT(lp, dbp->pgsize, lp->pgno,
+	    PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+	    cp->page->level, TYPE(cp->page));
+	P_INIT(rp, dbp->pgsize, rp->pgno,
+	    ISINTERNAL(cp->page) ?  PGNO_INVALID : lp->pgno, PGNO_INVALID,
+	    cp->page->level, TYPE(cp->page));
+
+	PERFMON5(env, alloc, btree_split,
+	    dbp->fname, dbp->dname, lp->pgno, cp->page->pgno, lp->level);
+
+	/* Split the page. */
+	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+		goto err;
+
+	if (DBC_LOGGING(dbc)) {
+		memset(&log_dbt, 0, sizeof(log_dbt));
+		if ((ret =
+		    __os_malloc(dbp->env, dbp->pgsize, &log_dbt.data)) != 0)
+			goto err;
+		log_dbt.size = dbp->pgsize;
+		memcpy(log_dbt.data, cp->page, dbp->pgsize);
+	}
+
+	/* Clean up the new root page. */
+	if ((ret = (dbc->dbtype == DB_RECNO ?
+	    __ram_root(dbc, cp->page, lp, rp) :
+	    __bam_broot(dbc, cp->page, split, lp, rp))) != 0) {
+		if (DBC_LOGGING(dbc))
+			__os_free(dbp->env, log_dbt.data);
+		goto err;
+	}
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+		memset(rootent, 0, sizeof(rootent));
+		rootent[0].data = GET_BINTERNAL(dbp, cp->page, 0);
+		rootent[1].data = GET_BINTERNAL(dbp, cp->page, 1);
+		if (dbc->dbtype == DB_RECNO)
+			rootent[0].size = rootent[1].size = RINTERNAL_SIZE;
+		else {
+			rootent[0].size = BINTERNAL_SIZE(
+			    ((BINTERNAL *)rootent[0].data)->len);
+			rootent[1].size = BINTERNAL_SIZE(
+			    ((BINTERNAL *)rootent[1].data)->len);
+		}
+		ZERO_LSN(log_lsn);
+		opflags = F_ISSET(
+		    (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0;
+		if (dbc->dbtype == DB_RECNO)
+			opflags |= SPL_RECNO;
+		ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0,
+		    OP_SET(opflags, cp->page), PGNO(lp), &LSN(lp),
+		    PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
+		    PGNO_INVALID, &log_lsn, PGNO(cp->page),
+		    &LSN(cp->page), 0, &log_dbt, &rootent[0], &rootent[1]);
+
+		/* On failure, restore the page. */
+		if (ret != 0)
+			memcpy(cp->page, log_dbt.data, dbp->pgsize);
+		__os_free(dbp->env, log_dbt.data);
+
+		if (ret != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+	LSN(lp) = LSN(cp->page);
+	LSN(rp) = LSN(cp->page);
+
+	/* Adjust any cursors. */
+	ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
+
+	/* Success or error: release pages and locks. */
+err:	if (cp->page != NULL && (t_ret = __memp_fput(mpf,
+	     dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	cp->page = NULL;
+
+	/*
+	 * We are done.  Put or downgrade all our locks and release
+	 * the pages.
+	 */
+	if ((t_ret = __TLPUT(dbc, llock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, rlock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (lp != NULL && (t_ret = __memp_fput(mpf,
+	     dbc->thread_info, lp, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (rp != NULL && (t_ret = __memp_fput(mpf,
+	     dbc->thread_info, rp, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bam_page --
+ *	Split the non-root page of a btree.
+ */
+static int
+__bam_page(dbc, pp, cp)
+	DBC *dbc;
+	EPG *pp, *cp;
+{
+	BTREE_CURSOR *bc;
+	DB *dbp;
+	DBT log_dbt, rentry;
+	DB_LOCK rplock;
+	DB_LSN log_lsn;
+	DB_LSN save_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *lp, *rp, *alloc_rp, *tp;
+	db_indx_t split;
+	u_int32_t opflags;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	alloc_rp = lp = rp = tp = NULL;
+	LOCK_INIT(rplock);
+	ret = -1;
+
+	/*
+	 * Create new left page for the split, and fill in everything
+	 * except its LSN and next-page page number.
+	 *
+	 * Create a new right page for the split, and fill in everything
+	 * except its LSN and page number.
+	 *
+	 * We malloc space for both the left and right pages, so we don't get
+	 * a new page from the underlying buffer pool until we know the split
+	 * is going to succeed.  The reason is that we can't release locks
+	 * acquired during the get-a-new-page process because metadata page
+	 * locks can't be discarded on failure since we may have modified the
+	 * free list.  So, if you assume that we're holding a write lock on the
+	 * leaf page which ran out of space and started this split (e.g., we
+	 * have already written records to the page, or we retrieved a record
+	 * from it with the DB_RMW flag set), failing in a split with both a
+	 * leaf page locked and the metadata page locked can potentially lock
+	 * up the tree badly, because we've violated the rule of always locking
+	 * down the tree, and never up.
+	 */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize * 2, &lp)) != 0)
+		goto err;
+	P_INIT(lp, dbp->pgsize, PGNO(cp->page),
+	    ISINTERNAL(cp->page) ?  PGNO_INVALID : PREV_PGNO(cp->page),
+	    ISINTERNAL(cp->page) ?  PGNO_INVALID : 0,
+	    cp->page->level, TYPE(cp->page));
+
+	rp = (PAGE *)((u_int8_t *)lp + dbp->pgsize);
+	P_INIT(rp, dbp->pgsize, 0,
+	    ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page),
+	    ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page),
+	    cp->page->level, TYPE(cp->page));
+
+	/*
+	 * Split right.
+	 *
+	 * Only the indices are sorted on the page, i.e., the key/data pairs
+	 * aren't, so it's simpler to copy the data from the split page onto
+	 * two new pages instead of copying half the data to a new right page
+	 * and compacting the left page in place.  Since the left page can't
+	 * change, we swap the original and the allocated left page after the
+	 * split.
+	 */
+	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+		goto err;
+
+	/*
+	 * Test to see if we are going to be able to insert the new pages into
+	 * the parent page.  The interesting failure here is that the parent
+	 * page can't hold the new keys, and has to be split in turn, in which
+	 * case we want to release all the locks we can.
+	 */
+	if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_SPACEONLY)) != 0)
+		goto err;
+
+	/*
+	 * We've got everything locked down we need, and we know the split
+	 * is going to succeed.  Go and get the additional page we'll need.
+	 */
+	if ((ret = __db_new(dbc, TYPE(cp->page), &rplock, &alloc_rp)) != 0)
+		goto err;
+
+	/*
+	 * Prepare to fix up the previous pointer of any leaf page following
+	 * the split page.  Our caller has already write locked the page so
+	 * we can get it without deadlocking on the parent latch.
+	 */
+	if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID &&
+	    (ret = __memp_fget(mpf, &NEXT_PGNO(cp->page),
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &tp)) != 0)
+		goto err;
+
+	PERFMON5(env, alloc, btree_split, dbp->fname,
+	    dbp->dname, cp->page->pgno, pp->page->pgno, cp->page->level);
+
+	/*
+	 * Fix up the page numbers we didn't have before.  We have to do this
+	 * before calling __bam_pinsert because it may copy a page number onto
+	 * the parent page and it takes the page number from its page argument.
+	 */
+	PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp);
+
+	DB_ASSERT(dbp->env, IS_DIRTY(cp->page));
+	DB_ASSERT(dbp->env, IS_DIRTY(pp->page));
+
+	bc = (BTREE_CURSOR *)dbc->internal;
+
+	/* Actually update the parent page. */
+	if ((ret = __bam_pinsert(dbc,
+	    pp, split, lp, rp, F_ISSET(bc, C_RECNUM) ? 0 : BPI_NOLOGGING)) != 0)
+		goto err;
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+		memset(&log_dbt, 0, sizeof(log_dbt));
+		log_dbt.data = cp->page;
+		log_dbt.size = dbp->pgsize;
+		memset(&rentry, 0, sizeof(rentry));
+		rentry.data = GET_BINTERNAL(dbp, pp->page, pp->indx + 1);
+		opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0;
+		if (dbc->dbtype == DB_RECNO) {
+			opflags |= SPL_RECNO;
+			rentry.size = RINTERNAL_SIZE;
+		} else
+			rentry.size =
+			    BINTERNAL_SIZE(((BINTERNAL *)rentry.data)->len);
+		if (tp == NULL)
+			ZERO_LSN(log_lsn);
+		if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page),
+		    0, OP_SET(opflags, pp->page), PGNO(cp->page),
+		    &LSN(cp->page), PGNO(alloc_rp), &LSN(alloc_rp),
+		    (u_int32_t)NUM_ENT(lp), tp == NULL ? 0 : PGNO(tp),
+		    tp == NULL ? &log_lsn : &LSN(tp), PGNO(pp->page),
+		    &LSN(pp->page), pp->indx, &log_dbt, NULL, &rentry)) != 0) {
+			/*
+			 * If this is not RECNO then undo the update
+			 * to the parent page, which has not been
+			 * logged yet. This must succeed.  Renco
+			 * database trees are locked and therefore
+			 * the parent can be logged independently.
+			 */
+			if (F_ISSET(bc, C_RECNUM) == 0) {
+				t_ret = __db_ditem_nolog(dbc, pp->page,
+				    pp->indx + 1, rentry.size);
+				DB_ASSERT(dbp->env, t_ret == 0);
+			}
+
+			goto err;
+		}
+
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+
+	/* Update the LSNs for all involved pages. */
+	LSN(alloc_rp) = LSN(cp->page);
+	LSN(lp) = LSN(cp->page);
+	LSN(rp) = LSN(cp->page);
+	LSN(pp->page) = LSN(cp->page);
+	if (tp != NULL) {
+		/* Log record has been written; so safe to update next page. */
+		PREV_PGNO(tp) = PGNO(rp);
+		LSN(tp) = LSN(cp->page);
+	}
+
+	/*
+	 * Copy the left and right pages into place.  There are two paths
+	 * through here.  Either we are logging and we set the LSNs in the
+	 * logging path.  However, if we are not logging, then we do not
+	 * have valid LSNs on lp or rp.  The correct LSNs to use are the
+	 * ones on the page we got from __db_new or the one that was
+	 * originally on cp->page.  In both cases, we save the LSN from the
+	 * real database page (not a malloc'd one) and reapply it after we
+	 * do the copy.
+	 */
+	save_lsn = alloc_rp->lsn;
+	memcpy(alloc_rp, rp, LOFFSET(dbp, rp));
+	memcpy((u_int8_t *)alloc_rp + HOFFSET(rp),
+	    (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp));
+	alloc_rp->lsn = save_lsn;
+
+	save_lsn = cp->page->lsn;
+	memcpy(cp->page, lp, LOFFSET(dbp, lp));
+	memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+	    (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+	cp->page->lsn = save_lsn;
+
+	/* Adjust any cursors. */
+	if ((ret = __bam_ca_split(dbc,
+	    PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0)
+		goto err;
+
+	__os_free(dbp->env, lp);
+
+	/*
+	 * Success -- write the real pages back to the store.
+	 */
+	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, alloc_rp, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, rplock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (tp != NULL) {
+		if ((t_ret = __memp_fput(mpf,
+		    dbc->thread_info, tp, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if ((t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+
+err:	if (lp != NULL)
+		__os_free(dbp->env, lp);
+	if (alloc_rp != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, alloc_rp, dbc->priority);
+	if (tp != NULL)
+		(void)__memp_fput(mpf, dbc->thread_info, tp, dbc->priority);
+
+	if (pp->page != NULL)
+		(void)__memp_fput(mpf,
+		     dbc->thread_info, pp->page, dbc->priority);
+
+	if (ret == DB_NEEDSPLIT && atomic_read(&mpf->mfp->multiversion) == 0)
+		(void)__LPUT(dbc, pp->lock);
+	else
+		(void)__TLPUT(dbc, pp->lock);
+
+	(void)__memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+
+	/*
+	 * We don't drop the left and right page locks.  If we doing dirty
+	 * reads then we need to hold the locks until we abort the transaction.
+	 * If we are not transactional, we are hosed anyway as the tree
+	 * is trashed.  It may be better not to leak the locks.
+	 */
+
+	if (dbc->txn == NULL)
+		(void)__LPUT(dbc, rplock);
+
+	if (dbc->txn == NULL || ret == DB_NEEDSPLIT)
+		(void)__LPUT(dbc, cp->lock);
+
+	return (ret);
+}
+
+/*
+ * __bam_broot --
+ *	Fix up the btree root page after it has been split.
+ * PUBLIC: int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
+ */
+int
+__bam_broot(dbc, rootp, split, lp, rp)
+	DBC *dbc;
+	u_int32_t split;
+	PAGE *rootp, *lp, *rp;
+{
+	BINTERNAL bi, bi0, *child_bi;
+	BKEYDATA *child_bk;
+	BOVERFLOW bo, *child_bo;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT hdr, hdr0, data;
+	db_pgno_t root_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	child_bo = NULL;
+	data.data = NULL;
+	memset(&bi, 0, sizeof(bi));
+
+	switch (TYPE(rootp)) {
+	case P_IBTREE:
+		/* Copy the first key of the child page onto the root page. */
+		child_bi = GET_BINTERNAL(dbp, rootp, split);
+		switch (B_TYPE(child_bi->type)) {
+		case B_KEYDATA:
+			bi.len = child_bi->len;
+			B_TSET(bi.type, B_KEYDATA);
+			bi.pgno = rp->pgno;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			if ((ret = __os_malloc(dbp->env,
+			    child_bi->len, &data.data)) != 0)
+				return (ret);
+			memcpy(data.data, child_bi->data, child_bi->len);
+			data.size = child_bi->len;
+			break;
+		case B_OVERFLOW:
+			/* Reuse the overflow key. */
+			child_bo = (BOVERFLOW *)child_bi->data;
+			memset(&bo, 0, sizeof(bo));
+			bo.type = B_OVERFLOW;
+			bo.tlen = child_bo->tlen;
+			bo.pgno = child_bo->pgno;
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, B_OVERFLOW);
+			bi.pgno = rp->pgno;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+			break;
+		case B_DUPLICATE:
+		default:
+			goto pgfmt;
+		}
+		break;
+	case P_LDUP:
+	case P_LBTREE:
+		/* Copy the first key of the child page onto the root page. */
+		child_bk = GET_BKEYDATA(dbp, rootp, split);
+		switch (B_TYPE(child_bk->type)) {
+		case B_KEYDATA:
+			bi.len = child_bk->len;
+			B_TSET(bi.type, B_KEYDATA);
+			bi.pgno = rp->pgno;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			if ((ret = __os_malloc(dbp->env,
+			     child_bk->len, &data.data)) != 0)
+				return (ret);
+			memcpy(data.data, child_bk->data, child_bk->len);
+			data.size = child_bk->len;
+			break;
+		case B_OVERFLOW:
+			/* Copy the overflow key. */
+			child_bo = (BOVERFLOW *)child_bk;
+			memset(&bo, 0, sizeof(bo));
+			bo.type = B_OVERFLOW;
+			bo.tlen = child_bo->tlen;
+			memset(&hdr, 0, sizeof(hdr));
+			if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+			     child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+				ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+			if (hdr.data != NULL)
+				__os_free(dbp->env, hdr.data);
+			if (ret != 0)
+				return (ret);
+
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, B_OVERFLOW);
+			bi.pgno = rp->pgno;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+			break;
+		case B_DUPLICATE:
+		default:
+			goto pgfmt;
+		}
+		break;
+	default:
+pgfmt:		return (__db_pgfmt(dbp->env, rp->pgno));
+	}
+	/*
+	 * If the root page was a leaf page, change it into an internal page.
+	 * We copy the key we split on (but not the key's data, in the case of
+	 * a leaf page) to the new root page.
+	 */
+	root_pgno = BAM_ROOT_PGNO(dbc);
+	P_INIT(rootp, dbp->pgsize,
+	    root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
+
+	/*
+	 * The btree comparison code guarantees that the left-most key on any
+	 * internal btree page is never used, so it doesn't need to be filled
+	 * in.  Set the record count if necessary.
+	 */
+	memset(&bi0, 0, sizeof(bi0));
+	B_TSET(bi0.type, B_KEYDATA);
+	bi0.pgno = lp->pgno;
+	if (F_ISSET(cp, C_RECNUM)) {
+		bi0.nrecs = __bam_total(dbp, lp);
+		RE_NREC_SET(rootp, bi0.nrecs);
+		bi.nrecs = __bam_total(dbp, rp);
+		RE_NREC_ADJ(rootp, bi.nrecs);
+	}
+	DB_SET_DBT(hdr0, &bi0, SSZA(BINTERNAL, data));
+	if ((ret = __db_pitem_nolog(dbc, rootp,
+	    0, BINTERNAL_SIZE(0), &hdr0, NULL)) != 0)
+		goto err;
+	ret = __db_pitem_nolog(dbc, rootp, 1,
+	    BINTERNAL_SIZE(data.size), &hdr, &data);
+
+err:	if (data.data != NULL && child_bo == NULL)
+		__os_free(dbp->env, data.data);
+	return (ret);
+}
+
+/*
+ * __ram_root --
+ *	Fix up the recno root page after it has been split.
+ * PUBLIC:  int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__ram_root(dbc, rootp, lp, rp)
+	DBC *dbc;
+	PAGE *rootp, *lp, *rp;
+{
+	DB *dbp;
+	DBT hdr;
+	RINTERNAL ri;
+	db_pgno_t root_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+	root_pgno = BAM_ROOT_PGNO(dbc);
+
+	/* Initialize the page. */
+	P_INIT(rootp, dbp->pgsize,
+	    root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
+
+	/* Initialize the header. */
+	DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+
+	/* Insert the left and right keys, set the header information. */
+	ri.pgno = lp->pgno;
+	ri.nrecs = __bam_total(dbp, lp);
+	if ((ret = __db_pitem_nolog(dbc,
+	     rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+		return (ret);
+	RE_NREC_SET(rootp, ri.nrecs);
+	ri.pgno = rp->pgno;
+	ri.nrecs = __bam_total(dbp, rp);
+	if ((ret = __db_pitem_nolog(dbc,
+	    rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+		return (ret);
+	RE_NREC_ADJ(rootp, ri.nrecs);
+	return (0);
+}
+
+/*
+ * __bam_pinsert --
+ *	Insert a new key into a parent page, completing the split.
+ *
+ * PUBLIC: int __bam_pinsert
+ * PUBLIC:     __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
+ */
+int
+__bam_pinsert(dbc, parent, split, lchild, rchild, flags)
+	DBC *dbc;
+	EPG *parent;
+	u_int32_t split;
+	PAGE *lchild, *rchild;
+	int flags;
+{
+	BINTERNAL bi, *child_bi;
+	BKEYDATA *child_bk, *tmp_bk;
+	BOVERFLOW bo, *child_bo;
+	BTREE *t;
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT a, b, hdr, data;
+	EPG *child;
+	PAGE *ppage;
+	RINTERNAL ri;
+	db_indx_t off;
+	db_recno_t nrecs;
+	size_t (*func) __P((DB *, const DBT *, const DBT *));
+	int (*pitem) __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+	u_int32_t n, nbytes, nksize, oldsize, size;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = (BTREE_CURSOR *)dbc->internal;
+	t = dbp->bt_internal;
+	ppage = parent->page;
+	child = parent + 1;
+
+	/* If handling record numbers, count records split to the right page. */
+	nrecs = F_ISSET(cp, C_RECNUM) &&
+	    !LF_ISSET(BPI_SPACEONLY) ? __bam_total(dbp, rchild) : 0;
+
+	/*
+	 * Now we insert the new page's first key into the parent page, which
+	 * completes the split.  The parent points to a PAGE and a page index
+	 * offset, where the new key goes ONE AFTER the index, because we split
+	 * to the right.
+	 *
+	 * XXX
+	 * Some btree algorithms replace the key for the old page as well as
+	 * the new page.  We don't, as there's no reason to believe that the
+	 * first key on the old page is any better than the key we have, and,
+	 * in the case of a key being placed at index 0 causing the split, the
+	 * key is unavailable.
+	 */
+	off = parent->indx + O_INDX;
+	if (LF_ISSET(BPI_REPLACE))
+		oldsize = TYPE(ppage) == P_IRECNO ? RINTERNAL_PSIZE :
+		    BINTERNAL_PSIZE(GET_BINTERNAL(dbp, ppage, off)->len);
+	else
+		oldsize = 0;
+
+	/*
+	 * Calculate the space needed on the parent page.
+	 *
+	 * Prefix trees: space hack used when inserting into BINTERNAL pages.
+	 * Retain only what's needed to distinguish between the new entry and
+	 * the LAST entry on the page to its left.  If the keys compare equal,
+	 * retain the entire key.  We ignore overflow keys, and the entire key
+	 * must be retained for the next-to-leftmost key on the leftmost page
+	 * of each level, or the search will fail.  Applicable ONLY to internal
+	 * pages that have leaf pages as children.  Further reduction of the
+	 * key between pairs of internal pages loses too much information.
+	 */
+	switch (TYPE(child->page)) {
+	case P_IBTREE:
+		child_bi = GET_BINTERNAL(dbp, child->page, split);
+		nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+		if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+			return (DB_NEEDSPLIT);
+		if (LF_ISSET(BPI_SPACEONLY))
+			return (0);
+
+		switch (B_TYPE(child_bi->type)) {
+		case B_KEYDATA:
+			/* Add a new record for the right page. */
+			memset(&bi, 0, sizeof(bi));
+			bi.len = child_bi->len;
+			B_TSET(bi.type, B_KEYDATA);
+			bi.pgno = rchild->pgno;
+			bi.nrecs = nrecs;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, child_bi->data, child_bi->len);
+			size = BINTERNAL_SIZE(child_bi->len);
+			break;
+		case B_OVERFLOW:
+			/* Reuse the overflow key. */
+			child_bo = (BOVERFLOW *)child_bi->data;
+			memset(&bo, 0, sizeof(bo));
+			bo.type = B_OVERFLOW;
+			bo.tlen = child_bo->tlen;
+			bo.pgno = child_bo->pgno;
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, B_OVERFLOW);
+			bi.pgno = rchild->pgno;
+			bi.nrecs = nrecs;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+			size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+			break;
+		case B_DUPLICATE:
+		default:
+			goto pgfmt;
+		}
+		break;
+	case P_LDUP:
+	case P_LBTREE:
+		child_bk = GET_BKEYDATA(dbp, child->page, split);
+		switch (B_TYPE(child_bk->type)) {
+		case B_KEYDATA:
+			nbytes = BINTERNAL_PSIZE(child_bk->len);
+			nksize = child_bk->len;
+
+			/*
+			 * Prefix compression:
+			 * We set t->bt_prefix to NULL if we have a comparison
+			 * callback but no prefix compression callback.  But,
+			 * if we're splitting in an off-page duplicates tree,
+			 * we still have to do some checking.  If using the
+			 * default off-page duplicates comparison routine we
+			 * can use the default prefix compression callback. If
+			 * not using the default off-page duplicates comparison
+			 * routine, we can't do any kind of prefix compression
+			 * as there's no way for an application to specify a
+			 * prefix compression callback that corresponds to its
+			 * comparison callback.
+			 *
+			 * No prefix compression if we don't have a compression
+			 * function, or the key we'd compress isn't a normal
+			 * key (for example, it references an overflow page).
+			 *
+			 * Generate a parent page key for the right child page
+			 * from a comparison of the last key on the left child
+			 * page and the first key on the right child page.
+			 */
+			if (F_ISSET(dbc, DBC_OPD)) {
+				if (dbp->dup_compare == __bam_defcmp)
+					func = __bam_defpfx;
+				else
+					func = NULL;
+			} else
+				func = t->bt_prefix;
+			if (func == NULL)
+				goto noprefix;
+			tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) -
+			    (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));
+			if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+				goto noprefix;
+			DB_INIT_DBT(a, tmp_bk->data, tmp_bk->len);
+			DB_INIT_DBT(b, child_bk->data, child_bk->len);
+			nksize = (u_int32_t)func(dbp, &a, &b);
+			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
+				nbytes = n;
+			else
+				nksize = child_bk->len;
+
+noprefix:		if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+				return (DB_NEEDSPLIT);
+			if (LF_ISSET(BPI_SPACEONLY))
+				return (0);
+
+			memset(&bi, 0, sizeof(bi));
+			bi.len = nksize;
+			B_TSET(bi.type, B_KEYDATA);
+			bi.pgno = rchild->pgno;
+			bi.nrecs = nrecs;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, child_bk->data, nksize);
+			size = BINTERNAL_SIZE(nksize);
+			break;
+		case B_OVERFLOW:
+			nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+			if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+				return (DB_NEEDSPLIT);
+			if (LF_ISSET(BPI_SPACEONLY))
+				return (0);
+
+			/* Copy the overflow key. */
+			child_bo = (BOVERFLOW *)child_bk;
+			memset(&bo, 0, sizeof(bo));
+			bo.type = B_OVERFLOW;
+			bo.tlen = child_bo->tlen;
+			memset(&hdr, 0, sizeof(hdr));
+			if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+			     child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+				ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+			if (hdr.data != NULL)
+				__os_free(dbp->env, hdr.data);
+			if (ret != 0)
+				return (ret);
+
+			memset(&bi, 0, sizeof(bi));
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, B_OVERFLOW);
+			bi.pgno = rchild->pgno;
+			bi.nrecs = nrecs;
+			DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+			DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+			size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+
+			break;
+		case B_DUPLICATE:
+		default:
+			goto pgfmt;
+		}
+		break;
+	case P_IRECNO:
+	case P_LRECNO:
+		nbytes = RINTERNAL_PSIZE;
+
+		if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+			return (DB_NEEDSPLIT);
+		if (LF_ISSET(BPI_SPACEONLY))
+			return (0);
+
+		/* Add a new record for the right page. */
+		DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+		ri.pgno = rchild->pgno;
+		ri.nrecs = nrecs;
+		size = RINTERNAL_SIZE;
+		data.size = 0;
+		/*
+		 * For now, we are locking internal recno nodes so
+		 * use two steps.
+		 */
+		if (LF_ISSET(BPI_REPLACE)) {
+			if ((ret = __bam_ditem(dbc, ppage, off)) != 0)
+				return (ret);
+			LF_CLR(BPI_REPLACE);
+		}
+		break;
+	default:
+pgfmt:		return (__db_pgfmt(dbp->env, PGNO(child->page)));
+	}
+
+	if (LF_ISSET(BPI_REPLACE)) {
+		DB_ASSERT(dbp->env, !LF_ISSET(BPI_NOLOGGING));
+		if ((ret = __bam_irep(dbc, ppage, off, &hdr, &data)) != 0)
+			return (ret);
+	} else {
+		if (LF_ISSET(BPI_NOLOGGING))
+			pitem = __db_pitem_nolog;
+		else
+			pitem = __db_pitem;
+
+		if ((ret = pitem(dbc, ppage,
+		    off, size, &hdr, data.size != 0 ? &data : NULL)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * If a Recno or Btree with record numbers AM page, or an off-page
+	 * duplicates tree, adjust the parent page's left page record count.
+	 */
+	if (F_ISSET(cp, C_RECNUM) && !LF_ISSET(BPI_NORECNUM)) {
+		/* Log the change. */
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+			    &LSN(ppage), 0, PGNO(ppage), &LSN(ppage),
+			    parent->indx, -(int32_t)nrecs, 0)) != 0)
+				return (ret);
+		} else
+			LSN_NOT_LOGGED(LSN(ppage));
+
+		/* Update the left page count. */
+		if (dbc->dbtype == DB_RECNO)
+			GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+		else
+			GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+	}
+
+	return (0);
+}
+
+/*
+ * __bam_psplit --
+ *	Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbc, cp, lp, rp, splitret)
+	DBC *dbc;
+	EPG *cp;
+	PAGE *lp, *rp;
+	db_indx_t *splitret;
+{
+	DB *dbp;
+	PAGE *pp;
+	db_indx_t half, *inp, nbytes, off, splitp, top;
+	int adjust, cnt, iflag, isbigkey, ret;
+
+	dbp = dbc->dbp;
+	pp = cp->page;
+	inp = P_INP(dbp, pp);
+	adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
+
+	/*
+	 * If we're splitting the first (last) page on a level because we're
+	 * inserting (appending) a key to it, it's likely that the data is
+	 * sorted.  Moving a single item to the new page is less work and can
+	 * push the fill factor higher than normal.  This is trivial when we
+	 * are splitting a new page before the beginning of the tree, all of
+	 * the interesting tests are against values of 0.
+	 *
+	 * Catching appends to the tree is harder.  In a simple append, we're
+	 * inserting an item that sorts past the end of the tree; the cursor
+	 * will point past the last element on the page.  But, in trees with
+	 * duplicates, the cursor may point to the last entry on the page --
+	 * in this case, the entry will also be the last element of a duplicate
+	 * set (the last because the search call specified the SR_DUPLAST flag).
+	 * The only way to differentiate between an insert immediately before
+	 * the last item in a tree or an append after a duplicate set which is
+	 * also the last item in the tree is to call the comparison function.
+	 * When splitting internal pages during an append, the search code
+	 * guarantees the cursor always points to the largest page item less
+	 * than the new internal entry.  To summarize, we want to catch three
+	 * possible index values:
+	 *
+	 *	NUM_ENT(page)		Btree/Recno leaf insert past end-of-tree
+	 *	NUM_ENT(page) - O_INDX	Btree or Recno internal insert past EOT
+	 *	NUM_ENT(page) - P_INDX	Btree leaf insert past EOT after a set
+	 *				    of duplicates
+	 *
+	 * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert
+	 * near the end of the tree, and not after the end of the tree at all.
+	 * Do a simple test which might be wrong because calling the comparison
+	 * functions is expensive.  Regardless, it's not a big deal if we're
+	 * wrong, we'll do the split the right way next time.
+	 */
+	off = 0;
+	if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust)
+		off = NUM_ENT(pp) - adjust;
+	else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+		off = adjust;
+	if (off != 0)
+		goto sort;
+
+	/*
+	 * Split the data to the left and right pages.  Try not to split on
+	 * an overflow key.  (Overflow keys on internal pages will slow down
+	 * searches.)  Refuse to split in the middle of a set of duplicates.
+	 *
+	 * First, find the optimum place to split.
+	 *
+	 * It's possible to try and split past the last record on the page if
+	 * there's a very large record at the end of the page.  Make sure this
+	 * doesn't happen by bounding the check at the next-to-last entry on
+	 * the page.
+	 *
+	 * Note, we try and split half the data present on the page.  This is
+	 * because another process may have already split the page and left
+	 * it half empty.  We don't try and skip the split -- we don't know
+	 * how much space we're going to need on the page, and we may need up
+	 * to half the page for a big item, so there's no easy test to decide
+	 * if we need to split or not.  Besides, if two threads are inserting
+	 * data into the same place in the database, we're probably going to
+	 * need more space soon anyway.
+	 */
+	top = NUM_ENT(pp) - adjust;
+	half = (dbp->pgsize - HOFFSET(pp)) / 2;
+	for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+		switch (TYPE(pp)) {
+		case P_IBTREE:
+			if (B_TYPE(
+			    GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA)
+				nbytes += BINTERNAL_SIZE(
+				   GET_BINTERNAL(dbp, pp, off)->len);
+			else
+				nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+			break;
+		case P_LBTREE:
+			if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+			    B_KEYDATA)
+				nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+				    pp, off)->len);
+			else
+				nbytes += BOVERFLOW_SIZE;
+
+			++off;
+			/* FALLTHROUGH */
+		case P_LDUP:
+		case P_LRECNO:
+			if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+			    B_KEYDATA)
+				nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+				    pp, off)->len);
+			else
+				nbytes += BOVERFLOW_SIZE;
+			break;
+		case P_IRECNO:
+			nbytes += RINTERNAL_SIZE;
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, pp->pgno));
+		}
+sort:	splitp = off;
+
+	/*
+	 * Splitp is either at or just past the optimum split point.  If the
+	 * tree type is such that we're going to promote a key to an internal
+	 * page, and our current choice is an overflow key, look for something
+	 * close by that's smaller.
+	 */
+	switch (TYPE(pp)) {
+	case P_IBTREE:
+		iflag = 1;
+		isbigkey =
+		    B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA;
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+		iflag = 0;
+		isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) !=
+		    B_KEYDATA;
+		break;
+	default:
+		iflag = isbigkey = 0;
+	}
+	if (isbigkey)
+		for (cnt = 1; cnt <= 3; ++cnt) {
+			off = splitp + cnt * adjust;
+			if (off < (db_indx_t)NUM_ENT(pp) &&
+			    ((iflag && B_TYPE(
+			    GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) ||
+			    B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+			    B_KEYDATA)) {
+				splitp = off;
+				break;
+			}
+			if (splitp <= (db_indx_t)(cnt * adjust))
+				continue;
+			off = splitp - cnt * adjust;
+			if (iflag ? B_TYPE(
+			    GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA :
+			    B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+			    B_KEYDATA) {
+				splitp = off;
+				break;
+			}
+		}
+
+	/*
+	 * We can't split in the middle a set of duplicates.  We know that
+	 * no duplicate set can take up more than about 25% of the page,
+	 * because that's the point where we push it off onto a duplicate
+	 * page set.  So, this loop can't be unbounded.
+	 */
+	if (TYPE(pp) == P_LBTREE &&
+	    inp[splitp] == inp[splitp - adjust])
+		for (cnt = 1;; ++cnt) {
+			off = splitp + cnt * adjust;
+			if (off < NUM_ENT(pp) &&
+			    inp[splitp] != inp[off]) {
+				splitp = off;
+				break;
+			}
+			if (splitp <= (db_indx_t)(cnt * adjust))
+				continue;
+			off = splitp - cnt * adjust;
+			if (inp[splitp] != inp[off]) {
+				splitp = off + adjust;
+				break;
+			}
+		}
+
+	/* We're going to split at splitp. */
+	if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+		return (ret);
+	if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+		return (ret);
+
+	*splitret = splitp;
+	return (0);
+}
+
+/*
+ * __bam_copy --
+ *	Copy a set of records from one page to another.
+ *
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+	DB *dbp;
+	PAGE *pp, *cp;
+	u_int32_t nxt, stop;
+{
+	BINTERNAL internal;
+	db_indx_t *cinp, nbytes, off, *pinp;
+
+	cinp = P_INP(dbp, cp);
+	pinp = P_INP(dbp, pp);
+	/*
+	 * Nxt is the offset of the next record to be placed on the target page.
+	 */
+	for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+		switch (TYPE(pp)) {
+		case P_IBTREE:
+			if (off == 0 && nxt != 0)
+				nbytes = BINTERNAL_SIZE(0);
+			else if (B_TYPE(
+			    GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA)
+				nbytes = BINTERNAL_SIZE(
+				    GET_BINTERNAL(dbp, pp, nxt)->len);
+			else
+				nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+			break;
+		case P_LBTREE:
+			/*
+			 * If we're on a key and it's a duplicate, just copy
+			 * the offset.
+			 */
+			if (off != 0 && (nxt % P_INDX) == 0 &&
+			    pinp[nxt] == pinp[nxt - P_INDX]) {
+				cinp[off] = cinp[off - P_INDX];
+				continue;
+			}
+			/* FALLTHROUGH */
+		case P_LDUP:
+		case P_LRECNO:
+			if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
+			    B_KEYDATA)
+				nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+				    pp, nxt)->len);
+			else
+				nbytes = BOVERFLOW_SIZE;
+			break;
+		case P_IRECNO:
+			nbytes = RINTERNAL_SIZE;
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, pp->pgno));
+		}
+		cinp[off] = HOFFSET(cp) -= nbytes;
+		if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
+			internal.len = 0;
+			UMRW_SET(internal.unused);
+			internal.type = B_KEYDATA;
+			internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
+			internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+			memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
+		}
+		else
+			memcpy(P_ENTRY(dbp, cp, off),
+			     P_ENTRY(dbp, pp, nxt), nbytes);
+	}
+	return (0);
+}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
new file mode 100644
index 00000000..668c4fdb
--- /dev/null
+++ b/src/btree/bt_stat.c
@@ -0,0 +1,669 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __bam_stat --
+ *	Gather/print the btree statistics
+ *
+ * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__bam_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	BTMETA *meta;
+	BTREE *t;
+	DB *dbp;
+	DB_BTREE_STAT *sp;
+	DB_LOCK lock, metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t pgno;
+	int ret, t_ret, write_meta;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	meta = NULL;
+	t = dbp->bt_internal;
+	sp = NULL;
+	LOCK_INIT(metalock);
+	LOCK_INIT(lock);
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = write_meta = 0;
+
+	/* Allocate and clear the structure. */
+	if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+		goto err;
+	memset(sp, 0, sizeof(*sp));
+
+	/* Get the metadata page for the entire database. */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &pgno,
+	     dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	if (flags == DB_FAST_STAT)
+		goto meta_only;
+
+	/* Walk the metadata free list, counting pages. */
+	for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) {
+		++sp->bt_free;
+
+		if ((ret = __memp_fget(mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
+			goto err;
+
+		pgno = h->next_pgno;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbc->priority)) != 0)
+			goto err;
+		h = NULL;
+	}
+
+	/* Get the root page. */
+	BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+	if (ret != 0)
+		goto err;
+	DB_ASSERT(env, h != NULL);
+
+	/* Get the levels from the root page. */
+	sp->bt_levels = h->level;
+
+	/* Discard the root page. */
+	ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	h = NULL;
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	/* Discard the metadata page. */
+	ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	meta = NULL;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	/* Walk the tree. */
+	if ((ret = __bam_traverse(dbc,
+	    DB_LOCK_READ, PGNO_INVALID, __bam_stat_callback, sp)) != 0)
+		goto err;
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp) && (ret = __bam_compress_count(dbc,
+	    &sp->bt_nkeys, &sp->bt_ndata)) != 0)
+		goto err;
+#endif
+
+	/*
+	 * Get the subdatabase metadata page if it's not the same as the
+	 * one we already have.
+	 */
+	write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
+	    (!MULTIVERSION(dbp) || dbc->txn != NULL);
+meta_only:
+	if (meta == NULL || t->bt_meta != PGNO_BASE_MD || write_meta) {
+		if (meta != NULL) {
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, meta, dbc->priority);
+			meta = NULL;
+			if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				goto err;
+		}
+
+		if ((ret = __db_lget(dbc,
+		    0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ,
+		    0, &metalock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &t->bt_meta,
+		     dbc->thread_info, dbc->txn,
+		    write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0)
+			goto err;
+	}
+	if (flags == DB_FAST_STAT) {
+		if (dbp->type == DB_RECNO ||
+		    (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) {
+			BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+			if (ret != 0)
+				goto err;
+
+			sp->bt_nkeys = RE_NREC(h);
+		} else
+			sp->bt_nkeys = meta->dbmeta.key_count;
+
+		sp->bt_ndata = dbp->type == DB_RECNO ?
+		   sp->bt_nkeys : meta->dbmeta.record_count;
+	}
+
+	/* Get metadata page statistics. */
+	sp->bt_metaflags = meta->dbmeta.flags;
+	sp->bt_minkey = meta->minkey;
+	sp->bt_re_len = meta->re_len;
+	sp->bt_re_pad = meta->re_pad;
+	/*
+	 * Don't take the page number from the meta-data page -- that value is
+	 * only maintained in the primary database, we may have been called on
+	 * a subdatabase.  (Yes, I read the primary database meta-data page
+	 * earlier in this function, but I'm asking the underlying cache so the
+	 * code for the Hash and Btree methods is the same.)
+	 */
+	if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
+		goto err;
+	sp->bt_pagecnt = pgno + 1;
+	sp->bt_pagesize = meta->dbmeta.pagesize;
+	sp->bt_magic = meta->dbmeta.magic;
+	sp->bt_version = meta->dbmeta.version;
+
+	if (write_meta != 0) {
+		meta->dbmeta.key_count = sp->bt_nkeys;
+		meta->dbmeta.record_count = sp->bt_ndata;
+	}
+
+	*(DB_BTREE_STAT **)spp = sp;
+
+err:	/* Discard the second page. */
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the metadata page. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret != 0 && sp != NULL) {
+		__os_ufree(env, sp);
+		*(DB_BTREE_STAT **)spp = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __bam_stat_print --
+ *	Display btree/recno statistics.
+ *
+ * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__bam_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ BTM_DUP,	"duplicates" },
+		{ BTM_RECNO,	"recno" },
+		{ BTM_RECNUM,	"record-numbers" },
+		{ BTM_FIXEDLEN,	"fixed-length" },
+		{ BTM_RENUMBER,	"renumber" },
+		{ BTM_SUBDB,	"multiple-databases" },
+		{ BTM_DUPSORT,	"sorted duplicates" },
+		{ BTM_COMPRESS,	"compressed" },
+		{ 0,		NULL }
+	};
+	DB *dbp;
+	DB_BTREE_STAT *sp;
+	ENV *env;
+	int lorder, ret;
+	const char *s;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp)) {
+		if ((ret = __partition_stat(dbc, &sp, flags)) != 0)
+			return (ret);
+	} else
+#endif
+	if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Default Btree/Recno database information:");
+	}
+
+	__db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic);
+	__db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version);
+
+	(void)__db_get_lorder(dbp, &lorder);
+	switch (lorder) {
+	case 1234:
+		s = "Little-endian";
+		break;
+	case 4321:
+		s = "Big-endian";
+		break;
+	default:
+		s = "Unrecognized byte order";
+		break;
+	}
+	__db_msg(env, "%s\tByte order", s);
+	__db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags");
+	if (dbp->type == DB_BTREE)
+		__db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey);
+	if (dbp->type == DB_RECNO) {
+		__db_dl(env,
+		    "Fixed-length record size", (u_long)sp->bt_re_len);
+		__db_msg(env,
+		    "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
+	}
+	__db_dl(env,
+	    "Underlying database page size", (u_long)sp->bt_pagesize);
+	if (dbp->type == DB_BTREE)
+		__db_dl(env, "Overflow key/data size",
+		    ((BTREE_CURSOR *)dbc->internal)->ovflsize);
+	__db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels);
+	__db_dl(env, dbp->type == DB_BTREE ?
+	    "Number of unique keys in the tree" :
+	    "Number of records in the tree", (u_long)sp->bt_nkeys);
+	__db_dl(env,
+	    "Number of data items in the tree", (u_long)sp->bt_ndata);
+
+	__db_dl(env,
+	    "Number of tree internal pages", (u_long)sp->bt_int_pg);
+	__db_dl_pct(env,
+	    "Number of bytes free in tree internal pages",
+	    (u_long)sp->bt_int_pgfree,
+	    DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of tree leaf pages", (u_long)sp->bt_leaf_pg);
+	__db_dl_pct(env, "Number of bytes free in tree leaf pages",
+	    (u_long)sp->bt_leaf_pgfree, DB_PCT_PG(
+	    sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of tree duplicate pages", (u_long)sp->bt_dup_pg);
+	__db_dl_pct(env,
+	    "Number of bytes free in tree duplicate pages",
+	    (u_long)sp->bt_dup_pgfree,
+	    DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of tree overflow pages", (u_long)sp->bt_over_pg);
+	__db_dl_pct(env, "Number of bytes free in tree overflow pages",
+	    (u_long)sp->bt_over_pgfree, DB_PCT_PG(
+	    sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff");
+	__db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg);
+
+	__db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __bam_stat_callback --
+ *	Statistics callback.
+ *
+ * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__bam_stat_callback(dbc, h, cookie, putp)
+	DBC *dbc;
+	PAGE *h;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	DB_BTREE_STAT *sp;
+	db_indx_t indx, *inp, top;
+	u_int8_t type;
+
+	dbp = dbc->dbp;
+	sp = cookie;
+	*putp = 0;
+	top = NUM_ENT(h);
+	inp = P_INP(dbp, h);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+		++sp->bt_int_pg;
+		sp->bt_int_pgfree += P_FREESPACE(dbp, h);
+		break;
+	case P_LBTREE:
+		if (top == 0)
+			++sp->bt_empty_pg;
+
+		/* Correct for on-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type;
+			/* Ignore deleted items. */
+			if (B_DISSET(type))
+				continue;
+
+			/* Ignore duplicate keys. */
+			if (indx + P_INDX >= top ||
+			    inp[indx] != inp[indx + P_INDX])
+				++sp->bt_nkeys;
+
+			/* Ignore off-page duplicates. */
+			if (B_TYPE(type) != B_DUPLICATE)
+				++sp->bt_ndata;
+		}
+
+		++sp->bt_leaf_pg;
+		sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+		break;
+	case P_LRECNO:
+		if (top == 0)
+			++sp->bt_empty_pg;
+
+		/*
+		 * If walking a recno tree, then each of these items is a key.
+		 * Otherwise, we're walking an off-page duplicate set.
+		 */
+		if (dbp->type == DB_RECNO) {
+			/*
+			 * Correct for deleted items in non-renumbering Recno
+			 * databases.
+			 */
+			if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+				sp->bt_nkeys += top;
+				sp->bt_ndata += top;
+			} else
+				for (indx = 0; indx < top; indx += O_INDX) {
+					type = GET_BKEYDATA(dbp, h, indx)->type;
+					if (!B_DISSET(type)) {
+						++sp->bt_ndata;
+						++sp->bt_nkeys;
+					}
+				}
+
+			++sp->bt_leaf_pg;
+			sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+		} else {
+			sp->bt_ndata += top;
+
+			++sp->bt_dup_pg;
+			sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+		}
+		break;
+	case P_LDUP:
+		if (top == 0)
+			++sp->bt_empty_pg;
+
+		/* Correct for deleted items. */
+		for (indx = 0; indx < top; indx += O_INDX)
+			if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+				++sp->bt_ndata;
+
+		++sp->bt_dup_pg;
+		sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+		break;
+	case P_OVERFLOW:
+		++sp->bt_over_pg;
+		sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h);
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, h->pgno));
+	}
+	return (0);
+}
+
+/*
+ * __bam_print_cursor --
+ *	Display the current internal cursor.
+ *
+ * PUBLIC: void __bam_print_cursor __P((DBC *));
+ */
+void
+__bam_print_cursor(dbc)
+	DBC *dbc;
+{
+	static const FN fn[] = {
+		{ C_DELETED,	"C_DELETED" },
+		{ C_RECNUM,	"C_RECNUM" },
+		{ C_RENUMBER,	"C_RENUMBER" },
+		{ 0,		NULL }
+	};
+	ENV *env;
+	BTREE_CURSOR *cp;
+
+	env = dbc->env;
+	cp = (BTREE_CURSOR *)dbc->internal;
+
+	STAT_ULONG("Overflow size", cp->ovflsize);
+	if (dbc->dbtype == DB_RECNO)
+		STAT_ULONG("Recno", cp->recno);
+	STAT_ULONG("Order", cp->order);
+	__db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__bam_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbc->env));
+}
+
+int
+__bam_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __bam_key_range --
+ *	Return proportion of keys relative to given key.  The numbers are
+ *	slightly skewed due to on page duplicates.
+ *
+ * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__bam_key_range(dbc, dbt, kp, flags)
+	DBC *dbc;
+	DBT *dbt;
+	DB_KEY_RANGE *kp;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	EPG *sp;
+	double factor;
+	int exact, ret;
+
+	COMPQUIET(flags, 0);
+
+	if ((ret = __bam_search(dbc, PGNO_INVALID,
+	    dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0)
+		return (ret);
+
+	cp = (BTREE_CURSOR *)dbc->internal;
+	kp->less = kp->greater = 0.0;
+
+	factor = 1.0;
+
+	/* Correct the leaf page. */
+	cp->csp->entries /= 2;
+	cp->csp->indx /= 2;
+	for (sp = cp->sp; sp <= cp->csp; ++sp) {
+		/*
+		 * At each level we know that pages greater than indx contain
+		 * keys greater than what we are looking for and those less
+		 * than indx are less than.  The one pointed to by indx may
+		 * have some less, some greater or even equal.  If indx is
+		 * equal to the number of entries, then the key is out of range
+		 * and everything is less.
+		 */
+		if (sp->indx == 0)
+			kp->greater += factor * (sp->entries - 1)/sp->entries;
+		else if (sp->indx == sp->entries)
+			kp->less += factor;
+		else {
+			kp->less += factor * sp->indx / sp->entries;
+			kp->greater += factor *
+			    ((sp->entries - sp->indx) - 1) / sp->entries;
+		}
+		factor *= 1.0/sp->entries;
+	}
+
+	/*
+	 * If there was an exact match then assign 1 n'th to the key itself.
+	 * Otherwise that factor belongs to those greater than the key, unless
+	 * the key was out of range.
+	 */
+	if (exact)
+		kp->equal = factor;
+	else {
+		if (kp->less != 1)
+			kp->greater += factor;
+		kp->equal = 0;
+	}
+
+	if ((ret = __bam_stkrel(dbc, 0)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __bam_traverse --
+ *	Walk a Btree database.
+ *
+ * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC:     db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__bam_traverse(dbc, mode, root_pgno, callback, cookie)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t root_pgno;
+	int (*callback)__P((DBC *, PAGE *, void *, int *));
+	void *cookie;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	RINTERNAL *ri;
+	db_indx_t indx, *inp;
+	int already_put, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	already_put = 0;
+	LOCK_INIT(lock);
+
+	COMPQUIET(h, NULL);
+	BAM_GET_ROOT(dbc, root_pgno, h, 0, mode, lock, ret);
+	if (ret != 0)
+		goto err1;
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+			bi = GET_BINTERNAL(dbp, h, indx);
+			if (B_TYPE(bi->type) == B_OVERFLOW &&
+			    (ret = __db_traverse_big(dbc,
+			    ((BOVERFLOW *)bi->data)->pgno,
+			    callback, cookie)) != 0)
+				goto err;
+			if ((ret = __bam_traverse(
+			    dbc, mode, bi->pgno, callback, cookie)) != 0)
+				goto err;
+		}
+		break;
+	case P_IRECNO:
+		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+			ri = GET_RINTERNAL(dbp, h, indx);
+			if ((ret = __bam_traverse(
+			    dbc, mode, ri->pgno, callback, cookie)) != 0)
+				goto err;
+		}
+		break;
+	case P_LBTREE:
+		inp = P_INP(dbp, h);
+		for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) {
+			bk = GET_BKEYDATA(dbp, h, indx);
+			if (B_TYPE(bk->type) == B_OVERFLOW &&
+			    (indx + P_INDX >= NUM_ENT(h) ||
+			    inp[indx] != inp[indx + P_INDX])) {
+				if ((ret = __db_traverse_big(dbc,
+				    GET_BOVERFLOW(dbp, h, indx)->pgno,
+				    callback, cookie)) != 0)
+					goto err;
+			}
+			bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+			if (B_TYPE(bk->type) == B_DUPLICATE &&
+			    (ret = __bam_traverse(dbc, mode,
+			    GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+			    callback, cookie)) != 0)
+				goto err;
+			if (B_TYPE(bk->type) == B_OVERFLOW &&
+			    (ret = __db_traverse_big(dbc,
+			    GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+			    callback, cookie)) != 0)
+				goto err;
+		}
+		break;
+	case P_LDUP:
+	case P_LRECNO:
+		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+			bk = GET_BKEYDATA(dbp, h, indx);
+			if (B_TYPE(bk->type) == B_OVERFLOW &&
+			    (ret = __db_traverse_big(dbc,
+			    GET_BOVERFLOW(dbp, h, indx)->pgno,
+			    callback, cookie)) != 0)
+				goto err;
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, h->pgno));
+	}
+
+	ret = callback(dbc, h, cookie, &already_put);
+
+err:	if (!already_put && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+err1:	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
new file mode 100644
index 00000000..c9123351
--- /dev/null
+++ b/src/btree/bt_upgrade.c
@@ -0,0 +1,153 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_upgrade.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_30_btreemeta --
+ *	Upgrade the metadata pages from version 6 to version 7.
+ *
+ * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__bam_30_btreemeta(dbp, real_name, buf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *buf;
+{
+	BTMETA2X *oldmeta;
+	BTMETA30 *newmeta;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	newmeta = (BTMETA30 *)buf;
+	oldmeta = (BTMETA2X *)buf;
+
+	/*
+	 * Move things from the end up, so we do not overwrite things.
+	 * We are going to create a new uid, so we can move the stuff
+	 * at the end of the structure first, overwriting the uid.
+	 */
+
+	newmeta->re_pad = oldmeta->re_pad;
+	newmeta->re_len = oldmeta->re_len;
+	newmeta->minkey = oldmeta->minkey;
+	newmeta->maxkey = oldmeta->maxkey;
+	newmeta->dbmeta.free = oldmeta->free;
+	newmeta->dbmeta.flags = oldmeta->flags;
+	newmeta->dbmeta.type  = P_BTREEMETA;
+
+	newmeta->dbmeta.version = 7;
+	/* Replace the unique ID. */
+	if ((ret = __os_fileid(env, real_name, 1, buf + 36)) != 0)
+		return (ret);
+
+	newmeta->root = 1;
+
+	return (0);
+}
+
+/*
+ * __bam_31_btreemeta --
+ *	Upgrade the database from version 7 to version 8.
+ *
+ * PUBLIC: int __bam_31_btreemeta
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	BTMETA30 *oldmeta;
+	BTMETA31 *newmeta;
+
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(fhp, NULL);
+
+	newmeta = (BTMETA31 *)h;
+	oldmeta = (BTMETA30 *)h;
+
+	/*
+	 * Copy the effected fields down the page.
+	 * The fields may overlap each other so we
+	 * start at the bottom and use memmove.
+	 */
+	newmeta->root = oldmeta->root;
+	newmeta->re_pad = oldmeta->re_pad;
+	newmeta->re_len = oldmeta->re_len;
+	newmeta->minkey = oldmeta->minkey;
+	newmeta->maxkey = oldmeta->maxkey;
+	memmove(newmeta->dbmeta.uid,
+	    oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+	newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+	newmeta->dbmeta.record_count = 0;
+	newmeta->dbmeta.key_count = 0;
+	ZERO_LSN(newmeta->dbmeta.unused3);
+
+	/* Set the version number. */
+	newmeta->dbmeta.version = 8;
+
+	/* Upgrade the flags. */
+	if (LF_ISSET(DB_DUPSORT))
+		F_SET(&newmeta->dbmeta, BTM_DUPSORT);
+
+	*dirtyp = 1;
+	return (0);
+}
+
+/*
+ * __bam_31_lbtree --
+ *	Upgrade the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_31_lbtree
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	BKEYDATA *bk;
+	db_pgno_t pgno;
+	db_indx_t indx;
+	int ret;
+
+	ret = 0;
+	for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+		bk = GET_BKEYDATA(dbp, h, indx);
+		if (B_TYPE(bk->type) == B_DUPLICATE) {
+			pgno = GET_BOVERFLOW(dbp, h, indx)->pgno;
+			if ((ret = __db_31_offdup(dbp, real_name, fhp,
+			    LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0)
+				break;
+			if (pgno != GET_BOVERFLOW(dbp, h, indx)->pgno) {
+				*dirtyp = 1;
+				GET_BOVERFLOW(dbp, h, indx)->pgno = pgno;
+			}
+		}
+	}
+
+	return (ret);
+}
diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c
new file mode 100644
index 00000000..99354a58
--- /dev/null
+++ b/src/btree/bt_verify.c
@@ -0,0 +1,2805 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
+    PAGE *, u_int32_t, int, DBT *, int *));
+static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+    db_indx_t *, u_int32_t));
+static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
+    BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
+    u_int32_t));
+static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+    db_indx_t *, u_int32_t));
+
+/*
+ * __bam_vrfy_meta --
+ *	Verify the btree-specific part of a metadata page.
+ *
+ * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC:     db_pgno_t, u_int32_t));
+ */
+int
+__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	BTMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int isbad, t_ret, ret;
+	db_indx_t ovflsize;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/*
+	 * If we came through __db_vrfy_pagezero, we have already checked the
+	 * common fields.  However, we used the on-disk metadata page, it may
+	 * have been stale.  We now have the page from mpool, so check that.
+	 */
+	if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/* bt_minkey:  must be >= 2; must produce sensible ovflsize */
+
+	/* avoid division by zero */
+	ovflsize = meta->minkey > 0 ?
+	    B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
+
+	if (meta->minkey < 2 ||
+	    ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+		pip->bt_minkey = 0;
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1034",
+	    "Page %lu: nonsensical bt_minkey value %lu on metadata page",
+		    "%lu %lu"), (u_long)pgno, (u_long)meta->minkey));
+	} else
+		pip->bt_minkey = meta->minkey;
+
+	/* re_len: no constraints on this (may be zero or huge--we make rope) */
+	pip->re_pad = meta->re_pad;
+	pip->re_len = meta->re_len;
+
+	/*
+	 * The root must not be current page or 0 and it must be within
+	 * database.  If this metadata page is the master meta data page
+	 * of the file, then the root page had better be page 1.
+	 */
+	pip->root = 0;
+	if (meta->root == PGNO_INVALID ||
+	    meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
+	    (pgno == PGNO_BASE_MD && meta->root != 1)) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1035",
+		    "Page %lu: nonsensical root page %lu on metadata page",
+		    "%lu %lu"), (u_long)pgno, (u_long)meta->root));
+	} else
+		pip->root = meta->root;
+
+	/* Flags. */
+	if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
+		F_SET(pip, VRFY_IS_RRECNO);
+
+	if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
+		/*
+		 * If this is a master db meta page, it had better not have
+		 * duplicates.
+		 */
+		if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1036",
+"Page %lu: Btree metadata page has both duplicates and multiple databases",
+			    "%lu"), (u_long)pgno));
+		}
+		F_SET(pip, VRFY_HAS_SUBDBS);
+	}
+
+	if (F_ISSET(&meta->dbmeta, BTM_DUP))
+		F_SET(pip, VRFY_HAS_DUPS);
+	if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
+		F_SET(pip, VRFY_HAS_DUPSORT);
+	if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
+		F_SET(pip, VRFY_HAS_RECNUMS);
+	if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+		EPRINT((env, DB_STR_A("1037",
+    "Page %lu: Btree metadata page illegally has both recnums and dups",
+		    "%lu"), (u_long)pgno));
+		isbad = 1;
+	}
+
+	if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
+		F_SET(pip, VRFY_IS_RECNO);
+		dbp->type = DB_RECNO;
+	} else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1038",
+    "Page %lu: metadata page has renumber flag set but is not recno",
+		    "%lu"), (u_long)pgno));
+	}
+
+#ifdef HAVE_COMPRESSION
+	if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) {
+		F_SET(pip, VRFY_HAS_COMPRESS);
+		if (!DB_IS_COMPRESSED(dbp)) {
+			((BTREE *)dbp->bt_internal)->bt_compress =
+			    __bam_defcompress;
+			((BTREE *)dbp->bt_internal)->bt_decompress =
+			    __bam_defdecompress;
+		}
+		/*
+		 * Copy dup_compare to compress_dup_compare, and use the
+		 * compression duplicate compare.
+		 */
+		if (F_ISSET(pip, VRFY_HAS_DUPSORT)) {
+			if (dbp->dup_compare == NULL)
+				dbp->dup_compare = __bam_defcmp;
+			if (((BTREE *)dbp->bt_internal)->compress_dup_compare
+				== NULL) {
+				((BTREE *)dbp->bt_internal)->
+					compress_dup_compare = dbp->dup_compare;
+				dbp->dup_compare = __bam_compress_dupcmp;
+			}
+		}
+	}
+
+	if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+		EPRINT((env, DB_STR_A("1039",
+    "Page %lu: Btree metadata page illegally has both recnums and compression",
+		    "%lu"), (u_long)pgno));
+		isbad = 1;
+	}
+	if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) &&
+	    F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+		EPRINT((env, DB_STR_A("1040",
+		    "Page %lu: Btree metadata page illegally has both "
+		    "unsorted duplicates and compression",
+		    "%lu"), (u_long)pgno));
+		isbad = 1;
+	}
+#endif
+
+	if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+		EPRINT((env, DB_STR_A("1041",
+		    "Page %lu: recno metadata page specifies duplicates",
+		    "%lu"), (u_long)pgno));
+		isbad = 1;
+	}
+
+	if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
+		F_SET(pip, VRFY_IS_FIXEDLEN);
+	else if (pip->re_len > 0) {
+		/*
+		 * It's wrong to have an re_len if it's not a fixed-length
+		 * database
+		 */
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1042",
+		    "Page %lu: re_len of %lu in non-fixed-length database",
+		    "%lu %lu"), (u_long)pgno, (u_long)pip->re_len));
+	}
+
+	/*
+	 * We do not check that the rest of the page is 0, because it may
+	 * not be and may still be correct.
+	 */
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (LF_ISSET(DB_SALVAGE) &&
+	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_leaf --
+ *	Verify a recno leaf page.
+ *
+ * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	BKEYDATA *bk;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_indx_t i;
+	int ret, t_ret, isbad;
+	u_int32_t re_len_guess, len;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	if (TYPE(h) != P_LRECNO) {
+		ret = __db_unknown_path(env, "__ram_vrfy_leaf");
+		goto err;
+	}
+
+	/*
+	 * Verify (and, if relevant, save off) page fields common to
+	 * all PAGEs.
+	 */
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * Verify inp[].  Return immediately if it returns DB_VERIFY_BAD;
+	 * further checks are dangerous.
+	 */
+	if ((ret = __bam_vrfy_inp(dbp,
+	    vdp, h, pgno, &pip->entries, flags)) != 0)
+		goto err;
+
+	if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+		EPRINT((env, DB_STR_A("1043",
+		    "Page %lu: Recno database has dups",
+		    "%lu"), (u_long)pgno));
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+	/*
+	 * Walk through inp and see if the lengths of all the records are the
+	 * same--if so, this may be a fixed-length database, and we want to
+	 * save off this value.  We know inp to be safe if we've gotten this
+	 * far.
+	 */
+	re_len_guess = 0;
+	for (i = 0; i < NUM_ENT(h); i++) {
+		bk = GET_BKEYDATA(dbp, h, i);
+		/* KEYEMPTY.  Go on. */
+		if (B_DISSET(bk->type))
+			continue;
+		if (bk->type == B_OVERFLOW)
+			len = ((BOVERFLOW *)bk)->tlen;
+		else if (bk->type == B_KEYDATA)
+			len = bk->len;
+		else {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1044",
+			    "Page %lu: nonsensical type for item %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			continue;
+		}
+		if (re_len_guess == 0)
+			re_len_guess = len;
+
+		/*
+		 * Is this item's len the same as the last one's?  If not,
+		 * reset to 0 and break--we don't have a single re_len.
+		 * Otherwise, go on to the next item.
+		 */
+		if (re_len_guess != len) {
+			re_len_guess = 0;
+			break;
+		}
+	}
+	pip->re_len = re_len_guess;
+
+	/* Save off record count. */
+	pip->rec_cnt = NUM_ENT(h);
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy --
+ *	Verify a btree leaf or internal page.
+ *
+ * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__bam_vrfy(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret, isbad;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LDUP:
+		break;
+	default:
+		ret = __db_unknown_path(env, "__bam_vrfy");
+		goto err;
+	}
+
+	/*
+	 * Verify (and, if relevant, save off) page fields common to
+	 * all PAGEs.
+	 */
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * The record count is, on internal pages, stored in an overloaded
+	 * next_pgno field.  Save it off;  we'll verify it when we check
+	 * overall database structure.  We could overload the field
+	 * in VRFY_PAGEINFO, too, but this seems gross, and space
+	 * is not at such a premium.
+	 */
+	pip->rec_cnt = RE_NREC(h);
+
+	/*
+	 * Verify inp[].
+	 */
+	if (TYPE(h) == P_IRECNO) {
+		if ((ret = __ram_vrfy_inp(dbp,
+		    vdp, h, pgno, &pip->entries, flags)) != 0)
+			goto err;
+	} else if ((ret = __bam_vrfy_inp(dbp,
+	    vdp, h, pgno, &pip->entries, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+		EPRINT((env, DB_STR_A("1045",
+		    "Page %lu: item order check unsafe: skipping",
+		    "%lu"), (u_long)pgno));
+	} else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
+	    __bam_vrfy_itemorder(dbp,
+	    vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
+		/*
+		 * We know that the elements of inp are reasonable.
+		 *
+		 * Check that elements fall in the proper order.
+		 */
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_inp --
+ *	Verify that all entries in a P_IRECNO inp[] array are reasonable,
+ *	and count them.  Note that P_LRECNO uses __bam_vrfy_inp;
+ *	P_IRECNOs are a special, and simpler, case, since they have
+ *	RINTERNALs rather than BKEYDATA/BINTERNALs.
+ */
+static int
+__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	db_indx_t *nentriesp;
+	u_int32_t flags;
+{
+	ENV *env;
+	RINTERNAL *ri;
+	VRFY_CHILDINFO child;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret, isbad;
+	u_int32_t himark, i, offset, nentries;
+	db_indx_t *inp;
+	u_int8_t *pagelayout, *p;
+
+	env = dbp->env;
+	isbad = 0;
+	memset(&child, 0, sizeof(VRFY_CHILDINFO));
+	nentries = 0;
+	pagelayout = NULL;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	if (TYPE(h) != P_IRECNO) {
+		ret = __db_unknown_path(env, "__ram_vrfy_inp");
+		goto err;
+	}
+
+	himark = dbp->pgsize;
+	if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
+		goto err;
+	memset(pagelayout, 0, dbp->pgsize);
+	inp = P_INP(dbp, h);
+	for (i = 0; i < NUM_ENT(h); i++) {
+		if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
+			EPRINT((env, DB_STR_A("1046",
+			    "Page %lu: entries listing %lu overlaps data",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		offset = inp[i];
+		/*
+		 * Check that the item offset is reasonable:  it points
+		 * somewhere after the inp array and before the end of the
+		 * page.
+		 */
+		if (offset <= (u_int32_t)((u_int8_t *)inp + i -
+		    (u_int8_t *)h) ||
+		    offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1047",
+			    "Page %lu: bad offset %lu at index %lu",
+			    "%lu %lu %lu"), (u_long)pgno, (u_long)offset,
+			    (u_long)i));
+			continue;
+		}
+
+		/* Update the high-water mark (what HOFFSET should be) */
+		if (offset < himark)
+			himark = offset;
+
+		nentries++;
+
+		/* Make sure this RINTERNAL is not multiply referenced. */
+		ri = GET_RINTERNAL(dbp, h, i);
+		if (pagelayout[offset] == 0) {
+			pagelayout[offset] = 1;
+			child.pgno = ri->pgno;
+			child.type = V_RECNO;
+			child.nrecs = ri->nrecs;
+			if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+				goto err;
+		} else {
+			EPRINT((env, DB_STR_A("1048",
+		"Page %lu: RINTERNAL structure at offset %lu referenced twice",
+			    "%lu %lu"), (u_long)pgno, (u_long)offset));
+			isbad = 1;
+		}
+	}
+
+	for (p = pagelayout + himark;
+	    p < pagelayout + dbp->pgsize;
+	    p += RINTERNAL_SIZE)
+		if (*p != 1) {
+			EPRINT((env, DB_STR_A("1049",
+			    "Page %lu: gap between items at offset %lu",
+			    "%lu %lu"), (u_long)pgno,
+			    (u_long)(p - pagelayout)));
+			isbad = 1;
+		}
+
+	if ((db_indx_t)himark != HOFFSET(h)) {
+		EPRINT((env, DB_STR_A("1050",
+		    "Page %lu: bad HOFFSET %lu, appears to be %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)(HOFFSET(h)),
+		    (u_long)himark));
+		isbad = 1;
+	}
+
+	*nentriesp = nentries;
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pagelayout != NULL)
+		__os_free(env, pagelayout);
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
+
+/*
+ * __bam_vrfy_inp --
+ *	Verify that all entries in inp[] array are reasonable;
+ *	count them.
+ */
+static int
+__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	db_indx_t *nentriesp;
+	u_int32_t flags;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	ENV *env;
+	VRFY_CHILDINFO child;
+	VRFY_ITEM *pagelayout;
+	VRFY_PAGEINFO *pip;
+	u_int32_t himark, offset;		/*
+						 * These would be db_indx_ts
+						 * but for alignment.
+						 */
+	u_int32_t i, endoff, nentries;
+	int isbad, initem, isdupitem, ret, t_ret;
+
+	env = dbp->env;
+	isbad = isdupitem = 0;
+	nentries = 0;
+	memset(&child, 0, sizeof(VRFY_CHILDINFO));
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		break;
+	default:
+		/*
+		 * In the salvager, we might call this from a page which
+		 * we merely suspect is a btree page.  Otherwise, it
+		 * shouldn't get called--if it is, that's a verifier bug.
+		 */
+		if (LF_ISSET(DB_SALVAGE))
+			break;
+		ret = __db_unknown_path(env, "__bam_vrfy_inp");
+		goto err;
+	}
+
+	/*
+	 * Loop through inp[], the array of items, until we either
+	 * run out of entries or collide with the data.  Keep track
+	 * of h_offset in himark.
+	 *
+	 * For each element in inp[i], make sure it references a region
+	 * that starts after the end of the inp array (as defined by
+	 * NUM_ENT(h)), ends before the beginning of the page, doesn't
+	 * overlap any other regions, and doesn't have a gap between
+	 * it and the region immediately after it.
+	 */
+	himark = dbp->pgsize;
+	if ((ret = __os_calloc(
+	    env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
+		goto err;
+	for (i = 0; i < NUM_ENT(h); i++) {
+		switch (ret = __db_vrfy_inpitem(dbp,
+		    h, pgno, i, 1, flags, &himark, &offset)) {
+		case 0:
+			break;
+		case DB_VERIFY_BAD:
+			isbad = 1;
+			continue;
+		case DB_VERIFY_FATAL:
+			isbad = 1;
+			goto err;
+		default:
+			DB_ASSERT(env, ret != 0);
+			break;
+		}
+
+		/*
+		 * We now have a plausible beginning for the item, and we know
+		 * its length is safe.
+		 *
+		 * Mark the beginning and end in pagelayout so we can make sure
+		 * items have no overlaps or gaps.
+		 */
+		bk = GET_BKEYDATA(dbp, h, i);
+		if (pagelayout[offset] == VRFY_ITEM_NOTSET)
+			pagelayout[offset] = VRFY_ITEM_BEGIN;
+		else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
+			/*
+			 * Having two inp entries that point at the same patch
+			 * of page is legal if and only if the page is
+			 * a btree leaf and they're onpage duplicate keys--
+			 * that is, if (i % P_INDX) == 0.
+			 */
+			if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
+				/* Flag for later. */
+				F_SET(pip, VRFY_HAS_DUPS);
+
+				/* Bump up nentries so we don't undercount. */
+				nentries++;
+
+				/*
+				 * We'll check to make sure the end is
+				 * equal, too.
+				 */
+				isdupitem = 1;
+			} else {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1051",
+				    "Page %lu: duplicated item %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+			}
+		}
+
+		/*
+		 * Mark the end.  Its location varies with the page type
+		 * and the item type.
+		 *
+		 * If the end already has a sign other than 0, do nothing--
+		 * it's an overlap that we'll catch later.
+		 */
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			if (TYPE(h) == P_IBTREE)
+				/* It's a BINTERNAL. */
+				endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
+			else
+				endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
+			break;
+		case B_DUPLICATE:
+			/*
+			 * Flag that we have dups; we'll check whether
+			 * that's okay during the structure check.
+			 */
+			F_SET(pip, VRFY_HAS_DUPS);
+			/* FALLTHROUGH */
+		case B_OVERFLOW:
+			/*
+			 * Overflow entries on internal pages are stored
+			 * as the _data_ of a BINTERNAL;  overflow entries
+			 * on leaf pages are stored as the entire entry.
+			 */
+			endoff = offset +
+			    ((TYPE(h) == P_IBTREE) ?
+			    BINTERNAL_SIZE(BOVERFLOW_SIZE) :
+			    BOVERFLOW_SIZE) - 1;
+			break;
+		default:
+			/*
+			 * We'll complain later;  for now, just mark
+			 * a minimum.
+			 */
+			endoff = offset + BKEYDATA_SIZE(0) - 1;
+			break;
+		}
+
+		/*
+		 * If this is an onpage duplicate key we've seen before,
+		 * the end had better coincide too.
+		 */
+		if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
+			EPRINT((env, DB_STR_A("1052",
+			    "Page %lu: duplicated item %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			isbad = 1;
+		} else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
+			pagelayout[endoff] = VRFY_ITEM_END;
+		isdupitem = 0;
+
+		/*
+		 * There should be no deleted items in a quiescent tree,
+		 * except in recno.
+		 */
+		if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1053",
+			    "Page %lu: item %lu marked deleted", "%lu %lu"),
+			    (u_long)pgno, (u_long)i));
+		}
+
+		/*
+		 * Check the type and such of bk--make sure it's reasonable
+		 * for the pagetype.
+		 */
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			/*
+			 * This is a normal, non-overflow BKEYDATA or BINTERNAL.
+			 * The only thing to check is the len, and that's
+			 * already been done.
+			 */
+			break;
+		case B_DUPLICATE:
+			if (TYPE(h) == P_IBTREE) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1054",
+    "Page %lu: duplicate page referenced by internal btree page at item %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				break;
+			} else if (TYPE(h) == P_LRECNO) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1055",
+	"Page %lu: duplicate page referenced by recno page at item %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				break;
+			}
+			/* FALLTHROUGH */
+		case B_OVERFLOW:
+			bo = (TYPE(h) == P_IBTREE) ?
+			    (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
+			    (BOVERFLOW *)bk;
+
+			if (B_TYPE(bk->type) == B_OVERFLOW)
+				/* Make sure tlen is reasonable. */
+				if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1056",
+				"Page %lu: impossible tlen %lu, item %lu",
+					    "%lu %lu %lu"), (u_long)pgno,
+					    (u_long)bo->tlen, (u_long)i));
+					/* Don't save as a child. */
+					break;
+				}
+
+			if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
+			    bo->pgno == PGNO_INVALID) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1057",
+			    "Page %lu: offpage item %lu has bad pgno %lu",
+				    "%lu %lu %lu"), (u_long)pgno, (u_long)i,
+				    (u_long)bo->pgno));
+				/* Don't save as a child. */
+				break;
+			}
+
+			child.pgno = bo->pgno;
+			child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
+			    V_OVERFLOW : V_DUPLICATE);
+			child.tlen = bo->tlen;
+			if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+				goto err;
+			break;
+		default:
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1058",
+			    "Page %lu: item %lu of invalid type %lu",
+			    "%lu %lu %lu"), (u_long)pgno, (u_long)i,
+			    (u_long)B_TYPE(bk->type)));
+			break;
+		}
+	}
+
+	/*
+	 * Now, loop through and make sure the items are contiguous and
+	 * non-overlapping.
+	 */
+	initem = 0;
+	for (i = himark; i < dbp->pgsize; i++)
+		if (initem == 0)
+			switch (pagelayout[i]) {
+			case VRFY_ITEM_NOTSET:
+				/* May be just for alignment. */
+				if (i != DB_ALIGN(i, sizeof(u_int32_t)))
+					continue;
+
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1059",
+				    "Page %lu: gap between items at offset %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				/* Find the end of the gap */
+				for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
+				    (size_t)(i + 1) < dbp->pgsize; i++)
+					;
+				break;
+			case VRFY_ITEM_BEGIN:
+				/* We've found an item. Check its alignment. */
+				if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1060",
+					    "Page %lu: offset %lu unaligned",
+					    "%lu %lu"), (u_long)pgno,
+					    (u_long)i));
+				}
+				initem = 1;
+				nentries++;
+				break;
+			case VRFY_ITEM_END:
+				/*
+				 * We've hit the end of an item even though
+				 * we don't think we're in one;  must
+				 * be an overlap.
+				 */
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1061",
+				    "Page %lu: overlapping items at offset %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				break;
+			}
+		else
+			switch (pagelayout[i]) {
+			case VRFY_ITEM_NOTSET:
+				/* In the middle of an item somewhere. Okay. */
+				break;
+			case VRFY_ITEM_END:
+				/* End of an item; switch to out-of-item mode.*/
+				initem = 0;
+				break;
+			case VRFY_ITEM_BEGIN:
+				/*
+				 * Hit a second item beginning without an
+				 * end.  Overlap.
+				 */
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1062",
+				    "Page %lu: overlapping items at offset %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				break;
+			}
+
+	__os_free(env, pagelayout);
+
+	/* Verify HOFFSET. */
+	if ((db_indx_t)himark != HOFFSET(h)) {
+		EPRINT((env, DB_STR_A("1063",
+		    "Page %lu: bad HOFFSET %lu, appears to be %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)HOFFSET(h),
+		    (u_long)himark));
+		isbad = 1;
+	}
+
+err:	if (nentriesp != NULL)
+		*nentriesp = nentries;
+
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_itemorder --
+ *	Make sure the items on a page sort correctly.
+ *
+ *	Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
+ *	reasonable;  be sure that __bam_vrfy_inp has been called first.
+ *
+ *	If ovflok is set, it also assumes that overflow page chains
+ *	hanging off the current page have been sanity-checked, and so we
+ *	can use __bam_cmp to verify their ordering.  If it is not set,
+ *	and we run into an overflow page, carp and return DB_VERIFY_BAD;
+ *	we shouldn't be called if any exist.
+ *
+ * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
+ * PUBLIC:      PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
+ */
+int
+__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t nentries;
+	int ovflok, hasdups;
+	u_int32_t flags;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	BTREE *bt;
+	DB_MPOOLFILE *mpf;
+	DBC *dbc;
+	DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
+	ENV *env;
+	PAGE *child;
+	db_pgno_t cpgno;
+	VRFY_PAGEINFO *pip;
+	db_indx_t i, *inp;
+	int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
+	int (*dupfunc) __P((DB *, const DBT *, const DBT *));
+	int (*func) __P((DB *, const DBT *, const DBT *));
+	void *buf1, *buf2, *tmpbuf;
+
+	/*
+	 * We need to work in the ORDERCHKONLY environment where we might
+	 * not have a pip, but we also may need to work in contexts where
+	 * NUM_ENT isn't safe.
+	 */
+	if (vdp != NULL) {
+		if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+			return (ret);
+		nentries = pip->entries;
+	} else
+		pip = NULL;
+
+	env = dbp->env;
+	ret = isbad = 0;
+	bo = NULL;			/* Shut up compiler. */
+
+	memset(&dbta, 0, sizeof(DBT));
+	F_SET(&dbta, DB_DBT_REALLOC);
+
+	memset(&dbtb, 0, sizeof(DBT));
+	F_SET(&dbtb, DB_DBT_REALLOC);
+
+	buf1 = buf2 = NULL;
+
+	DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
+
+	dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+	if (TYPE(h) == P_LDUP)
+		func = dupfunc;
+	else {
+		func = __bam_defcmp;
+		if (dbp->bt_internal != NULL) {
+			bt = (BTREE *)dbp->bt_internal;
+			if (TYPE(h) == P_IBTREE && (bt->bt_compare != NULL ||
+			    dupfunc != __bam_defcmp)) {
+				/*
+				 * The problem here is that we cannot
+				 * tell the difference between an off
+				 * page duplicate internal page and
+				 * a main database internal page.
+				 * Walk down the tree to figure it out.
+				 */
+				mpf = dbp->mpf;
+				child = h;
+				while (TYPE(child) == P_IBTREE) {
+					bi = GET_BINTERNAL(dbp, child, 0);
+					cpgno = bi->pgno;
+					if (child != h &&
+					    (ret = __memp_fput(mpf,
+					    vdp->thread_info, child,
+					    DB_PRIORITY_UNCHANGED)) != 0)
+						goto err;
+					if ((ret = __memp_fget(mpf,
+					    &cpgno, vdp->thread_info,
+					    NULL, 0, &child)) != 0)
+						goto err;
+				}
+				if (TYPE(child) == P_LDUP)
+					func = dupfunc;
+				else if (bt->bt_compare != NULL)
+					func = bt->bt_compare;
+				if ((ret = __memp_fput(mpf, vdp->thread_info,
+				    child, DB_PRIORITY_UNCHANGED)) != 0)
+					goto err;
+			} else if (bt->bt_compare != NULL)
+				func = bt->bt_compare;
+		}
+	}
+
+	/*
+	 * We alternate our use of dbta and dbtb so that we can walk
+	 * through the page key-by-key without copying a dbt twice.
+	 * p1 is always the dbt for index i - 1, and p2 for index i.
+	 * Reset the data pointers in case we are retrying.
+	 */
+retry:	p1 = &dbta;
+	p1->data = NULL;
+	p2 = &dbtb;
+	p2->data = NULL;
+
+	/*
+	 * Loop through the entries.  nentries ought to contain the
+	 * actual count, and so is a safe way to terminate the loop;  whether
+	 * we inc. by one or two depends on whether we're a leaf page--
+	 * on a leaf page, we care only about keys.  On internal pages
+	 * and LDUP pages, we want to check the order of all entries.
+	 *
+	 * Note that on IBTREE pages or the index page of a partitioned
+	 * database, we start with item 1, since item 0 doesn't get looked
+	 * at by __bam_cmp.
+	 */
+	inp = P_INP(dbp, h);
+	adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
+	for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0;
+	    i < nentries; i += adj) {
+		/*
+		 * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
+		 */
+		tmp = p1;
+		p1 = p2;
+		p2 = tmp;
+		tmpbuf = buf1;
+		buf1 = buf2;
+		buf2 = tmpbuf;
+
+		/*
+		 * Get key i into p2.
+		 */
+		switch (TYPE(h)) {
+		case P_IBTREE:
+			bi = GET_BINTERNAL(dbp, h, i);
+			if (B_TYPE(bi->type) == B_OVERFLOW) {
+				bo = (BOVERFLOW *)(bi->data);
+				goto overflow;
+			} else {
+				p2->data = bi->data;
+				p2->size = bi->len;
+			}
+
+			/*
+			 * The leftmost key on an internal page must be
+			 * len 0, since it's just a placeholder and
+			 * automatically sorts less than all keys.
+			 *
+			 * XXX
+			 * This criterion does not currently hold!
+			 * See todo list item #1686.  Meanwhile, it's harmless
+			 * to just not check for it.
+			 */
+#if 0
+			if (i == 0 && bi->len != 0) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1064",
+		"Page %lu: lowest key on internal page of nonzero length",
+				    "%lu"), (u_long)pgno));
+			}
+#endif
+			break;
+		case P_LBTREE:
+		case P_LDUP:
+			bk = GET_BKEYDATA(dbp, h, i);
+			if (B_TYPE(bk->type) == B_OVERFLOW) {
+				bo = (BOVERFLOW *)bk;
+				goto overflow;
+			} else {
+				p2->data = bk->data;
+				p2->size = bk->len;
+			}
+			break;
+		default:
+			/*
+			 * This means our caller screwed up and sent us
+			 * an inappropriate page.
+			 */
+			ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
+			goto err;
+		}
+
+		if (0) {
+			/*
+			 * If ovflok != 1, we can't safely go chasing
+			 * overflow pages with the normal routines now;
+			 * they might be unsafe or nonexistent.  Mark this
+			 * page as incomplete and return.
+			 *
+			 * Note that we don't need to worry about freeing
+			 * buffers, since they can't have been allocated
+			 * if overflow items are unsafe.
+			 */
+overflow:		if (!ovflok) {
+				if (pip != NULL)
+					F_SET(pip, VRFY_INCOMPLETE);
+				goto err;
+			}
+
+			/*
+			 * Overflow items are safe to chase.  Do so.
+			 * Fetch the overflow item into p2->data,
+			 * NULLing it or reallocing it as appropriate.
+			 *
+			 * (We set p2->data to buf2 before the call
+			 * so we're sure to realloc if we can and if p2
+			 * was just pointing at a non-overflow item.)
+			 */
+			p2->data = buf2;
+			if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+			    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+				goto err;
+			if ((ret = __db_goff(dbc,
+			    p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1065",
+			    "Page %lu: error %lu in fetching overflow item %lu",
+				    "%lu %lu %lu"), (u_long)pgno, (u_long)ret,
+				    (u_long)i));
+			}
+			/* In case it got realloc'ed and thus changed. */
+			buf2 = p2->data;
+		}
+
+		/* Compare with the last key. */
+		if (p1->data != NULL && p2->data != NULL) {
+			cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
+
+			/* comparison succeeded */
+			if (cmp > 0) {
+				/*
+				 * If we are looking at an internal page, we
+				 * don't know whether it is part of the main
+				 * database or in an off-page-duplicate tree.
+				 * If the main comparator fails, retry with
+				 * the duplicate comparator.
+				 */
+				if (TYPE(h) == P_IBTREE && func != dupfunc) {
+					func = dupfunc;
+					goto retry;
+				}
+
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1066",
+				    "Page %lu: out-of-order key at entry %lu",
+				    "%lu %lu"), (u_long)pgno, (u_long)i));
+				/* proceed */
+			} else if (cmp == 0) {
+				if (inp[i] != inp[i - adj]) {
+					/* See above. */
+					if (TYPE(h) == P_IBTREE &&
+					    func != dupfunc) {
+						func = dupfunc;
+						goto retry;
+					}
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1067",
+				     "Page %lu: non-dup dup key at entry %lu",
+					   "%lu %lu"), (u_long)pgno,
+					    (u_long)i));
+				}
+				/*
+				 * If they compared equally, this
+				 * had better be a (sub)database with dups.
+				 * Mark it so we can check during the
+				 * structure check.
+				 */
+				if (pip != NULL)
+					F_SET(pip, VRFY_HAS_DUPS);
+				else if (hasdups == 0) {
+					/* See above. */
+					if (TYPE(h) == P_IBTREE &&
+					    func != dupfunc) {
+						func = dupfunc;
+						goto retry;
+					}
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1068",
+	"Page %lu: database with no duplicates has duplicated keys",
+					    "%lu"), (u_long)pgno));
+				}
+
+				/*
+				 * If we're a btree leaf, check to see
+				 * if the data items of these on-page dups are
+				 * in sorted order.  If not, flag this, so
+				 * that we can make sure during the
+				 * structure checks that the DUPSORT flag
+				 * is unset.
+				 *
+				 * At this point i points to a duplicate key.
+				 * Compare the datum before it (same key)
+				 * to the datum after it, i.e. i-1 to i+1.
+				 */
+				if (TYPE(h) == P_LBTREE) {
+					/*
+					 * Unsafe;  continue and we'll pick
+					 * up the bogus nentries later.
+					 */
+					if (i + 1 >= (db_indx_t)nentries)
+						continue;
+
+					/*
+					 * We don't bother with clever memory
+					 * management with on-page dups,
+					 * as it's only really a big win
+					 * in the overflow case, and overflow
+					 * dups are probably (?) rare.
+					 */
+					if (((ret = __bam_safe_getdata(dbp,
+					    ip, h, i - 1, ovflok,
+					    &dup_1, &freedup_1)) != 0) ||
+					    ((ret = __bam_safe_getdata(dbp,
+					    ip, h, i + 1, ovflok,
+					    &dup_2, &freedup_2)) != 0))
+						goto err;
+
+					/*
+					 * If either of the data are NULL,
+					 * it's because they're overflows and
+					 * it's not safe to chase them now.
+					 * Mark an incomplete and return.
+					 */
+					if (dup_1.data == NULL ||
+					    dup_2.data == NULL) {
+						DB_ASSERT(env, !ovflok);
+						if (pip != NULL)
+							F_SET(pip,
+							    VRFY_INCOMPLETE);
+						goto err;
+					}
+
+					/*
+					 * If the dups are out of order,
+					 * flag this.  It's not an error
+					 * until we do the structure check
+					 * and see whether DUPSORT is set.
+					 */
+					if (dupfunc(dbp, &dup_1, &dup_2) > 0 &&
+					    pip != NULL)
+						F_SET(pip, VRFY_DUPS_UNSORTED);
+
+					if (freedup_1)
+						__os_ufree(env, dup_1.data);
+					if (freedup_2)
+						__os_ufree(env, dup_2.data);
+				}
+			}
+		}
+	}
+
+err:	if (pip != NULL && ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
+		ret = t_ret;
+
+	if (buf1 != NULL)
+		__os_ufree(env, buf1);
+	if (buf2 != NULL)
+		__os_ufree(env, buf2);
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_structure --
+ *	Verify the tree structure of a btree database (including the master
+ *	database containing subdbs).
+ *
+ * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     void *, void *, u_int32_t));
+ */
+int
+__bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta_pgno;
+	void *lp, *rp;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *mip, *rip;
+	db_pgno_t root, p;
+	int t_ret, ret;
+	u_int32_t nrecs, level, relen, stflags;
+
+	env = dbp->env;
+	mip = rip = 0;
+	pgset = vdp->pgset;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
+		return (ret);
+
+	if ((ret = __db_vrfy_pgset_get(pgset,
+	    vdp->thread_info, vdp->txn, meta_pgno, (int *)&p)) != 0)
+		goto err;
+	if (p != 0) {
+		EPRINT((env, DB_STR_A("1069",
+		    "Page %lu: btree metadata page observed twice",
+		    "%lu"), (u_long)meta_pgno));
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+	if ((ret = __db_vrfy_pgset_inc(
+	    pgset, vdp->thread_info, vdp->txn, meta_pgno)) != 0)
+		goto err;
+
+	root = mip->root;
+
+	if (root == 0) {
+		EPRINT((env, DB_STR_A("1070",
+		    "Page %lu: btree metadata page has no root",
+		    "%lu"), (u_long)meta_pgno));
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
+		goto err;
+
+	switch (rip->type) {
+	case P_IBTREE:
+	case P_LBTREE:
+		stflags = flags | DB_ST_TOPLEVEL;
+		if (F_ISSET(mip, VRFY_HAS_DUPS))
+			stflags |= DB_ST_DUPOK;
+		if (F_ISSET(mip, VRFY_HAS_DUPSORT))
+			stflags |= DB_ST_DUPSORT;
+		if (F_ISSET(mip, VRFY_HAS_RECNUMS))
+			stflags |= DB_ST_RECNUM;
+		ret = __bam_vrfy_subtree(dbp,
+		    vdp, root, lp, rp, stflags, NULL, NULL, NULL);
+		break;
+	case P_IRECNO:
+	case P_LRECNO:
+		stflags =
+		    flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
+		if (mip->re_len > 0)
+			stflags |= DB_ST_RELEN;
+		if ((ret = __bam_vrfy_subtree(dbp, vdp,
+		    root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
+			goto err;
+		/*
+		 * Even if mip->re_len > 0, re_len may come back zero if the
+		 * tree is empty.  It should be okay to just skip the check in
+		 * this case, as if there are any non-deleted keys at all,
+		 * that should never happen.
+		 */
+		if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
+			EPRINT((env, DB_STR_A("1071",
+			    "Page %lu: recno database has bad re_len %lu",
+			    "%lu %lu"), (u_long)meta_pgno, (u_long)relen));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		ret = 0;
+		break;
+	case P_LDUP:
+		EPRINT((env, DB_STR_A("1072",
+		    "Page %lu: duplicate tree referenced from metadata page",
+		    "%lu"), (u_long)meta_pgno));
+		ret = DB_VERIFY_BAD;
+		break;
+	default:
+		EPRINT((env, DB_STR_A("1073",
+	    "Page %lu: btree root of incorrect type %lu on metadata page",
+		    "%lu %lu"), (u_long)meta_pgno, (u_long)rip->type));
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+err:	if (mip != NULL && ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
+		ret = t_ret;
+	if (rip != NULL && ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __bam_vrfy_subtree--
+ *	Verify a subtree (or entire) btree with specified root.
+ *
+ *	Note that this is public because it must be called to verify
+ *	offpage dup trees, including from hash.
+ *
+ * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
+ * PUBLIC:     void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	void *l, *r;
+	u_int32_t flags, *levelp, *nrecsp, *relenp;
+{
+	BINTERNAL *li, *ri;
+	DB *pgset;
+	DBC *cc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	VRFY_CHILDINFO *child;
+	VRFY_PAGEINFO *pip;
+	db_indx_t i;
+	db_pgno_t next_pgno, prev_pgno;
+	db_recno_t child_nrecs, nrecs;
+	u_int32_t child_level, child_relen, j, level, relen, stflags;
+	u_int8_t leaf_type;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+	int isbad, p, ret, t_ret, toplevel;
+
+	if (levelp != NULL)	/* Don't leave uninitialized on error. */
+		*levelp = 0;
+	if (nrecsp != NULL)
+		*nrecsp = 0;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	next_pgno = prev_pgno = PGNO_INVALID;
+	nrecs = 0;
+	relen = 0;
+	leaf_type = P_INVALID;
+	isbad = ret = 0;
+
+	/* Provide feedback on our progress to the application. */
+	if (!LF_ISSET(DB_SALVAGE))
+		__db_vrfy_struct_feedback(dbp, vdp);
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	cc = NULL;
+	level = pip->bt_level;
+
+	toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
+	LF_CLR(DB_ST_TOPLEVEL);
+
+	/*
+	 * If this is the root, initialize the vdp's prev- and next-pgno
+	 * accounting.
+	 *
+	 * For each leaf page we hit, we'll want to make sure that
+	 * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
+	 * our page number.  Then, we'll set vdp->next_pgno to pip->next_pgno
+	 * and vdp->prev_pgno to our page number, and the next leaf page in
+	 * line should be able to do the same verification.
+	 */
+	if (toplevel) {
+		/*
+		 * Cache the values stored in the vdp so that if we're an
+		 * auxiliary tree such as an off-page duplicate set, our
+		 * caller's leaf page chain doesn't get lost.
+		 */
+		prev_pgno = vdp->prev_pgno;
+		next_pgno = vdp->next_pgno;
+		leaf_type = vdp->leaf_type;
+		vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
+		vdp->leaf_type = P_INVALID;
+	}
+
+	/*
+	 * We are recursively descending a btree, starting from the root
+	 * and working our way out to the leaves.
+	 *
+	 * There are four cases we need to deal with:
+	 *	1. pgno is a recno leaf page.  Any children are overflows.
+	 *	2. pgno is a duplicate leaf page.  Any children
+	 *	   are overflow pages;  traverse them, and then return
+	 *	   level and nrecs.
+	 *	3. pgno is an ordinary leaf page.  Check whether dups are
+	 *	   allowed, and if so, traverse any off-page dups or
+	 *	   overflows.  Then return nrecs and level.
+	 *	4. pgno is a recno internal page.  Recursively check any
+	 *	   child pages, making sure their levels are one lower
+	 *	   and their nrecs sum to ours.
+	 *	5. pgno is a btree internal page.  Same as #4, plus we
+	 *	   must verify that for each pair of BINTERNAL entries
+	 *	   N and N+1, the leftmost item on N's child sorts
+	 *	   greater than N, and the rightmost item on N's child
+	 *	   sorts less than N+1.
+	 *
+	 * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
+	 * we need to verify the internal sort order is correct if,
+	 * due to overflow items, we were not able to do so earlier.
+	 */
+	switch (pip->type) {
+	case P_LRECNO:
+	case P_LDUP:
+	case P_LBTREE:
+		/*
+		 * Cases 1, 2 and 3.
+		 *
+		 * We're some sort of leaf page;  verify
+		 * that our linked list of leaves is consistent.
+		 */
+		if (vdp->leaf_type == P_INVALID) {
+			/*
+			 * First leaf page.  Set the type that all its
+			 * successors should be, and verify that our prev_pgno
+			 * is PGNO_INVALID.
+			 */
+			vdp->leaf_type = pip->type;
+			if (pip->prev_pgno != PGNO_INVALID)
+				goto bad_prev;
+		} else {
+			/*
+			 * Successor leaf page. Check our type, the previous
+			 * page's next_pgno, and our prev_pgno.
+			 */
+			if (pip->type != vdp->leaf_type) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1074",
+	"Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
+				    "%lu %lu %lu"), (u_long)pip->pgno,
+				    (u_long)pip->type,
+				    (u_long)vdp->leaf_type));
+			}
+
+			/*
+			 * Don't do the prev/next_pgno checks if we've lost
+			 * leaf pages due to another corruption.
+			 */
+			if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
+				if (pip->pgno != vdp->next_pgno) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1075",
+	"Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
+					    "%lu %lu %lu"),
+					    (u_long)vdp->prev_pgno,
+					    (u_long)vdp->next_pgno,
+					    (u_long)pip->pgno));
+				}
+				if (pip->prev_pgno != vdp->prev_pgno) {
+bad_prev:				isbad = 1;
+					EPRINT((env, DB_STR_A("1076",
+    "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
+					    "%lu %lu %lu"),
+					    (u_long)pip->pgno,
+					    (u_long)pip->prev_pgno,
+					    (u_long)vdp->prev_pgno));
+				}
+			}
+		}
+		vdp->prev_pgno = pip->pgno;
+		vdp->next_pgno = pip->next_pgno;
+		F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+		/*
+		 * Overflow pages are common to all three leaf types;
+		 * traverse the child list, looking for overflows.
+		 */
+		if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+			goto err;
+		for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+		    ret = __db_vrfy_ccnext(cc, &child))
+			if (child->type == V_OVERFLOW &&
+			    (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+			    child->pgno, child->tlen,
+			    flags | DB_ST_OVFL_LEAF)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto done;
+			}
+
+		if ((ret = __db_vrfy_ccclose(cc)) != 0)
+			goto err;
+		cc = NULL;
+
+		/* Case 1 */
+		if (pip->type == P_LRECNO) {
+			if (!LF_ISSET(DB_ST_IS_RECNO) &&
+			    !(LF_ISSET(DB_ST_DUPOK) &&
+			    !LF_ISSET(DB_ST_DUPSORT))) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1077",
+				    "Page %lu: recno leaf page non-recno tree",
+				    "%lu"), (u_long)pgno));
+				goto done;
+			}
+			goto leaf;
+		} else if (LF_ISSET(DB_ST_IS_RECNO)) {
+			/*
+			 * It's a non-recno leaf.  Had better not be a recno
+			 * subtree.
+			 */
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1078",
+			    "Page %lu: non-recno leaf page in recno tree",
+			    "%lu"), (u_long)pgno));
+			goto done;
+		}
+
+		/* Case 2--no more work. */
+		if (pip->type == P_LDUP)
+			goto leaf;
+
+		/* Case 3 */
+
+		/* Check if we have any dups. */
+		if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+			/* If dups aren't allowed in this btree, trouble. */
+			if (!LF_ISSET(DB_ST_DUPOK)) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1079",
+				    "Page %lu: duplicates in non-dup btree",
+				    "%lu"), (u_long)pgno));
+			} else {
+				/*
+				 * We correctly have dups.  If any are off-page,
+				 * traverse those btrees recursively.
+				 */
+				if ((ret =
+				    __db_vrfy_childcursor(vdp, &cc)) != 0)
+					goto err;
+				for (ret = __db_vrfy_ccset(cc, pgno, &child);
+				    ret == 0;
+				    ret = __db_vrfy_ccnext(cc, &child)) {
+					stflags =
+					    flags | DB_ST_RECNUM | DB_ST_DUPSET;
+					/* Skip any overflow entries. */
+					if (child->type == V_DUPLICATE) {
+						if ((ret = __db_vrfy_duptype(
+						    dbp, vdp, child->pgno,
+						    stflags)) != 0) {
+							isbad = 1;
+							/* Next child. */
+							continue;
+						}
+						if ((ret = __bam_vrfy_subtree(
+						    dbp, vdp, child->pgno,
+						    NULL, NULL,
+						    stflags | DB_ST_TOPLEVEL,
+						    NULL, NULL, NULL)) != 0) {
+							if (ret ==
+							    DB_VERIFY_BAD)
+								isbad = 1;
+							else
+								goto err;
+						}
+					}
+				}
+
+				if ((ret = __db_vrfy_ccclose(cc)) != 0)
+					goto err;
+				cc = NULL;
+
+				/*
+				 * If VRFY_DUPS_UNSORTED is set,
+				 * DB_ST_DUPSORT had better not be.
+				 */
+				if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
+				    LF_ISSET(DB_ST_DUPSORT)) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1080",
+		    "Page %lu: unsorted duplicate set in sorted-dup database",
+					    "%lu"), (u_long)pgno));
+				}
+			}
+		}
+		goto leaf;
+	case P_IBTREE:
+	case P_IRECNO:
+		/* We handle these below. */
+		break;
+	default:
+		/*
+		 * If a P_IBTREE or P_IRECNO contains a reference to an
+		 * invalid page, we'll wind up here;  handle it gracefully.
+		 * Note that the code at the "done" label assumes that the
+		 * current page is a btree/recno one of some sort;  this
+		 * is not the case here, so we goto err.
+		 *
+		 * If the page is entirely zeroed, its pip->type will be a lie
+		 * (we assumed it was a hash page, as they're allowed to be
+		 * zeroed);  handle this case specially.
+		 */
+		if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+			ZEROPG_ERR_PRINT(env, pgno, DB_STR_P(
+			    "btree or recno page"));
+		else
+			EPRINT((env, DB_STR_A("1081",
+	    "Page %lu: btree or recno page is of inappropriate type %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)pip->type));
+
+		/*
+		 * We probably lost a leaf page (or more if this was an
+		 * internal page) from our prev/next_pgno chain.  Flag
+		 * that this is expected;  we don't want or need to
+		 * spew error messages about erroneous prev/next_pgnos,
+		 * since that's probably not the real problem.
+		 */
+		F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+	/*
+	 * Cases 4 & 5: This is a btree or recno internal page.  For each child,
+	 * recurse, keeping a running count of nrecs and making sure the level
+	 * is always reasonable.
+	 */
+	if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+		goto err;
+	for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+	    ret = __db_vrfy_ccnext(cc, &child))
+		if (child->type == V_RECNO) {
+			if (pip->type != P_IRECNO) {
+				ret = __db_unknown_path(
+				    env, "__bam_vrfy_subtree");
+				goto err;
+			}
+			if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
+			    NULL, NULL, flags, &child_level, &child_nrecs,
+			    &child_relen)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto done;
+			}
+
+			if (LF_ISSET(DB_ST_RELEN)) {
+				if (relen == 0)
+					relen = child_relen;
+				/*
+				 * child_relen may be zero if the child subtree
+				 * is empty.
+				 */
+				else if (child_relen > 0 &&
+				    relen != child_relen) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1082",
+			   "Page %lu: recno page returned bad re_len %lu",
+					    "%lu %lu"), (u_long)child->pgno,
+					    (u_long)child_relen));
+				}
+				if (relenp)
+					*relenp = relen;
+			}
+			if (LF_ISSET(DB_ST_RECNUM)) {
+				if (child->nrecs != child_nrecs) {
+					isbad = 1;
+					EPRINT((env, DB_STR_A("1083",
+		"Page %lu: record count incorrect: actual %lu, in record %lu",
+					    "%lu %lu %lu"),
+					    (u_long)child->pgno,
+					    (u_long)child_nrecs,
+					    (u_long)child->nrecs));
+				}
+				nrecs += child_nrecs;
+			}
+			if (isbad == 0 && level != child_level + 1) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1084",
+		"Page %lu: recno level incorrect: got %lu, expected %lu",
+				    "%lu %lu %lu"),
+				    (u_long)child->pgno, (u_long)child_level,
+				    (u_long)(level - 1)));
+			}
+		} else if (child->type == V_OVERFLOW) {
+			/*
+			 * It is possible for one internal page to reference
+			 * a single overflow page twice, if all the items
+			 * in the subtree referenced by slot 0 are deleted,
+			 * then a similar number of items are put back
+			 * before the key that formerly had been in slot 1.
+			 *
+			 * (Btree doesn't look at the key in slot 0, so the
+			 * fact that the key formerly at slot 1 is the "wrong"
+			 * parent of the stuff in the slot 0 subtree isn't
+			 * really incorrect.)
+			 *
+			 * __db_vrfy_ovfl_structure is designed to be
+			 * efficiently called multiple times for multiple
+			 * references;  call it here as many times as is
+			 * appropriate.
+			 */
+
+			/* Otherwise, __db_vrfy_childput would be broken. */
+			DB_ASSERT(env, child->refcnt >= 1);
+
+			/*
+			 * An overflow referenced more than twice here
+			 * shouldn't happen.
+			 */
+			if (child->refcnt > 2) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1085",
+    "Page %lu: overflow page %lu referenced more than twice from internal page",
+				    "%lu %lu"), (u_long)pgno,
+				    (u_long)child->pgno));
+			} else
+				for (j = 0; j < child->refcnt; j++)
+					if ((ret = __db_vrfy_ovfl_structure(dbp,
+					    vdp, child->pgno, child->tlen,
+					    flags)) != 0) {
+						if (ret == DB_VERIFY_BAD)
+							isbad = 1;
+						else
+							goto done;
+					}
+		}
+
+	if ((ret = __db_vrfy_ccclose(cc)) != 0)
+		goto err;
+	cc = NULL;
+
+	/* We're done with case 4. */
+	if (pip->type == P_IRECNO)
+		goto done;
+
+	/*
+	 * Case 5.  Btree internal pages.
+	 * As described above, we need to iterate through all the
+	 * items on the page and make sure that our children sort appropriately
+	 * with respect to them.
+	 *
+	 * For each entry, li will be the "left-hand" key for the entry
+	 * itself, which must sort lower than all entries on its child;
+	 * ri will be the key to its right, which must sort greater.
+	 */
+	if (h == NULL &&
+	    (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		goto err;
+	for (i = 0; i < pip->entries; i += O_INDX) {
+		li = GET_BINTERNAL(dbp, h, i);
+		ri = (i + O_INDX < pip->entries) ?
+		    GET_BINTERNAL(dbp, h, i + O_INDX) : r;
+
+		/*
+		 * The leftmost key is forcibly sorted less than all entries,
+		 * so don't bother passing it.
+		 */
+		if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
+		    i == 0 ? NULL : li, ri, flags, &child_level,
+		    &child_nrecs, NULL)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto done;
+		}
+
+		if (LF_ISSET(DB_ST_RECNUM)) {
+			/*
+			 * Keep a running tally on the actual record count so
+			 * we can return it to our parent (if we have one) or
+			 * compare it to the NRECS field if we're a root page.
+			 */
+			nrecs += child_nrecs;
+
+			/*
+			 * Make sure the actual record count of the child
+			 * is equal to the value in the BINTERNAL structure.
+			 */
+			if (li->nrecs != child_nrecs) {
+				isbad = 1;
+				EPRINT((env, DB_STR_A("1086",
+	"Page %lu: item %lu has incorrect record count of %lu, should be %lu",
+				    "%lu %lu %lu %lu"), (u_long)pgno,
+				    (u_long)i, (u_long)li->nrecs,
+				    (u_long)child_nrecs));
+			}
+		}
+
+		if (level != child_level + 1) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1087",
+		"Page %lu: Btree level incorrect: got %lu, expected %lu",
+			    "%lu %lu %lu"), (u_long)li->pgno,
+			    (u_long)child_level, (u_long)(level - 1)));
+		}
+	}
+
+	if (0) {
+leaf:		level = LEAFLEVEL;
+		if (LF_ISSET(DB_ST_RECNUM))
+			nrecs = pip->rec_cnt;
+
+		/* XXX
+		 * We should verify that the record count on a leaf page
+		 * is the sum of the number of keys and the number of
+		 * records in its off-page dups.  This requires looking
+		 * at the page again, however, and it may all be changing
+		 * soon, so for now we don't bother.
+		 */
+
+		if (LF_ISSET(DB_ST_RELEN) && relenp)
+			*relenp = pip->re_len;
+	}
+done:	if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
+		/*
+		 * During the page-by-page pass, item order verification was
+		 * not finished due to the presence of overflow items.  If
+		 * isbad == 0, though, it's now safe to do so, as we've
+		 * traversed any child overflow pages.  Do it.
+		 */
+		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+		    vdp->thread_info, NULL, 0, &h)) != 0)
+			goto err;
+		if ((ret = __bam_vrfy_itemorder(dbp,
+		    vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
+			goto err;
+		F_CLR(pip, VRFY_INCOMPLETE);
+	}
+
+	/*
+	 * It's possible to get to this point with a page that has no
+	 * items, but without having detected any sort of failure yet.
+	 * Having zero items is legal if it's a leaf--it may be the
+	 * root page in an empty tree, or the tree may have been
+	 * modified with the DB_REVSPLITOFF flag set (there's no way
+	 * to tell from what's on disk).  For an internal page,
+	 * though, having no items is a problem (all internal pages
+	 * must have children).
+	 */
+	if (isbad == 0 && ret == 0) {
+		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+		    vdp->thread_info, NULL, 0, &h)) != 0)
+			goto err;
+
+		if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1088",
+		    "Page %lu: internal page is empty and should not be",
+			    "%lu"), (u_long)pgno));
+			goto err;
+		}
+	}
+
+	/*
+	 * Our parent has sent us BINTERNAL pointers to parent records
+	 * so that we can verify our place with respect to them.  If it's
+	 * appropriate--we have a default sort function--verify this.
+	 */
+	if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
+	    pip->type != P_IRECNO && pip->type != P_LRECNO) {
+		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+		    vdp->thread_info, NULL, 0, &h)) != 0)
+			goto err;
+
+		/*
+		 * __bam_vrfy_treeorder needs to know what comparison function
+		 * to use.  If DB_ST_DUPSET is set, we're in a duplicate tree
+		 * and we use the duplicate comparison function;  otherwise,
+		 * use the btree one.  If unset, use the default, of course.
+		 */
+		func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
+		    ((BTREE *)dbp->bt_internal)->bt_compare;
+		if (func == NULL)
+			func = __bam_defcmp;
+
+		if ((ret = __bam_vrfy_treeorder(dbp,
+		    vdp->thread_info, h, l, r, func, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+	}
+
+	/*
+	 * This is guaranteed to succeed for leaf pages, but no harm done.
+	 *
+	 * Internal pages below the top level do not store their own
+	 * record numbers, so we skip them.
+	 */
+	if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1089",
+		    "Page %lu: bad record count: has %lu records, claims %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)nrecs,
+		    (u_long)pip->rec_cnt));
+	}
+
+	if (levelp)
+		*levelp = level;
+	if (nrecsp)
+		*nrecsp = nrecs;
+
+	pgset = vdp->pgset;
+	if ((ret = __db_vrfy_pgset_get(pgset,
+	    vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+		goto err;
+	if (p != 0) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1090",
+		    "Page %lu: linked twice", "%lu"), (u_long)pgno));
+	} else if ((ret =
+	    __db_vrfy_pgset_inc(pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+		goto err;
+
+	if (toplevel)
+		/*
+		 * The last page's next_pgno in the leaf chain should have been
+		 * PGNO_INVALID.
+		 */
+		if (vdp->next_pgno != PGNO_INVALID) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("1091",
+			    "Page %lu: unterminated leaf chain",
+			    "%lu"), (u_long)vdp->prev_pgno));
+		}
+
+err:	if (toplevel) {
+		/* Restore our caller's settings. */
+		vdp->next_pgno = next_pgno;
+		vdp->prev_pgno = prev_pgno;
+		vdp->leaf_type = leaf_type;
+	}
+
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_treeorder --
+ *	Verify that the lowest key on a page sorts greater than the
+ *	BINTERNAL which points to it (lp), and the highest key
+ *	sorts less than the BINTERNAL above that (rp).
+ *
+ *	If lp is NULL, this means that it was the leftmost key on the
+ *	parent, which (regardless of sort function) sorts less than
+ *	all keys.  No need to check it.
+ *
+ *	If rp is NULL, lp was the highest key on the parent, so there's
+ *	no higher key we must sort less than.
+ */
+static int
+__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	BINTERNAL *lp, *rp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+	u_int32_t flags;
+{
+	BOVERFLOW *bo;
+	DBC *dbc;
+	DBT dbt;
+	ENV *env;
+	db_indx_t last;
+	int ret, cmp;
+
+	env = dbp->env;
+	memset(&dbt, 0, sizeof(DBT));
+	F_SET(&dbt, DB_DBT_MALLOC);
+	ret = 0;
+
+	/*
+	 * Empty pages are sorted correctly by definition.  We check
+	 * to see whether they ought to be empty elsewhere;  leaf
+	 * pages legally may be.
+	 */
+	if (NUM_ENT(h) == 0)
+		return (0);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_LDUP:
+		last = NUM_ENT(h) - O_INDX;
+		break;
+	case P_LBTREE:
+		last = NUM_ENT(h) - P_INDX;
+		break;
+	default:
+		return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
+	}
+
+	/* Populate a dummy cursor. */
+	if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+	    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+		return (ret);
+	/*
+	 * The key on page h, the child page, is more likely to be
+	 * an overflow page, so we pass its offset, rather than lp/rp's,
+	 * into __bam_cmp.  This will take advantage of __db_moff.
+	 */
+
+	/*
+	 * Skip first-item check if we're an internal page--the first
+	 * entry on an internal page is treated specially by __bam_cmp,
+	 * so what's on the page shouldn't matter.  (Plus, since we're passing
+	 * our page and item 0 as to __bam_cmp, we'll sort before our
+	 * parent and falsely report a failure.)
+	 */
+	if (lp != NULL && TYPE(h) != P_IBTREE) {
+		if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+			return (ret);
+		if (lp->type == B_KEYDATA) {
+			dbt.data = lp->data;
+			dbt.size = lp->len;
+		} else if (lp->type == B_OVERFLOW) {
+			bo = (BOVERFLOW *)lp->data;
+			if ((ret = __db_goff(dbc, &dbt,
+			    bo->tlen, bo->pgno, NULL, NULL)) != 0)
+				return (ret);
+		} else
+			return (
+			    __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+		/* On error, fall through, free if needed, and return. */
+		if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
+			if (cmp > 0) {
+				EPRINT((env, DB_STR_A("1092",
+	    "Page %lu: first item on page sorted greater than parent entry",
+				    "%lu"), (u_long)PGNO(h)));
+				ret = DB_VERIFY_BAD;
+			}
+		} else
+			EPRINT((env, DB_STR_A("1093",
+			    "Page %lu: first item on page had comparison error",
+			    "%lu"), (u_long)PGNO(h)));
+
+		if (dbt.data != lp->data)
+			__os_ufree(env, dbt.data);
+		if (ret != 0)
+			return (ret);
+	}
+
+	if (rp != NULL) {
+		if (rp->type == B_KEYDATA) {
+			dbt.data = rp->data;
+			dbt.size = rp->len;
+		} else if (rp->type == B_OVERFLOW) {
+			bo = (BOVERFLOW *)rp->data;
+			if ((ret = __db_goff(dbc, &dbt,
+			    bo->tlen, bo->pgno, NULL, NULL)) != 0)
+				return (ret);
+		} else
+			return (
+			    __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+		/* On error, fall through, free if needed, and return. */
+		if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
+			if (cmp < 0) {
+				EPRINT((env, DB_STR_A("1094",
+	    "Page %lu: last item on page sorted greater than parent entry",
+				    "%lu"), (u_long)PGNO(h)));
+				ret = DB_VERIFY_BAD;
+			}
+		} else
+			EPRINT((env, DB_STR_A("1095",
+			    "Page %lu: last item on page had comparison error",
+			    "%lu"), (u_long)PGNO(h)));
+
+		if (dbt.data != rp->data)
+			__os_ufree(env, dbt.data);
+	}
+
+	return (ret);
+}
+
+/*
+ * __bam_salvage --
+ *	Safely dump out anything that looks like a key on an alleged
+ *	btree leaf page, also mark overflow pages as seen.  For internal btree
+ *	pages, just mark any overflow pages as seen.
+ *
+ * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC:     db_pgno_t, u_int32_t, PAGE *, void *,
+ * PUBLIC:     int (*)(void *, const void *), DBT *, u_int32_t));
+ */
+int
+__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t pgtype;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	DBT *key;
+	u_int32_t flags;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	DBT dbt, repldbt, unknown_key, unknown_data;
+	ENV *env;
+	VRFY_ITEM *pgmap;
+	db_indx_t i, last, beg, end, *inp;
+	db_pgno_t ovflpg;
+	u_int32_t himark, ovfl_bufsz;
+	void *ovflbuf;
+	int adj, ret, t_ret, t2_ret;
+#ifdef HAVE_COMPRESSION
+	DBT kcpy, *last_key;
+	int unknown_dup_key;
+#endif
+
+	env = dbp->env;
+	ovflbuf = pgmap = NULL;
+	inp = P_INP(dbp, h);
+
+	memset(&dbt, 0, sizeof(DBT));
+	dbt.flags = DB_DBT_REALLOC;
+	memset(&repldbt, 0, sizeof(DBT));
+
+#ifdef HAVE_COMPRESSION
+	memset(&kcpy, 0, sizeof(DBT));
+	unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY);
+	last_key = unknown_dup_key ? NULL : key;
+#endif
+	LF_CLR(DB_SA_UNKNOWNKEY);
+
+	DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
+	DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
+
+	/*
+	 * Allocate a buffer for overflow items.  Start at one page;
+	 * __db_safe_goff will realloc as needed.
+	 */
+	if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+		goto err;
+	ovfl_bufsz = dbp->pgsize;
+
+	if (LF_ISSET(DB_AGGRESSIVE) && (ret =
+	    __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
+		goto err;
+
+	/*
+	 * Loop through the inp array, spitting out key/data pairs.
+	 *
+	 * If we're salvaging normally, loop from 0 through NUM_ENT(h).  If
+	 * we're being aggressive, loop until we hit the end of the page --
+	 * NUM_ENT() may be bogus.
+	 */
+	himark = dbp->pgsize;
+	for (i = 0, last = UINT16_MAX;; i += O_INDX) {
+		/*
+		 * If we're not aggressive, or if we're on an internal page,
+		 * break when we hit NUM_ENT(h).
+		 */
+		if ((!LF_ISSET(DB_AGGRESSIVE) ||
+		    pgtype == P_IBTREE) && i >= NUM_ENT(h))
+			break;
+
+		/* Verify the current item. */
+		t_ret =
+		    __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
+
+		if (t_ret != 0) {
+			/*
+			 * If this is a btree leaf and we've printed out a key
+			 * but not its associated data item, fix this imbalance
+			 * by printing an "UNKNOWN_DATA".
+			 */
+			if (pgtype == P_LBTREE && i % P_INDX == 1 &&
+			    last == i - 1 && (t2_ret = __db_vrfy_prdbt(
+			    &unknown_data,
+			    0, " ", handle, callback, 0, 0, vdp)) != 0) {
+				if (ret == 0)
+					ret = t2_ret;
+				goto err;
+			}
+
+			/*
+			 * Don't return DB_VERIFY_FATAL; it's private and means
+			 * only that we can't go on with this page, not with
+			 * the whole database.  It's not even an error if we've
+			 * run into it after NUM_ENT(h).
+			 */
+			if (t_ret == DB_VERIFY_FATAL) {
+				if (i < NUM_ENT(h) && ret == 0)
+					ret = DB_VERIFY_BAD;
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * If this returned 0, it's safe to print or (carefully)
+		 * try to fetch.
+		 *
+		 * We only print deleted items if DB_AGGRESSIVE is set.
+		 */
+		bk = GET_BKEYDATA(dbp, h, i);
+		if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
+			continue;
+
+		/*
+		 * If this is a btree leaf and we're about to print out a data
+		 * item for which we didn't print out a key, fix this imbalance
+		 * by printing an "UNKNOWN_KEY".
+		 */
+		if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) {
+#ifdef HAVE_COMPRESSION
+			last_key = NULL;
+#endif
+			if ((t_ret = __db_vrfy_prdbt(&unknown_key,
+			    0, " ", handle, callback, 0, 0, vdp)) != 0) {
+				if (ret == 0)
+					ret = t_ret;
+				goto err;
+			}
+		}
+		last = i;
+
+		/*
+		 * We're going to go try to print the next item.  If key is
+		 * non-NULL, we're a dup page, so we've got to print the key
+		 * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
+		 * first entry.
+		 */
+		if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) {
+#ifdef HAVE_COMPRESSION
+			last_key = unknown_dup_key ? NULL : key;
+#endif
+			if ((t_ret = __db_vrfy_prdbt(key,
+			    0, " ", handle, callback, 0, 0, vdp)) != 0) {
+				if (ret == 0)
+					ret = t_ret;
+				goto err;
+			}
+		}
+
+		beg = end = inp[i];
+		switch (B_TYPE(bk->type)) {
+		case B_DUPLICATE:
+			if (pgtype == P_IBTREE)
+				break;
+
+			end = beg + BOVERFLOW_SIZE - 1;
+			/*
+			 * If we're not on a normal btree leaf page, there
+			 * shouldn't be off-page dup sets.  Something's
+			 * confused; just drop it, and the code to pick up
+			 * unlinked offpage dup sets will print it out
+			 * with key "UNKNOWN" later.
+			 */
+			if (pgtype != P_LBTREE)
+				break;
+
+			bo = (BOVERFLOW *)bk;
+
+			/*
+			 * If the page number is unreasonable, or if this is
+			 * supposed to be a key item, output "UNKNOWN_KEY" --
+			 * the best we can do is run into the data items in
+			 * the unlinked offpage dup pass.
+			 */
+			if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
+				/* Not much to do on failure. */
+#ifdef HAVE_COMPRESSION
+				if (key == NULL && i % P_INDX == 0)
+					last_key = NULL;
+#endif
+				if ((t_ret = __db_vrfy_prdbt(
+				 i % P_INDX == 0 ? &unknown_key : &unknown_data,
+				   0, " ", handle, callback, 0, 0,vdp)) != 0) {
+					if (ret == 0)
+						ret = t_ret;
+					goto err;
+				}
+				break;
+			}
+
+			/* Don't stop on error. */
+			if ((t_ret = __db_salvage_duptree(dbp,
+			    vdp, bo->pgno, &dbt, handle, callback,
+			    flags | DB_SA_SKIPFIRSTKEY
+#ifdef HAVE_COMPRESSION
+			    | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0)
+#endif
+			    )) != 0 && ret == 0)
+				ret = t_ret;
+
+			break;
+		case B_KEYDATA:
+			if (pgtype == P_IBTREE)
+				break;
+
+			end = (db_indx_t)DB_ALIGN(
+			    beg + bk->len, sizeof(u_int32_t)) - 1;
+
+			dbt.data = bk->data;
+			dbt.size = bk->len;
+
+#ifdef HAVE_COMPRESSION
+			if (DB_IS_COMPRESSED(dbp) && last_key != NULL &&
+			    (key != NULL || (i % P_INDX == 1))) {
+				/* Decompress the key/data pair  - the key
+				   is in last_key, and the data is in dbt */
+				if ((t_ret = __bam_compress_salvage(dbp, vdp,
+				    handle, callback, last_key, &dbt)) != 0) {
+					if (t_ret == DB_VERIFY_FATAL) {
+						if (ret == 0)
+							ret = DB_VERIFY_BAD;
+						if (!LF_ISSET(DB_AGGRESSIVE))
+							goto err;
+					} else if (ret == 0) {
+						ret = t_ret;
+						goto err;
+					}
+				}
+			} else {
+				if (key == NULL && i % P_INDX == 0) {
+					if ((ret = __os_realloc(
+					    env, dbt.size, &kcpy.data)) != 0)
+						goto err;
+					memcpy(kcpy.data, dbt.data, dbt.size);
+					kcpy.size = dbt.size;
+					last_key = &kcpy;
+				}
+#endif
+
+				if ((t_ret = __db_vrfy_prdbt(&dbt,
+				   0, " ", handle, callback, 0, 0, vdp)) != 0) {
+					if (ret == 0)
+						ret = t_ret;
+					goto err;
+				}
+#ifdef HAVE_COMPRESSION
+			}
+#endif
+			break;
+		case B_OVERFLOW:
+			if (pgtype != P_IBTREE)
+				end = beg + BOVERFLOW_SIZE - 1;
+			bo = (BOVERFLOW *)bk;
+
+			/*
+			 * Check for replicated overflow keys, so that we only
+			 * call __db_safe_goff once per overflow page.  If we
+			 * get the same offset as the previous key just re-use
+			 * the previous dbt.
+			 *
+			 * P_IBTREE pages will never have replicated overflow
+			 * keys.
+			 */
+			adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
+			if (pgtype == P_IBTREE) {
+				/*
+				 * If we're looking at a P_IBTREE, we just want
+				 * to mark the overflow page as seen.
+				 *
+				 * Note that this call to __db_safe_goff differs
+				 * from the non-P_IBTREE call.
+				 *
+				 * Only call __db_safe_goff if the overflow page
+				 * hasn't been seen.
+				 */
+				ovflpg = ((BOVERFLOW *)
+				    ((BINTERNAL *)bk)->data)->pgno;
+				if (__db_salvage_isdone(vdp, ovflpg) == 0 &&
+				    (t_ret =__db_safe_goff(dbp, vdp, ovflpg,
+					&dbt, &ovflbuf,
+					&ovfl_bufsz, flags)) != 0 && ret == 0)
+					ret = t_ret;
+				break;
+			} else if (i > adj - 1 &&
+			    i % adj == 0 && inp[i] == inp[i - adj])
+				dbt = repldbt;
+			else {
+				/* Don't stop on error. */
+				if ((t_ret = __db_safe_goff(dbp, vdp,
+				    bo->pgno, &dbt, &ovflbuf,
+				    &ovfl_bufsz, flags)) != 0 && ret == 0)
+					ret = t_ret;
+
+				/*
+				 * If this is a key, save it in case the next
+				 * key is a replicated overflow, so we don't
+				 * call __db_safe_goff again.  Copy out dbt.data
+				 * in case that pointer gets realloc'd when
+				 * getting a data item.
+				 */
+				if (i % P_INDX == 0) {
+					if (t_ret == 0) {
+						if ((t_ret = __os_realloc(env,
+							dbt.size,
+							&repldbt.data)) != 0) {
+							if (ret == 0)
+								ret = t_ret;
+							goto err;
+						}
+						memcpy(repldbt.data,
+						    dbt.data, dbt.size);
+						repldbt.size = dbt.size;
+					} else {
+						if (__os_realloc(env,
+						    unknown_key.size,
+						    &repldbt.data) != 0)
+							goto err;
+						memcpy(repldbt.data,
+						    unknown_key.data,
+						    unknown_key.size);
+						repldbt.size = unknown_key.size;
+					}
+				}
+
+			}
+
+#ifdef HAVE_COMPRESSION
+			if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 &&
+			    (key != NULL || (i % P_INDX == 1))) {
+				/* Decompress the key/data pair  - the key
+				   is in last_key, and the data is in dbt */
+				if ((t_ret = __bam_compress_salvage(dbp, vdp,
+				    handle, callback, last_key, &dbt)) != 0) {
+					if (t_ret == DB_VERIFY_FATAL) {
+						if (ret == 0)
+							ret = DB_VERIFY_BAD;
+						if (!LF_ISSET(DB_AGGRESSIVE))
+							goto err;
+					} else if (ret == 0) {
+						ret = t_ret;
+						goto err;
+					}
+				}
+			} else {
+				if (key == NULL && i % P_INDX == 0) {
+					if (t_ret == 0) {
+						if ((ret = __os_realloc(env,
+						    dbt.size, &kcpy.data)) != 0)
+							goto err;
+						memcpy(kcpy.data, dbt.data,
+							dbt.size);
+						kcpy.size = dbt.size;
+						last_key = &kcpy;
+					} else
+						last_key = NULL;
+				}
+#endif
+
+				if ((t_ret = __db_vrfy_prdbt(
+					   t_ret == 0 ? &dbt : &unknown_key,
+					   0, " ", handle, callback, 0, 0, vdp))
+					!= 0 && ret == 0)
+					ret = t_ret;
+#ifdef HAVE_COMPRESSION
+			}
+#endif
+			break;
+		default:
+			/*
+			 * We should never get here; __db_vrfy_inpitem should
+			 * not be returning 0 if bk->type is unrecognizable.
+			 */
+			t_ret = __db_unknown_path(env, "__bam_salvage");
+			if (ret == 0)
+				ret = t_ret;
+			goto err;
+		}
+
+		/*
+		 * If we're being aggressive, mark the beginning and end of
+		 * the item; we'll come back and print whatever "junk" is in
+		 * the gaps in case we had any bogus inp elements and thereby
+		 * missed stuff.
+		 */
+		if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
+			pgmap[beg] = VRFY_ITEM_BEGIN;
+			pgmap[end] = VRFY_ITEM_END;
+		}
+	}
+
+err:	if (pgmap != NULL)
+		__os_free(env, pgmap);
+	if (ovflbuf != NULL)
+		__os_free(env, ovflbuf);
+	if (repldbt.data != NULL)
+		__os_free(env, repldbt.data);
+#ifdef HAVE_COMPRESSION
+	if (kcpy.data != NULL)
+		__os_free(env, kcpy.data);
+#endif
+
+	/* Mark this page as done. */
+	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __bam_salvage_walkdupint --
+ *	Walk a known-good btree or recno internal page which is part of
+ *	a dup tree, calling __db_salvage_duptree on each child page.
+ *
+ * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	DBT *key;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	BINTERNAL *bi;
+	ENV *env;
+	RINTERNAL *ri;
+	int ret, t_ret;
+	db_indx_t i;
+
+	env = dbp->env;
+	ret = 0;
+
+	for (i = 0; i < NUM_ENT(h); i++) {
+		switch (TYPE(h)) {
+		case P_IBTREE:
+			bi = GET_BINTERNAL(dbp, h, i);
+			if ((t_ret = __db_salvage_duptree(dbp,
+			    vdp, bi->pgno, key, handle, callback, flags)) != 0)
+				ret = t_ret;
+			break;
+		case P_IRECNO:
+			ri = GET_RINTERNAL(dbp, h, i);
+			if ((t_ret = __db_salvage_duptree(dbp,
+			    vdp, ri->pgno, key, handle, callback, flags)) != 0)
+				ret = t_ret;
+			break;
+		default:
+			return (__db_unknown_path(
+			    env, "__bam_salvage_walkdupint"));
+		}
+		/* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
+		flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
+	}
+
+	return (ret);
+}
+
+/*
+ * __bam_meta2pgset --
+ *	Given a known-good meta page, return in pgsetp a 0-terminated list of
+ *	db_pgno_t's corresponding to the pages in the btree.
+ *
+ *	We do this by a somewhat sleazy method, to avoid having to traverse the
+ *	btree structure neatly:  we walk down the left side to the very
+ *	first leaf page, then we mark all the pages in the chain of
+ *	NEXT_PGNOs (being wary of cycles and invalid ones), then we
+ *	consolidate our scratch array into a nice list, and return.  This
+ *	avoids the memory management hassles of recursion and the
+ *	trouble of walking internal pages--they just don't matter, except
+ *	for the left branch.
+ *
+ * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC:     u_int32_t, DB *));
+ */
+int
+__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	BTMETA *btmeta;
+	u_int32_t flags;
+	DB *pgset;
+{
+	BINTERNAL *bi;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	RINTERNAL *ri;
+	db_pgno_t current, p;
+	int err_ret, ret;
+
+	DB_ASSERT(dbp->env, pgset != NULL);
+
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = err_ret = 0;
+
+	for (current = btmeta->root;;) {
+		if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
+			err_ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		if ((ret = __memp_fget(mpf, &current,
+		     vdp->thread_info, NULL, 0, &h)) != 0) {
+			err_ret = ret;
+			goto err;
+		}
+
+		switch (TYPE(h)) {
+		case P_IBTREE:
+		case P_IRECNO:
+			if ((ret = __bam_vrfy(dbp,
+			    vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
+				err_ret = ret;
+				goto err;
+			}
+			if (TYPE(h) == P_IBTREE) {
+				bi = GET_BINTERNAL(dbp, h, 0);
+				current = bi->pgno;
+			} else {	/* P_IRECNO */
+				ri = GET_RINTERNAL(dbp, h, 0);
+				current = ri->pgno;
+			}
+			break;
+		case P_LBTREE:
+		case P_LRECNO:
+			goto traverse;
+		default:
+			err_ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		if ((ret = __memp_fput(mpf,
+		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			err_ret = ret;
+		h = NULL;
+	}
+
+	/*
+	 * At this point, current is the pgno of leaf page h, the 0th in the
+	 * tree we're concerned with.
+	 */
+traverse:
+	while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
+		if (h == NULL && (ret = __memp_fget(mpf,
+		    &current, vdp->thread_info, NULL, 0, &h)) != 0) {
+			err_ret = ret;
+			break;
+		}
+
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, vdp->txn, current, (int *)&p)) != 0)
+			goto err;
+
+		if (p != 0) {
+			/*
+			 * We've found a cycle.  Return success anyway--
+			 * our caller may as well use however much of
+			 * the pgset we've come up with.
+			 */
+			break;
+		}
+		if ((ret = __db_vrfy_pgset_inc(
+		    pgset, vdp->thread_info, vdp->txn, current)) != 0)
+			goto err;
+
+		current = NEXT_PGNO(h);
+		if ((ret = __memp_fput(mpf,
+		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			err_ret = ret;
+		h = NULL;
+	}
+
+err:	if (h != NULL)
+		(void)__memp_fput(mpf,
+		    vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
+
+	return (ret == 0 ? err_ret : ret);
+}
+
+/*
+ * __bam_safe_getdata --
+ *
+ *	Utility function for __bam_vrfy_itemorder.  Safely gets the datum at
+ *	index i, page h, and sticks it in DBT dbt.  If ovflok is 1 and i's an
+ *	overflow item, we do a safe_goff to get the item and signal that we need
+ *	to free dbt->data;  if ovflok is 0, we leaves the DBT zeroed.
+ */
+static int
+__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	u_int32_t i;
+	int ovflok;
+	DBT *dbt;
+	int *freedbtp;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	DBC *dbc;
+	int ret;
+
+	memset(dbt, 0, sizeof(DBT));
+	*freedbtp = 0;
+
+	bk = GET_BKEYDATA(dbp, h, i);
+	if (B_TYPE(bk->type) == B_OVERFLOW) {
+		if (!ovflok)
+			return (0);
+
+		if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+			return (ret);
+		bo = (BOVERFLOW *)bk;
+		F_SET(dbt, DB_DBT_MALLOC);
+
+		*freedbtp = 1;
+		return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL));
+	} else {
+		dbt->data = bk->data;
+		dbt->size = bk->len;
+	}
+
+	return (0);
+}
diff --git a/src/btree/btree.src b/src/btree/btree.src
new file mode 100644
index 00000000..08e5a206
--- /dev/null
+++ b/src/btree/btree.src
@@ -0,0 +1,290 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__bam
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/btree.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left:	the page number for the low-order contents.
+ * llsn:	the left page's original LSN.
+ * right:	the page number for the high-order contents.
+ * rlsn:	the right page's original LSN.
+ * indx:	the number of entries that went to the left page.
+ * npgno:	the next page number
+ * nlsn:	the next page's original LSN (or 0 if no next page).
+ * pgno:	the parent page number
+ * plsn:	the parent page's original LSN.
+ * pg:		the split page's contents before the split.
+ * opflags:	SPL_NRECS: if splitting a tree that maintains a record count.
+ * pindx:	index of new record in parent page.
+ */
+BEGIN split		50	62
+DB	fileid		int32_t		ld
+OP	opflags		u_int32_t	lu
+ARG	left		db_pgno_t	lu
+POINTER	llsn		DB_LSN *	lu
+ARG	right		db_pgno_t	lu
+POINTER	rlsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	npgno		db_pgno_t	lu
+POINTER	nlsn		DB_LSN *	lu
+ARG	ppgno		db_pgno_t	lu
+POINTER	plsn		DB_LSN *	lu
+ARG	pindx		u_int32_t	lu
+PGDBT	pg		DBT		s
+HDR	pentry		DBT		s
+HDR	rentry		DBT		s
+END
+
+BEGIN_COMPAT split		48	62
+DB	fileid		int32_t		ld
+ARG	left		db_pgno_t	lu
+POINTER	llsn		DB_LSN *	lu
+ARG	right		db_pgno_t	lu
+POINTER	rlsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	npgno		db_pgno_t	lu
+POINTER	nlsn		DB_LSN *	lu
+ARG	ppgno		db_pgno_t	lu
+POINTER	plsn		DB_LSN *	lu
+ARG	pindx		u_int32_t	lu
+PGDBT	pg		DBT		s
+DBT	pentry		DBT		s
+DBT	rentry		DBT		s
+ARG	opflags		u_int32_t	lu
+END
+
+BEGIN_COMPAT split	42	62
+DB	fileid		int32_t		ld
+ARG	left		db_pgno_t	lu
+POINTER	llsn		DB_LSN *	lu
+ARG	right		db_pgno_t	lu
+POINTER	rlsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	npgno		db_pgno_t	lu
+POINTER	nlsn		DB_LSN *	lu
+ARG	root_pgno	db_pgno_t	lu
+PGDBT	pg		DBT		s
+ARG	opflags		u_int32_t	lu
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno:	the page number of the page copied over the root.
+ * pgdbt:	the page being copied on the root page.
+ * root_pgno:	the root page number.
+ * nrec:	the tree's record count.
+ * rootent:	last entry on the root page.
+ * rootlsn:	the root page's original lsn.
+ */
+BEGIN rsplit		42	63
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+PGDBT	pgdbt		DBT		s
+ARG	root_pgno	db_pgno_t	lu
+ARG	nrec		db_pgno_t	lu
+DBT	rootent		DBT		s
+POINTER rootlsn		DB_LSN *	lu
+END
+
+/*
+ * BTREE-adj: used to log the adjustment of an index.
+ *
+ * pgno:	the page modified.
+ * lsn:		the page's original lsn.
+ * indx:	the index adjusted.
+ * indx_copy:	the index to copy if inserting.
+ * is_insert:	0 if a delete, 1 if an insert.
+ */
+BEGIN adj		42	55
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	indx_copy	u_int32_t	lu
+ARG	is_insert	u_int32_t	lu
+END
+
+/*
+ * BTREE-cadjust: used to adjust the count change in an internal page.
+ *
+ * pgno:	the page modified.
+ * lsn:		the page's original lsn.
+ * indx:	the index to be adjusted.
+ * adjust:	the signed adjustment.
+ * opflags:	CAD_UPDATEROOT: if root page count was adjusted.
+ */
+BEGIN cadjust		42	56
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	adjust		int32_t		ld
+ARG	opflags		u_int32_t	lu
+END
+
+/*
+ * BTREE-cdel: used to log the intent-to-delete of a cursor record.
+ *
+ * pgno:	the page modified.
+ * lsn:		the page's original lsn.
+ * indx:	the index to be deleted.
+ */
+BEGIN cdel		42	57
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+END
+
+/*
+ * BTREE-repl: used to log the replacement of an item.
+ *
+ * pgno:	the page modified.
+ * lsn:		the page's original lsn.
+ * indx:	the index to be replaced.
+ * isdeleted:	set if the record was previously deleted.
+ * orig:	the original data.
+ * repl:	the replacement data.
+ * prefix:	the prefix of the replacement that matches the original.
+ * suffix:	the suffix of the replacement that matches the original.
+ */
+BEGIN repl		42	58
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	isdeleted	u_int32_t	lu
+DBT	orig		DBT		s
+DBT	repl		DBT		s
+ARG	prefix		u_int32_t	lu
+ARG	suffix		u_int32_t	lu
+END
+
+/*
+ * BTREE-irep: used to log the replacement of an item on an internal page.
+ *
+ * pgno:	the page modified.
+ * lsn:		the page's original lsn.
+ * indx:	the index to be replaced.
+ * ptype:	type of the page.
+ * hdr:		header of the record.
+ * data:	data of the record.
+ */
+BEGIN irep		50	67
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+OP	ptype		u_int32_t	lu
+HDR	hdr		DBT		s
+DATA	data		DBT		s
+HDR	old		DBT		s
+END
+
+/*
+ * BTREE-root: log the assignment of a root btree page.
+ */
+BEGIN root		42	59
+DB	fileid		int32_t		ld
+ARG	meta_pgno	db_pgno_t	lu
+ARG	root_pgno	db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+END
+
+/*
+ * BTREE-curadj: undo cursor adjustments on txn abort.
+ *     Should only be processed during DB_TXN_ABORT.
+ * NOTE: the first_indx field gets used to hold
+ *	signed index adjustment in one case.
+ *	care should be taken if its size is changed.
+ */
+BEGIN curadj		42	64
+/* Fileid of db affected. */
+DB	fileid		int32_t		ld
+/* Which adjustment. */
+ARG	mode		db_ca_mode	ld
+/* Page entry is from. */
+ARG	from_pgno	db_pgno_t	lu
+/* Page entry  went to. */
+ARG	to_pgno		db_pgno_t	lu
+/* Left page of root split. */
+ARG	left_pgno	db_pgno_t	lu
+/* First index of dup set. Also used as adjustment. */
+ARG	first_indx	u_int32_t	lu
+/* Index entry is from. */
+ARG	from_indx	u_int32_t	lu
+/* Index where entry went. */
+ARG	to_indx		u_int32_t	lu
+END
+
+/*
+ * BTREE-rcuradj: undo cursor adjustments on txn abort in
+ *     renumbering recno trees.
+ *     Should only be processed during DB_TXN_ABORT.
+ */
+BEGIN rcuradj	42	65
+/* Fileid of db affected. */
+DB	fileid		int32_t		ld
+/* Which adjustment. */
+ARG	mode		ca_recno_arg	ld
+/* Root page number. */
+ARG	root		db_pgno_t	ld
+/* Recno of the adjustment. */
+ARG	recno		db_recno_t	ld
+/* Order number of the adjustment. */
+ARG	order		u_int32_t	lu
+END
+
+/*
+ * BTREE-relink -- Handles relinking around a deleted leaf page.
+ * Current routine moved to __db_relink.
+ *
+ */
+BEGIN_COMPAT relink		43	147
+/* Fileid of db affected. */
+DB	fileid		int32_t		ld
+/* The page being removed. */
+ARG	pgno		db_pgno_t	lu
+/* The page's original lsn. */
+POINTER	lsn		DB_LSN *	lu
+/* The previous page. */
+ARG	prev		db_pgno_t	lu
+/* The previous page's original lsn. */
+POINTER	lsn_prev	DB_LSN *	lu
+/* The next page. */
+ARG	next		db_pgno_t	lu
+/* The previous page's original lsn. */
+POINTER	lsn_next	DB_LSN *	lu
+END
+
+/*
+ * BTREE-merge -- Handles merging of pages during a compaction.
+ * Current routine moved to __db_merge.
+ */
+BEGIN_COMPAT merge		44	148
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	npgno		db_pgno_t	lu
+POINTER nlsn		DB_LSN *	lu
+DBT	hdr		DBT		s
+DBT	data		DBT		s
+DBT	ind		DBT		s
+END
diff --git a/src/btree/btree_auto.c b/src/btree/btree_auto.c
new file mode 100644
index 00000000..e5e148c5
--- /dev/null
+++ b/src/btree/btree_auto.c
@@ -0,0 +1,207 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __bam_split_desc[] = {
+	{LOGREC_DB, SSZ(__bam_split_args, fileid), "fileid", ""},
+	{LOGREC_OP, SSZ(__bam_split_args, opflags), "opflags", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_split_args, left), "left", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_args, llsn), "llsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_args, right), "right", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_args, rlsn), "rlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_split_args, npgno), "npgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_args, nlsn), "nlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_args, ppgno), "ppgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_args, plsn), "plsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_args, pindx), "pindx", "%lu"},
+	{LOGREC_PGDBT, SSZ(__bam_split_args, pg), "pg", ""},
+	{LOGREC_HDR, SSZ(__bam_split_args, pentry), "pentry", ""},
+	{LOGREC_HDR, SSZ(__bam_split_args, rentry), "rentry", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_split_48_desc[] = {
+	{LOGREC_DB, SSZ(__bam_split_48_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, left), "left", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_48_args, llsn), "llsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, right), "right", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_48_args, rlsn), "rlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, npgno), "npgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_48_args, nlsn), "nlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, ppgno), "ppgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_48_args, plsn), "plsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, pindx), "pindx", "%lu"},
+	{LOGREC_PGDBT, SSZ(__bam_split_48_args, pg), "pg", ""},
+	{LOGREC_DBT, SSZ(__bam_split_48_args, pentry), "pentry", ""},
+	{LOGREC_DBT, SSZ(__bam_split_48_args, rentry), "rentry", ""},
+	{LOGREC_ARG, SSZ(__bam_split_48_args, opflags), "opflags", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_split_42_desc[] = {
+	{LOGREC_DB, SSZ(__bam_split_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, left), "left", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_42_args, llsn), "llsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, right), "right", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_42_args, rlsn), "rlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, npgno), "npgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_split_42_args, nlsn), "nlsn", ""},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, root_pgno), "root_pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__bam_split_42_args, pg), "pg", ""},
+	{LOGREC_ARG, SSZ(__bam_split_42_args, opflags), "opflags", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_rsplit_desc[] = {
+	{LOGREC_DB, SSZ(__bam_rsplit_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_rsplit_args, pgno), "pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__bam_rsplit_args, pgdbt), "pgdbt", ""},
+	{LOGREC_ARG, SSZ(__bam_rsplit_args, root_pgno), "root_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_rsplit_args, nrec), "nrec", "%lu"},
+	{LOGREC_DBT, SSZ(__bam_rsplit_args, rootent), "rootent", ""},
+	{LOGREC_POINTER, SSZ(__bam_rsplit_args, rootlsn), "rootlsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_adj_desc[] = {
+	{LOGREC_DB, SSZ(__bam_adj_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_adj_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_adj_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_adj_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_adj_args, indx_copy), "indx_copy", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_adj_args, is_insert), "is_insert", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_cadjust_desc[] = {
+	{LOGREC_DB, SSZ(__bam_cadjust_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_cadjust_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_cadjust_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_cadjust_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_cadjust_args, adjust), "adjust", "%ld"},
+	{LOGREC_ARG, SSZ(__bam_cadjust_args, opflags), "opflags", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_cdel_desc[] = {
+	{LOGREC_DB, SSZ(__bam_cdel_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_cdel_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_cdel_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_cdel_args, indx), "indx", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_repl_desc[] = {
+	{LOGREC_DB, SSZ(__bam_repl_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_repl_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_repl_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_repl_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_repl_args, isdeleted), "isdeleted", "%lu"},
+	{LOGREC_DBT, SSZ(__bam_repl_args, orig), "orig", ""},
+	{LOGREC_DBT, SSZ(__bam_repl_args, repl), "repl", ""},
+	{LOGREC_ARG, SSZ(__bam_repl_args, prefix), "prefix", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_repl_args, suffix), "suffix", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_irep_desc[] = {
+	{LOGREC_DB, SSZ(__bam_irep_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_irep_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_irep_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_irep_args, indx), "indx", "%lu"},
+	{LOGREC_OP, SSZ(__bam_irep_args, ptype), "ptype", "%lu"},
+	{LOGREC_HDR, SSZ(__bam_irep_args, hdr), "hdr", ""},
+	{LOGREC_DATA, SSZ(__bam_irep_args, data), "data", ""},
+	{LOGREC_HDR, SSZ(__bam_irep_args, old), "old", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_root_desc[] = {
+	{LOGREC_DB, SSZ(__bam_root_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_root_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_root_args, root_pgno), "root_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_root_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_curadj_desc[] = {
+	{LOGREC_DB, SSZ(__bam_curadj_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, mode), "mode", "%ld"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, from_pgno), "from_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, to_pgno), "to_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, left_pgno), "left_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, first_indx), "first_indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, from_indx), "from_indx", "%lu"},
+	{LOGREC_ARG, SSZ(__bam_curadj_args, to_indx), "to_indx", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_rcuradj_desc[] = {
+	{LOGREC_DB, SSZ(__bam_rcuradj_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_rcuradj_args, mode), "mode", "%ld"},
+	{LOGREC_ARG, SSZ(__bam_rcuradj_args, root), "root", "%ld"},
+	{LOGREC_ARG, SSZ(__bam_rcuradj_args, recno), "recno", "%ld"},
+	{LOGREC_ARG, SSZ(__bam_rcuradj_args, order), "order", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_relink_43_desc[] = {
+	{LOGREC_DB, SSZ(__bam_relink_43_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_relink_43_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_relink_43_args, prev), "prev", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn_prev), "lsn_prev", ""},
+	{LOGREC_ARG, SSZ(__bam_relink_43_args, next), "next", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn_next), "lsn_next", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_merge_44_desc[] = {
+	{LOGREC_DB, SSZ(__bam_merge_44_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__bam_merge_44_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_merge_44_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__bam_merge_44_args, npgno), "npgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__bam_merge_44_args, nlsn), "nlsn", ""},
+	{LOGREC_DBT, SSZ(__bam_merge_44_args, hdr), "hdr", ""},
+	{LOGREC_DBT, SSZ(__bam_merge_44_args, data), "data", ""},
+	{LOGREC_DBT, SSZ(__bam_merge_44_args, ind), "ind", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __bam_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_split_recover, DB___bam_split)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rsplit_recover, DB___bam_rsplit)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_adj_recover, DB___bam_adj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cadjust_recover, DB___bam_cadjust)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cdel_recover, DB___bam_cdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_repl_recover, DB___bam_repl)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_irep_recover, DB___bam_irep)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_root_recover, DB___bam_root)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_curadj_recover, DB___bam_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rcuradj_recover, DB___bam_rcuradj)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/btree/btree_autop.c b/src/btree/btree_autop.c
new file mode 100644
index 00000000..d2bee7d0
--- /dev/null
+++ b/src/btree/btree_autop.c
@@ -0,0 +1,291 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __bam_split_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_split_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_split", __bam_split_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_split_48_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_split_48_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_split_48", __bam_split_48_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_split_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_split_42", __bam_split_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_rsplit_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_rsplit", __bam_rsplit_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_adj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_adj_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_adj", __bam_adj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_cadjust_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_cadjust", __bam_cadjust_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_cdel_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_cdel", __bam_cdel_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_repl_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_repl_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_repl", __bam_repl_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_irep_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_irep_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_irep", __bam_irep_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_root_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_root_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_root", __bam_root_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_curadj_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_curadj", __bam_curadj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_rcuradj_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_rcuradj", __bam_rcuradj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_relink_43_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_relink_43", __bam_relink_43_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_merge_44_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__bam_merge_44", __bam_merge_44_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_split_print, DB___bam_split)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rsplit_print, DB___bam_rsplit)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_adj_print, DB___bam_adj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cadjust_print, DB___bam_cadjust)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cdel_print, DB___bam_cdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_repl_print, DB___bam_repl)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_irep_print, DB___bam_irep)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_root_print, DB___bam_root)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_curadj_print, DB___bam_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rcuradj_print, DB___bam_rcuradj)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/clib/atoi.c b/src/clib/atoi.c
new file mode 100644
index 00000000..d064ffb0
--- /dev/null
+++ b/src/clib/atoi.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * atoi --
+ *
+ * PUBLIC: #ifndef HAVE_ATOI
+ * PUBLIC: int atoi __P((const char *));
+ * PUBLIC: #endif
+ */
+int
+atoi(str)
+	const char *str;
+{
+	return (int)strtol(str, (char **)NULL, 10);
+}
diff --git a/src/clib/atol.c b/src/clib/atol.c
new file mode 100644
index 00000000..9aefcd5a
--- /dev/null
+++ b/src/clib/atol.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * atol --
+ *
+ * PUBLIC: #ifndef HAVE_ATOL
+ * PUBLIC: long atol __P((const char *));
+ * PUBLIC: #endif
+ */
+long
+atol(str)
+	const char *str;
+{
+	return strtol(str, (char **)NULL, 10);
+}
diff --git a/src/clib/bsearch.c b/src/clib/bsearch.c
new file mode 100644
index 00000000..3e55009a
--- /dev/null
+++ b/src/clib/bsearch.c
@@ -0,0 +1,38 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * bsearch --
+ *
+ * PUBLIC: #ifndef HAVE_BSEARCH
+ * PUBLIC: void *bsearch __P((const void *, const void *, size_t,
+ * PUBLIC:	size_t, int (*)(const void *, const void *)));
+ * PUBLIC: #endif
+ */
+
+void *bsearch(key, base, nmemb, size, cmp)
+	const void *key;
+	const void *base;
+	size_t nmemb;
+	size_t size;
+	int (*cmp) __P((const void *, const void *));
+{
+	size_t i;
+
+	/* not doing a binary search, but searching linearly */
+	for (i=0; i < nmemb; i++) {
+		if ((*cmp)(key, (const void *)((char *)base + i * size)) == 0)
+			return ((void *)((char *)base + i * size));
+	}
+
+	return (NULL);
+}
diff --git a/src/clib/getcwd.c b/src/clib/getcwd.c
new file mode 100644
index 00000000..83e8b62d
--- /dev/null
+++ b/src/clib/getcwd.c
@@ -0,0 +1,261 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+#endif
+
+#define	ISDOT(dp) \
+	(dp->d_name[0] == '.' && (dp->d_name[1] == '\0' || \
+	    (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+
+#ifndef dirfd
+#define	dirfd(dirp)     ((dirp)->dd_fd)
+#endif
+
+/*
+ * getcwd --
+ *	Get the current working directory.
+ *
+ * PUBLIC: #ifndef HAVE_GETCWD
+ * PUBLIC: char *getcwd __P((char *, size_t));
+ * PUBLIC: #endif
+ */
+char *
+getcwd(pt, size)
+	char *pt;
+	size_t size;
+{
+	register struct dirent *dp;
+	register DIR *dir;
+	register dev_t dev;
+	register ino_t ino;
+	register int first;
+	register char *bpt, *bup;
+	struct stat s;
+	dev_t root_dev;
+	ino_t root_ino;
+	size_t ptsize, upsize;
+	int ret, save_errno;
+	char *ept, *eup, *up;
+
+	/*
+	 * If no buffer specified by the user, allocate one as necessary.
+	 * If a buffer is specified, the size has to be non-zero.  The path
+	 * is built from the end of the buffer backwards.
+	 */
+	if (pt) {
+		ptsize = 0;
+		if (!size) {
+			__os_set_errno(EINVAL);
+			return (NULL);
+		}
+		if (size == 1) {
+			__os_set_errno(ERANGE);
+			return (NULL);
+		}
+		ept = pt + size;
+	} else {
+		if ((ret =
+		    __os_malloc(NULL, ptsize = 1024 - 4, &pt)) != 0) {
+			__os_set_errno(ret);
+			return (NULL);
+		}
+		ept = pt + ptsize;
+	}
+	bpt = ept - 1;
+	*bpt = '\0';
+
+	/*
+	 * Allocate bytes (1024 - malloc space) for the string of "../"'s.
+	 * Should always be enough (it's 340 levels).  If it's not, allocate
+	 * as necessary.  Special case the first stat, it's ".", not "..".
+	 */
+	if ((ret = __os_malloc(NULL, upsize = 1024 - 4, &up)) != 0)
+		goto err;
+	eup = up + 1024;
+	bup = up;
+	up[0] = '.';
+	up[1] = '\0';
+
+	/* Save root values, so know when to stop. */
+	if (stat("/", &s))
+		goto err;
+	root_dev = s.st_dev;
+	root_ino = s.st_ino;
+
+	__os_set_errno(0);		/* XXX readdir has no error return. */
+
+	for (first = 1;; first = 0) {
+		/* Stat the current level. */
+		if (lstat(up, &s))
+			goto err;
+
+		/* Save current node values. */
+		ino = s.st_ino;
+		dev = s.st_dev;
+
+		/* Check for reaching root. */
+		if (root_dev == dev && root_ino == ino) {
+			*--bpt = PATH_SEPARATOR[0];
+			/*
+			 * It's unclear that it's a requirement to copy the
+			 * path to the beginning of the buffer, but it's always
+			 * been that way and stuff would probably break.
+			 */
+			bcopy(bpt, pt, ept - bpt);
+			__os_free(NULL, up);
+			return (pt);
+		}
+
+		/*
+		 * Build pointer to the parent directory, allocating memory
+		 * as necessary.  Max length is 3 for "../", the largest
+		 * possible component name, plus a trailing NULL.
+		 */
+		if (bup + 3  + MAXNAMLEN + 1 >= eup) {
+			if (__os_realloc(NULL, upsize *= 2, &up) != 0)
+				goto err;
+			bup = up;
+			eup = up + upsize;
+		}
+		*bup++ = '.';
+		*bup++ = '.';
+		*bup = '\0';
+
+		/* Open and stat parent directory. */
+		if (!(dir = opendir(up)) || fstat(dirfd(dir), &s))
+			goto err;
+
+		/* Add trailing slash for next directory. */
+		*bup++ = PATH_SEPARATOR[0];
+
+		/*
+		 * If it's a mount point, have to stat each element because
+		 * the inode number in the directory is for the entry in the
+		 * parent directory, not the inode number of the mounted file.
+		 */
+		save_errno = 0;
+		if (s.st_dev == dev) {
+			for (;;) {
+				if (!(dp = readdir(dir)))
+					goto notfound;
+				if (dp->d_fileno == ino)
+					break;
+			}
+		} else
+			for (;;) {
+				if (!(dp = readdir(dir)))
+					goto notfound;
+				if (ISDOT(dp))
+					continue;
+				bcopy(dp->d_name, bup, dp->d_namlen + 1);
+
+				/* Save the first error for later. */
+				if (lstat(up, &s)) {
+					if (save_errno == 0)
+						save_errno = __os_get_errno();
+					__os_set_errno(0);
+					continue;
+				}
+				if (s.st_dev == dev && s.st_ino == ino)
+					break;
+			}
+
+		/*
+		 * Check for length of the current name, preceding slash,
+		 * leading slash.
+		 */
+		if (bpt - pt < dp->d_namlen + (first ? 1 : 2)) {
+			size_t len, off;
+
+			if (!ptsize) {
+				__os_set_errno(ERANGE);
+				goto err;
+			}
+			off = bpt - pt;
+			len = ept - bpt;
+			if (__os_realloc(NULL, ptsize *= 2, &pt) != 0)
+				goto err;
+			bpt = pt + off;
+			ept = pt + ptsize;
+			bcopy(bpt, ept - len, len);
+			bpt = ept - len;
+		}
+		if (!first)
+			*--bpt = PATH_SEPARATOR[0];
+		bpt -= dp->d_namlen;
+		bcopy(dp->d_name, bpt, dp->d_namlen);
+		(void)closedir(dir);
+
+		/* Truncate any file name. */
+		*bup = '\0';
+	}
+
+notfound:
+	/*
+	 * If readdir set errno, use it, not any saved error; otherwise,
+	 * didn't find the current directory in its parent directory, set
+	 * errno to ENOENT.
+	 */
+	if (__os_get_errno_ret_zero() == 0)
+		__os_set_errno(save_errno == 0 ? ENOENT : save_errno);
+	/* FALLTHROUGH */
+err:
+	if (ptsize)
+		__os_free(NULL, pt);
+	__os_free(NULL, up);
+	return (NULL);
+}
diff --git a/src/clib/getopt.c b/src/clib/getopt.c
new file mode 100644
index 00000000..ca98e7f1
--- /dev/null
+++ b/src/clib/getopt.c
@@ -0,0 +1,153 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1987, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * Avoid inclusion of internal header files as this
+ * file is used by example code.
+ *
+ * Unconditional inclusion of stdio and string are
+ * OK in this file.  It will work on all platforms
+ * for which this file is used
+ */
+extern char *__db_rpath(const char *);
+#include <stdio.h>
+#include <string.h>
+
+int	__db_getopt_reset;	/* global reset for VxWorks. */
+
+int	opterr = 1,		/* if error message should be printed */
+	optind = 1,		/* index into parent argv vector */
+	optopt,			/* character checked for validity */
+	optreset;		/* reset getopt */
+char	*optarg;		/* argument associated with option */
+
+#undef	BADCH
+#define	BADCH	(int)'?'
+#undef	BADARG
+#define	BADARG	(int)':'
+#undef	EMSG
+#define	EMSG	""
+
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * PUBLIC: #ifndef HAVE_GETOPT
+ * PUBLIC: int getopt __P((int, char * const *, const char *));
+ * PUBLIC: #endif
+ */
+int
+getopt(nargc, nargv, ostr)
+	int nargc;
+	char * const *nargv;
+	const char *ostr;
+{
+	static char *progname;
+	static char *place = EMSG;		/* option letter processing */
+	char *oli;				/* option letter list index */
+
+	/*
+	 * VxWorks needs to be able to repeatedly call getopt from multiple
+	 * programs within its global name space.
+	 */
+	if (__db_getopt_reset) {
+		__db_getopt_reset = 0;
+
+		opterr = optind = 1;
+		optopt = optreset = 0;
+		optarg = NULL;
+		progname = NULL;
+		place = EMSG;
+	}
+	if (!progname) {
+		if ((progname = __db_rpath(*nargv)) == NULL)
+			progname = *nargv;
+		else
+			++progname;
+	}
+
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc || *(place = nargv[optind]) != '-') {
+			place = EMSG;
+			return (EOF);
+		}
+		if (place[1] && *++place == '-') {	/* found "--" */
+			++optind;
+			place = EMSG;
+			return (EOF);
+		}
+	}					/* option letter okay? */
+	if ((optopt = (int)*place++) == (int)':' ||
+	    !(oli = strchr(ostr, optopt))) {
+		/*
+		 * if the user didn't specify '-' as an option,
+		 * assume it means EOF.
+		 */
+		if (optopt == (int)'-')
+			return (EOF);
+		if (!*place)
+			++optind;
+		if (opterr && *ostr != ':')
+			(void)fprintf(stderr,
+			    "%s: illegal option -- %c\n", progname, optopt);
+		return (BADCH);
+	}
+	if (*++oli != ':') {			/* don't need argument */
+		optarg = NULL;
+		if (!*place)
+			++optind;
+	}
+	else {					/* need an argument */
+		if (*place)			/* no white space */
+			optarg = place;
+		else if (nargc <= ++optind) {	/* no arg */
+			place = EMSG;
+			if (*ostr == ':')
+				return (BADARG);
+			if (opterr)
+				(void)fprintf(stderr,
+				    "%s: option requires an argument -- %c\n",
+				    progname, optopt);
+			return (BADCH);
+		}
+		else				/* white space */
+			optarg = nargv[optind];
+		place = EMSG;
+		++optind;
+	}
+	return (optopt);			/* dump back option letter */
+}
diff --git a/src/clib/isalpha.c b/src/clib/isalpha.c
new file mode 100644
index 00000000..6bf1ffb7
--- /dev/null
+++ b/src/clib/isalpha.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isalpha --
+ *
+ * PUBLIC: #ifndef HAVE_ISALPHA
+ * PUBLIC: int isalpha __P((int));
+ * PUBLIC: #endif
+ */
+int
+isalpha(c)
+	int c;
+{
+	/*
+	 * Depends on ASCII-like character ordering.
+	 */
+	return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ? 1 : 0);
+}
diff --git a/src/clib/isdigit.c b/src/clib/isdigit.c
new file mode 100644
index 00000000..d1b2a65e
--- /dev/null
+++ b/src/clib/isdigit.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isdigit --
+ *
+ * PUBLIC: #ifndef HAVE_ISDIGIT
+ * PUBLIC: int isdigit __P((int));
+ * PUBLIC: #endif
+ */
+int
+isdigit(c)
+	int c;
+{
+	/*
+	 * Depends on ASCII-like character ordering.
+	 */
+	return (c >= '0' && c <= '9' ? 1 : 0);
+}
diff --git a/src/clib/isprint.c b/src/clib/isprint.c
new file mode 100644
index 00000000..685e20ea
--- /dev/null
+++ b/src/clib/isprint.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isprint --
+ *
+ * PUBLIC: #ifndef HAVE_ISPRINT
+ * PUBLIC: int isprint __P((int));
+ * PUBLIC: #endif
+ */
+int
+isprint(c)
+	int c;
+{
+	/*
+	 * Depends on ASCII character values.
+	 */
+	return ((c >= ' ' && c <= '~') ? 1 : 0);
+}
diff --git a/src/clib/isspace.c b/src/clib/isspace.c
new file mode 100644
index 00000000..df450d3b
--- /dev/null
+++ b/src/clib/isspace.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isspace --
+ *
+ * PUBLIC: #ifndef HAVE_ISSPACE
+ * PUBLIC: int isspace __P((int));
+ * PUBLIC: #endif
+ */
+int
+isspace(c)
+	int c;
+{
+	return (c == '\t' || c == '\n' ||
+	    c == '\v' || c == '\f' || c == '\r' || c == ' ' ? 1 : 0);
+}
diff --git a/src/clib/memcmp.c b/src/clib/memcmp.c
new file mode 100644
index 00000000..7fec827c
--- /dev/null
+++ b/src/clib/memcmp.c
@@ -0,0 +1,62 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * memcmp --
+ *
+ * PUBLIC: #ifndef HAVE_MEMCMP
+ * PUBLIC: int memcmp __P((const void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+int
+memcmp(s1, s2, n)
+	const void *s1, *s2;
+	size_t n;
+{
+	if (n != 0) {
+		unsigned char *p1 = (unsigned char *)s1,
+			      *p2 = (unsigned char *)s2;
+		do {
+			if (*p1++ != *p2++)
+				return (*--p1 - *--p2);
+		} while (--n != 0);
+	}
+	return (0);
+}
diff --git a/src/clib/memmove.c b/src/clib/memmove.c
new file mode 100644
index 00000000..34a181cc
--- /dev/null
+++ b/src/clib/memmove.c
@@ -0,0 +1,150 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * sizeof(word) MUST BE A POWER OF TWO
+ * SO THAT wmask BELOW IS ALL ONES
+ */
+typedef	int word;		/* "word" used for optimal copy speed */
+
+#undef	wsize
+#define	wsize	sizeof(word)
+#undef	wmask
+#define	wmask	(wsize - 1)
+
+/*
+ * Copy a block of memory, handling overlap.
+ * This is the routine that actually implements
+ * (the portable versions of) bcopy, memcpy, and memmove.
+ */
+#ifdef MEMCOPY
+/*
+ * PUBLIC: #ifndef HAVE_MEMCPY
+ * PUBLIC: void *memcpy __P((void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+void *
+memcpy(dst0, src0, length)
+#else
+#ifdef MEMMOVE
+/*
+ * PUBLIC: #ifndef HAVE_MEMMOVE
+ * PUBLIC: void *memmove __P((void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+void *
+memmove(dst0, src0, length)
+#else
+void
+bcopy(src0, dst0, length)
+#endif
+#endif
+	void *dst0;
+	const void *src0;
+	register size_t length;
+{
+	register char *dst = dst0;
+	register const char *src = src0;
+	register size_t t;
+
+	if (length == 0 || dst == src)		/* nothing to do */
+		goto done;
+
+	/*
+	 * Macros: loop-t-times; and loop-t-times, t>0
+	 */
+#undef	TLOOP
+#define	TLOOP(s) if (t) TLOOP1(s)
+#undef	TLOOP1
+#define	TLOOP1(s) do { s; } while (--t)
+
+	if ((unsigned long)dst < (unsigned long)src) {
+		/*
+		 * Copy forward.
+		 */
+		t = (size_t)src;	/* only need low bits */
+		if ((t | (size_t)dst) & wmask) {
+			/*
+			 * Try to align operands.  This cannot be done
+			 * unless the low bits match.
+			 */
+			if ((t ^ (size_t)dst) & wmask || length < wsize)
+				t = length;
+			else
+				t = wsize - (t & wmask);
+			length -= t;
+			TLOOP1(*dst++ = *src++);
+		}
+		/*
+		 * Copy whole words, then mop up any trailing bytes.
+		 */
+		t = length / wsize;
+		TLOOP(*(word *)dst = *(word *)src; src += wsize; dst += wsize);
+		t = length & wmask;
+		TLOOP(*dst++ = *src++);
+	} else {
+		/*
+		 * Copy backwards.  Otherwise essentially the same.
+		 * Alignment works as before, except that it takes
+		 * (t&wmask) bytes to align, not wsize-(t&wmask).
+		 */
+		src += length;
+		dst += length;
+		t = (size_t)src;
+		if ((t | (size_t)dst) & wmask) {
+			if ((t ^ (size_t)dst) & wmask || length <= wsize)
+				t = length;
+			else
+				t &= wmask;
+			length -= t;
+			TLOOP1(*--dst = *--src);
+		}
+		t = length / wsize;
+		TLOOP(src -= wsize; dst -= wsize; *(word *)dst = *(word *)src);
+		t = length & wmask;
+		TLOOP(*--dst = *--src);
+	}
+done:
+#if defined(MEMCOPY) || defined(MEMMOVE)
+	return (dst0);
+#else
+	return;
+#endif
+}
diff --git a/src/clib/printf.c b/src/clib/printf.c
new file mode 100644
index 00000000..a2c01296
--- /dev/null
+++ b/src/clib/printf.c
@@ -0,0 +1,116 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * printf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int printf __P((const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+#ifdef STDC_HEADERS
+printf(const char *fmt, ...)
+#else
+printf(fmt, va_alist)
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	size_t len;
+	char buf[2048];    /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+
+	/*
+	 * We implement printf/fprintf with fwrite, because Berkeley DB uses
+	 * fwrite in other places.
+	 */
+	return (fwrite(
+	    buf, sizeof(char), (size_t)len, stdout) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
+
+/*
+ * fprintf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int fprintf __P((FILE *, const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+#ifdef STDC_HEADERS
+fprintf(FILE *fp, const char *fmt, ...)
+#else
+fprintf(fp, fmt, va_alist)
+	FILE *fp;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	size_t len;
+	char buf[2048];    /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	len = vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+
+	/*
+	 * We implement printf/fprintf with fwrite, because Berkeley DB uses
+	 * fwrite in other places.
+	 */
+	return (fwrite(
+	    buf, sizeof(char), (size_t)len, fp) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
+
+/*
+ * vfprintf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int vfprintf __P((FILE *, const char *, va_list));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+vfprintf(fp, fmt, ap)
+	FILE *fp;
+	const char *fmt;
+	va_list ap;
+{
+	size_t len;
+	char buf[2048];    /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	len = vsnprintf(buf, sizeof(buf), fmt, ap);
+
+	/*
+	 * We implement printf/fprintf with fwrite, because Berkeley DB uses
+	 * fwrite in other places.
+	 */
+	return (fwrite(
+	    buf, sizeof(char), (size_t)len, fp) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
diff --git a/src/clib/qsort.c b/src/clib/qsort.c
new file mode 100644
index 00000000..cec6288c
--- /dev/null
+++ b/src/clib/qsort.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * static char sccsid[] = "@(#)qsort.c	8.1 (Berkeley) 6/4/93";
+ * Id: qsort.c,v 1.4 1996/04/19 18:40:20 bde
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static char	*med3 __P((char *,
+		    char *, char *, int (*)(const void *, const void *)));
+static void	 swapfunc __P((char *, char *, int, int));
+
+#define	min(a, b)	(a) < (b) ? a : b
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define	swapcode(TYPE, parmi, parmj, n) {		\
+	long i = (n) / sizeof(TYPE);			\
+	register TYPE *pi = (TYPE *) (parmi);		\
+	register TYPE *pj = (TYPE *) (parmj);		\
+	do {						\
+		register TYPE	t = *pi;		\
+		*pi++ = *pj;				\
+		*pj++ = t;				\
+	} while (--i > 0);				\
+}
+
+#define	SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+	es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+
+static inline void
+swapfunc(a, b, n, swaptype)
+	char *a, *b;
+	int n, swaptype;
+{
+	if (swaptype <= 1)
+		swapcode(long, a, b, n)
+	else
+		swapcode(char, a, b, n)
+}
+
+#define	swap(a, b)					\
+	if (swaptype == 0) {				\
+		long t = *(long *)(a);			\
+		*(long *)(a) = *(long *)(b);		\
+		*(long *)(b) = t;			\
+	} else						\
+		swapfunc(a, b, es, swaptype)
+
+#define	vecswap(a, b, n)	if ((n) > 0) swapfunc(a, b, n, swaptype)
+
+static inline char *
+med3(a, b, c, cmp)
+	char *a, *b, *c;
+	int (*cmp)(const void *, const void *);
+{
+	return cmp(a, b) < 0 ?
+	       (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
+	      :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_QSORT
+ * PUBLIC: void qsort __P((void *,
+ * PUBLIC:     size_t, size_t, int(*)(const void *, const void *)));
+ * PUBLIC: #endif
+ */
+void
+qsort(a, n, es, cmp)
+	void *a;
+	size_t n, es;
+	int (*cmp) __P((const void *, const void *));
+{
+	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+	int d, r, swaptype, swap_cnt;
+
+loop:	SWAPINIT(a, es);
+	swap_cnt = 0;
+	if (n < 7) {
+		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+			for (pl = pm; pl > (char *)a && cmp(pl - es, pl) > 0;
+			     pl -= es)
+				swap(pl, pl - es);
+		return;
+	}
+	pm = (char *)a + (n / 2) * es;
+	if (n > 7) {
+		pl = a;
+		pn = (char *)a + (n - 1) * es;
+		if (n > 40) {
+			d = (n / 8) * es;
+			pl = med3(pl, pl + d, pl + 2 * d, cmp);
+			pm = med3(pm - d, pm, pm + d, cmp);
+			pn = med3(pn - 2 * d, pn - d, pn, cmp);
+		}
+		pm = med3(pl, pm, pn, cmp);
+	}
+	swap(a, pm);
+	pa = pb = (char *)a + es;
+
+	pc = pd = (char *)a + (n - 1) * es;
+	for (;;) {
+		while (pb <= pc && (r = cmp(pb, a)) <= 0) {
+			if (r == 0) {
+				swap_cnt = 1;
+				swap(pa, pb);
+				pa += es;
+			}
+			pb += es;
+		}
+		while (pb <= pc && (r = cmp(pc, a)) >= 0) {
+			if (r == 0) {
+				swap_cnt = 1;
+				swap(pc, pd);
+				pd -= es;
+			}
+			pc -= es;
+		}
+		if (pb > pc)
+			break;
+		swap(pb, pc);
+		swap_cnt = 1;
+		pb += es;
+		pc -= es;
+	}
+	if (swap_cnt == 0) {  /* Switch to insertion sort */
+		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+			for (pl = pm; pl > (char *)a && cmp(pl - es, pl) > 0;
+			     pl -= es)
+				swap(pl, pl - es);
+		return;
+	}
+
+	pn = (char *)a + n * es;
+	r = min(pa - (char *)a, pb - pa);
+	vecswap(a, pb - r, r);
+	r = min((int)(pd - pc), (int)(pn - pd - es));
+	vecswap(pb, pn - r, r);
+	if ((r = (int)(pb - pa)) > (int)es)
+		qsort(a, r / es, es, cmp);
+	if ((r = (int)(pd - pc)) > (int)es) {
+		/* Iterate rather than recurse to save stack space */
+		a = pn - r;
+		n = r / es;
+		goto loop;
+	}
+/*		qsort(pn - r, r / es, es, cmp);*/
+}
diff --git a/src/clib/raise.c b/src/clib/raise.c
new file mode 100644
index 00000000..ad0e567f
--- /dev/null
+++ b/src/clib/raise.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * raise --
+ *	Send a signal to the current process.
+ *
+ * PUBLIC: #ifndef HAVE_RAISE
+ * PUBLIC: int raise __P((int));
+ * PUBLIC: #endif
+ */
+int
+raise(s)
+	int s;
+{
+	return (kill(getpid(), s));
+}
diff --git a/src/clib/rand.c b/src/clib/rand.c
new file mode 100644
index 00000000..6b810060
--- /dev/null
+++ b/src/clib/rand.c
@@ -0,0 +1,25 @@
+/*
+ * Copied from the ANSI C standard 4.10.2.2.
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * rand, srand --
+ *
+ * PUBLIC: #ifndef HAVE_RAND
+ * PUBLIC: int rand __P((void));
+ * PUBLIC: void srand __P((unsigned int));
+ * PUBLIC: #endif
+ */
+int rand(void)	/* RAND_MAX assumed to be 32767 */
+{
+	DB_GLOBAL(rand_next) = DB_GLOBAL(rand_next) * 1103515245 + 12345;
+	return (unsigned int) (DB_GLOBAL(rand_next)/65536) % 32768;
+}
+
+void srand(unsigned int seed)
+{
+	DB_GLOBAL(rand_next) = seed;
+}
diff --git a/src/clib/snprintf.c b/src/clib/snprintf.c
new file mode 100644
index 00000000..6b31d850
--- /dev/null
+++ b/src/clib/snprintf.c
@@ -0,0 +1,149 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_VSNPRINTF)
+static void sprintf_overflow __P((void));
+static int  sprintf_retcharpnt __P((void));
+#endif
+
+/*
+ * snprintf --
+ *	Bounded version of sprintf.
+ *
+ * PUBLIC: #ifndef HAVE_SNPRINTF
+ * PUBLIC: int snprintf __P((char *, size_t, const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_SNPRINTF
+int
+#ifdef STDC_HEADERS
+snprintf(char *str, size_t n, const char *fmt, ...)
+#else
+snprintf(str, n, fmt, va_alist)
+	char *str;
+	size_t n;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	static int ret_charpnt = -1;
+	va_list ap;
+	size_t len;
+
+	if (ret_charpnt == -1)
+		ret_charpnt = sprintf_retcharpnt();
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	len = (size_t)vsprintf(str, fmt, ap);
+	if (ret_charpnt)
+		len = strlen(str);
+
+	va_end(ap);
+
+	if (len >= n) {
+		sprintf_overflow();
+		/* NOTREACHED */
+	}
+	return ((int)len);
+}
+#endif
+
+/*
+ * vsnprintf --
+ *	Bounded version of vsprintf.
+ *
+ * PUBLIC: #ifndef HAVE_VSNPRINTF
+ * PUBLIC: int vsnprintf __P((char *, size_t, const char *, va_list));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_VSNPRINTF
+int
+vsnprintf(str, n, fmt, ap)
+	char *str;
+	size_t n;
+	const char *fmt;
+	va_list ap;
+{
+	static int ret_charpnt = -1;
+	size_t len;
+
+	if (ret_charpnt == -1)
+		ret_charpnt = sprintf_retcharpnt();
+
+	len = (size_t)vsprintf(str, fmt, ap);
+	if (ret_charpnt)
+		len = strlen(str);
+
+	if (len >= n) {
+		sprintf_overflow();
+		/* NOTREACHED */
+	}
+	return ((int)len);
+}
+#endif
+
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_VSNPRINTF)
+static void
+sprintf_overflow()
+{
+	/*
+	 * !!!
+	 * We're potentially manipulating strings handed us by the application,
+	 * and on systems without a real snprintf() the sprintf() calls could
+	 * have overflowed the buffer.  We can't do anything about it now, but
+	 * we don't want to return control to the application, we might have
+	 * overwritten the stack with a Trojan horse.  We're not trying to do
+	 * anything recoverable here because systems without snprintf support
+	 * are pretty rare anymore.
+	 */
+#define	OVERFLOW_ERROR	"internal buffer overflow, process ended\n"
+#ifndef	STDERR_FILENO
+#define	STDERR_FILENO	2
+#endif
+	(void)write(STDERR_FILENO, OVERFLOW_ERROR, sizeof(OVERFLOW_ERROR) - 1);
+
+	/* Be polite. */
+	exit(1);
+
+	/* But firm. */
+	__os_abort(NULL);
+
+	/* NOTREACHED */
+}
+
+static int
+sprintf_retcharpnt()
+{
+	int ret_charpnt;
+	char buf[10];
+
+	/*
+	 * Some old versions of sprintf return a pointer to the first argument
+	 * instead of a character count.  Assume the return value of snprintf,
+	 * vsprintf, etc. will be the same as sprintf, and check the easy one.
+	 *
+	 * We do this test at run-time because it's not a test we can do in a
+	 * cross-compilation environment.
+	 */
+
+	ret_charpnt =
+	    (int)sprintf(buf, "123") != 3 ||
+	    (int)sprintf(buf, "123456789") != 9 ||
+	    (int)sprintf(buf, "1234") != 4;
+
+	return (ret_charpnt);
+}
+#endif
diff --git a/src/clib/strcasecmp.c b/src/clib/strcasecmp.c
new file mode 100644
index 00000000..287895ce
--- /dev/null
+++ b/src/clib/strcasecmp.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 1987, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strcasecmp --
+ *	Do strcmp(3) in a case-insensitive manner.
+ *
+ * PUBLIC: #ifndef HAVE_STRCASECMP
+ * PUBLIC: int strcasecmp __P((const char *, const char *));
+ * PUBLIC: #endif
+ */
+int
+strcasecmp(s1, s2)
+	const char *s1, *s2;
+{
+	u_char s1ch, s2ch;
+
+	for (;;) {
+		s1ch = *s1++;
+		s2ch = *s2++;
+		if (s1ch >= 'A' && s1ch <= 'Z')		/* tolower() */
+			s1ch += 32;
+		if (s2ch >= 'A' && s2ch <= 'Z')		/* tolower() */
+			s2ch += 32;
+		if (s1ch != s2ch)
+			return (s1ch - s2ch);
+		if (s1ch == '\0')
+			return (0);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * strncasecmp --
+ *	Do strncmp(3) in a case-insensitive manner.
+ *
+ * PUBLIC: #ifndef HAVE_STRCASECMP
+ * PUBLIC: int strncasecmp __P((const char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+int
+strncasecmp(s1, s2, n)
+	const char *s1, *s2;
+	register size_t n;
+{
+	u_char s1ch, s2ch;
+
+	for (; n != 0; --n) {
+		s1ch = *s1++;
+		s2ch = *s2++;
+		if (s1ch >= 'A' && s1ch <= 'Z')		/* tolower() */
+			s1ch += 32;
+		if (s2ch >= 'A' && s2ch <= 'Z')		/* tolower() */
+			s2ch += 32;
+		if (s1ch != s2ch)
+			return (s1ch - s2ch);
+		if (s1ch == '\0')
+			return (0);
+	}
+	return (0);
+}
diff --git a/src/clib/strcat.c b/src/clib/strcat.c
new file mode 100644
index 00000000..d99c9070
--- /dev/null
+++ b/src/clib/strcat.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strcat --
+ *
+ * PUBLIC: #ifndef HAVE_STRCAT
+ * PUBLIC: char *strcat __P((char *, const char *));
+ * PUBLIC: #endif
+ */
+char *
+strcat(char *s, const char *append)
+{
+	char *save = s;
+
+	for (; *s; ++s);
+	while ((*s++ = *append++));
+	return (save);
+}
diff --git a/src/clib/strchr.c b/src/clib/strchr.c
new file mode 100644
index 00000000..a8ac4ce0
--- /dev/null
+++ b/src/clib/strchr.c
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strchr --
+ *
+ * PUBLIC: #ifndef HAVE_STRCHR
+ * PUBLIC: char *strchr __P((const char *,  int));
+ * PUBLIC: #endif
+ */
+char *strchr(const char *p, int ch)
+{
+	char c;
+
+	c = ch;
+	for (;; ++p) {
+		if (*p == c)
+			return ((char *)p);
+		if (*p == '\0')
+			return (NULL);
+	}
+	/* NOTREACHED */
+}
diff --git a/src/clib/strdup.c b/src/clib/strdup.c
new file mode 100644
index 00000000..5863340c
--- /dev/null
+++ b/src/clib/strdup.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strdup --
+ *
+ * PUBLIC: #ifndef HAVE_STRDUP
+ * PUBLIC: char *strdup __P((const char *));
+ * PUBLIC: #endif
+ */
+char *
+strdup(str)
+	const char *str;
+{
+	size_t len;
+	char *copy;
+
+	len = strlen(str) + 1;
+	if (!(copy = malloc((u_int)len)))
+		return (NULL);
+	memcpy(copy, str, len);
+	return (copy);
+}
diff --git a/src/clib/strerror.c b/src/clib/strerror.c
new file mode 100644
index 00000000..62bd7dd5
--- /dev/null
+++ b/src/clib/strerror.c
@@ -0,0 +1,225 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * Copyright (c) 1982, 1985, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("FreeBSD: /repoman/r/ncvs/src/lib/libc/gen/errlst.c,v 1.8 2005/04/02 12:33:28 das Exp $");
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_strerror --
+ *	Return the string associated with an errno.
+ *
+ * PUBLIC: #ifndef HAVE_STRERROR
+ * PUBLIC: char *strerror __P((int));
+ * PUBLIC: #endif
+ */
+char *
+strerror(num)
+	int num;
+{
+#define	ERRSTR(v, s) do {						\
+	if (num == (v))							\
+		return (s);						\
+} while (0)
+	ERRSTR(0, "Undefined error: 0");
+	ERRSTR(EPERM, "Operation not permitted");
+	ERRSTR(ENOENT, "No such file or directory");
+	ERRSTR(ESRCH, "No such process");
+	ERRSTR(EINTR, "Interrupted system call");
+	ERRSTR(EIO, "Input/output error");
+	ERRSTR(ENXIO, "Device not configured");
+	ERRSTR(E2BIG, "Argument list too long");
+	ERRSTR(ENOEXEC, "Exec format error");
+	ERRSTR(EBADF, "Bad file descriptor");
+	ERRSTR(ECHILD, "No child processes");
+	ERRSTR(EDEADLK, "Resource deadlock avoided");
+	ERRSTR(ENOMEM, "Cannot allocate memory");
+	ERRSTR(EACCES, "Permission denied");
+	ERRSTR(EFAULT, "Bad address");
+	ERRSTR(ENOTBLK, "Block device required");
+	ERRSTR(EBUSY, "Device busy");
+	ERRSTR(EEXIST, "File exists");
+	ERRSTR(EXDEV, "Cross-device link");
+	ERRSTR(ENODEV, "Operation not supported by device");
+	ERRSTR(ENOTDIR, "Not a directory");
+	ERRSTR(EISDIR, "Is a directory");
+	ERRSTR(EINVAL, "Invalid argument");
+	ERRSTR(ENFILE, "Too many open files in system");
+	ERRSTR(EMFILE, "Too many open files");
+	ERRSTR(ENOTTY, "Inappropriate ioctl for device");
+	ERRSTR(ETXTBSY, "Text file busy");
+	ERRSTR(EFBIG, "File too large");
+	ERRSTR(ENOSPC, "No space left on device");
+	ERRSTR(ESPIPE, "Illegal seek");
+	ERRSTR(EROFS, "Read-only file system");
+	ERRSTR(EMLINK, "Too many links");
+	ERRSTR(EPIPE, "Broken pipe");
+
+/* math software */
+	ERRSTR(EDOM, "Numerical argument out of domain");
+	ERRSTR(ERANGE, "Result too large");
+
+/* non-blocking and interrupt i/o */
+	ERRSTR(EAGAIN, "Resource temporarily unavailable");
+	ERRSTR(EWOULDBLOCK, "Resource temporarily unavailable");
+	ERRSTR(EINPROGRESS, "Operation now in progress");
+	ERRSTR(EALREADY, "Operation already in progress");
+
+/* ipc/network software -- argument errors */
+	ERRSTR(ENOTSOCK, "Socket operation on non-socket");
+	ERRSTR(EDESTADDRREQ, "Destination address required");
+	ERRSTR(EMSGSIZE, "Message too long");
+	ERRSTR(EPROTOTYPE, "Protocol wrong type for socket");
+	ERRSTR(ENOPROTOOPT, "Protocol not available");
+	ERRSTR(EPROTONOSUPPORT, "Protocol not supported");
+	ERRSTR(ESOCKTNOSUPPORT, "Socket type not supported");
+	ERRSTR(EOPNOTSUPP, "Operation not supported");
+	ERRSTR(EPFNOSUPPORT, "Protocol family not supported");
+	ERRSTR(EAFNOSUPPORT, "Address family not supported by protocol family");
+	ERRSTR(EADDRINUSE, "Address already in use");
+	ERRSTR(EADDRNOTAVAIL, "Can't assign requested address");
+
+/* ipc/network software -- operational errors */
+	ERRSTR(ENETDOWN, "Network is down");
+	ERRSTR(ENETUNREACH, "Network is unreachable");
+	ERRSTR(ENETRESET, "Network dropped connection on reset");
+	ERRSTR(ECONNABORTED, "Software caused connection abort");
+	ERRSTR(ECONNRESET, "Connection reset by peer");
+	ERRSTR(ENOBUFS, "No buffer space available");
+	ERRSTR(EISCONN, "Socket is already connected");
+	ERRSTR(ENOTCONN, "Socket is not connected");
+	ERRSTR(ESHUTDOWN, "Can't send after socket shutdown");
+	ERRSTR(ETOOMANYREFS, "Too many references: can't splice");
+	ERRSTR(ETIMEDOUT, "Operation timed out");
+	ERRSTR(ECONNREFUSED, "Connection refused");
+
+	ERRSTR(ELOOP, "Too many levels of symbolic links");
+	ERRSTR(ENAMETOOLONG, "File name too long");
+
+/* should be rearranged */
+	ERRSTR(EHOSTDOWN, "Host is down");
+	ERRSTR(EHOSTUNREACH, "No route to host");
+	ERRSTR(ENOTEMPTY, "Directory not empty");
+
+/* quotas & mush */
+	ERRSTR(EPROCLIM, "Too many processes");
+	ERRSTR(EUSERS, "Too many users");
+	ERRSTR(EDQUOT, "Disc quota exceeded");
+
+/* Network File System */
+	ERRSTR(ESTALE, "Stale NFS file handle");
+	ERRSTR(EREMOTE, "Too many levels of remote in path");
+	ERRSTR(EBADRPC, "RPC struct is bad");
+	ERRSTR(ERPCMISMATCH, "RPC version wrong");
+	ERRSTR(EPROGUNAVAIL, "RPC prog. not avail");
+	ERRSTR(EPROGMISMATCH, "Program version wrong");
+	ERRSTR(EPROCUNAVAIL, "Bad procedure for program");
+
+	ERRSTR(ENOLCK, "No locks available");
+	ERRSTR(ENOSYS, "Function not implemented");
+	ERRSTR(EFTYPE, "Inappropriate file type or format");
+#ifdef EAUTH
+	ERRSTR(EAUTH, "Authentication error");
+#endif
+#ifdef ENEEDAUTH
+	ERRSTR(ENEEDAUTH, "Need authenticator");
+#endif
+	ERRSTR(EIDRM, "Identifier removed");
+	ERRSTR(ENOMSG, "No message of desired type");
+#ifdef EOVERFLOW
+	ERRSTR(EOVERFLOW, "Value too large to be stored in data type");
+#endif
+	ERRSTR(ECANCELED, "Operation canceled");
+	ERRSTR(EILSEQ, "Illegal byte sequence");
+#ifdef ENOATTR
+	ERRSTR(ENOATTR, "Attribute not found");
+#endif
+
+/* General */
+#ifdef EDOOFUS
+	ERRSTR(EDOOFUS, "Programming error");
+#endif
+
+#ifdef EBADMSG
+	ERRSTR(EBADMSG, "Bad message");
+#endif
+#ifdef EMULTIHOP
+	ERRSTR(EMULTIHOP, "Multihop attempted");
+#endif
+#ifdef ENOLINK
+	ERRSTR(ENOLINK, "Link has been severed");
+#endif
+#ifdef EPROTO
+	ERRSTR(EPROTO, "Protocol error");
+#endif
+
+	return (__db_unknown_error(num));
+}
diff --git a/src/clib/strncat.c b/src/clib/strncat.c
new file mode 100644
index 00000000..ce8273a4
--- /dev/null
+++ b/src/clib/strncat.c
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strncat --
+ *
+ * PUBLIC: #ifndef HAVE_STRNCAT
+ * PUBLIC: char *strncat __P((char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+/*
+ * Concatenate src on the end of dst.  At most strlen(dst)+n+1 bytes
+ * are written at dst (at most n+1 bytes being appended).  Return dst.
+ */
+char *
+strncat(char *dst, const char *src, size_t n)
+{
+	if (n != 0) {
+		char *d = dst;
+		const char *s = src;
+
+		while (*d != 0)
+			d++;
+		do {
+			if ((*d = *s++) == 0)
+				break;
+			d++;
+		} while (--n != 0);
+		*d = 0;
+	}
+	return (dst);
+}
diff --git a/src/clib/strncmp.c b/src/clib/strncmp.c
new file mode 100644
index 00000000..9738b5b2
--- /dev/null
+++ b/src/clib/strncmp.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strncmp --
+ *
+ * PUBLIC: #ifndef HAVE_STRNCMP
+ * PUBLIC: int strncmp __P((const char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+int
+strncmp(s1, s2, n)
+	const char *s1, *s2;
+	size_t n;
+{
+
+	if (n == 0)
+		return (0);
+	do {
+		if (*s1 != *s2++)
+			return (*(const unsigned char *)s1 -
+				*(const unsigned char *)(s2 - 1));
+		if (*s1++ == 0)
+			break;
+	} while (--n != 0);
+	return (0);
+}
diff --git a/src/clib/strrchr.c b/src/clib/strrchr.c
new file mode 100644
index 00000000..8753e943
--- /dev/null
+++ b/src/clib/strrchr.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strrchr --
+ *
+ * PUBLIC: #ifndef HAVE_STRRCHR
+ * PUBLIC: char *strrchr __P((const char *, int));
+ * PUBLIC: #endif
+ */
+char *strrchr(const char *p, int ch)
+{
+	char *save;
+	char c;
+
+	c = ch;
+	for (save = NULL;; ++p) {
+		if (*p == c)
+			save = (char *)p;
+		if (*p == '\0')
+			return (save);
+	}
+	/* NOTREACHED */
+}
diff --git a/src/clib/strsep.c b/src/clib/strsep.c
new file mode 100644
index 00000000..f79d0f5c
--- /dev/null
+++ b/src/clib/strsep.c
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Get next token from string *stringp, where tokens are possibly-empty
+ * strings separated by characters from delim.
+ *
+ * Writes NULs into the string at *stringp to end tokens.
+ * delim need not remain constant from call to call.
+ * On return, *stringp points past the last NUL written (if there might
+ * be further tokens), or is NULL (if there are definitely no more tokens).
+ *
+ * If *stringp is NULL, strsep returns NULL.
+ *
+ * PUBLIC: #ifndef HAVE_STRSEP
+ * PUBLIC: char *strsep __P((char **, const char *));
+ * PUBLIC: #endif
+ */
+char *
+strsep(stringp, delim)
+	char **stringp;
+	const char *delim;
+{
+	char *s;
+	const char *spanp;
+	int c, sc;
+	char *tok;
+
+	if ((s = *stringp) == NULL)
+		return (NULL);
+	for (tok = s;;) {
+		c = *s++;
+		spanp = delim;
+		do {
+			if ((sc = *spanp++) == c) {
+				if (c == 0)
+					s = NULL;
+				else
+					s[-1] = 0;
+				*stringp = s;
+				return (tok);
+			}
+		} while (sc != 0);
+	}
+	/* NOTREACHED */
+}
diff --git a/src/clib/strtol.c b/src/clib/strtol.c
new file mode 100644
index 00000000..eb76b8f4
--- /dev/null
+++ b/src/clib/strtol.c
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Convert a string to a long integer.
+ *
+ * Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ *
+ * PUBLIC: #ifndef HAVE_STRTOL
+ * PUBLIC: long strtol __P((const char *, char **, int));
+ * PUBLIC: #endif
+ */
+long
+strtol(nptr, endptr, base)
+	const char * nptr;
+	char ** endptr;
+	int base;
+{
+	const char *s;
+	unsigned long acc;
+	char c;
+	unsigned long cutoff;
+	int neg, any, cutlim;
+
+	/*
+	 * Skip white space and pick up leading +/- sign if any.
+	 * If base is 0, allow 0x for hex and 0 for octal, else
+	 * assume decimal; if base is already 16, allow 0x.
+	 */
+	s = nptr;
+	do {
+		c = *s++;
+	} while (isspace((unsigned char)c));
+	if (c == '-') {
+		neg = 1;
+		c = *s++;
+	} else {
+		neg = 0;
+		if (c == '+')
+			c = *s++;
+	}
+	if ((base == 0 || base == 16) &&
+	    c == '0' && (*s == 'x' || *s == 'X')) {
+		c = s[1];
+		s += 2;
+		base = 16;
+	}
+	if (base == 0)
+		base = c == '0' ? 8 : 10;
+	acc = any = 0;
+	if (base < 2 || base > 36)
+		goto noconv;
+
+	/*
+	 * Compute the cutoff value between legal numbers and illegal
+	 * numbers.  That is the largest legal value, divided by the
+	 * base.  An input number that is greater than this value, if
+	 * followed by a legal input character, is too big.  One that
+	 * is equal to this value may be valid or not; the limit
+	 * between valid and invalid numbers is then based on the last
+	 * digit.  For instance, if the range for longs is
+	 * [-2147483648..2147483647] and the input base is 10,
+	 * cutoff will be set to 214748364 and cutlim to either
+	 * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+	 * a value > 214748364, or equal but the next digit is > 7 (or 8),
+	 * the number is too big, and we will return a range error.
+	 *
+	 * Set 'any' if any `digits' consumed; make it negative to indicate
+	 * overflow.
+	 */
+	cutoff = neg ? (unsigned long)-(LONG_MIN + LONG_MAX) + LONG_MAX
+	    : LONG_MAX;
+	cutlim = cutoff % base;
+	cutoff /= base;
+	for ( ; ; c = *s++) {
+		if (c >= '0' && c <= '9')
+			c -= '0';
+		else if (c >= 'A' && c <= 'Z')
+			c -= 'A' - 10;
+		else if (c >= 'a' && c <= 'z')
+			c -= 'a' - 10;
+		else
+			break;
+		if (c >= base)
+			break;
+		if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+			any = -1;
+		else {
+			any = 1;
+			acc *= base;
+			acc += c;
+		}
+	}
+	if (any < 0) {
+		acc = neg ? LONG_MIN : LONG_MAX;
+		errno = ERANGE;
+	} else if (!any) {
+noconv:
+		errno = EINVAL;
+	} else if (neg)
+		acc = -(long)acc;
+	if (endptr != NULL)
+		*endptr = (char *)(any ? s - 1 : nptr);
+	return (acc);
+}
diff --git a/src/clib/strtoul.c b/src/clib/strtoul.c
new file mode 100644
index 00000000..d0495a33
--- /dev/null
+++ b/src/clib/strtoul.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Convert a string to an unsigned long integer.
+ *
+ * Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ *
+ * PUBLIC: #ifndef HAVE_STRTOUL
+ * PUBLIC: unsigned long strtoul __P((const char *, char **, int));
+ * PUBLIC: #endif
+ */
+unsigned long
+strtoul(nptr, endptr, base)
+	const char * nptr;
+	char ** endptr;
+	int base;
+{
+	const char *s;
+	unsigned long acc;
+	char c;
+	unsigned long cutoff;
+	int neg, any, cutlim;
+
+	/*
+	 * See strtol for comments as to the logic used.
+	 */
+	s = nptr;
+	do {
+		c = *s++;
+	} while (isspace((unsigned char)c));
+	if (c == '-') {
+		neg = 1;
+		c = *s++;
+	} else {
+		neg = 0;
+		if (c == '+')
+			c = *s++;
+	}
+	if ((base == 0 || base == 16) &&
+	    c == '0' && (*s == 'x' || *s == 'X')) {
+		c = s[1];
+		s += 2;
+		base = 16;
+	}
+	if (base == 0)
+		base = c == '0' ? 8 : 10;
+	acc = any = 0;
+	if (base < 2 || base > 36)
+		goto noconv;
+
+	cutoff = ULONG_MAX / base;
+	cutlim = ULONG_MAX % base;
+	for ( ; ; c = *s++) {
+		if (c >= '0' && c <= '9')
+			c -= '0';
+		else if (c >= 'A' && c <= 'Z')
+			c -= 'A' - 10;
+		else if (c >= 'a' && c <= 'z')
+			c -= 'a' - 10;
+		else
+			break;
+		if (c >= base)
+			break;
+		if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+			any = -1;
+		else {
+			any = 1;
+			acc *= base;
+			acc += c;
+		}
+	}
+	if (any < 0) {
+		acc = ULONG_MAX;
+		errno = ERANGE;
+	} else if (!any) {
+noconv:
+		errno = EINVAL;
+	} else if (neg)
+		acc = -acc;
+	if (endptr != NULL)
+		*endptr = (char *)(any ? s - 1 : nptr);
+	return (acc);
+}
diff --git a/src/clib/time.c b/src/clib/time.c
new file mode 100644
index 00000000..abc2ab2d
--- /dev/null
+++ b/src/clib/time.c
@@ -0,0 +1,34 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * time --
+ *
+ * PUBLIC: #ifndef HAVE_TIME
+ * PUBLIC: time_t time __P((time_t *));
+ * PUBLIC: #endif
+ */
+time_t
+time(nowp)
+	time_t *nowp;
+{
+	db_timespec t;
+	time_t res;
+
+	__os_gettime(NULL, &t, 0);
+
+	res = t.tv_sec + t.tv_nsec / NS_PER_SEC;
+
+	if (nowp != NULL)
+		*nowp = res;
+	return (res);
+}
diff --git a/src/common/clock.c b/src/common/clock.c
new file mode 100644
index 00000000..e1f917af
--- /dev/null
+++ b/src/common/clock.c
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+/*
+ * __clock_set_expires --
+ *	Set the expire time given the time to live.
+ *
+ * PUBLIC: void __clock_set_expires __P((ENV *, db_timespec *, db_timeout_t));
+ */
+void
+__clock_set_expires(env, timespecp, timeout)
+	ENV *env;
+	db_timespec *timespecp;
+	db_timeout_t timeout;
+{
+	db_timespec v;
+
+	/*
+	 * If timespecp is set then it contains "now".  This avoids repeated
+	 * system calls to get the time.
+	 */
+	if (!timespecisset(timespecp))
+		__os_gettime(env, timespecp, 1);
+
+	/* Convert the microsecond timeout argument to a timespec. */
+	DB_TIMEOUT_TO_TIMESPEC(timeout, &v);
+
+	/* Add the timeout to "now". */
+	timespecadd(timespecp, &v);
+}
+
+/*
+ * __clock_expired -- determine if a timeout has expired.
+ *
+ * PUBLIC: int __clock_expired __P((ENV *, db_timespec *, db_timespec *));
+ */
+int
+__clock_expired(env, now, timespecp)
+	ENV *env;
+	db_timespec *now, *timespecp;
+{
+	if (!timespecisset(timespecp))
+		return (0);
+
+	if (!timespecisset(now))
+		__os_gettime(env, now, 1);
+
+	return (timespeccmp(now, timespecp, >=));
+}
diff --git a/src/common/crypto_stub.c b/src/common/crypto_stub.c
new file mode 100644
index 00000000..95faebdb
--- /dev/null
+++ b/src/common/crypto_stub.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __crypto_region_init --
+ *	Initialize crypto.
+ *
+ *
+ * !!!
+ * We don't put this stub file in the crypto/ directory of the distribution
+ * because that entire directory is removed for non-crypto distributions.
+ *
+ * PUBLIC: int __crypto_region_init __P((ENV *));
+ */
+int
+__crypto_region_init(env)
+	ENV *env;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	int ret;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	MUTEX_LOCK(env, renv->mtx_regenv);
+	ret = !(renv->cipher_off == INVALID_ROFF);
+	MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+	if (ret == 0)
+		return (0);
+
+	__db_errx(env, DB_STR("0040",
+"Encrypted environment: library build did not include cryptography support"));
+	return (DB_OPNOTSUP);
+}
diff --git a/src/common/db_byteorder.c b/src/common/db_byteorder.c
new file mode 100644
index 00000000..71428f0a
--- /dev/null
+++ b/src/common/db_byteorder.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_isbigendian --
+ *	Return 1 if big-endian (Motorola and Sparc), not little-endian
+ *	(Intel and Vax).  We do this work at run-time, rather than at
+ *	configuration time so cross-compilation and general embedded
+ *	system support is simpler.
+ *
+ * PUBLIC: int __db_isbigendian __P((void));
+ */
+int
+__db_isbigendian()
+{
+	union {					/* From Harbison & Steele.  */
+		long l;
+		char c[sizeof(long)];
+	} u;
+
+	u.l = 1;
+	return (u.c[sizeof(long) - 1] == 1);
+}
+
+/*
+ * __db_byteorder --
+ *	Return if we need to do byte swapping, checking for illegal
+ *	values.
+ *
+ * PUBLIC: int __db_byteorder __P((ENV *, int));
+ */
+int
+__db_byteorder(env, lorder)
+	ENV *env;
+	int lorder;
+{
+	switch (lorder) {
+	case 0:
+		break;
+	case 1234:
+		if (!F_ISSET(env, ENV_LITTLEENDIAN))
+			return (DB_SWAPBYTES);
+		break;
+	case 4321:
+		if (F_ISSET(env, ENV_LITTLEENDIAN))
+			return (DB_SWAPBYTES);
+		break;
+	default:
+		__db_errx(env, DB_STR("0041",
+	    "unsupported byte order, only big and little-endian supported"));
+		return (EINVAL);
+	}
+	return (0);
+}
diff --git a/src/common/db_compint.c b/src/common/db_compint.c
new file mode 100644
index 00000000..9f5ccf9a
--- /dev/null
+++ b/src/common/db_compint.c
@@ -0,0 +1,555 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_COMPRESSION
+
+/*
+ * Integer compression
+ *
+ *  First byte | Next | Maximum
+ *  byte       | bytes| value
+ * ------------+------+---------------------------------------------------------
+ * [0 xxxxxxx] | 0    | 2^7 - 1
+ * [10 xxxxxx] | 1    | 2^14 + 2^7 - 1
+ * [110 xxxxx] | 2    | 2^21 + 2^14 + 2^7 - 1
+ * [1110 xxxx] | 3    | 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11110 xxx] | 4    | 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 000] | 5    | 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 001] | 6    | 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 010] | 7    | 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 011] | 8    | 2^64 + 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 +
+ *	       |      |	2^7 - 1
+ *
+ * NOTE: this compression algorithm depends
+ * on big-endian order, so swap if necessary.
+ *
+ */
+
+#define	CMP_INT_1BYTE_MAX 0x7F
+#define	CMP_INT_2BYTE_MAX 0x407F
+#define	CMP_INT_3BYTE_MAX 0x20407F
+#define	CMP_INT_4BYTE_MAX 0x1020407F
+
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define	CMP_INT_5BYTE_MAX 0x081020407Fi64
+#define	CMP_INT_6BYTE_MAX 0x01081020407Fi64
+#define	CMP_INT_7BYTE_MAX 0x0101081020407Fi64
+#define	CMP_INT_8BYTE_MAX 0x010101081020407Fi64
+#else
+#define	CMP_INT_5BYTE_MAX 0x081020407FLL
+#define	CMP_INT_6BYTE_MAX 0x01081020407FLL
+#define	CMP_INT_7BYTE_MAX 0x0101081020407FLL
+#define	CMP_INT_8BYTE_MAX 0x010101081020407FLL
+#endif
+
+#define	CMP_INT_2BYTE_VAL 0x80
+#define	CMP_INT_3BYTE_VAL 0xC0
+#define	CMP_INT_4BYTE_VAL 0xE0
+#define	CMP_INT_5BYTE_VAL 0xF0
+#define	CMP_INT_6BYTE_VAL 0xF8
+#define	CMP_INT_7BYTE_VAL 0xF9
+#define	CMP_INT_8BYTE_VAL 0xFA
+#define	CMP_INT_9BYTE_VAL 0xFB
+/* CMP_INT_SPARE_VAL is defined in db_int.h */
+
+#define	CMP_INT_2BYTE_MASK 0x3F
+#define	CMP_INT_3BYTE_MASK 0x1F
+#define	CMP_INT_4BYTE_MASK 0x0F
+#define	CMP_INT_5BYTE_MASK 0x07
+
+static const u_int8_t __db_marshaled_int_size[] = {
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+
+	0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+	0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x06, 0x07, 0x08, 0x09, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/*
+ * __db_compress_count_int --
+ *	Return the number of bytes that the compressed version
+ *	of the argument will occupy.
+ *
+ * PUBLIC: u_int32_t __db_compress_count_int __P((u_int64_t));
+ */
+u_int32_t
+__db_compress_count_int(i)
+	u_int64_t i;
+{
+	if (i <= CMP_INT_1BYTE_MAX)
+		return 1;
+	else if (i <= CMP_INT_2BYTE_MAX)
+		return 2;
+	else if (i <= CMP_INT_3BYTE_MAX)
+		return 3;
+	else if (i <= CMP_INT_4BYTE_MAX)
+		return 4;
+	else if (i <= CMP_INT_5BYTE_MAX)
+		return 5;
+	else if (i <= CMP_INT_6BYTE_MAX)
+		return 6;
+	else if (i <= CMP_INT_7BYTE_MAX)
+		return 7;
+	else if (i <= CMP_INT_8BYTE_MAX)
+		return 8;
+	else
+		return 9;
+}
+
+/*
+ * __db_compress_int --
+ *	Compresses the integer into the buffer, returning the number of
+ *	bytes occupied.
+ *
+ * PUBLIC: int __db_compress_int __P((u_int8_t *, u_int64_t));
+ */
+int
+__db_compress_int(buf, i)
+	u_int8_t *buf;
+	u_int64_t i;
+{
+	if (i <= CMP_INT_1BYTE_MAX) {
+		/* no swapping for one byte value */
+		buf[0] = (u_int8_t)i;
+		return 1;
+	} else {
+		u_int8_t *p = (u_int8_t*)&i;
+		if (i <= CMP_INT_2BYTE_MAX) {
+			i -= CMP_INT_1BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = p[6] | CMP_INT_2BYTE_VAL;
+				buf[1] = p[7];
+			} else {
+				buf[0] = p[1] | CMP_INT_2BYTE_VAL;
+				buf[1] = p[0];
+			}
+			return 2;
+		} else if (i <= CMP_INT_3BYTE_MAX) {
+			i -= CMP_INT_2BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = p[5] | CMP_INT_3BYTE_VAL;
+				buf[1] = p[6];
+				buf[2] = p[7];
+			} else {
+				buf[0] = p[2] | CMP_INT_3BYTE_VAL;
+				buf[1] = p[1];
+				buf[2] = p[0];
+			}
+			return 3;
+		} else if (i <= CMP_INT_4BYTE_MAX) {
+			i -= CMP_INT_3BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = p[4] | CMP_INT_4BYTE_VAL;
+				buf[1] = p[5];
+				buf[2] = p[6];
+				buf[3] = p[7];
+			} else {
+				buf[0] = p[3] | CMP_INT_4BYTE_VAL;
+				buf[1] = p[2];
+				buf[2] = p[1];
+				buf[3] = p[0];
+			}
+			return 4;
+		} else if (i <= CMP_INT_5BYTE_MAX) {
+			i -= CMP_INT_4BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = p[3] | CMP_INT_5BYTE_VAL;
+				buf[1] = p[4];
+				buf[2] = p[5];
+				buf[3] = p[6];
+				buf[4] = p[7];
+			} else {
+				buf[0] = p[4] | CMP_INT_5BYTE_VAL;
+				buf[1] = p[3];
+				buf[2] = p[2];
+				buf[3] = p[1];
+				buf[4] = p[0];
+			}
+			return 5;
+		} else if (i <= CMP_INT_6BYTE_MAX) {
+			i -= CMP_INT_5BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = CMP_INT_6BYTE_VAL;
+				buf[1] = p[3];
+				buf[2] = p[4];
+				buf[3] = p[5];
+				buf[4] = p[6];
+				buf[5] = p[7];
+			} else {
+				buf[0] = CMP_INT_6BYTE_VAL;
+				buf[1] = p[4];
+				buf[2] = p[3];
+				buf[3] = p[2];
+				buf[4] = p[1];
+				buf[5] = p[0];
+			}
+			return 6;
+		} else if (i <= CMP_INT_7BYTE_MAX) {
+			i -= CMP_INT_6BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = CMP_INT_7BYTE_VAL;
+				buf[1] = p[2];
+				buf[2] = p[3];
+				buf[3] = p[4];
+				buf[4] = p[5];
+				buf[5] = p[6];
+				buf[6] = p[7];
+			} else {
+				buf[0] = CMP_INT_7BYTE_VAL;
+				buf[1] = p[5];
+				buf[2] = p[4];
+				buf[3] = p[3];
+				buf[4] = p[2];
+				buf[5] = p[1];
+				buf[6] = p[0];
+			}
+			return 7;
+		} else if (i <= CMP_INT_8BYTE_MAX) {
+			i -= CMP_INT_7BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = CMP_INT_8BYTE_VAL;
+				buf[1] = p[1];
+				buf[2] = p[2];
+				buf[3] = p[3];
+				buf[4] = p[4];
+				buf[5] = p[5];
+				buf[6] = p[6];
+				buf[7] = p[7];
+			} else {
+				buf[0] = CMP_INT_8BYTE_VAL;
+				buf[1] = p[6];
+				buf[2] = p[5];
+				buf[3] = p[4];
+				buf[4] = p[3];
+				buf[5] = p[2];
+				buf[6] = p[1];
+				buf[7] = p[0];
+			}
+			return 8;
+		} else {
+			i -= CMP_INT_8BYTE_MAX + 1;
+			if (__db_isbigendian() != 0) {
+				buf[0] = CMP_INT_9BYTE_VAL;
+				buf[1] = p[0];
+				buf[2] = p[1];
+				buf[3] = p[2];
+				buf[4] = p[3];
+				buf[5] = p[4];
+				buf[6] = p[5];
+				buf[7] = p[6];
+				buf[8] = p[7];
+			} else {
+				buf[0] = CMP_INT_9BYTE_VAL;
+				buf[1] = p[7];
+				buf[2] = p[6];
+				buf[3] = p[5];
+				buf[4] = p[4];
+				buf[5] = p[3];
+				buf[6] = p[2];
+				buf[7] = p[1];
+				buf[8] = p[0];
+			}
+			return 9;
+		}
+	}
+}
+
+/*
+ * __db_decompress_count_int --
+ *	Return the number of bytes occupied by the compressed
+ *	integer pointed to by buf.
+ *
+ * PUBLIC: u_int32_t __db_decompress_count_int __P((const u_int8_t *));
+ */
+u_int32_t
+__db_decompress_count_int(buf)
+	const u_int8_t *buf;
+{
+	return __db_marshaled_int_size[*buf];
+}
+
+/*
+ * __db_decompress_int --
+ *	Decompresses the compressed integer pointer to by buf into i,
+ *	returning the number of bytes read.
+ *
+ * PUBLIC: int __db_decompress_int __P((const u_int8_t *, u_int64_t *));
+ */
+int
+__db_decompress_int(buf, i)
+	const u_int8_t *buf;
+	u_int64_t *i;
+{
+	int len;
+	u_int64_t tmp;
+	u_int8_t *p;
+	u_int8_t c;
+
+	tmp = 0;
+	p = (u_int8_t*)&tmp;
+	c = buf[0];
+	len = __db_marshaled_int_size[c];
+
+	switch (len) {
+	case 1:
+		*i = c;
+		return 1;
+	case 2:
+		if (__db_isbigendian() != 0) {
+			p[6] = (c & CMP_INT_2BYTE_MASK);
+			p[7] = buf[1];
+		} else {
+			p[1] = (c & CMP_INT_2BYTE_MASK);
+			p[0] = buf[1];
+		}
+		tmp += CMP_INT_1BYTE_MAX + 1;
+		break;
+	case 3:
+		if (__db_isbigendian() != 0) {
+			p[5] = (c & CMP_INT_3BYTE_MASK);
+			p[6] = buf[1];
+			p[7] = buf[2];
+		} else {
+			p[2] = (c & CMP_INT_3BYTE_MASK);
+			p[1] = buf[1];
+			p[0] = buf[2];
+		}
+		tmp += CMP_INT_2BYTE_MAX + 1;
+		break;
+	case 4:
+		if (__db_isbigendian() != 0) {
+			p[4] = (c & CMP_INT_4BYTE_MASK);
+			p[5] = buf[1];
+			p[6] = buf[2];
+			p[7] = buf[3];
+		} else {
+			p[3] = (c & CMP_INT_4BYTE_MASK);
+			p[2] = buf[1];
+			p[1] = buf[2];
+			p[0] = buf[3];
+		}
+		tmp += CMP_INT_3BYTE_MAX + 1;
+		break;
+	case 5:
+		if (__db_isbigendian() != 0) {
+			p[3] = (c & CMP_INT_5BYTE_MASK);
+			p[4] = buf[1];
+			p[5] = buf[2];
+			p[6] = buf[3];
+			p[7] = buf[4];
+		} else {
+			p[4] = (c & CMP_INT_5BYTE_MASK);
+			p[3] = buf[1];
+			p[2] = buf[2];
+			p[1] = buf[3];
+			p[0] = buf[4];
+		}
+		tmp += CMP_INT_4BYTE_MAX + 1;
+		break;
+	case 6:
+		if (__db_isbigendian() != 0) {
+			p[3] = buf[1];
+			p[4] = buf[2];
+			p[5] = buf[3];
+			p[6] = buf[4];
+			p[7] = buf[5];
+		} else {
+			p[4] = buf[1];
+			p[3] = buf[2];
+			p[2] = buf[3];
+			p[1] = buf[4];
+			p[0] = buf[5];
+		}
+		tmp += CMP_INT_5BYTE_MAX + 1;
+		break;
+	case 7:
+		if (__db_isbigendian() != 0) {
+			p[2] = buf[1];
+			p[3] = buf[2];
+			p[4] = buf[3];
+			p[5] = buf[4];
+			p[6] = buf[5];
+			p[7] = buf[6];
+		} else {
+			p[5] = buf[1];
+			p[4] = buf[2];
+			p[3] = buf[3];
+			p[2] = buf[4];
+			p[1] = buf[5];
+			p[0] = buf[6];
+		}
+		tmp += CMP_INT_6BYTE_MAX + 1;
+		break;
+	case 8:
+		if (__db_isbigendian() != 0) {
+			p[1] = buf[1];
+			p[2] = buf[2];
+			p[3] = buf[3];
+			p[4] = buf[4];
+			p[5] = buf[5];
+			p[6] = buf[6];
+			p[7] = buf[7];
+		} else {
+			p[6] = buf[1];
+			p[5] = buf[2];
+			p[4] = buf[3];
+			p[3] = buf[4];
+			p[2] = buf[5];
+			p[1] = buf[6];
+			p[0] = buf[7];
+		}
+		tmp += CMP_INT_7BYTE_MAX + 1;
+		break;
+	case 9:
+		if (__db_isbigendian() != 0) {
+			p[0] = buf[1];
+			p[1] = buf[2];
+			p[2] = buf[3];
+			p[3] = buf[4];
+			p[4] = buf[5];
+			p[5] = buf[6];
+			p[6] = buf[7];
+			p[7] = buf[8];
+		} else {
+			p[7] = buf[1];
+			p[6] = buf[2];
+			p[5] = buf[3];
+			p[4] = buf[4];
+			p[3] = buf[5];
+			p[2] = buf[6];
+			p[1] = buf[7];
+			p[0] = buf[8];
+		}
+		tmp += CMP_INT_8BYTE_MAX + 1;
+		break;
+	default:
+		break;
+	}
+
+	*i = tmp;
+	return len;
+}
+
+/*
+ * __db_decompress_int32 --
+ *	Decompresses the compressed 32 bit integer pointer to by buf into i,
+ *	returning the number of bytes read.
+ *
+ * PUBLIC: int __db_decompress_int32 __P((const u_int8_t *, u_int32_t *));
+ */
+int
+__db_decompress_int32(buf, i)
+	const u_int8_t *buf;
+	u_int32_t *i;
+{
+	int len;
+	u_int32_t tmp;
+	u_int8_t *p;
+	u_int8_t c;
+
+	tmp = 0;
+	p = (u_int8_t*)&tmp;
+	c = buf[0];
+	len = __db_marshaled_int_size[c];
+
+	switch (len) {
+	case 1:
+		*i = c;
+		return 1;
+	case 2:
+		if (__db_isbigendian() != 0) {
+			p[2] = (c & CMP_INT_2BYTE_MASK);
+			p[3] = buf[1];
+		} else {
+			p[1] = (c & CMP_INT_2BYTE_MASK);
+			p[0] = buf[1];
+		}
+		tmp += CMP_INT_1BYTE_MAX + 1;
+		break;
+	case 3:
+		if (__db_isbigendian() != 0) {
+			p[1] = (c & CMP_INT_3BYTE_MASK);
+			p[2] = buf[1];
+			p[3] = buf[2];
+		} else {
+			p[2] = (c & CMP_INT_3BYTE_MASK);
+			p[1] = buf[1];
+			p[0] = buf[2];
+		}
+		tmp += CMP_INT_2BYTE_MAX + 1;
+		break;
+	case 4:
+		if (__db_isbigendian() != 0) {
+			p[0] = (c & CMP_INT_4BYTE_MASK);
+			p[1] = buf[1];
+			p[2] = buf[2];
+			p[3] = buf[3];
+		} else {
+			p[3] = (c & CMP_INT_4BYTE_MASK);
+			p[2] = buf[1];
+			p[1] = buf[2];
+			p[0] = buf[3];
+		}
+		tmp += CMP_INT_3BYTE_MAX + 1;
+		break;
+	case 5:
+		if (__db_isbigendian() != 0) {
+			p[0] = buf[1];
+			p[1] = buf[2];
+			p[2] = buf[3];
+			p[3] = buf[4];
+		} else {
+			p[3] = buf[1];
+			p[2] = buf[2];
+			p[1] = buf[3];
+			p[0] = buf[4];
+		}
+		tmp += CMP_INT_4BYTE_MAX + 1;
+		break;
+	default:
+		break;
+	}
+
+	*i = tmp;
+	return len;
+}
+
+#endif
diff --git a/src/common/db_err.c b/src/common/db_err.c
new file mode 100644
index 00000000..6edc37b6
--- /dev/null
+++ b/src/common/db_err.c
@@ -0,0 +1,1118 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static void __db_msgcall __P((const DB_ENV *, const char *, va_list));
+static void __db_msgfile __P((const DB_ENV *, const char *, va_list));
+
+/*
+ * __db_fchk --
+ *	General flags checking routine.
+ *
+ * PUBLIC: int __db_fchk __P((ENV *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__db_fchk(env, name, flags, ok_flags)
+	ENV *env;
+	const char *name;
+	u_int32_t flags, ok_flags;
+{
+	return (LF_ISSET(~ok_flags) ? __db_ferr(env, name, 0) : 0);
+}
+
+/*
+ * __db_fcchk --
+ *	General combination flags checking routine.
+ *
+ * PUBLIC: int __db_fcchk
+ * PUBLIC:    __P((ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__db_fcchk(env, name, flags, flag1, flag2)
+	ENV *env;
+	const char *name;
+	u_int32_t flags, flag1, flag2;
+{
+	return (LF_ISSET(flag1) &&
+	    LF_ISSET(flag2) ? __db_ferr(env, name, 1) : 0);
+}
+
+/*
+ * __db_ferr --
+ *	Common flag errors.
+ *
+ * PUBLIC: int __db_ferr __P((const ENV *, const char *, int));
+ */
+int
+__db_ferr(env, name, iscombo)
+	const ENV *env;
+	const char *name;
+	int iscombo;
+{
+	if (iscombo)
+		__db_errx(env, DB_STR_A("0054",
+		    "illegal flag combination specified to %s", "%s"), name);
+	else
+		__db_errx(env, DB_STR_A("0055",
+		    "illegal flag specified to %s", "%s"), name);
+
+	return (EINVAL);
+}
+
+/*
+ * __db_fnl --
+ *	Common flag-needs-locking message.
+ *
+ * PUBLIC: int __db_fnl __P((const ENV *, const char *));
+ */
+int
+__db_fnl(env, name)
+	const ENV *env;
+	const char *name;
+{
+	__db_errx(env, DB_STR_A("0056",
+    "%s: DB_READ_COMMITTED, DB_READ_UNCOMMITTED and DB_RMW require locking",
+	    "%s"), name);
+	return (EINVAL);
+}
+
+/*
+ * __db_pgerr --
+ *	Error when unable to retrieve a specified page.
+ *
+ * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t, int));
+ */
+int
+__db_pgerr(dbp, pgno, errval)
+	DB *dbp;
+	db_pgno_t pgno;
+	int errval;
+{
+	/*
+	 * Three things are certain:
+	 * Death, taxes, and lost data.
+	 * Guess which has occurred.
+	 */
+	__db_errx(dbp->env, DB_STR_A("0057",
+	    "unable to create/retrieve page %lu", "%lu"), (u_long)pgno);
+	return (__env_panic(dbp->env, errval));
+}
+
+/*
+ * __db_pgfmt --
+ *	Error when a page has the wrong format.
+ *
+ * PUBLIC: int __db_pgfmt __P((ENV *, db_pgno_t));
+ */
+int
+__db_pgfmt(env, pgno)
+	ENV *env;
+	db_pgno_t pgno;
+{
+	__db_errx(env, DB_STR_A("0058",
+	    "page %lu: illegal page type or format", "%lu"), (u_long)pgno);
+	return (__env_panic(env, EINVAL));
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_assert --
+ *	Error when an assertion fails.  Only checked if #DIAGNOSTIC defined.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_assert __P((ENV *, const char *, const char *, int));
+ * PUBLIC: #endif
+ */
+void
+__db_assert(env, e, file, line)
+	ENV *env;
+	const char *e, *file;
+	int line;
+{
+	if (DB_GLOBAL(j_assert) != NULL)
+		DB_GLOBAL(j_assert)(e, file, line);
+	else {
+		__db_errx(env, DB_STR_A("0059",
+		    "assert failure: %s/%d: \"%s\"",
+		    "%s %d %s"), file, line, e);
+
+		__os_abort(env);
+		/* NOTREACHED */
+	}
+}
+#endif
+
+/*
+ * __env_panic_msg --
+ *	Just report that someone else paniced.
+ *
+ * PUBLIC: int __env_panic_msg __P((ENV *));
+ */
+int
+__env_panic_msg(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	ret = DB_RUNRECOVERY;
+
+	__db_errx(env, DB_STR("0060",
+	    "PANIC: fatal region error detected; run recovery"));
+
+	if (dbenv->db_paniccall != NULL)		/* Deprecated */
+		dbenv->db_paniccall(dbenv, ret);
+
+	/* Must check for DB_EVENT_REG_PANIC panic first because it is never
+	 * set by itself.  If set, it means panic came from DB_REGISTER code
+	 * only, otherwise it could be from many possible places in the code.
+	 */
+	if ((env->reginfo != NULL) &&
+	    (((REGENV *)env->reginfo->primary)->reg_panic))
+		DB_EVENT(env, DB_EVENT_REG_PANIC, &ret);
+	else
+		DB_EVENT(env, DB_EVENT_PANIC, &ret);
+
+	return (ret);
+}
+
+/*
+ * __env_panic --
+ *	Lock out the database environment due to unrecoverable error.
+ *
+ * PUBLIC: int __env_panic __P((ENV *, int));
+ */
+int
+__env_panic(env, errval)
+	ENV *env;
+	int errval;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env->dbenv;
+
+	if (env != NULL) {
+		__env_panic_set(env, 1);
+
+		__db_err(env, errval, DB_STR("0061", "PANIC"));
+
+		if (dbenv->db_paniccall != NULL)	/* Deprecated */
+			dbenv->db_paniccall(dbenv, errval);
+
+		/* Must check for DB_EVENT_REG_PANIC first because it is never
+		 * set by itself.  If set, it means panic came from DB_REGISTER
+		 * code only, otherwise it could be from many possible places
+		 * in the code.
+		 */
+		if ((env->reginfo != NULL) &&
+		    (((REGENV *)env->reginfo->primary)->reg_panic))
+			DB_EVENT(env, DB_EVENT_REG_PANIC, &errval);
+		else
+			DB_EVENT(env, DB_EVENT_PANIC, &errval);
+	}
+
+#if defined(DIAGNOSTIC) && !defined(CONFIG_TEST)
+	/*
+	 * We want a stack trace of how this could possibly happen.
+	 *
+	 * Don't drop core if it's the test suite -- it's reasonable for the
+	 * test suite to check to make sure that DB_RUNRECOVERY is returned
+	 * under certain conditions.
+	 */
+	__os_abort(env);
+	/* NOTREACHED */
+#endif
+
+	/*
+	 * Chaos reigns within.
+	 * Reflect, repent, and reboot.
+	 * Order shall return.
+	 */
+	return (DB_RUNRECOVERY);
+}
+
+/*
+ * db_strerror --
+ *	ANSI C strerror(3) for DB.
+ *
+ * EXTERN: char *db_strerror __P((int));
+ */
+char *
+db_strerror(error)
+	int error;
+{
+	char *p;
+
+	if (error == 0)
+		return (DB_STR("0062", "Successful return: 0"));
+	if (error > 0) {
+		if ((p = strerror(error)) != NULL)
+			return (p);
+		return (__db_unknown_error(error));
+	}
+
+	/*
+	 * !!!
+	 * The Tcl API requires that some of these return strings be compared
+	 * against strings stored in application scripts.  So, any of these
+	 * errors that do not invariably result in a Tcl exception may not be
+	 * altered.
+	 */
+	switch (error) {
+	case DB_BUFFER_SMALL:
+		return (DB_STR("0063",
+		    "DB_BUFFER_SMALL: User memory too small for return value"));
+	case DB_DONOTINDEX:
+		return (DB_STR("0064",
+		    "DB_DONOTINDEX: Secondary index callback returns null"));
+	case DB_FOREIGN_CONFLICT:
+		return (DB_STR("0065",
+       "DB_FOREIGN_CONFLICT: A foreign database constraint has been violated"));
+	case DB_HEAP_FULL:
+		return (DB_STR("0208","DB_HEAP_FULL: no free space in db"));
+	case DB_KEYEMPTY:
+		return (DB_STR("0066",
+		    "DB_KEYEMPTY: Non-existent key/data pair"));
+	case DB_KEYEXIST:
+		return (DB_STR("0067",
+		    "DB_KEYEXIST: Key/data pair already exists"));
+	case DB_LOCK_DEADLOCK:
+		return (DB_STR("0068",
+		    "DB_LOCK_DEADLOCK: Locker killed to resolve a deadlock"));
+	case DB_LOCK_NOTGRANTED:
+		return (DB_STR("0069", "DB_LOCK_NOTGRANTED: Lock not granted"));
+	case DB_LOG_BUFFER_FULL:
+		return (DB_STR("0070",
+		    "DB_LOG_BUFFER_FULL: In-memory log buffer is full"));
+	case DB_LOG_VERIFY_BAD:
+		return (DB_STR("0071",
+		    "DB_LOG_VERIFY_BAD: Log verification failed"));
+	case DB_NOSERVER:
+		return (DB_STR("0072",
+    "DB_NOSERVER: No message dispatch call-back function has been configured"));
+	case DB_NOTFOUND:
+		return (DB_STR("0073",
+		    "DB_NOTFOUND: No matching key/data pair found"));
+	case DB_OLD_VERSION:
+		return (DB_STR("0074",
+		    "DB_OLDVERSION: Database requires a version upgrade"));
+	case DB_PAGE_NOTFOUND:
+		return (DB_STR("0075",
+		    "DB_PAGE_NOTFOUND: Requested page not found"));
+	case DB_REP_DUPMASTER:
+		return (DB_STR("0076",
+		    "DB_REP_DUPMASTER: A second master site appeared"));
+	case DB_REP_HANDLE_DEAD:
+		return (DB_STR("0077",
+		    "DB_REP_HANDLE_DEAD: Handle is no longer valid"));
+	case DB_REP_HOLDELECTION:
+		return (DB_STR("0078",
+		    "DB_REP_HOLDELECTION: Need to hold an election"));
+	case DB_REP_IGNORE:
+		return (DB_STR("0079",
+		    "DB_REP_IGNORE: Replication record/operation ignored"));
+	case DB_REP_ISPERM:
+		return (DB_STR("0080",
+		    "DB_REP_ISPERM: Permanent record written"));
+	case DB_REP_JOIN_FAILURE:
+		return (DB_STR("0081",
+		    "DB_REP_JOIN_FAILURE: Unable to join replication group"));
+	case DB_REP_LEASE_EXPIRED:
+		return (DB_STR("0082",
+		    "DB_REP_LEASE_EXPIRED: Replication leases have expired"));
+	case DB_REP_LOCKOUT:
+		return (DB_STR("0083",
+	    "DB_REP_LOCKOUT: Waiting for replication recovery to complete"));
+	case DB_REP_NEWSITE:
+		return (DB_STR("0084",
+		    "DB_REP_NEWSITE: A new site has entered the system"));
+	case DB_REP_NOTPERM:
+		return (DB_STR("0085",
+		    "DB_REP_NOTPERM: Permanent log record not written"));
+	case DB_REP_UNAVAIL:
+		return (DB_STR("0086",
+	    "DB_REP_UNAVAIL: Too few remote sites to complete operation"));
+	case DB_REP_WOULDROLLBACK:	/* Undocumented; C API only. */
+		return (DB_STR("0207",
+			"DB_REP_WOULDROLLBACK: Client data has diverged"));
+	case DB_RUNRECOVERY:
+		return (DB_STR("0087",
+		    "DB_RUNRECOVERY: Fatal error, run database recovery"));
+	case DB_SECONDARY_BAD:
+		return (DB_STR("0088",
+	    "DB_SECONDARY_BAD: Secondary index inconsistent with primary"));
+	case DB_TIMEOUT:
+		return (DB_STR("0089", "DB_TIMEOUT: Operation timed out"));
+	case DB_VERIFY_BAD:
+		return (DB_STR("0090",
+		    "DB_VERIFY_BAD: Database verification failed"));
+	case DB_VERSION_MISMATCH:
+		return (DB_STR("0091",
+	    "DB_VERSION_MISMATCH: Database environment version mismatch"));
+	default:
+		break;
+	}
+
+	return (__db_unknown_error(error));
+}
+
+/*
+ * __db_unknown_error --
+ *	Format an unknown error value into a static buffer.
+ *
+ * PUBLIC: char *__db_unknown_error __P((int));
+ */
+char *
+__db_unknown_error(error)
+	int error;
+{
+	/*
+	 * !!!
+	 * Room for a 64-bit number + slop.  This buffer is only used
+	 * if we're given an unknown error number, which should never
+	 * happen.
+	 *
+	 * We're no longer thread-safe if it does happen, but the worst
+	 * result is a corrupted error string because there will always
+	 * be a trailing nul byte since the error buffer is nul filled
+	 * and longer than any error message.
+	 */
+	(void)snprintf(DB_GLOBAL(error_buf),
+	    sizeof(DB_GLOBAL(error_buf)), DB_STR_A("0092",
+	    "Unknown error: %d", "%d"), error);
+	return (DB_GLOBAL(error_buf));
+}
+
+/*
+ * __db_syserr --
+ *	Standard error routine.
+ *
+ * PUBLIC: void __db_syserr __P((const ENV *, int, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_syserr(const ENV *env, int error, const char *fmt, ...)
+#else
+__db_syserr(env, error, fmt, va_alist)
+	const ENV *env;
+	int error;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * The same as DB->err, except we don't default to writing to stderr
+	 * after any output channel has been configured, and we use a system-
+	 * specific function to translate errors to strings.
+	 */
+	DB_REAL_ERR(dbenv, error, DB_ERROR_SYSTEM, 0, fmt);
+}
+
+/*
+ * __db_err --
+ *	Standard error routine.
+ *
+ * PUBLIC: void __db_err __P((const ENV *, int, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_err(const ENV *env, int error, const char *fmt, ...)
+#else
+__db_err(env, error, fmt, va_alist)
+	const ENV *env;
+	int error;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * The same as DB->err, except we don't default to writing to stderr
+	 * once an output channel has been configured.
+	 */
+	DB_REAL_ERR(dbenv, error, DB_ERROR_SET, 0, fmt);
+}
+
+/*
+ * __db_errx --
+ *	Standard error routine.
+ *
+ * PUBLIC: void __db_errx __P((const ENV *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_errx(const ENV *env, const char *fmt, ...)
+#else
+__db_errx(env, fmt, va_alist)
+	const ENV *env;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * The same as DB->errx, except we don't default to writing to stderr
+	 * once an output channel has been configured.
+	 */
+	DB_REAL_ERR(dbenv, 0, DB_ERROR_NOT_SET, 0, fmt);
+}
+
+/*
+ * __db_errcall --
+ *	Do the error message work for callback functions.
+ *
+ * PUBLIC: void __db_errcall
+ * PUBLIC:    __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+ */
+void
+__db_errcall(dbenv, error, error_set, fmt, ap)
+	const DB_ENV *dbenv;
+	int error;
+	db_error_set_t error_set;
+	const char *fmt;
+	va_list ap;
+{
+	char *p;
+	char buf[2048];		/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+	char sysbuf[1024];	/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	p = buf;
+	if (fmt != NULL)
+		p += vsnprintf(buf, sizeof(buf), fmt, ap);
+	if (error_set != DB_ERROR_NOT_SET)
+		p += snprintf(p,
+		    sizeof(buf) - (size_t)(p - buf), ": %s",
+		    error_set == DB_ERROR_SET ? db_strerror(error) :
+		    __os_strerror(error, sysbuf, sizeof(sysbuf)));
+
+	dbenv->db_errcall(dbenv, dbenv->db_errpfx, buf);
+}
+
+/*
+ * __db_errfile --
+ *	Do the error message work for FILE *s.
+ *
+ * PUBLIC: void __db_errfile
+ * PUBLIC:    __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+ */
+void
+__db_errfile(dbenv, error, error_set, fmt, ap)
+	const DB_ENV *dbenv;
+	int error;
+	db_error_set_t error_set;
+	const char *fmt;
+	va_list ap;
+{
+	FILE *fp;
+	int need_sep;
+	char sysbuf[1024];	/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	fp = dbenv == NULL ||
+	    dbenv->db_errfile == NULL ? stderr : dbenv->db_errfile;
+	need_sep = 0;
+
+	if (dbenv != NULL && dbenv->db_errpfx != NULL) {
+		(void)fprintf(fp, "%s", dbenv->db_errpfx);
+		need_sep = 1;
+	}
+	if (fmt != NULL && fmt[0] != '\0') {
+		if (need_sep)
+			(void)fprintf(fp, ": ");
+		need_sep = 1;
+		(void)vfprintf(fp, fmt, ap);
+	}
+	if (error_set != DB_ERROR_NOT_SET)
+		(void)fprintf(fp, "%s%s",
+		    need_sep ? ": " : "",
+		    error_set == DB_ERROR_SET ? db_strerror(error) :
+		    __os_strerror(error, sysbuf, sizeof(sysbuf)));
+	(void)fprintf(fp, "\n");
+	(void)fflush(fp);
+}
+
+/*
+ * __db_msgadd --
+ *	Aggregate a set of strings into a buffer for the callback API.
+ *
+ * PUBLIC: void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_msgadd(ENV *env, DB_MSGBUF *mbp, const char *fmt, ...)
+#else
+__db_msgadd(env, mbp, fmt, va_alist)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	__db_msgadd_ap(env, mbp, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * __db_msgadd_ap --
+ *	Aggregate a set of strings into a buffer for the callback API.
+ *
+ * PUBLIC: void __db_msgadd_ap
+ * PUBLIC:     __P((ENV *, DB_MSGBUF *, const char *, va_list));
+ */
+void
+__db_msgadd_ap(env, mbp, fmt, ap)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	const char *fmt;
+	va_list ap;
+{
+	size_t len, olen;
+	char buf[2048];		/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap);
+
+	/*
+	 * There's a heap buffer in the ENV handle we use to aggregate the
+	 * message chunks.  We maintain a pointer to the buffer, the next slot
+	 * to be filled in in the buffer, and a total buffer length.
+	 */
+	olen = (size_t)(mbp->cur - mbp->buf);
+	if (olen + len >= mbp->len) {
+		if (__os_realloc(env, mbp->len + len + 256, &mbp->buf))
+			return;
+		mbp->len += (len + 256);
+		mbp->cur = mbp->buf + olen;
+	}
+
+	memcpy(mbp->cur, buf, len + 1);
+	mbp->cur += len;
+}
+
+/*
+ * __db_msg --
+ *	Standard DB stat message routine.
+ *
+ * PUBLIC: void __db_msg __P((const ENV *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_msg(const ENV *env, const char *fmt, ...)
+#else
+__db_msg(env, fmt, va_alist)
+	const ENV *env;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	DB_REAL_MSG(dbenv, fmt);
+}
+
+/*
+ * __db_repmsg --
+ *	Replication system message routine.
+ *
+ * PUBLIC: void __db_repmsg __P((const ENV *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_repmsg(const ENV *env, const char *fmt, ...)
+#else
+__db_repmsg(env, fmt, va_alist)
+	const ENV *env;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	char buf[2048];		/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+	__rep_msg(env, buf);
+	va_end(ap);
+}
+
+/*
+ * __db_msgcall --
+ *	Do the message work for callback functions.
+ */
+static void
+__db_msgcall(dbenv, fmt, ap)
+	const DB_ENV *dbenv;
+	const char *fmt;
+	va_list ap;
+{
+	char buf[2048];		/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+
+	dbenv->db_msgcall(dbenv, buf);
+}
+
+/*
+ * __db_msgfile --
+ *	Do the message work for FILE *s.
+ */
+static void
+__db_msgfile(dbenv, fmt, ap)
+	const DB_ENV *dbenv;
+	const char *fmt;
+	va_list ap;
+{
+	FILE *fp;
+
+	fp = dbenv == NULL ||
+	    dbenv->db_msgfile == NULL ? stdout : dbenv->db_msgfile;
+	(void)vfprintf(fp, fmt, ap);
+
+	(void)fprintf(fp, "\n");
+	(void)fflush(fp);
+}
+
+/*
+ * __db_unknown_flag -- report internal error
+ *
+ * PUBLIC: int __db_unknown_flag __P((ENV *, char *, u_int32_t));
+ */
+int
+__db_unknown_flag(env, routine, flag)
+	ENV *env;
+	char *routine;
+	u_int32_t flag;
+{
+	__db_errx(env, DB_STR_A("0093", "%s: Unknown flag: %#x", "%s %#x"),
+	    routine, (u_int)flag);
+
+#ifdef DIAGNOSTIC
+	__os_abort(env);
+	/* NOTREACHED */
+#endif
+	return (EINVAL);
+}
+
+/*
+ * __db_unknown_type -- report internal database type error
+ *
+ * PUBLIC: int __db_unknown_type __P((ENV *, char *, DBTYPE));
+ */
+int
+__db_unknown_type(env, routine, type)
+	ENV *env;
+	char *routine;
+	DBTYPE type;
+{
+	__db_errx(env, DB_STR_A("0094", "%s: Unexpected database type: %s",
+	    "%s %s"), routine, __db_dbtype_to_string(type));
+
+#ifdef DIAGNOSTIC
+	__os_abort(env);
+	/* NOTREACHED */
+#endif
+	return (EINVAL);
+}
+
+/*
+ * __db_unknown_path -- report unexpected database code path error.
+ *
+ * PUBLIC: int __db_unknown_path __P((ENV *, char *));
+ */
+int
+__db_unknown_path(env, routine)
+	ENV *env;
+	char *routine;
+{
+	__db_errx(env, DB_STR_A("0095", "%s: Unexpected code path error",
+	    "%s"), routine);
+
+#ifdef DIAGNOSTIC
+	__os_abort(env);
+	/* NOTREACHED */
+#endif
+	return (EINVAL);
+}
+
+/*
+ * __db_check_txn --
+ *	Check for common transaction errors.
+ *
+ * PUBLIC: int __db_check_txn __P((DB *, DB_TXN *, DB_LOCKER *, int));
+ */
+int
+__db_check_txn(dbp, txn, assoc_locker, read_op)
+	DB *dbp;
+	DB_TXN *txn;
+	DB_LOCKER *assoc_locker;
+	int read_op;
+{
+	ENV *env;
+	int related, ret;
+
+	env = dbp->env;
+
+	/*
+	 * If we are in recovery or aborting a transaction, then we
+	 * don't need to enforce the rules about dbp's not allowing
+	 * transactional operations in non-transactional dbps and
+	 * vica-versa.  This happens all the time as the dbp during
+	 * an abort may be transactional, but we undo operations
+	 * outside a transaction since we're aborting.
+	 */
+	if (IS_RECOVERING(env) || F_ISSET(dbp, DB_AM_RECOVER))
+		return (0);
+
+	/*
+	 * Check for common transaction errors:
+	 *	an operation on a handle whose open commit hasn't completed.
+	 *	a transaction handle in a non-transactional environment
+	 *	a transaction handle for a non-transactional database
+	 */
+	if (!read_op && txn != NULL && F_ISSET(txn, TXN_READONLY)) {
+		__db_errx(env, DB_STR("0096",
+		    "Read-only transaction cannot be used for an update"));
+		return (EINVAL);
+	} else if (txn == NULL || F_ISSET(txn, TXN_PRIVATE)) {
+		if (dbp->cur_locker != NULL &&
+		    dbp->cur_locker->id >= TXN_MINIMUM)
+			goto open_err;
+
+		if (!read_op && F_ISSET(dbp, DB_AM_TXN)) {
+			__db_errx(env, DB_STR("0097",
+		    "Transaction not specified for a transactional database"));
+			return (EINVAL);
+		}
+	} else if (F_ISSET(txn, TXN_FAMILY)) {
+		/*
+		 * Family transaction handles can be passed to any method,
+		 * since they only determine locker IDs.
+		 */
+		return (0);
+	} else {
+		if (!TXN_ON(env))
+			 return (__db_not_txn_env(env));
+
+		if (!F_ISSET(dbp, DB_AM_TXN)) {
+			__db_errx(env, DB_STR("0098",
+		    "Transaction specified for a non-transactional database"));
+			return (EINVAL);
+		}
+
+		if (F_ISSET(txn, TXN_DEADLOCK))
+			return (__db_txn_deadlock_err(env, txn));
+
+		if (dbp->cur_locker != NULL &&
+		    dbp->cur_locker->id >= TXN_MINIMUM &&
+		     dbp->cur_locker->id != txn->txnid) {
+			if ((ret = __lock_locker_same_family(env,
+			     dbp->cur_locker, txn->locker, &related)) != 0)
+				return (ret);
+			if (!related)
+				goto open_err;
+		}
+	}
+
+	/*
+	 * If dbp->associate_locker is not NULL, that means we're in
+	 * the middle of a DB->associate with DB_CREATE (i.e., a secondary index
+	 * creation).
+	 *
+	 * In addition to the usual transaction rules, we need to lock out
+	 * non-transactional updates that aren't part of the associate (and
+	 * thus are using some other locker ID).
+	 *
+	 * Transactional updates should simply block;  from the time we
+	 * decide to build the secondary until commit, we'll hold a write
+	 * lock on all of its pages, so it should be safe to attempt to update
+	 * the secondary in another transaction (presumably by updating the
+	 * primary).
+	 */
+	if (!read_op && dbp->associate_locker != NULL &&
+	    txn != NULL && dbp->associate_locker != assoc_locker) {
+		__db_errx(env, DB_STR("0099",
+	    "Operation forbidden while secondary index is being created"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Check the txn and dbp are from the same env.
+	 */
+	if (txn != NULL && env != txn->mgrp->env) {
+		__db_errx(env, DB_STR("0100",
+		    "Transaction and database from different environments"));
+		return (EINVAL);
+	}
+
+	return (0);
+open_err:
+	if (F2_ISSET(dbp, DB2_AM_EXCL))
+	    __db_errx(env, DB_STR("0209",
+"Exclusive database handles can only have one active transaction at a time."));
+	else
+		__db_errx(env, DB_STR("0101",
+		    "Transaction that opened the DB handle is still active"));
+	return (EINVAL);
+}
+
+/*
+ * __db_txn_deadlock_err --
+ *	Transaction has allready been deadlocked.
+ *
+ * PUBLIC: int __db_txn_deadlock_err __P((ENV *, DB_TXN *));
+ */
+int
+__db_txn_deadlock_err(env, txn)
+	ENV *env;
+	DB_TXN *txn;
+{
+	const char *name;
+
+	name = NULL;
+	(void)__txn_get_name(txn, &name);
+
+	__db_errx(env, DB_STR_A("0102",
+	    "%s%sprevious transaction deadlock return not resolved",
+	    "%s %s"), name == NULL ? "" : name, name == NULL ? "" : ": ");
+
+	return (EINVAL);
+}
+
+/*
+ * __db_not_txn_env --
+ *	DB handle must be in an environment that supports transactions.
+ *
+ * PUBLIC: int __db_not_txn_env __P((ENV *));
+ */
+int
+__db_not_txn_env(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0103",
+	    "DB environment not configured for transactions"));
+	return (EINVAL);
+}
+
+/*
+ * __db_rec_toobig --
+ *	Fixed record length exceeded error message.
+ *
+ * PUBLIC: int __db_rec_toobig __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__db_rec_toobig(env, data_len, fixed_rec_len)
+	ENV *env;
+	u_int32_t data_len, fixed_rec_len;
+{
+	__db_errx(env, DB_STR_A("0104",
+	    "%lu larger than database's maximum record length %lu",
+	    "%lu %lu"), (u_long)data_len, (u_long)fixed_rec_len);
+	return (EINVAL);
+}
+
+/*
+ * __db_rec_repl --
+ *	Fixed record replacement length error message.
+ *
+ * PUBLIC: int __db_rec_repl __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__db_rec_repl(env, data_size, data_dlen)
+	ENV *env;
+	u_int32_t data_size, data_dlen;
+{
+	__db_errx(env, DB_STR_A("0105",
+	    "Record length error: "
+	    "replacement length %lu differs from replaced length %lu",
+	    "%lu %lu"), (u_long)data_size, (u_long)data_dlen);
+	return (EINVAL);
+}
+
+#if defined(DIAGNOSTIC) || defined(DEBUG_ROP)  || defined(DEBUG_WOP)
+/*
+ * __dbc_logging --
+ *	In DIAGNOSTIC mode, check for bad replication combinations.
+ *
+ * PUBLIC: int __dbc_logging __P((DBC *));
+ */
+int
+__dbc_logging(dbc)
+	DBC *dbc;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+	db_rep = env->rep_handle;
+
+	ret = LOGGING_ON(env) &&
+	    !F_ISSET(dbc, DBC_RECOVER) && !IS_REP_CLIENT(env);
+
+	/*
+	 * If we're not using replication or running recovery, return.
+	 */
+	if (db_rep == NULL || F_ISSET(dbc, DBC_RECOVER))
+		return (ret);
+
+#ifndef	DEBUG_ROP
+	/*
+	 *  Only check when DEBUG_ROP is not configured.  People often do
+	 * non-transactional reads, and debug_rop is going to write
+	 * a log record.
+	 */
+	{
+	REP *rep;
+
+	rep = db_rep->region;
+
+	/*
+	 * If we're a client and not running recovery or non durably, error.
+	 */
+	if (IS_REP_CLIENT(env) && !F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE)) {
+		__db_errx(env, DB_STR("0106",
+		    "dbc_logging: Client update"));
+		goto err;
+	}
+
+#ifndef DEBUG_WOP
+	/*
+	 * If DEBUG_WOP is enabled, then we'll generate debugging log records
+	 * that are non-transactional.  This is OK.
+	 */
+	if (IS_REP_MASTER(env) &&
+	    dbc->txn == NULL && !F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE)) {
+		__db_errx(env, DB_STR("0107",
+		    "Dbc_logging: Master non-txn update"));
+		goto err;
+	}
+#endif
+
+	if (0) {
+err:		__db_errx(env, DB_STR_A("0108", "Rep: flags 0x%lx msg_th %lu",
+		    "%lx %lu"), (u_long)rep->flags, (u_long)rep->msg_th);
+		__db_errx(env, DB_STR_A("0109", "Rep: handle %lu, opcnt %lu",
+		    "%lu %lu"), (u_long)rep->handle_cnt, (u_long)rep->op_cnt);
+		__os_abort(env);
+		/* NOTREACHED */
+	}
+	}
+#endif
+	return (ret);
+}
+#endif
+
+/*
+ * __db_check_lsn --
+ *	Display the log sequence error message.
+ *
+ * PUBLIC: int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__db_check_lsn(env, lsn, prev)
+	ENV *env;
+	DB_LSN *lsn, *prev;
+{
+	__db_errx(env, DB_STR_A("0110",
+	    "Log sequence error: page LSN %lu %lu; previous LSN %lu %lu",
+	    "%lu %lu %lu %lu"), (u_long)(lsn)->file,
+	    (u_long)(lsn)->offset, (u_long)(prev)->file,
+	    (u_long)(prev)->offset);
+	return (EINVAL);
+}
+
+/*
+ * __db_rdonly --
+ *	Common readonly message.
+ * PUBLIC: int __db_rdonly __P((const ENV *, const char *));
+ */
+int
+__db_rdonly(env, name)
+	const ENV *env;
+	const char *name;
+{
+	__db_errx(env, DB_STR_A("0111",
+	    "%s: attempt to modify a read-only database", "%s"), name);
+	return (EACCES);
+}
+
+/*
+ * __db_space_err --
+ *	Common out of space message.
+ * PUBLIC: int __db_space_err __P((const DB *));
+ */
+int
+__db_space_err(dbp)
+	const DB *dbp;
+{
+	__db_errx(dbp->env, DB_STR_A("0112",
+	    "%s: file limited to %lu pages", "%s %lu"),
+	    dbp->fname, (u_long)dbp->mpf->mfp->maxpgno);
+	return (ENOSPC);
+}
+
+/*
+ * __db_failed --
+ *	Common failed thread  message.
+ *
+ * PUBLIC: int __db_failed __P((const ENV *,
+ * PUBLIC:      const char *, pid_t, db_threadid_t));
+ */
+int
+__db_failed(env, msg, pid, tid)
+	const ENV *env;
+	const char *msg;
+	pid_t pid;
+	db_threadid_t tid;
+{
+	DB_ENV *dbenv;
+	char buf[DB_THREADID_STRLEN];
+
+	dbenv = env->dbenv;
+
+	__db_errx(env, DB_STR_A("0113", "Thread/process %s failed: %s",
+	    "%s %s"), dbenv->thread_id_string(dbenv, pid, tid, buf),  msg);
+	return (DB_RUNRECOVERY);
+}
diff --git a/src/common/db_getlong.c b/src/common/db_getlong.c
new file mode 100644
index 00000000..cac55a0e
--- /dev/null
+++ b/src/common/db_getlong.c
@@ -0,0 +1,146 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_getlong --
+ *	Return a long value inside of basic parameters.
+ *
+ * PUBLIC: int __db_getlong
+ * PUBLIC:     __P((DB_ENV *, const char *, char *, long, long, long *));
+ */
+int
+__db_getlong(dbenv, progname, p, min, max, storep)
+	DB_ENV *dbenv;
+	const char *progname;
+	char *p;
+	long min, max, *storep;
+{
+	long val;
+	char *end;
+
+	__os_set_errno(0);
+	val = strtol(p, &end, 10);
+	if ((val == LONG_MIN || val == LONG_MAX) &&
+	    __os_get_errno() == ERANGE) {
+		if (dbenv == NULL)
+			fprintf(stderr, "%s: %s: %s\n",
+			    progname, p, strerror(ERANGE));
+		else
+			dbenv->err(dbenv, ERANGE, "%s", p);
+		return (ERANGE);
+	}
+	if (p[0] == '\0' || (end[0] != '\0' && end[0] != '\n')) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0042",
+			    "%s: %s: Invalid numeric argument\n",
+			    "%s %s\n"), progname, p);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0043",
+			    "%s: Invalid numeric argument", "%s"), p);
+		return (EINVAL);
+	}
+	if (val < min) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0044",
+			    "%s: %s: Less than minimum value (%ld)\n",
+			    "%s %s %ld\n"), progname, p, min);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0045",
+			    "%s: Less than minimum value (%ld)",
+			    "%s %ld"), p, min);
+		return (ERANGE);
+	}
+	if (val > max) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0046",
+			    "%s: %s: Greater than maximum value (%ld)\n",
+			    "%s %s %ld\n"), progname, p, max);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0047",
+			    "%s: Greater than maximum value (%ld)",
+			    "%s %ld"), p, max);
+		return (ERANGE);
+	}
+	*storep = val;
+	return (0);
+}
+
+/*
+ * __db_getulong --
+ *	Return an unsigned long value inside of basic parameters.
+ *
+ * PUBLIC: int __db_getulong
+ * PUBLIC:     __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *));
+ */
+int
+__db_getulong(dbenv, progname, p, min, max, storep)
+	DB_ENV *dbenv;
+	const char *progname;
+	char *p;
+	u_long min, max, *storep;
+{
+	u_long val;
+	char *end;
+
+	__os_set_errno(0);
+	val = strtoul(p, &end, 10);
+	if (val == ULONG_MAX && __os_get_errno() == ERANGE) {
+		if (dbenv == NULL)
+			fprintf(stderr, "%s: %s: %s\n",
+			    progname, p, strerror(ERANGE));
+		else
+			dbenv->err(dbenv, ERANGE, "%s", p);
+		return (ERANGE);
+	}
+	if (p[0] == '\0' || (end[0] != '\0' && end[0] != '\n')) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0048",
+			    "%s: %s: Invalid numeric argument\n",
+			    "%s %s\n"), progname, p);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0049",
+			    "%s: Invalid numeric argument",
+			    "%s"), p);
+		return (EINVAL);
+	}
+	if (val < min) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0050",
+			    "%s: %s: Less than minimum value (%lu)\n",
+			    "%s %s %lu\n"), progname, p, min);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0051",
+			    "%s: Less than minimum value (%lu)",
+			    "%s %lu"), p, min);
+		return (ERANGE);
+	}
+
+	/*
+	 * We allow a 0 to substitute as a max value for ULONG_MAX because
+	 * 1) accepting only a 0 value is unlikely to be necessary, and 2)
+	 * we don't want callers to have to use ULONG_MAX explicitly, as it
+	 * may not exist on all platforms.
+	 */
+	if (max != 0 && val > max) {
+		if (dbenv == NULL)
+			fprintf(stderr, DB_STR_A("0052",
+			    "%s: %s: Greater than maximum value (%lu)\n",
+			    "%s %s %lu\n"), progname, p, max);
+		else
+			dbenv->errx(dbenv, DB_STR_A("0053",
+			    "%s: Greater than maximum value (%lu)",
+			    "%s %lu"), p, max);
+		return (ERANGE);
+	}
+	*storep = val;
+	return (0);
+}
diff --git a/src/common/db_idspace.c b/src/common/db_idspace.c
new file mode 100644
index 00000000..a9cbb1bf
--- /dev/null
+++ b/src/common/db_idspace.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __db_idcmp __P((const void *, const void *));
+
+static int
+__db_idcmp(a, b)
+	const void *a;
+	const void *b;
+{
+	u_int32_t i, j;
+
+	i = *(u_int32_t *)a;
+	j = *(u_int32_t *)b;
+
+	if (i < j)
+		return (-1);
+	else if (i > j)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * __db_idspace --
+ *
+ * On input, minp and maxp contain the minimum and maximum valid values for
+ * the name space and on return, they contain the minimum and maximum ids
+ * available (by finding the biggest gap).  The minimum can be an inuse
+ * value, but the maximum cannot be.
+ *
+ * PUBLIC: void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *));
+ */
+void
+__db_idspace(inuse, n, minp, maxp)
+	u_int32_t *inuse;
+	int n;
+	u_int32_t *minp, *maxp;
+{
+	int i, low;
+	u_int32_t gap, t;
+
+	/* A single locker ID is a special case. */
+	if (n == 1) {
+		/*
+		 * If the single item in use is the last one in the range,
+		 * then we've got to perform wrap which means that we set
+		 * the min to the minimum ID, which is what we came in with,
+		 * so we don't do anything.
+		 */
+		if (inuse[0] != *maxp)
+			*minp = inuse[0];
+		*maxp = inuse[0] - 1;
+		return;
+	}
+
+	gap = 0;
+	low = 0;
+	qsort(inuse, (size_t)n, sizeof(u_int32_t), __db_idcmp);
+	for (i = 0; i < n - 1; i++)
+		if ((t = (inuse[i + 1] - inuse[i])) > gap) {
+			gap = t;
+			low = i;
+		}
+
+	/* Check for largest gap at the end of the space. */
+	if ((*maxp - inuse[n - 1]) + (inuse[0] - *minp) > gap) {
+		/* Do same check as we do in the n == 1 case. */
+		if (inuse[n - 1] != *maxp)
+			*minp = inuse[n - 1];
+		*maxp = inuse[0] - 1;
+	} else {
+		*minp = inuse[low];
+		*maxp = inuse[low + 1] - 1;
+	}
+}
diff --git a/src/common/db_log2.c b/src/common/db_log2.c
new file mode 100644
index 00000000..9c929f84
--- /dev/null
+++ b/src/common/db_log2.c
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * PUBLIC: u_int32_t __db_log2 __P((u_int32_t));
+ */
+u_int32_t
+__db_log2(num)
+	u_int32_t num;
+{
+	u_int32_t i, limit;
+
+	limit = 1;
+	for (i = 0; limit < num; limit = limit << 1)
+		++i;
+	return (i);
+}
diff --git a/src/common/db_shash.c b/src/common/db_shash.c
new file mode 100644
index 00000000..a056e4b1
--- /dev/null
+++ b/src/common/db_shash.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_tablesize --
+ *	Choose a size for the hash table.
+ *
+ * PUBLIC: u_int32_t __db_tablesize __P((u_int32_t));
+ */
+u_int32_t
+__db_tablesize(n_buckets)
+	u_int32_t n_buckets;
+{
+	/*
+	 * We try to be clever about how big we make the hash tables.  Use a
+	 * prime number close to the "suggested" number of elements that will
+	 * be in the hash table.  Use 32 as the minimum hash table size.
+	 *
+	 * Ref: Sedgewick, Algorithms in C, "Hash Functions"
+	 *
+	 * Up to ~250,000 buckets, we use powers of 2.  After that, we slow
+	 * the rate of increase by half.  For each choice, we then use a
+	 * nearby prime number as the hash value.
+	 *
+	 * If a terabyte is the maximum cache we'll see, and we assume there
+	 * are 10 1K buckets on each hash chain, then 107374182 is the maximum
+	 * number of buckets we'll ever need.
+	 *
+	 * We don't use the obvious static data structure because some C
+	 * compilers (and I use the term loosely), can't handle them.
+	 */
+#define	HASH_SIZE(power, prime) {					\
+	if ((power) >= n_buckets)					\
+		return (prime);						\
+}
+	HASH_SIZE(32, 37);			/* 2^5 */
+	HASH_SIZE(64, 67);			/* 2^6 */
+	HASH_SIZE(128, 131);			/* 2^7 */
+	HASH_SIZE(256, 257);			/* 2^8 */
+	HASH_SIZE(512, 521);			/* 2^9 */
+	HASH_SIZE(1024, 1031);			/* 2^10 */
+	HASH_SIZE(2048, 2053);			/* 2^11 */
+	HASH_SIZE(4096, 4099);			/* 2^12 */
+	HASH_SIZE(8192, 8191);			/* 2^13 */
+	HASH_SIZE(16384, 16381);		/* 2^14 */
+	HASH_SIZE(32768, 32771);		/* 2^15 */
+	HASH_SIZE(65536, 65537);		/* 2^16 */
+	HASH_SIZE(131072, 131071);		/* 2^17 */
+	HASH_SIZE(262144, 262147);		/* 2^18 */
+	HASH_SIZE(393216, 393209);		/* 2^18 + 2^18/2 */
+	HASH_SIZE(524288, 524287);		/* 2^19 */
+	HASH_SIZE(786432, 786431);		/* 2^19 + 2^19/2 */
+	HASH_SIZE(1048576, 1048573);		/* 2^20 */
+	HASH_SIZE(1572864, 1572869);		/* 2^20 + 2^20/2 */
+	HASH_SIZE(2097152, 2097169);		/* 2^21 */
+	HASH_SIZE(3145728, 3145721);		/* 2^21 + 2^21/2 */
+	HASH_SIZE(4194304, 4194301);		/* 2^22 */
+	HASH_SIZE(6291456, 6291449);		/* 2^22 + 2^22/2 */
+	HASH_SIZE(8388608, 8388617);		/* 2^23 */
+	HASH_SIZE(12582912, 12582917);		/* 2^23 + 2^23/2 */
+	HASH_SIZE(16777216, 16777213);		/* 2^24 */
+	HASH_SIZE(25165824, 25165813);		/* 2^24 + 2^24/2 */
+	HASH_SIZE(33554432, 33554393);		/* 2^25 */
+	HASH_SIZE(50331648, 50331653);		/* 2^25 + 2^25/2 */
+	HASH_SIZE(67108864, 67108859);		/* 2^26 */
+	HASH_SIZE(100663296, 100663291);	/* 2^26 + 2^26/2 */
+	HASH_SIZE(134217728, 134217757);	/* 2^27 */
+	HASH_SIZE(201326592, 201326611);	/* 2^27 + 2^27/2 */
+	HASH_SIZE(268435456, 268435459);	/* 2^28 */
+	HASH_SIZE(402653184, 402653189);	/* 2^28 + 2^28/2 */
+	HASH_SIZE(536870912, 536870909);	/* 2^29 */
+	HASH_SIZE(805306368, 805306357);	/* 2^29 + 2^29/2 */
+	HASH_SIZE(1073741824, 1073741827);	/* 2^30 */
+	return (1073741827);
+}
+
+/*
+ * __db_hashinit --
+ *	Initialize a hash table that resides in shared memory.
+ *
+ * PUBLIC: void __db_hashinit __P((void *, u_int32_t));
+ */
+void
+__db_hashinit(begin, nelements)
+	void *begin;
+	u_int32_t nelements;
+{
+	u_int32_t i;
+	SH_TAILQ_HEAD(hash_head) *headp;
+
+	headp = (struct hash_head *)begin;
+
+	for (i = 0; i < nelements; i++, headp++)
+		SH_TAILQ_INIT(headp);
+}
diff --git a/src/common/dbt.c b/src/common/dbt.c
new file mode 100644
index 00000000..90409f2c
--- /dev/null
+++ b/src/common/dbt.c
@@ -0,0 +1,74 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __dbt_usercopy --
+ *	Take a copy of the user's data, if a callback is supplied.
+ *
+ * PUBLIC: int __dbt_usercopy __P((ENV *, DBT *));
+ */
+int
+__dbt_usercopy(env, dbt)
+	ENV *env;
+	DBT *dbt;
+{
+	void *buf;
+	int ret;
+
+	if (dbt == NULL || !F_ISSET(dbt, DB_DBT_USERCOPY) || dbt->size == 0 ||
+	    dbt->data != NULL)
+		return (0);
+
+	buf = NULL;
+	if ((ret = __os_umalloc(env, dbt->size, &buf)) != 0 ||
+	    (ret = env->dbt_usercopy(dbt, 0, buf, dbt->size,
+	    DB_USERCOPY_GETDATA)) != 0)
+		goto err;
+	dbt->data = buf;
+
+	return (0);
+
+err:	if (buf != NULL) {
+		__os_ufree(env, buf);
+		dbt->data = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbt_userfree --
+ *	Free a copy of the user's data, if necessary.
+ *
+ * PUBLIC: void __dbt_userfree __P((ENV *, DBT *, DBT *, DBT *));
+ */
+void
+__dbt_userfree(env, key, pkey, data)
+	ENV *env;
+	DBT *key, *pkey, *data;
+{
+	if (key != NULL &&
+	    F_ISSET(key, DB_DBT_USERCOPY) && key->data != NULL) {
+		__os_ufree(env, key->data);
+		key->data = NULL;
+	}
+	if (pkey != NULL &&
+	    F_ISSET(pkey, DB_DBT_USERCOPY) && pkey->data != NULL) {
+		__os_ufree(env, pkey->data);
+		pkey->data = NULL;
+	}
+	if (data != NULL &&
+	    F_ISSET(data, DB_DBT_USERCOPY) && data->data != NULL) {
+		__os_ufree(env, data->data);
+		data->data = NULL;
+	}
+}
diff --git a/src/common/mkpath.c b/src/common/mkpath.c
new file mode 100644
index 00000000..c684692c
--- /dev/null
+++ b/src/common/mkpath.c
@@ -0,0 +1,68 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_mkpath -- --
+ *	Create intermediate directories.
+ *
+ * PUBLIC: int __db_mkpath __P((ENV *, const char *));
+ */
+int
+__db_mkpath(env, name)
+	ENV *env;
+	const char *name;
+{
+	size_t len;
+	int ret;
+	char *p, *t, savech;
+
+	/*
+	 * Get a copy so we can modify the string.  It's a path and potentially
+	 * quite long, so don't allocate the space on the stack.
+	 */
+	len = strlen(name) + 1;
+	if ((ret = __os_malloc(env, len, &t)) != 0)
+		return (ret);
+	memcpy(t, name, len);
+
+	/*
+	 * Cycle through the path, creating intermediate directories.
+	 *
+	 * Skip the first byte if it's a path separator, it's the start of an
+	 * absolute pathname.
+	 */
+	if (PATH_SEPARATOR[1] == '\0') {
+		for (p = t + 1; p[0] != '\0'; ++p)
+			if (p[0] == PATH_SEPARATOR[0]) {
+				savech = *p;
+				*p = '\0';
+				if (__os_exists(env, t, NULL) &&
+				    (ret = __os_mkdir(
+					env, t, env->dir_mode)) != 0)
+					break;
+				*p = savech;
+			}
+	} else
+		for (p = t + 1; p[0] != '\0'; ++p)
+			if (strchr(PATH_SEPARATOR, p[0]) != NULL) {
+				savech = *p;
+				*p = '\0';
+				if (__os_exists(env, t, NULL) &&
+				    (ret = __os_mkdir(
+					env, t, env->dir_mode)) != 0)
+					break;
+				*p = savech;
+			}
+
+	__os_free(env, t);
+	return (ret);
+}
diff --git a/src/common/openflags.c b/src/common/openflags.c
new file mode 100644
index 00000000..cec1f081
--- /dev/null
+++ b/src/common/openflags.c
@@ -0,0 +1,51 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_openflags --
+ *	Convert open(2) flags to DB flags.
+ *
+ * PUBLIC: u_int32_t __db_openflags __P((int));
+ */
+u_int32_t
+__db_openflags(oflags)
+	int oflags;
+{
+	u_int32_t dbflags;
+
+	dbflags = 0;
+
+	if (oflags & O_CREAT)
+		dbflags |= DB_CREATE;
+
+	if (oflags & O_TRUNC)
+		dbflags |= DB_TRUNCATE;
+
+	/*
+	 * !!!
+	 * Convert POSIX 1003.1 open(2) mode flags to DB flags.  This isn't
+	 * an exact science as few POSIX implementations have a flag value
+	 * for O_RDONLY, it's simply the lack of a write flag.
+	 */
+#ifndef	O_ACCMODE
+#define	O_ACCMODE	(O_RDONLY | O_RDWR | O_WRONLY)
+#endif
+	switch (oflags & O_ACCMODE) {
+	case O_RDWR:
+	case O_WRONLY:
+		break;
+	default:
+		dbflags |= DB_RDONLY;
+		break;
+	}
+	return (dbflags);
+}
diff --git a/src/common/os_method.c b/src/common/os_method.c
new file mode 100644
index 00000000..1ee06d7a
--- /dev/null
+++ b/src/common/os_method.c
@@ -0,0 +1,270 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * EXTERN: int db_env_set_func_assert
+ * EXTERN:     __P((void (*)(const char *, const char *, int)));
+ */
+int
+db_env_set_func_assert(func_assert)
+	void (*func_assert) __P((const char *, const char *, int));
+{
+	DB_GLOBAL(j_assert) = func_assert;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_close __P((int (*)(int)));
+ */
+int
+db_env_set_func_close(func_close)
+	int (*func_close) __P((int));
+{
+	DB_GLOBAL(j_close) = func_close;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_dirfree __P((void (*)(char **, int)));
+ */
+int
+db_env_set_func_dirfree(func_dirfree)
+	void (*func_dirfree) __P((char **, int));
+{
+	DB_GLOBAL(j_dirfree) = func_dirfree;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_dirlist
+ * EXTERN:     __P((int (*)(const char *, char ***, int *)));
+ */
+int
+db_env_set_func_dirlist(func_dirlist)
+	int (*func_dirlist) __P((const char *, char ***, int *));
+{
+	DB_GLOBAL(j_dirlist) = func_dirlist;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_exists __P((int (*)(const char *, int *)));
+ */
+int
+db_env_set_func_exists(func_exists)
+	int (*func_exists) __P((const char *, int *));
+{
+	DB_GLOBAL(j_exists) = func_exists;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_free __P((void (*)(void *)));
+ */
+int
+db_env_set_func_free(func_free)
+	void (*func_free) __P((void *));
+{
+	DB_GLOBAL(j_free) = func_free;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_fsync __P((int (*)(int)));
+ */
+int
+db_env_set_func_fsync(func_fsync)
+	int (*func_fsync) __P((int));
+{
+	DB_GLOBAL(j_fsync) = func_fsync;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_ftruncate __P((int (*)(int, off_t)));
+ */
+int
+db_env_set_func_ftruncate(func_ftruncate)
+	int (*func_ftruncate) __P((int, off_t));
+{
+	DB_GLOBAL(j_ftruncate) = func_ftruncate;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_ioinfo __P((int (*)(const char *,
+ * EXTERN:     int, u_int32_t *, u_int32_t *, u_int32_t *)));
+ */
+int
+db_env_set_func_ioinfo(func_ioinfo)
+	int (*func_ioinfo)
+	    __P((const char *, int, u_int32_t *, u_int32_t *, u_int32_t *));
+{
+	DB_GLOBAL(j_ioinfo) = func_ioinfo;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_malloc __P((void *(*)(size_t)));
+ */
+int
+db_env_set_func_malloc(func_malloc)
+	void *(*func_malloc) __P((size_t));
+{
+	DB_GLOBAL(j_malloc) = func_malloc;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_file_map
+ * EXTERN:    __P((int (*)(DB_ENV *, char *, size_t, int, void **),
+ * EXTERN:    int (*)(DB_ENV *, void *)));
+ */
+int
+db_env_set_func_file_map(func_file_map, func_file_unmap)
+	int (*func_file_map) __P((DB_ENV *, char *, size_t, int, void **));
+	int (*func_file_unmap) __P((DB_ENV *, void *));
+{
+	DB_GLOBAL(j_file_map) = func_file_map;
+	DB_GLOBAL(j_file_unmap) = func_file_unmap;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_region_map
+ * EXTERN:    __P((int (*)(DB_ENV *, char *, size_t, int *, void **),
+ * EXTERN:    int (*)(DB_ENV *, void *)));
+ */
+int
+db_env_set_func_region_map(func_region_map, func_region_unmap)
+	int (*func_region_map) __P((DB_ENV *, char *, size_t, int *, void **));
+	int (*func_region_unmap) __P((DB_ENV *, void *));
+{
+	DB_GLOBAL(j_region_map) = func_region_map;
+	DB_GLOBAL(j_region_unmap) = func_region_unmap;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_pread
+ * EXTERN:    __P((ssize_t (*)(int, void *, size_t, off_t)));
+ */
+int
+db_env_set_func_pread(func_pread)
+	ssize_t (*func_pread) __P((int, void *, size_t, off_t));
+{
+	DB_GLOBAL(j_pread) = func_pread;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_pwrite
+ * EXTERN:    __P((ssize_t (*)(int, const void *, size_t, off_t)));
+ */
+int
+db_env_set_func_pwrite(func_pwrite)
+	ssize_t (*func_pwrite) __P((int, const void *, size_t, off_t));
+{
+	DB_GLOBAL(j_pwrite) = func_pwrite;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_open __P((int (*)(const char *, int, ...)));
+ */
+int
+db_env_set_func_open(func_open)
+	int (*func_open) __P((const char *, int, ...));
+{
+	DB_GLOBAL(j_open) = func_open;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_read __P((ssize_t (*)(int, void *, size_t)));
+ */
+int
+db_env_set_func_read(func_read)
+	ssize_t (*func_read) __P((int, void *, size_t));
+{
+	DB_GLOBAL(j_read) = func_read;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_realloc __P((void *(*)(void *, size_t)));
+ */
+int
+db_env_set_func_realloc(func_realloc)
+	void *(*func_realloc) __P((void *, size_t));
+{
+	DB_GLOBAL(j_realloc) = func_realloc;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_rename
+ * EXTERN:     __P((int (*)(const char *, const char *)));
+ */
+int
+db_env_set_func_rename(func_rename)
+	int (*func_rename) __P((const char *, const char *));
+{
+	DB_GLOBAL(j_rename) = func_rename;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_seek
+ * EXTERN:     __P((int (*)(int, off_t, int)));
+ */
+int
+db_env_set_func_seek(func_seek)
+	int (*func_seek) __P((int, off_t, int));
+{
+	DB_GLOBAL(j_seek) = func_seek;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_unlink __P((int (*)(const char *)));
+ */
+int
+db_env_set_func_unlink(func_unlink)
+	int (*func_unlink) __P((const char *));
+{
+	DB_GLOBAL(j_unlink) = func_unlink;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_write
+ * EXTERN:     __P((ssize_t (*)(int, const void *, size_t)));
+ */
+int
+db_env_set_func_write(func_write)
+	ssize_t (*func_write) __P((int, const void *, size_t));
+{
+	DB_GLOBAL(j_write) = func_write;
+	return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_yield __P((int (*)(u_long, u_long)));
+ */
+int
+db_env_set_func_yield(func_yield)
+	int (*func_yield) __P((u_long, u_long));
+{
+	DB_GLOBAL(j_yield) = func_yield;
+	return (0);
+}
diff --git a/src/common/util_arg.c b/src/common/util_arg.c
new file mode 100644
index 00000000..73416cb7
--- /dev/null
+++ b/src/common/util_arg.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if DB_VERSION_MAJOR < 4 || DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR < 5
+/*
+ * !!!
+ * We build this file in old versions of Berkeley DB when we're doing test
+ * runs using the test_micro tool.   Without a prototype in place, we get
+ * warnings, and there's no simple workaround.
+ */
+char *strsep();
+#endif
+
+/*
+ * __db_util_arg --
+ *	Convert a string into an argc/argv pair.
+ *
+ * PUBLIC: int __db_util_arg __P((char *, char *, int *, char ***));
+ */
+int
+__db_util_arg(arg0, str, argcp, argvp)
+	char *arg0, *str, ***argvp;
+	int *argcp;
+{
+	int n, ret;
+	char **ap, **argv;
+
+#define	MAXARGS	25
+	if ((ret =
+	    __os_malloc(NULL, (MAXARGS + 1) * sizeof(char **), &argv)) != 0)
+		return (ret);
+
+	ap = argv;
+	*ap++ = arg0;
+	for (n = 1; (*ap = strsep(&str, " \t")) != NULL;)
+		if (**ap != '\0') {
+			++ap;
+			if (++n == MAXARGS)
+				break;
+		}
+	*ap = NULL;
+
+	*argcp = (int)(ap - argv);
+	*argvp = argv;
+
+	return (0);
+}
diff --git a/src/common/util_cache.c b/src/common/util_cache.c
new file mode 100644
index 00000000..1206940b
--- /dev/null
+++ b/src/common/util_cache.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_util_cache --
+ *	Compute if we have enough cache.
+ *
+ * PUBLIC: int __db_util_cache __P((DB *, u_int32_t *, int *));
+ */
+int
+__db_util_cache(dbp, cachep, resizep)
+	DB *dbp;
+	u_int32_t *cachep;
+	int *resizep;
+{
+	u_int32_t pgsize;
+	int ret;
+
+	/* Get the current page size. */
+	if ((ret = dbp->get_pagesize(dbp, &pgsize)) != 0)
+		return (ret);
+
+	/*
+	 * The current cache size is in cachep.  If it's insufficient, set the
+	 * the memory referenced by resizep to 1 and set cachep to the minimum
+	 * size needed.
+	 *
+	 * Make sure our current cache is big enough.  We want at least
+	 * DB_MINPAGECACHE pages in the cache.
+	 */
+	if ((*cachep / pgsize) < DB_MINPAGECACHE) {
+		*resizep = 1;
+		*cachep = pgsize * DB_MINPAGECACHE;
+	} else
+		*resizep = 0;
+
+	return (0);
+}
diff --git a/src/common/util_log.c b/src/common/util_log.c
new file mode 100644
index 00000000..d158d3f0
--- /dev/null
+++ b/src/common/util_log.c
@@ -0,0 +1,45 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_util_logset --
+ *	Log that we're running.
+ *
+ * PUBLIC: int __db_util_logset __P((const char *, char *));
+ */
+int
+__db_util_logset(progname, fname)
+	const char *progname;
+	char *fname;
+{
+	pid_t pid;
+	FILE *fp;
+	time_t now;
+	char time_buf[CTIME_BUFLEN];
+
+	if ((fp = fopen(fname, "w")) == NULL)
+		goto err;
+
+	(void)time(&now);
+
+	__os_id(NULL, &pid, NULL);
+	fprintf(fp,
+	    "%s: %lu %s", progname, (u_long)pid, __os_ctime(&now, time_buf));
+
+	if (fclose(fp) == EOF)
+		goto err;
+
+	return (0);
+
+err:	fprintf(stderr, "%s: %s: %s\n", progname, fname, strerror(errno));
+	return (1);
+}
diff --git a/src/common/util_sig.c b/src/common/util_sig.c
new file mode 100644
index 00000000..02a0fcb2
--- /dev/null
+++ b/src/common/util_sig.c
@@ -0,0 +1,110 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int	interrupt;
+static void	set_signal __P((int, int));
+static void	signal_handler __P((int));
+
+/*
+ * signal_handler --
+ *	Interrupt signal handler.
+ */
+static void
+signal_handler(signo)
+	int signo;
+{
+#ifndef HAVE_SIGACTION
+	/* Assume signal() is unreliable and reset it, first thing. */
+	set_signal(signo, 0);
+#endif
+	/* Some systems don't pass in the correct signal value -- check. */
+	if ((interrupt = signo) == 0)
+		interrupt = SIGINT;
+}
+
+/*
+ * set_signal
+ */
+static void
+set_signal(s, is_dflt)
+	int s, is_dflt;
+{
+	/*
+	 * Use sigaction if it's available, otherwise use signal().
+	 */
+#ifdef HAVE_SIGACTION
+	struct sigaction sa, osa;
+
+	sa.sa_handler = is_dflt ? SIG_DFL : signal_handler;
+	(void)sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	(void)sigaction(s, &sa, &osa);
+#else
+	(void)signal(s, is_dflt ? SIG_DFL : signal_handler);
+#endif
+}
+
+/*
+ * __db_util_siginit --
+ *
+ * PUBLIC: void __db_util_siginit __P((void));
+ */
+void
+__db_util_siginit()
+{
+	/*
+	 * Initialize the set of signals for which we want to clean up.
+	 * Generally, we try not to leave the shared regions locked if
+	 * we can.
+	 */
+#ifdef SIGHUP
+	set_signal(SIGHUP, 0);
+#endif
+#ifdef SIGINT
+	set_signal(SIGINT, 0);
+#endif
+#ifdef SIGPIPE
+	set_signal(SIGPIPE, 0);
+#endif
+#ifdef SIGTERM
+	set_signal(SIGTERM, 0);
+#endif
+}
+
+/*
+ * __db_util_interrupted --
+ *	Return if interrupted.
+ *
+ * PUBLIC: int __db_util_interrupted __P((void));
+ */
+int
+__db_util_interrupted()
+{
+	return (interrupt != 0);
+}
+
+/*
+ * __db_util_sigresend --
+ *
+ * PUBLIC: void __db_util_sigresend __P((void));
+ */
+void
+__db_util_sigresend()
+{
+	/* Resend any caught signal. */
+	if (interrupt != 0) {
+		set_signal(interrupt, 1);
+
+		(void)raise(interrupt);
+		/* NOTREACHED */
+	}
+}
diff --git a/src/common/zerofill.c b/src/common/zerofill.c
new file mode 100644
index 00000000..37662ddc
--- /dev/null
+++ b/src/common/zerofill.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_zero_fill --
+ *	Zero out bytes in the file.
+ *
+ *	Pages allocated by writing pages past end-of-file are not zeroed,
+ *	on some systems.  Recovery could theoretically be fooled by a page
+ *	showing up that contained garbage.  In order to avoid this, we
+ *	have to write the pages out to disk, and flush them.  The reason
+ *	for the flush is because if we don't sync, the allocation of another
+ *	page subsequent to this one might reach the disk first, and if we
+ *	crashed at the right moment, leave us with this page as the one
+ *	allocated by writing a page past it in the file.
+ *
+ * PUBLIC: int __db_zero_fill __P((ENV *, DB_FH *));
+ */
+int
+__db_zero_fill(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+#ifdef HAVE_FILESYSTEM_NOTZERO
+	off_t stat_offset, write_offset;
+	size_t blen, nw;
+	u_int32_t bytes, mbytes;
+	int group_sync, ret;
+	u_int8_t *bp;
+
+	/* Calculate the byte offset of the next write. */
+	write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
+
+	/* Stat the file. */
+	if ((ret = __os_ioinfo(env, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+		return (ret);
+	stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+
+	/* Check if the file is large enough. */
+	if (stat_offset >= write_offset)
+		return (0);
+
+	/* Get a large buffer if we're writing lots of data. */
+#undef	ZF_LARGE_WRITE
+#define	ZF_LARGE_WRITE	(64 * 1024)
+	if ((ret = __os_calloc(env, 1, ZF_LARGE_WRITE, &bp)) != 0)
+		return (ret);
+	blen = ZF_LARGE_WRITE;
+
+	/* Seek to the current end of the file. */
+	if ((ret = __os_seek(env, fhp, mbytes, MEGABYTE, bytes)) != 0)
+		goto err;
+
+	/*
+	 * Hash is the only access method that allocates groups of pages.  Hash
+	 * uses the existence of the last page in a group to signify the entire
+	 * group is OK; so, write all the pages but the last one in the group,
+	 * flush them to disk, then write the last one to disk and flush it.
+	 */
+	for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
+		if (write_offset - stat_offset <= (off_t)blen) {
+			blen = (size_t)(write_offset - stat_offset);
+			if (group_sync && (ret = __os_fsync(env, fhp)) != 0)
+				goto err;
+		}
+		if ((ret = __os_physwrite(env, fhp, bp, blen, &nw)) != 0)
+			goto err;
+		stat_offset += blen;
+	}
+	if ((ret = __os_fsync(env, fhp)) != 0)
+		goto err;
+
+	/* Seek back to where we started. */
+	mbytes = (u_int32_t)(write_offset / MEGABYTE);
+	bytes = (u_int32_t)(write_offset % MEGABYTE);
+	ret = __os_seek(env, fhp, mbytes, MEGABYTE, bytes);
+
+err:	 __os_free(env, bp);
+	return (ret);
+#else
+	COMPQUIET(env, NULL);
+	COMPQUIET(fhp, NULL);
+	return (0);
+#endif /* HAVE_FILESYSTEM_NOTZERO */
+}
+
+/*
+ * __db_zero --
+ *	Zero to the end of the file.
+ *
+ * PUBLIC: int __db_zero_extend __P((ENV *,
+ * PUBLIC:     DB_FH *, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+int
+__db_zero_extend(env, fhp, pgno, last_pgno, pgsize)
+	ENV *env;
+	DB_FH *fhp;
+	db_pgno_t pgno, last_pgno;
+	u_int32_t pgsize;
+{
+	int ret;
+	size_t nwrote;
+	u_int8_t *buf;
+
+	if ((ret = __os_calloc(env, 1, pgsize, &buf)) != 0)
+		return (ret);
+	memset(buf, 0, pgsize);
+	for (; pgno <= last_pgno; pgno++)
+		if ((ret = __os_io(env, DB_IO_WRITE,
+		    fhp, pgno, pgsize, 0, pgsize, buf, &nwrote)) != 0) {
+			if (ret == 0) {
+				ret = EIO;
+				goto err;
+			}
+			goto err;
+		}
+
+err:	__os_free(env, buf);
+	return (ret);
+}
diff --git a/src/crypto/aes_method.c b/src/crypto/aes_method.c
new file mode 100644
index 00000000..47193539
--- /dev/null
+++ b/src/crypto/aes_method.c
@@ -0,0 +1,357 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield,
+ * -- astubble@rice.edu.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+
+#ifdef HAVE_CRYPTO_IPP
+#include <ippcp.h>
+#endif
+
+static void __aes_err __P((ENV *, int));
+static int __aes_derivekeys __P((ENV *, DB_CIPHER *, u_int8_t *, size_t));
+
+/*
+ * __aes_setup --
+ *	Setup AES functions.
+ *
+ * PUBLIC: int __aes_setup __P((ENV *, DB_CIPHER *));
+ */
+int
+__aes_setup(env, db_cipher)
+	ENV *env;
+	DB_CIPHER *db_cipher;
+{
+	AES_CIPHER *aes_cipher;
+	int ret;
+#ifdef	HAVE_CRYPTO_IPP
+	int ctx_size = 0;
+	IppStatus ipp_ret;
+#endif
+
+	db_cipher->adj_size = __aes_adj_size;
+	db_cipher->close = __aes_close;
+	db_cipher->decrypt = __aes_decrypt;
+	db_cipher->encrypt = __aes_encrypt;
+	db_cipher->init = __aes_init;
+	if ((ret = __os_calloc(env, 1, sizeof(AES_CIPHER), &aes_cipher)) != 0)
+		return (ret);
+#ifdef	HAVE_CRYPTO_IPP
+	/*
+	 * IPP AES encryption context size can only be obtained through this
+	 * function call, cannot directly declare IppsRijndael128Spec within
+	 * AES_CIPHER struct.
+	 */
+	if ((ipp_ret = ippsRijndael128GetSize(&ctx_size)) != ippStsNoErr) {
+		__aes_err(env, (int)ipp_ret);
+		return (EAGAIN);
+	}
+	if ((ret = __os_malloc(env, ctx_size, &aes_cipher->ipp_ctx)) != 0) {
+		__os_free(env, aes_cipher);
+		return (ret);
+	}
+#endif
+	db_cipher->data = aes_cipher;
+	return (0);
+}
+
+/*
+ * __aes_adj_size --
+ *	Given a size, return an addition amount needed to meet the
+ *	"chunk" needs of the algorithm.
+ *
+ * PUBLIC: u_int __aes_adj_size __P((size_t));
+ */
+u_int
+__aes_adj_size(len)
+	size_t len;
+{
+	if (len % DB_AES_CHUNK == 0)
+		return (0);
+	return (DB_AES_CHUNK - (u_int)(len % DB_AES_CHUNK));
+}
+
+/*
+ * __aes_close --
+ *	Destroy the AES encryption instantiation.
+ *
+ * PUBLIC: int __aes_close __P((ENV *, void *));
+ */
+int
+__aes_close(env, data)
+	ENV *env;
+	void *data;
+{
+#ifdef	HAVE_CRYPTO_IPP
+	AES_CIPHER *aes_cipher = (AES_CIPHER *)data;
+	__os_free(env, aes_cipher->ipp_ctx);
+#endif
+	__os_free(env, data);
+	return (0);
+}
+
+/*
+ * __aes_decrypt --
+ *	Decrypt data with AES.
+ *
+ * PUBLIC: int __aes_decrypt __P((ENV *, void *, void *,
+ * PUBLIC:     u_int8_t *, size_t));
+ */
+int
+__aes_decrypt(env, aes_data, iv, cipher, cipher_len)
+	ENV *env;
+	void *aes_data;
+	void *iv;
+	u_int8_t *cipher;
+	size_t cipher_len;
+{
+	AES_CIPHER *aes;
+#ifdef	HAVE_CRYPTO_IPP
+	IppStatus ipp_ret;
+#else
+	cipherInstance c;
+#endif
+	int ret;
+
+	aes = (AES_CIPHER *)aes_data;
+	if (iv == NULL || cipher == NULL)
+		return (EINVAL);
+	if ((cipher_len % DB_AES_CHUNK) != 0)
+		return (EINVAL);
+
+#ifdef	HAVE_CRYPTO_IPP
+	if ((ipp_ret = ippsRijndael128DecryptCBC((const Ipp8u *)cipher,
+	    (Ipp8u *)cipher, cipher_len, (IppsRijndael128Spec *)aes->ipp_ctx,
+	    (const Ipp8u *)iv, 0)) != ippStsNoErr) {
+		__aes_err(env, (int)ipp_ret);
+		return (EAGAIN);
+	}
+#else
+	/*
+	 * Initialize the cipher
+	 */
+	if ((ret = __db_cipherInit(&c, MODE_CBC, iv)) < 0) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+
+	/* Do the decryption */
+	if ((ret = __db_blockDecrypt(&c, &aes->decrypt_ki, cipher,
+	    cipher_len * 8, cipher)) < 0) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+#endif
+	return (0);
+}
+
+/*
+ * __aes_encrypt --
+ *	Encrypt data with AES.
+ *
+ * PUBLIC: int __aes_encrypt __P((ENV *, void *, void *,
+ * PUBLIC:     u_int8_t *, size_t));
+ */
+int
+__aes_encrypt(env, aes_data, iv, data, data_len)
+	ENV *env;
+	void *aes_data;
+	void *iv;
+	u_int8_t *data;
+	size_t data_len;
+{
+	AES_CIPHER *aes;
+#ifdef	HAVE_CRYPTO_IPP
+	IppStatus ipp_ret;
+#else
+	cipherInstance c;
+#endif
+	u_int32_t tmp_iv[DB_IV_BYTES/4];
+	int ret;
+
+	aes = (AES_CIPHER *)aes_data;
+	if (aes == NULL || data == NULL)
+		return (EINVAL);
+	if ((data_len % DB_AES_CHUNK) != 0)
+		return (EINVAL);
+	/*
+	 * Generate the IV here.  We store it in a tmp IV because
+	 * the IV might be stored within the data we are encrypting
+	 * and so we will copy it over to the given location after
+	 * encryption is done.
+	 * We don't do this outside of there because some encryption
+	 * algorithms someone might add may not use IV's and we always
+	 * want on here.
+	 */
+	if ((ret = __db_generate_iv(env, tmp_iv)) != 0)
+		return (ret);
+
+#ifdef	HAVE_CRYPTO_IPP
+	if ((ipp_ret = ippsRijndael128EncryptCBC((const Ipp8u *)data,
+	    (Ipp8u *)data, data_len, (IppsRijndael128Spec *)aes->ipp_ctx,
+	    (const Ipp8u *)tmp_iv, 0)) != ippStsNoErr) {
+		__aes_err(env, (int)ipp_ret);
+		return (EAGAIN);
+	}
+#else
+	/*
+	 * Initialize the cipher
+	 */
+	if ((ret = __db_cipherInit(&c, MODE_CBC, (char *)tmp_iv)) < 0) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+
+	/* Do the encryption */
+	if ((ret = __db_blockEncrypt(&c, &aes->encrypt_ki, data, data_len * 8,
+	    data)) < 0) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+#endif
+	memcpy(iv, tmp_iv, DB_IV_BYTES);
+	return (0);
+}
+
+/*
+ * __aes_init --
+ *	Initialize the AES encryption instantiation.
+ *
+ * PUBLIC: int __aes_init __P((ENV *, DB_CIPHER *));
+ */
+int
+__aes_init(env, db_cipher)
+	ENV *env;
+	DB_CIPHER *db_cipher;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env->dbenv;
+
+	return (__aes_derivekeys(
+	    env, db_cipher, (u_int8_t *)dbenv->passwd, dbenv->passwd_len));
+}
+
+static int
+__aes_derivekeys(env, db_cipher, passwd, plen)
+	ENV *env;
+	DB_CIPHER *db_cipher;
+	u_int8_t *passwd;
+	size_t plen;
+{
+	AES_CIPHER *aes;
+	SHA1_CTX ctx;
+#ifdef	HAVE_CRYPTO_IPP
+	IppStatus ipp_ret;
+#else
+	int ret;
+#endif
+	u_int32_t temp[DB_MAC_KEY/4];
+
+	if (passwd == NULL)
+		return (EINVAL);
+
+	aes = (AES_CIPHER *)db_cipher->data;
+
+	/* Derive the crypto keys */
+	__db_SHA1Init(&ctx);
+	__db_SHA1Update(&ctx, passwd, plen);
+	__db_SHA1Update(&ctx, (u_int8_t *)DB_ENC_MAGIC, strlen(DB_ENC_MAGIC));
+	__db_SHA1Update(&ctx, passwd, plen);
+	__db_SHA1Final((u_int8_t *)temp, &ctx);
+
+#ifdef	HAVE_CRYPTO_IPP
+	if ((ipp_ret = ippsRijndael128Init((const Ipp8u *)temp,
+	    IppsRijndaelKey128, (IppsRijndael128Spec *)aes->ipp_ctx))
+	    != ippStsNoErr) {
+		__aes_err(env, (int)ipp_ret);
+		return (EAGAIN);
+	}
+#else
+	if ((ret = __db_makeKey(&aes->encrypt_ki, DIR_ENCRYPT,
+	    DB_AES_KEYLEN, (char *)temp)) != TRUE) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+	if ((ret = __db_makeKey(&aes->decrypt_ki, DIR_DECRYPT,
+	    DB_AES_KEYLEN, (char *)temp)) != TRUE) {
+		__aes_err(env, ret);
+		return (EAGAIN);
+	}
+#endif
+	return (0);
+}
+
+/*
+ * __aes_err --
+ *	Handle AES-specific errors.  Codes and messages derived from
+ *	rijndael/rijndael-api-fst.h.
+ */
+static void
+__aes_err(env, err)
+	ENV *env;
+	int err;
+{
+	char *errstr;
+
+	switch (err) {
+#ifdef	HAVE_CRYPTO_IPP
+	case ippStsNullPtrErr:
+		errstr = DB_STR("0182", "IPP AES NULL pointer error");
+		break;
+	case ippStsLengthErr:
+		errstr = DB_STR("0183", "IPP AES length error");
+		break;
+	case ippStsContextMatchErr:
+		errstr = DB_STR("0184",
+		    "IPP AES context does not match operation");
+		break;
+	case ippStsUnderRunErr:
+		errstr = DB_STR("0185", "IPP AES srclen size error");
+		break;
+#else
+	case BAD_KEY_DIR:
+		errstr = DB_STR("0186", "AES key direction is invalid");
+		break;
+	case BAD_KEY_MAT:
+		errstr = DB_STR("0187",
+		    "AES key material not of correct length");
+		break;
+	case BAD_KEY_INSTANCE:
+		errstr = DB_STR("0188", "AES key passwd not valid");
+		break;
+	case BAD_CIPHER_MODE:
+		errstr = DB_STR("0189",
+		    "AES cipher in wrong state (not initialized)");
+		break;
+	case BAD_BLOCK_LENGTH:
+		errstr = DB_STR("0190", "AES bad block length");
+		break;
+	case BAD_CIPHER_INSTANCE:
+		errstr = DB_STR("0191", "AES cipher instance is invalid");
+		break;
+	case BAD_DATA:
+		errstr = DB_STR("0192", "AES data contents are invalid");
+		break;
+	case BAD_OTHER:
+		errstr = DB_STR("0193", "AES unknown error");
+		break;
+#endif
+	default:
+		errstr = DB_STR("0194", "AES error unrecognized");
+		break;
+	}
+	__db_errx(env, "%s", errstr);
+	return;
+}
diff --git a/src/crypto/crypto.c b/src/crypto/crypto.c
new file mode 100644
index 00000000..b731496f
--- /dev/null
+++ b/src/crypto/crypto.c
@@ -0,0 +1,411 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield
+ * -- astubble@rice.edu
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/crypto.h"
+
+/*
+ * __crypto_region_init --
+ *	Initialize crypto.
+ */
+int
+__crypto_region_init(env)
+	ENV *env;
+{
+	CIPHER *cipher;
+	DB_CIPHER *db_cipher;
+	DB_ENV *dbenv;
+	REGENV *renv;
+	REGINFO *infop;
+	char *sh_passwd;
+	int ret;
+
+	dbenv = env->dbenv;
+	infop = env->reginfo;
+	renv = infop->primary;
+	db_cipher = env->crypto_handle;
+	ret = 0;
+
+	if (renv->cipher_off == INVALID_ROFF) {
+		if (!CRYPTO_ON(env))
+			return (0);
+		if (!F_ISSET(infop, REGION_CREATE)) {
+			__db_errx(env, DB_STR("0172",
+    "Joining non-encrypted environment with encryption key"));
+			return (EINVAL);
+		}
+		if (F_ISSET(db_cipher, CIPHER_ANY)) {
+			__db_errx(env, DB_STR("0173",
+			    "Encryption algorithm not supplied"));
+			return (EINVAL);
+		}
+		/*
+		 * Must create the shared information.  We need: Shared cipher
+		 * information that contains the passwd.  After we copy the
+		 * passwd, we smash and free the one in the env.
+		 */
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		if ((ret = __env_alloc(infop, sizeof(CIPHER), &cipher)) != 0) {
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+			return (ret);
+		}
+		memset(cipher, 0, sizeof(*cipher));
+		if ((ret =
+		    __env_alloc(infop, dbenv->passwd_len, &sh_passwd)) != 0) {
+			__env_alloc_free(infop, cipher);
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+			return (ret);
+		}
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		memset(sh_passwd, 0, dbenv->passwd_len);
+		cipher->passwd = R_OFFSET(infop, sh_passwd);
+		cipher->passwd_len = dbenv->passwd_len;
+		cipher->flags = db_cipher->alg;
+		memcpy(sh_passwd, dbenv->passwd, cipher->passwd_len);
+		renv->cipher_off = R_OFFSET(infop, cipher);
+	} else {
+		if (!CRYPTO_ON(env)) {
+			__db_errx(env, DB_STR("0174",
+		    "Encrypted environment: no encryption key supplied"));
+			return (EINVAL);
+		}
+		cipher = R_ADDR(infop, renv->cipher_off);
+		sh_passwd = R_ADDR(infop, cipher->passwd);
+		if ((cipher->passwd_len != dbenv->passwd_len) ||
+		    memcmp(dbenv->passwd, sh_passwd, cipher->passwd_len) != 0) {
+			__db_errx(env, DB_STR("0175", "Invalid password"));
+			return (EPERM);
+		}
+		if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+		    db_cipher->alg != cipher->flags) {
+			__db_errx(env, DB_STR("0176",
+    "Environment encrypted using a different algorithm"));
+			return (EINVAL);
+		}
+		if (F_ISSET(db_cipher, CIPHER_ANY))
+			/*
+			 * We have CIPHER_ANY and we are joining the existing
+			 * env.  Setup our cipher structure for whatever
+			 * algorithm this env has.
+			 */
+			if ((ret = __crypto_algsetup(env, db_cipher,
+			    cipher->flags, 0)) != 0)
+				return (ret);
+	}
+	ret = db_cipher->init(env, db_cipher);
+
+	/*
+	 * On success, no matter if we allocated it or are using the already
+	 * existing one, we are done with the passwd in the env.  We smash
+	 * N-1 bytes so that we don't overwrite the nul.
+	 */
+	memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+	__os_free(env, dbenv->passwd);
+	dbenv->passwd = NULL;
+	dbenv->passwd_len = 0;
+
+	return (ret);
+}
+
+/*
+ * __crypto_env_close --
+ *	Crypto-specific destruction of ENV structure.
+ *
+ * PUBLIC: int __crypto_env_close __P((ENV *));
+ */
+int
+__crypto_env_close(env)
+	ENV *env;
+{
+	DB_CIPHER *db_cipher;
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	if (dbenv->passwd != NULL) {
+		memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+		__os_free(env, dbenv->passwd);
+		dbenv->passwd = NULL;
+	}
+
+	if (!CRYPTO_ON(env))
+		return (0);
+
+	ret = 0;
+	db_cipher = env->crypto_handle;
+	if (!F_ISSET(db_cipher, CIPHER_ANY))
+		ret = db_cipher->close(env, db_cipher->data);
+	__os_free(env, db_cipher);
+
+	env->crypto_handle = NULL;
+	return (ret);
+}
+
+/*
+ * __crypto_env_refresh --
+ *	Clean up after the crpto system on a close or failed open.
+ *
+ * PUBLIC: int __crypto_env_refresh __P((ENV *));
+ */
+int
+__crypto_env_refresh(env)
+	ENV *env;
+{
+	CIPHER *cipher;
+	REGENV *renv;
+	REGINFO *infop;
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		infop = env->reginfo;
+		renv = infop->primary;
+		if (renv->cipher_off != INVALID_ROFF) {
+			cipher = R_ADDR(infop, renv->cipher_off);
+			MUTEX_LOCK(env, renv->mtx_regenv);
+			__env_alloc_free(infop, R_ADDR(infop, cipher->passwd));
+			__env_alloc_free(infop, cipher);
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __crypto_algsetup --
+ *	Given a db_cipher structure and a valid algorithm flag, call
+ * the specific algorithm setup function.
+ *
+ * PUBLIC: int __crypto_algsetup __P((ENV *, DB_CIPHER *, u_int32_t, int));
+ */
+int
+__crypto_algsetup(env, db_cipher, alg, do_init)
+	ENV *env;
+	DB_CIPHER *db_cipher;
+	u_int32_t alg;
+	int do_init;
+{
+	int ret;
+
+	ret = 0;
+	if (!CRYPTO_ON(env)) {
+		__db_errx(env, DB_STR("0177",
+		    "No cipher structure given"));
+		return (EINVAL);
+	}
+	F_CLR(db_cipher, CIPHER_ANY);
+	switch (alg) {
+	case CIPHER_AES:
+		db_cipher->alg = CIPHER_AES;
+		ret = __aes_setup(env, db_cipher);
+		break;
+	default:
+		ret = __env_panic(env, EINVAL);
+		break;
+	}
+	if (ret == 0 && do_init)
+		ret = db_cipher->init(env, db_cipher);
+	return (ret);
+}
+
+/*
+ * __crypto_decrypt_meta --
+ *	Perform decryption on a metapage if needed.
+ *
+ * PUBLIC:  int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int));
+ */
+int
+__crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
+	ENV *env;
+	DB *dbp;
+	u_int8_t *mbuf;
+	int do_metachk;
+{
+	DB dummydb;
+	DBMETA *meta;
+	DB_CIPHER *db_cipher;
+	size_t pg_off;
+	int ret;
+	u_int8_t *iv;
+
+	/*
+	 * If we weren't given a dbp, we just want to decrypt the page on
+	 * behalf of some internal subsystem, not on behalf of a user with
+	 * a dbp.  Therefore, set up a dummy dbp so that the call to
+	 * P_OVERHEAD below works.
+	 */
+	if (dbp == NULL) {
+		memset(&dummydb, 0, sizeof(DB));
+		dbp = &dummydb;
+	}
+
+	ret = 0;
+	meta = (DBMETA *)mbuf;
+
+	/*
+	 * !!!
+	 * We used an "unused" field in the meta-data page to flag whether or
+	 * not the database is encrypted.  Unfortunately, that unused field
+	 * was used in Berkeley DB releases before 3.0 (for example, 2.7.7).
+	 * It would have been OK, except encryption doesn't follow the usual
+	 * rules of "upgrade before doing anything else", we check encryption
+	 * before checking for old versions of the database.
+	 *
+	 * We don't have to check Btree databases -- before 3.0, the field of
+	 * interest was the bt_maxkey field (which was never supported and has
+	 * since been removed).
+	 *
+	 * Ugly check to jump out if this format is older than what we support.
+	 * This works because we do not encrypt the page header.
+	 */
+	if (meta->magic == DB_HASHMAGIC && meta->version <= 5)
+		return (0);
+
+	/*
+	 * Meta-pages may be encrypted for DBMETASIZE bytes.  If we have a
+	 * non-zero IV (that is written after encryption) then we decrypt (or
+	 * error if the user isn't set up for security).  We guarantee that
+	 * the IV space on non-encrypted pages will be zero and a zero-IV is
+	 * illegal for encryption.  Therefore any non-zero IV means an
+	 * encrypted database.  This basically checks the passwd on the file
+	 * if we cannot find a good magic number.  We walk through all the
+	 * algorithms we know about attempting to decrypt (and possibly
+	 * byteswap).
+	 *
+	 * !!!
+	 * All method meta pages have the IV and checksum at the exact same
+	 * location, but not in DBMETA, use BTMETA.
+	 */
+	if (meta->encrypt_alg != 0) {
+		db_cipher = env->crypto_handle;
+		if (!F_ISSET(dbp, DB_AM_ENCRYPT)) {
+			if (!CRYPTO_ON(env)) {
+				__db_errx(env, DB_STR("0178",
+    "Encrypted database: no encryption flag specified"));
+				return (EINVAL);
+			}
+			/*
+			 * User has a correct, secure env, but has encountered
+			 * a database in that env that is secure, but user
+			 * didn't dbp->set_flags.  Since it is existing, use
+			 * encryption if it is that way already.
+			 */
+			F_SET(dbp, DB_AM_ENCRYPT|DB_AM_CHKSUM);
+		}
+		/*
+		 * This was checked in set_flags when DB_AM_ENCRYPT was set.
+		 * So it better still be true here.
+		 */
+		DB_ASSERT(env, CRYPTO_ON(env));
+		if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+		    meta->encrypt_alg != db_cipher->alg) {
+			__db_errx(env, DB_STR("0179",
+			    "Database encrypted using a different algorithm"));
+			return (EINVAL);
+		}
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+		iv = ((BTMETA *)mbuf)->iv;
+		/*
+		 * For ALL pages, we do not encrypt the beginning of the page
+		 * that contains overhead information.  This is true of meta
+		 * and all other pages.
+		 */
+		pg_off = P_OVERHEAD(dbp);
+alg_retry:
+		/*
+		 * If they asked for a specific algorithm, then
+		 * use it.  Otherwise walk through those we know.
+		 */
+		if (!F_ISSET(db_cipher, CIPHER_ANY)) {
+			if (do_metachk && (ret = db_cipher->decrypt(env,
+			    db_cipher->data, iv, mbuf + pg_off,
+			    DBMETASIZE - pg_off)))
+				return (ret);
+			if (((BTMETA *)meta)->crypto_magic !=
+			    meta->magic) {
+				__db_errx(env, DB_STR("0180",
+				    "Invalid password"));
+				return (EINVAL);
+			}
+			/*
+			 * Success here.  The algorithm asked for and the one
+			 * on the file match.  We've just decrypted the meta
+			 * page and checked the magic numbers.  They match,
+			 * indicating the password is right.  All is right
+			 * with the world.
+			 */
+			return (0);
+		}
+		/*
+		 * If we get here, CIPHER_ANY must be set.
+		 */
+		ret = __crypto_algsetup(env, db_cipher, meta->encrypt_alg, 1);
+		goto alg_retry;
+	} else if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		/*
+		 * They gave us a passwd, but the database is not encrypted.
+		 * This is an error.  We do NOT want to silently allow them
+		 * to write data in the clear when the user set up and expects
+		 * encrypted data.
+		 *
+		 * This covers at least the following scenario.
+		 * 1.  User creates and sets up an encrypted database.
+		 * 2.  Attacker cannot read the actual data in the database
+		 * because it is encrypted, but can remove/replace the file
+		 * with an empty, unencrypted database file.
+		 * 3.  User sets encryption and we get to this code now.
+		 * If we allowed the file to be used in the clear since
+		 * it is that way on disk, the user would unsuspectingly
+		 * write sensitive data in the clear.
+		 * 4.  Attacker reads data that user thought was encrypted.
+		 *
+		 * Therefore, asking for encryption with a database that
+		 * was not encrypted is an error.
+		 */
+		__db_errx(env, DB_STR("0181",
+		    "Unencrypted database with a supplied encryption key"));
+		return (EINVAL);
+	}
+	return (ret);
+}
+
+/*
+ * __crypto_set_passwd --
+ *	Get the password from the shared region; and set it in a new
+ * environment handle.  Use this to duplicate environment handles.
+ *
+ * PUBLIC: int __crypto_set_passwd __P((ENV *, ENV *));
+ */
+int
+__crypto_set_passwd(env_src, env_dest)
+	ENV *env_src, *env_dest;
+{
+	CIPHER *cipher;
+	REGENV *renv;
+	REGINFO *infop;
+	char *sh_passwd;
+
+	infop = env_src->reginfo;
+	renv = infop->primary;
+
+	DB_ASSERT(env_src, CRYPTO_ON(env_src));
+
+	cipher = R_ADDR(infop, renv->cipher_off);
+	sh_passwd = R_ADDR(infop, cipher->passwd);
+	return (__env_set_encrypt(env_dest->dbenv, sh_passwd, DB_ENCRYPT_AES));
+}
diff --git a/src/crypto/crypto.html b/src/crypto/crypto.html
new file mode 100644
index 00000000..1a2dc0c1
--- /dev/null
+++ b/src/crypto/crypto.html
@@ -0,0 +1,638 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+   <meta name="GENERATOR" content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]">
+</head>
+<body>
+
+<center>
+<h1>
+&nbsp;Security Interface for Berkeley DB</h1></center>
+
+<center><i>Susan LoVerso</i>
+<br><i>Rev 1.6</i>
+<br><i>2002 Feb 26</i></center>
+
+<p>We provide an interface allowing secure access to Berkeley DB.&nbsp;&nbsp;
+Our goal is to allow users to have encrypted secure databases.&nbsp; In
+this document, the term <i>ciphering</i> means the act of encryption or
+decryption.&nbsp; They are equal but opposite actions and the same issues
+apply to both just in the opposite direction.
+<h3>
+Requirements</h3>
+The overriding requirement is to provide a simple mechanism to allow users
+to have a secure database.&nbsp; A secure database means that all of the
+pages of a database will be encrypted, and all of the log files will be
+encrypted.
+<p>Falling out from this work will be a simple mechanism to allow users
+to request that we checksum their data for additional error detection (without
+encryption/decryption).
+<p>We expect that data in process memory or stored in shared memory, potentially
+backed by disk, is not encrypted or secure.
+<h2>
+<a NAME="DB Modifications"></a>DB Method Interface Modifications</h2>
+With a logging environment, all database changes are recorded in the log
+files.&nbsp; Therefore, users requiring secure databases in such environments
+also require secure log files.
+<p>A prior thought had been to allow different passwords on the environment
+and the databases within.&nbsp; However, such a scheme, then requires that
+the password be logged in order for recovery to be able to restore the
+database.&nbsp; Therefore, any application having the password for the
+log could get the password for any databases by reading the log.&nbsp;
+So having a different password on a database does not gain any additional
+security and it makes certain things harder and more complex.&nbsp; Some
+of those more complex things include the need to handle database and env
+passwords differently since they'd need to be stored and accessed from
+different places.&nbsp; Also resolving the issue of how <i>db_checkpoint</i>
+or <i>db_sync</i>, which flush database pages to disk, would find the passwords
+of various databases without any dbps was unsolved.&nbsp; The feature didn't
+gain anything and caused significant pain.&nbsp; Therefore the decision
+is that there will be a single password protecting an environment and all
+the logs and some databases within that environment.&nbsp; We do allow
+users to have a secure environment and clear databases.&nbsp; Users that
+want secure databases within a secure environment must set a flag.
+<p>Users wishing to enable encryption on a database in a secure environment
+or enable just checksumming on their database pages will use new flags
+to <a href="../docs/api_c/db_set_flags.html">DB->set_flags()</a>.&nbsp;
+Providing ciphering over an entire environment is accomplished by adding
+a single environment method: <a href="../docs/api_c/env_set_encrypt.html">DBENV->set_encrypt()</a>.&nbsp;
+Providing encryption for a database (not part of an environment) is accomplished
+by adding a new database method: <a href="../docs/api_c/db_set_encrypt.html">DB->set_encrypt()</a>.
+<p>Both of the <i>set_encrypt</i> methods must be called before their respective
+<i>open</i> calls.&nbsp; The environment method must be before the environment
+open because we must know about security before there is any possibility
+of writing any log records out.&nbsp; The database method must be before
+the database open in order to read the root page.&nbsp; The planned interfaces
+for these methods are:
+<pre>DBENV->set_encrypt(DBENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* DB_ENV structure */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; char *passwd&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int32_t flags);&nbsp;&nbsp;&nbsp;&nbsp; /* Flags */</pre>
+
+<pre>DB->set_encrypt(DB *dbp,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* DB structure */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; char *passwd&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int32_t flags);&nbsp;&nbsp;&nbsp;&nbsp; /* Flags */</pre>
+The flags accepted by these functions are:
+<pre>#define DB_ENCRYPT_AES&nbsp; 0x00000001&nbsp; /* Use the AES encryption algorithm */</pre>
+Passwords are NULL-terminated strings.&nbsp; NULL or zero length strings
+are illegal.&nbsp; These flags enable the checksumming and encryption using
+the particular algorithms we have chosen for this implementation.&nbsp;
+The flags are named such that there is a logical naming pattern if additional
+checksum or encryption algorithms are used. If a user gives a flag of zero,
+it will behave in a manner similar to DB_UNKNOWN. It will be illegal if
+they are creating the environment or database, as an algorithm must be
+specified. If they are joining an existing environment or opening an existing
+database, they will use whatever algorithm is in force at the time.&nbsp;
+Using DB_ENCRYPT_AES automatically implies SHA1 checksumming.
+<p>These functions will perform several initialization steps.&nbsp; We
+will allocate crypto_handle for our env handle and set up our function
+pointers.&nbsp; We will allocate space and copy the password into our env
+handle password area.&nbsp; Similar to <i>DB->set_cachesize</i>, calling
+<i>DB->set_encrypt</i>
+will actually reflect back into the local environment created by DB.
+<p>Lastly, we will add a new flag, DB_OVERWRITE, to the <a href="../docs/api_c/env_remove.html">DBENV->remove</a>
+method.&nbsp; The purpose of this flag is to force all of the memory used
+by the shared regions to be overwritten before removal.&nbsp; We will use
+<i>rm_overwrite</i>,
+a function that overwrites and syncs a file 3 times with varying bit patterns
+to really remove a file.&nbsp; Additionally, this flag will force a sync
+of the overwritten regions to disk, if the regions are backed by the file
+system.&nbsp; That way there is no residual information left in the clear
+in memory or freed disk blocks.&nbsp; Although we expect that this flag
+will be used by customers using security, primarily, its action is not
+dependent on passwords or a secure setup, and so can be used by anyone.
+<h4>
+Initialization of the Environment</h4>
+The setup of the security subsystem will be similar to replication initialization
+since it is a sort of subsystem, but it does not have its own region.&nbsp;
+When the environment handle is created via <i>db_env_create</i>, we initialize
+our <i>set_encrypt</i> method to be the RPC or local version.&nbsp; Therefore
+the <i>DB_ENV</i> structure needs a new pointer:
+<pre>&nbsp;&nbsp;&nbsp; void&nbsp;&nbsp;&nbsp; *crypto_handle;&nbsp;&nbsp; /* Security handle */</pre>
+The crypto handle will really point to a new <i>__db_cipher</i> structure
+that will contain a set of functions and a pointer to the in-memory information
+needed by the specific encryption algorithm.&nbsp; It will look like:
+<pre>typedef struct __db_cipher {
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*init)__P((...));&nbsp;&nbsp;&nbsp; /* Alg-specific initialization function */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*encrypt)__P((...)); /* Alg-specific encryption algorithm */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*decrypt)__P((...)); /* Alg-specific decryption function */
+&nbsp;&nbsp;&nbsp; void&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *data;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Pointer to alg-specific information (AES_CIPHER) */
+&nbsp;&nbsp;&nbsp; u_int32_t flags;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Cipher flags */
+} DB_CIPHER;</pre>
+
+<pre>#define DB_MAC_KEY&nbsp;&nbsp;&nbsp; 20&nbsp;&nbsp;&nbsp; /* Size of the MAC key */
+typedef struct __aes_cipher {
+&nbsp;&nbsp;&nbsp; keyInstance&nbsp;&nbsp;&nbsp; encrypt_ki;&nbsp;&nbsp; /* Encrypt keyInstance temp. */
+&nbsp;&nbsp;&nbsp; keyInstance&nbsp;&nbsp;&nbsp; decrypt_ki;&nbsp;&nbsp; /* Decrypt keyInstance temp. */
+&nbsp;&nbsp;&nbsp; u_int8_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; mac_key[DB_MAC_KEY]; /* MAC key */
+&nbsp;&nbsp;&nbsp; u_int32_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; flags;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* AES-specific flags */
+} AES_CIPHER;</pre>
+It should be noted that none of these structures have their own mutex.&nbsp;
+We hold the environment region locked while we are creating this, but once
+this is set up, it is read-only forever.
+<p>During <a href="../docs/api_c/env_set_encrypt.html">dbenv->set_encrypt</a>,
+we set the encryption, decryption and checksumming methods to the appropriate
+functions based on the flags.&nbsp; This function will allocate us a crypto
+handle that we store in the <i>DB_ENV</i> structure just like all the
+other subsystems.&nbsp; For now, only AES ciphering functions and SHA1
+checksumming functions are supported.&nbsp; Also we will copy the password
+into the <i>DB_ENV</i> structure.&nbsp; We ultimately need to keep the
+password in the environment's shared memory region or compare this one
+against the one that is there, if we are joining an existing environment,
+but we do not have it yet because open has not yet been called.&nbsp; We
+will allocate a structure that will be used in initialization and set up
+the function pointers to point to the algorithm-specific functions.
+<p>In the&nbsp; <i>__env_open</i> path, in <i>__db_e_attach</i>, if we
+are creating the region and the <i>dbenv->passwd</i> field is set, we need
+to use the length of the password in the initial computation of the environment's
+size.&nbsp; This guarantees sufficient space for storing the password in
+shared memory.&nbsp; Then we will call a new function to initialize the
+security region, <i>__crypto_region_init</i> in <i>__env_open</i>.&nbsp;
+If we are the creator, we will allocate space in the shared region to store
+the password and copy the password into that space.&nbsp; Or, if we are
+not the creator we will compare the password stored in the dbenv with the
+one in shared memory.&nbsp;&nbsp; Additionally, we will compare the ciphering
+algorithm to the one stored in the shared region.We'll smash the dbenv
+password and free it.&nbsp; If they do not match, we return an error.&nbsp;
+If we are the creator we store the offset into the REGENV structure.&nbsp;
+Then <i>__crypto_region_init&nbsp;</i> will call the initialization function
+set up earlier based on the ciphering algorithm specified.&nbsp; For now
+we will call <i>__aes_init</i>.&nbsp; Additionally this function will allocate
+and set up the per-process state vector for this encryption's IVs.&nbsp;
+See <a href="#Generating the Initialization Vector">Generating the Initialization
+Vector</a> for a detailed description of the IV and state vector.
+<p>In the AES-specific initialization function, <i>__aes_init</i>,&nbsp;
+we will initialize it by calling
+<i>__aes_derivekeys</i> in order to fill
+in the keyInstance and mac_key fields in that structure.&nbsp; The REGENV
+structure will have one additional item
+<pre>&nbsp;&nbsp; roff_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; passwd_off;&nbsp;&nbsp; /* Offset of passwd */</pre>
+
+<h4>
+Initializing a Database</h4>
+During <a href="../docs/api_c/db_set_encrypt.html">db->set_encrypt</a>,
+we set the encryption, decryption and checksumming methods to the appropriate
+functions based on the flags.&nbsp; Basically, we test that we are not
+in an existing environment and we haven't called open.&nbsp; Then we just
+call through the environment handle to set the password.
+<p>Also, we will need to add a flag in the database meta-data page that
+indicates that the database is encrypted and what its algorithm is.&nbsp;
+This will be used when the meta-page is read after reopening a file. We
+need this information on the meta-page in order to detect a user opening
+a secure database without a password.&nbsp; I propose using the first unused1
+byte (renaming it too) in the meta page for this purpose.
+<p>All pages will not be encrypted for the first 64 bytes of data.&nbsp;
+Database meta-pages will be encrypted on the first 512 bytes only.&nbsp;
+All meta-page types will have an IV and checksum added within the first
+512 bytes as well as a crypto magic number.&nbsp; This will expand the
+size of the meta-page from 256 bytes to 512 bytes. The page in/out routines,
+<i>__db_pgin</i> and <i>__db_pgout</i> know the page type of the page and
+will apply the 512 bytes ciphering to meta pages.&nbsp; In <i>__db_pgout</i>,
+if we have a crypto handle in our (private) environment, we will apply
+ciphering to either the entire page, or the first 512 bytes if it is a
+meta-page.&nbsp; In <i>__db_pgin</i>, we will decrypt if the page we have
+a crypto handle.
+<p>When multiple processes share a database, all must use the same password
+as the database creator. Using an existing database requires several conditions
+to be true.&nbsp; First, if the creator of the database did not create
+with security, then opening later with security is an error.&nbsp; Second,
+if the creator did create it with security, then opening later without
+security is an error.&nbsp; Third, we need to be able to test and check
+that when another process opens a secure database that the password they
+provided is the same as the one in use by the creator.
+<p>When reading the meta-page, in <i>__db_file_setup</i>, we do not go
+through the paging functions, but directly read via <i>__os_read</i>.&nbsp;
+It is at this point that we will determine if the user is configured correctly.&nbsp;
+If the meta-page we read has an IV and checksum, they better have a crypto
+handle.&nbsp; If they have a crypto handle, then the meta-page must have
+an IV and checksum.&nbsp; If both of those are true, we test the password.&nbsp;
+We compare the unencrypted magic number to the newly-decrypted crypto magic
+number and if they are not the same, then we report that the user gave
+us a bad password.
+<p>On a mostly unrelated topic, even when we go to very large pagesizes,
+the meta information will still be within a disk sector.&nbsp; So, after
+talking it over with Keith and Margo, we determined that unencrypted meta-pages
+still will not need a checksum.
+<h3>
+Encryption and Checksum Routines</h3>
+These routines are provided to us by Adam Stubblefield at Rice University
+(astubble@rice.edu).&nbsp; The functional interfaces are:
+<pre>__aes_derivekeys(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *passwd,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t passwd_len,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of passwd */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *mac_key,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* 20 byte array to store MAC key */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *encrypt_key,&nbsp;&nbsp; /* Encryption key of passwd */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *decrypt_key);&nbsp; /* Decryption key of passwd */</pre>
+This is the only function requiring the textual user password.&nbsp; From
+the password, this function generates a key used in the checksum function,
+<i>__db_chksum</i>.&nbsp;
+It also fills in <i>keyInstance</i> structures which are then used in the
+encryption and decryption routines.&nbsp; The keyInstance structures must
+already be allocated.&nbsp; These will be stored in the AES_CIPHER structure.
+<pre>&nbsp;__db_chksum(u_int8_t *data,&nbsp;&nbsp;&nbsp; /* Data to checksum */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of data */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *mac_key,&nbsp;&nbsp;&nbsp; /* 20 byte array from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *checksum);&nbsp; /* 20 byte array to store checksum */</pre>
+This function generates a checksum on the data given.&nbsp; This function
+will do double-duty for users that simply want error detection on their
+pages.&nbsp; When users are using encryption, the <i>mac_key </i>will contain
+the 20-byte key set up in <i>__aes_derivekeys</i>.&nbsp; If they just want
+checksumming, then <i>mac_key</i> will be NULL.&nbsp; According to Adam,
+we can safely use the first N-bytes of the checksum.&nbsp; So for seeding
+the generator for initialization vectors, we'll hash the time and then
+send in the first 4 bytes for the seed.&nbsp; I believe we can probably
+do the same thing for checksumming log records.&nbsp; We can only use 4
+bytes for the checksum in the non-secure case.&nbsp; So when we want to
+verify the log checksum we can compute the mac but just compare the first
+4 bytes to the one we read.&nbsp; All locations where we generate or check
+log record checksums that currently call <i>__ham_func4</i> will now call
+<i>__db_chksum</i>.&nbsp;
+I believe there are 5 such locations,
+<i>__log_put, __log_putr, __log_newfile,
+__log_rep_put
+</i>and<i> __txn_force_abort.</i>
+<pre>__aes_encrypt(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *key,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password key instance from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *iv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Initialization vector */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *data,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Data to encrypt */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of data to encrypt - 16 byte multiple */</pre>
+This is the function to encrypt data.&nbsp; It will be called to encrypt
+pages and log records.&nbsp; The <i>key</i> instance is initialized in
+<i>__aes_derivekeys</i>.&nbsp;
+The initialization vector, <i>iv</i>, is the 16 byte random value set up
+by the Mersenne Twister pseudo-random generator.&nbsp; Lastly, we pass
+in a pointer to the <i>data</i> to encrypt and its length in <i>data_len</i>.&nbsp;
+The <i>data_len</i> must be a multiple of 16 bytes. The encryption is done
+in-place so that when the encryption code returns our encrypted data is
+in the same location as the original data.
+<pre>__aes_decrypt(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *key,&nbsp; /* Password key instance from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *iv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Initialization vector */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *data,&nbsp;&nbsp;&nbsp; /* Data to decrypt */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len);&nbsp; /* Length of data to decrypt - 16 byte multiple */</pre>
+This is the function to decrypt the data.&nbsp; It is exactly the same
+as the encryption function except for the action it performs.&nbsp; All
+of the args and issues are the same.&nbsp; It also decrypts in place.
+<h3>
+<a NAME="Generating the Initialization Vector"></a>Generating the Initialization
+Vector</h3>
+Internally, we need to provide a unique initialization vector (IV) of 16
+bytes every time we encrypt any data with the same password.&nbsp; For
+the IV we are planning on using mt19937, the Mersenne Twister, a random
+number generator that has a period of 2**19937-1. This package can be found
+at <a href="http://www.math.keio.ac.jp/~matumoto/emt.html">http://www.math.keio.ac.jp/~matumoto/emt.html</a>.&nbsp;
+Tests show that although it repeats a single integer every once in a while,
+that after several million iterations, it doesn't repeat any 4 integers
+that we'd be stuffing into our 16-byte IV.&nbsp; We plan on seeding this
+generator with the time (tv_sec) hashed through SHA1 when we create the
+environment.&nbsp; This package uses a global state vector that contains
+624 unsigned long integers.&nbsp; We do not allow a 16-byte IV of zero.&nbsp;
+It is simpler just to reject any 4-byte value of 0 and if we get one, just
+call the generator again and get a different number.&nbsp; We need to detect
+holes in files and if we read an IV of zero that is a simple indication
+that we need to check for an entire page of zero.&nbsp; The IVs are stored
+on the page after encryption and are not encrypted themselves so it is
+not possible for an entire encrypted page to be read as all zeroes, unless
+it was a hole in a file.&nbsp; See <a href="#Holes in Files">Holes in Files</a>
+for more details.
+<p>We will not be holding any locks when we need to generate our IV but
+we need to protect access to the state vector and the index.&nbsp; Calls
+to the MT code will come while encrypting some data in <i>__aes_encrypt.</i>&nbsp;&nbsp;
+The MT code will assume that all necessary locks are held in the caller.&nbsp;
+We will have per-process state vectors that are set up when a process begins.&nbsp;
+That way we minimize the contention and only multi-threaded processes need
+acquire locks for the IV.&nbsp; We will have the state vector in the environment
+handle in heap memory, as well as the index and there will be a mutex protecting
+it for threaded access.&nbsp; This will be added to the <i>DB_ENV</i>
+structure:
+<pre>&nbsp;&nbsp;&nbsp; DB_MUTEX&nbsp;&nbsp;&nbsp; *mt_mutexp;&nbsp;&nbsp; /* Mersenne Twister mutex */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *mti;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* MT index */
+&nbsp;&nbsp;&nbsp; u_long&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *mt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* MT state vector */</pre>
+This portion of the environment will be initialized at the end of _<i>_dbenv_open</i>,
+right after we initialize the other mutex for the <i>dblist</i>. When we
+allocate the space, we will generate our initial state vector. If we are
+multi-threaded we'll allocate and initialize our mutex also.
+<p>We need to make changes to the MT code to make it work in our namespace
+and&nbsp; to take&nbsp; a pointer to the location of the state vector and
+the index.&nbsp;&nbsp; There will be a wrapper function <i>__db_generate_iv</i>
+that DB will call and it will call the appropriate MT function.&nbsp; I
+am also going to change the default seed to use a hashed time instead of
+a hard coded value.&nbsp; I have looked at other implementations of the
+MT code available on the web site.&nbsp; The C++ version does a hash on
+the current time.&nbsp; I will modify our MT code to seed with the hashed
+time as well.&nbsp; That way the code to seed is contained within the MT
+code and we can just write the wrapper to get an IV.&nbsp; We will not
+be changing the core computational code of MT.
+<h2>
+DB Internal Issues</h2>
+
+<h4>
+When do we Cipher?</h4>
+All of the page ciphering is done in the <i>__db_pgin/__db_pgout</i> functions.&nbsp;
+We will encrypt after the method-specific function on page-out and decrypt
+before the method-specfic function on page-in.&nbsp; We do not hold any
+locks when entering these functions.&nbsp; We determine that we need to
+cipher based on the existence of the encryption flag in the dbp.
+<p>For ciphering log records, the encryption will be done as the first
+thing (or a new wrapper) in <i>__log_put.&nbsp; </i>See <a href="#Log Record Encryption">Log
+Record Encryption</a> for those details.
+<br>&nbsp;
+<h4>
+Page Changes</h4>
+The checksum and IV values will be stored prior to the first index of the
+page.&nbsp; We have a new P_INP macro that replaces use of inp[X] in the
+code. &nbsp;This macro takes a dbp as an argument and determines where
+our first index is based on whether we have DB_AM_CHKSUM and DB_AM_ENCRYPT
+set.&nbsp; If neither is set, then our first index is where it always was.
+&nbsp;If just checksumming is set, then we reserve a 4-byte checksum.&nbsp;
+If encryption is set, then we reserve 36 bytes for our checksum/IV as well
+as some space to get proper alignment to encrypt on a 16-byte boundary.
+<p>Since several paging macros use inp[X] in them, those macros must now
+take a dbp.&nbsp; There are a lot of changes to make all the necessary
+paging macros take a dbp, although these changes are trivial in nature.
+<p>Also, there is a new function <i>__db_chk_meta</i> to perform checksumming
+and decryption checking on meta pages specifically.&nbsp; This function
+is where we check that the database algorithm matches what the user gave
+(or if they set DB_CIPHER_ANY then we set it), and other encryption related
+testing for bad combinations of what is in the file versus what is in the
+user structures.
+<h4>
+Verification</h4>
+The verification code will also need to be updated to deal with secure
+pages.&nbsp; Basically when the verification code reads in the meta page
+it will call <i>__db_chk_meta</i> to perform any checksumming and decryption.
+<h4>
+<a NAME="Holes in Files"></a>Holes in Files</h4>
+Holes in files will be dealt with rather simply.&nbsp; We need to be able
+to distinguish reading a hole in a file from an encrypted page that happened
+to encrypt to all zero's.&nbsp; If we read a hole in a file, we do not
+want to send that empty page through the decryption routine.&nbsp; This
+can be determined simply without incurring the performance penalty of comparing
+every byte on a page on every read until we get a non-zero byte.
+<br>The __db_pgin function is only given an invalid page P_INVALID in this
+case.&nbsp;&nbsp;So, if the page type, which is always unencrypted, is
+P_INVALID, then we do not perform any checksum verification or decryption.
+<h4>
+Errors and Recovery</h4>
+Dealing with a checksum error is tricky.&nbsp; Ultimately, if a checksum
+error occurs it is extremely likely that the user must do catastrophic
+recovery.&nbsp; There is no other failure return other than&nbsp; DB_RUNRECOVERY
+for indicating that the user should run catastrophic recovery.&nbsp; We
+do not want to add a new error return for applications to check because
+a lot of applications already look for and deal with DB_RUNRECOVERY as
+an error condition and we want to fit ourselves into that application model.&nbsp;
+We already indicate to the user that when they get that error, then they
+need to run recovery.&nbsp; If recovery fails, then they need to run catastrophic
+recovery.&nbsp; We need to get ourselves to the point where users will
+run catastrophic recovery.
+<p>If we get a checksum error, then we need to log a message stating a
+checksum error occurred on page N.&nbsp; In <i>__db_pgin</i>, we can check
+if logging is on in the environment.&nbsp; If so, we want to log the message.
+<p>When the application gets the DB_RUNRECOVERY error, they'll have to
+shut down their application and run recovery.&nbsp; When the recovery encounters
+the record indicating checksum failure, then normal recovery will fail
+and the user will have to perform catastrophic recovery.&nbsp; When catastrophic
+recovery encounters that record, it will simply ignore it.
+<h4>
+<a NAME="Log Record Encryption"></a>Log Record Encryption</h4>
+Log records will be ciphered.&nbsp; It might make sense to wrap <i>__log_put</i>
+to encrypt the DBT we send down.&nbsp; The <i>__log_put </i>function is
+where the checksum is computed before acquiring the region lock.&nbsp;
+But also this function is where we call <i>__rep_send_message</i> to send
+the DBT to the replication clients.&nbsp; Therefore, we need the DBT to
+be encrypted prior to there.&nbsp; We also need it encrypted before checksumming.
+I think <i>__log_put </i>will become <i>__log_put_internal</i>, and the
+new <i>__log_put</i> will encrypt if needed and then call <i>__log_put_internal
+</i>(the
+function formerly known as <i>__log_put</i>).&nbsp; Log records are kept
+in a shared memory region buffer prior to going out to disk.&nbsp; Records
+in the buffer will be encrypted.&nbsp; No locks are held at the time we
+will need to encrypt.
+<p>On reading the log, via log cursors, the log code stores log records
+in the log buffer.&nbsp; Records in that buffer will be encrypted, so decryption
+will occur no matter whether we are returning records from the buffer or
+if we are returning log records directly from the disk. Current checksum
+checking is done in
+<i>__logc_get_int.</i>&nbsp; Decryption will be done
+after the checksum is checked.
+<p>There are currently two nasty issues with encrypted log records.&nbsp;
+The first is that <i>__txn_force_abort</i> overwrites a commit record in
+the log buffer with an abort record.&nbsp; Well, our log buffer will be
+encrypted.&nbsp; Therefore, <i>__txn_force_abort</i> is going to need to
+do encryption of its new record.&nbsp; This can be accomplished by sending
+in the dbenv handle to the function.&nbsp; It is available to us in <i>__log_flush_commit</i>
+and we can just pass it in.&nbsp; I don't like putting log encryption in
+the txn code, but the layering violation is already there.
+<p>The second issue is that the encryption code requires data that is a
+multiple of 16 bytes and log record lengths are variable.&nbsp; We will
+need to pad log records to meet the requirement.&nbsp; Since the callers
+of <i>__log_put</i> set up the given DBT it is a logical place to pad if
+necessary. We will modify the gen_rec.awk script to have all of the generated
+logging functions pad for us if we have a crypto handle. This padding will
+also expand the size of log files. Anyone calling <i>log_put</i> and using
+security from the application will have to pad on their own or it will
+return an error.
+<p>When ciphering the log file, we will need a different header than the
+current one.&nbsp; The current header only has space for a 4 byte checksum.&nbsp;
+Our secure header will need space for the 16 byte IV and 20 byte checksum.&nbsp;
+This will blow up our log files when running securely since every single
+log record header will now consume 32 additional bytes.&nbsp; I believe
+that the log header does not need to be encrypted.&nbsp; It contains an
+offset, a length and our IV and checksum.&nbsp; Our IV and checksum are
+never encrypted.&nbsp; I don't believe there to be any risk in having the
+offset and length in the clear.
+<p>I would prefer not to have two types of log headers that are incompatible
+with each other.&nbsp; It is not acceptable to increase the log headers
+of all users from 12 bytes to 44 bytes.&nbsp; Such a change would also
+make log files incompatible with earlier releases.&nbsp; Worse even, is
+that the <i>cksum</i> field of the header is in between the offset and
+len.&nbsp; It would be really convenient if we could have just made a bigger
+cksum portion without affecting the location of the other fields.&nbsp;
+Oh well.&nbsp; Most customers will not be using encryption and we won't
+make them pay the price of the expanded header.&nbsp; Keith indicates that
+the log file format is changing with the next release so I will move the
+cksum field so it can at least be overlaid.
+<p>One method around this would be to have a single internal header that
+contains all the information both mechanisms need, but when we write out
+the header we choose which pieces to write.&nbsp; By appending the security
+information to the end of the existing structure, and adding a size field,
+we can modify a few places to use the size field to write out only the
+current first 12 bytes, or the entire security header needed.
+<h4>
+Replication</h4>
+Replication clients are going to need to start all of their individual
+environment handles with the same password.&nbsp; The log records are going
+to be sent to the clients decrypted and the clients will have to encrypt
+them on their way to the client log files.&nbsp; We cannot send encrypted
+log records to clients. &nbsp;The reason is that the checksum and IV&nbsp;are
+stored in the log header and the master only sends the log record itself
+to the client. &nbsp;Therefore, the client has no way to decrypt a log
+record from the master. &nbsp;Therefore, anyone wanting to use truly secure
+replication is going to have to have a secure transport mechanism.&nbsp;
+By not encrypting records, clients can theoretically have different passwords
+and DB won't care.
+<p>On the master side we must copy the DBT sent in.&nbsp; We encrypt the
+original and send to clients the clear record.&nbsp; On the client side,
+support for encryption is added into <i>__log_rep_put</i>.
+<h4>
+Sharing the Environment</h4>
+When multiple processes join the environment, all must use the same password
+as the creator.
+<p>Joining an existing environment requires several conditions to be true.&nbsp;
+First, if the creator of the environment did not create with security,
+then joining later with security is an error.&nbsp; Second, if the creator
+did create it with security, then joining later without security is an
+error.&nbsp; Third, we need to be able to test and check that when another
+process joins a secure environment that the password they provided is the
+same as the one in use by the creator.
+<p>The first two scenarios should be fairly trivial to determine, if we
+aren't creating the environment, we can compare what is there with what
+we have.&nbsp; In the third case, the <i>__crypto_region_init</i> function
+will see that the environment region has a valid passwd_off and we'll then
+compare that password to the one we have in our dbenv handle.&nbsp; In
+any case we'll smash the dbenv handle's passwd and free that memory before
+returning whether we have a password match or not.
+<p>We need to store the passwords themselves in the region because multiple
+calls to the <i>__aes_derivekeys </i>function with the same password yields
+different keyInstance contents.&nbsp; Therefore we don't have any way to
+check passwords other than retaining and comparing the actual passwords.
+<h4>
+Other APIs</h4>
+All of the other APIs will need interface enhancements to support the new
+security methods.&nbsp; The Java and C++ interfaces will likely be done
+by Michael Cahill and Sue will implement the Tcl and RPC changes.&nbsp;
+Tcl will need the changes for testing purposes but the interface should
+be public, not test-only.&nbsp; RPC should fully support security.&nbsp;
+The biggest risk that I can see is that the client will send the password
+to the server in the clear.&nbsp; Anyone sniffing the wires or running
+tcpdump or other packet grabbing code could grab that.&nbsp; Someone really
+interested in using security over RPC probably ought to add authentication
+and other measures to the RPC server as well.
+<h4>
+<a NAME="Utilities"></a>Utilities</h4>
+All should take a -P flag to specify a password for the environment or
+password.&nbsp; Those that take an env and a database might need something
+more to distinguish between env passwds and db passwds. Here is what we
+do for each utility:
+<ul>
+<li>
+berkeley_db_svc - Needs -P after each -h specified.</li>
+
+<li>
+db_archive - Needs -P if the env is encrypted.</li>
+
+<li>
+db_checkpoint - Needs -P if the env is encrypted.</li>
+
+<li>
+db_deadlock - No changes</li>
+
+<li>
+db_dump - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_load - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_printlog - Needs -P if the env is encrypted.</li>
+
+<li>
+db_recover - Needs -P if the env is encrypted.</li>
+
+<li>
+db_stat - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_upgrade - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_verify - Needs -P if the env or database is encrypted.</li>
+</ul>
+
+<h2>
+Testing</h2>
+All testing should be able to be accomplished via Tcl.&nbsp; The following
+tests (and probably others I haven't thought of yet) should be performed:
+<ul>
+<li>
+Basic functionality - basically a test001 but encrypted without an env</li>
+
+<li>
+Basic functionality, w/ env - like the previous test but with an env.</li>
+
+<li>
+Basic functionality, multiple processes - like first test, but make sure
+others can correctly join.</li>
+
+<li>
+Basic functionality, mult. processes - like above test, but initialize/close
+environment/database first so that the next test processes are all joiners
+of an existing env, but creator no longer exists and the shared region
+must be opened.</li>
+
+<li>
+Recovery test - Run recovery over an encrypted environment.</li>
+
+<li>
+Subdb test - Run with subdbs that are encrypted.</li>
+
+<li>
+Utility test - Verify the new options to all the utilities.</li>
+
+<li>
+Error handling - Test the basic setup errors for both env's and databases
+with multiple processes.&nbsp; They are:</li>
+
+<ol>
+<li>
+Attempt to set a NULL or zero-length passwd.</li>
+
+<li>
+Create Env w/ security and attempt to create database w/ its own password.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Proc2 joins without - should get an
+error.</li>
+
+<li>
+Env/DB creates without security.&nbsp; Proc2 joins with - should get an
+error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Proc2 joins with different password
+- should get an error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Closes.&nbsp; Proc2 reopens with different
+password - should get an error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Closes.&nbsp; Tcl overwrites a page
+of the database with garbage.&nbsp; Proc2 reopens with the correct password.&nbsp;
+Code should detect checksum error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Open a 2nd identical DB with a different
+password.&nbsp; Put the exact same data into both databases.&nbsp; Close.&nbsp;
+Overwrite the identical page of DB1 with the one from DB2.&nbsp; Reopen
+the database with correct DB1 password.&nbsp; Code should detect an encryption
+error on that page.</li>
+</ol>
+</ul>
+
+<h2>
+Risks</h2>
+There are several holes in this design.&nbsp; It is important to document
+them clearly.
+<p>The first is that all of the pages are stored in memory and possibly
+the file system in the clear.&nbsp; The password is stored in the shared
+data regions in the clear.&nbsp; Therefore if an attacker can read the
+process memory, they can do whatever they want.&nbsp; If the attacker can
+read system memory or swap they can access the data as well.&nbsp; Since
+everything in the shared data regions (with the exception of the buffered
+log) will be in the clear, it is important to realize that file backed
+regions will be written in the clear, including the portion of the regions
+containing passwords.&nbsp; We recommend to users that they use system
+memory instead of file backed shared memory.
+</body>
+</html>
diff --git a/src/crypto/mersenne/mt19937db.c b/src/crypto/mersenne/mt19937db.c
new file mode 100644
index 00000000..2d53c312
--- /dev/null
+++ b/src/crypto/mersenne/mt19937db.c
@@ -0,0 +1,187 @@
+/*
+ * $Id$
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+
+/* A C-program for MT19937: Integer version (1999/10/28)          */
+/*  genrand() generates one pseudorandom unsigned integer (32bit) */
+/* which is uniformly distributed among 0 to 2^32-1  for each     */
+/* call. sgenrand(seed) sets initial values to the working area   */
+/* of 624 words. Before genrand(), sgenrand(seed) must be         */
+/* called once. (seed is any 32-bit integer.)                     */
+/*   Coded by Takuji Nishimura, considering the suggestions by    */
+/* Topher Cooper and Marc Rieffel in July-Aug. 1997.              */
+
+/* This library is free software under the Artistic license:       */
+/* see the file COPYING distributed together with this code.       */
+/* For the verification of the code, its output sequence file      */
+/* mt19937int.out is attached (2001/4/2)                           */
+
+/* Copyright (C) 1997, 1999 Makoto Matsumoto and Takuji Nishimura. */
+/* Any feedback is very welcome. For any question, comments,       */
+/* see http://www.math.keio.ac.jp/matumoto/emt.html or email       */
+/* matumoto@math.keio.ac.jp                                        */
+
+/* REFERENCE                                                       */
+/* M. Matsumoto and T. Nishimura,                                  */
+/* "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform  */
+/* Pseudo-Random Number Generator",                                */
+/* ACM Transactions on Modeling and Computer Simulation,           */
+/* Vol. 8, No. 1, January 1998, pp 3--30.                          */
+
+/* Period parameters */
+#define	N 624
+#define	M 397
+#define	MATRIX_A 0x9908b0df   /* constant vector a */
+#define	UPPER_MASK 0x80000000 /* most significant w-r bits */
+#define	LOWER_MASK 0x7fffffff /* least significant r bits */
+
+/* Tempering parameters */
+#define	TEMPERING_MASK_B 0x9d2c5680
+#define	TEMPERING_MASK_C 0xefc60000
+#define	TEMPERING_SHIFT_U(y)  (y >> 11)
+#define	TEMPERING_SHIFT_S(y)  (y << 7)
+#define	TEMPERING_SHIFT_T(y)  (y << 15)
+#define	TEMPERING_SHIFT_L(y)  (y >> 18)
+
+static void __db_sgenrand __P((unsigned long, unsigned long *, int *));
+#ifdef	NOT_USED
+static void __db_lsgenrand __P((unsigned long *, unsigned long *, int *));
+#endif
+static unsigned long __db_genrand __P((ENV *));
+
+/*
+ * __db_generate_iv --
+ *	Generate an initialization vector (IV)
+ *
+ * PUBLIC: int __db_generate_iv __P((ENV *, u_int32_t *));
+ */
+int
+__db_generate_iv(env, iv)
+	ENV *env;
+	u_int32_t *iv;
+{
+	int i, n, ret;
+
+	ret = 0;
+	n = DB_IV_BYTES / sizeof(u_int32_t);
+	MUTEX_LOCK(env, env->mtx_mt);
+	if (env->mt == NULL) {
+		if ((ret = __os_calloc(env, 1, N*sizeof(unsigned long),
+		    &env->mt)) != 0)
+			return (ret);
+		/* mti==N+1 means mt[N] is not initialized */
+		env->mti = N + 1;
+	}
+	for (i = 0; i < n; i++) {
+		/*
+		 * We do not allow 0.  If we get one just try again.
+		 */
+		do {
+			iv[i] = (u_int32_t)__db_genrand(env);
+		} while (iv[i] == 0);
+	}
+
+	MUTEX_UNLOCK(env, env->mtx_mt);
+	return (0);
+}
+
+/* Initializing the array with a seed */
+static void
+__db_sgenrand(seed, mt, mtip)
+	unsigned long seed;
+	unsigned long mt[];
+	int *mtip;
+{
+    int i;
+
+    DB_ASSERT(NULL, seed != 0);
+    for (i=0;i<N;i++) {
+	 mt[i] = seed & 0xffff0000;
+	 seed = 69069 * seed + 1;
+	 mt[i] |= (seed & 0xffff0000) >> 16;
+	 seed = 69069 * seed + 1;
+    }
+    *mtip = N;
+}
+
+#ifdef	NOT_USED
+/* Initialization by "sgenrand()" is an example. Theoretically,      */
+/* there are 2^19937-1 possible states as an intial state.           */
+/* This function allows to choose any of 2^19937-1 ones.             */
+/* Essential bits in "seed_array[]" is following 19937 bits:         */
+/*  (seed_array[0]&UPPER_MASK), seed_array[1], ..., seed_array[N-1]. */
+/* (seed_array[0]&LOWER_MASK) is discarded.                          */
+/* Theoretically,                                                    */
+/*  (seed_array[0]&UPPER_MASK), seed_array[1], ..., seed_array[N-1]  */
+/* can take any values except all zeros.                             */
+static void
+__db_lsgenrand(seed_array, mt, mtip)
+    unsigned long seed_array[];
+    unsigned long mt[];
+    int *mtip;
+    /* the length of seed_array[] must be at least N */
+{
+    int i;
+
+    for (i=0;i<N;i++)
+      mt[i] = seed_array[i];
+    *mtip=N;
+}
+#endif
+
+static unsigned long
+__db_genrand(env)
+    ENV *env;
+{
+    db_timespec ts;
+    unsigned long y;
+    static unsigned long mag01[2]={0x0, MATRIX_A};
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+    u_int32_t seed;
+
+    /*
+     * We are called with ENV->mtx_mt locked.
+     */
+    if (env->mti >= N) { /* generate N words at one time */
+	int kk;
+
+	if (env->mti == N+1) {  /* if sgenrand() has not been called, */
+		/*
+		 * Seed the generator with the hashed time.  The __db_mac
+		 * function will return 4 bytes if we don't send in a key.
+		 */
+		do {
+			__os_gettime(env, &ts, 1);
+			__db_chksum(NULL, (u_int8_t *)&ts.tv_sec,
+			    sizeof(ts.tv_sec), NULL, (u_int8_t *)&seed);
+		} while (seed == 0);
+		__db_sgenrand((unsigned long)seed, env->mt, &env->mti);
+	}
+
+	for (kk=0;kk<N-M;kk++) {
+	    y = (env->mt[kk]&UPPER_MASK)|(env->mt[kk+1]&LOWER_MASK);
+	    env->mt[kk] = env->mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1];
+	}
+	for (;kk<N-1;kk++) {
+	    y = (env->mt[kk]&UPPER_MASK)|(env->mt[kk+1]&LOWER_MASK);
+	    env->mt[kk] = env->mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1];
+	}
+	y = (env->mt[N-1]&UPPER_MASK)|(env->mt[0]&LOWER_MASK);
+	env->mt[N-1] = env->mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+	env->mti = 0;
+    }
+
+    y = env->mt[env->mti++];
+    y ^= TEMPERING_SHIFT_U(y);
+    y ^= TEMPERING_SHIFT_S(y) & TEMPERING_MASK_B;
+    y ^= TEMPERING_SHIFT_T(y) & TEMPERING_MASK_C;
+    y ^= TEMPERING_SHIFT_L(y);
+
+    return y;
+}
diff --git a/src/crypto/rijndael/rijndael-alg-fst.c b/src/crypto/rijndael/rijndael-alg-fst.c
new file mode 100644
index 00000000..322ad5ff
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-alg-fst.c
@@ -0,0 +1,1466 @@
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+    (u_int)0xc66363a5, (u_int)0xf87c7c84, (u_int)0xee777799, (u_int)0xf67b7b8d,
+    (u_int)0xfff2f20d, (u_int)0xd66b6bbd, (u_int)0xde6f6fb1, (u_int)0x91c5c554,
+    (u_int)0x60303050, (u_int)0x02010103, (u_int)0xce6767a9, (u_int)0x562b2b7d,
+    (u_int)0xe7fefe19, (u_int)0xb5d7d762, (u_int)0x4dababe6, (u_int)0xec76769a,
+    (u_int)0x8fcaca45, (u_int)0x1f82829d, (u_int)0x89c9c940, (u_int)0xfa7d7d87,
+    (u_int)0xeffafa15, (u_int)0xb25959eb, (u_int)0x8e4747c9, (u_int)0xfbf0f00b,
+    (u_int)0x41adadec, (u_int)0xb3d4d467, (u_int)0x5fa2a2fd, (u_int)0x45afafea,
+    (u_int)0x239c9cbf, (u_int)0x53a4a4f7, (u_int)0xe4727296, (u_int)0x9bc0c05b,
+    (u_int)0x75b7b7c2, (u_int)0xe1fdfd1c, (u_int)0x3d9393ae, (u_int)0x4c26266a,
+    (u_int)0x6c36365a, (u_int)0x7e3f3f41, (u_int)0xf5f7f702, (u_int)0x83cccc4f,
+    (u_int)0x6834345c, (u_int)0x51a5a5f4, (u_int)0xd1e5e534, (u_int)0xf9f1f108,
+    (u_int)0xe2717193, (u_int)0xabd8d873, (u_int)0x62313153, (u_int)0x2a15153f,
+    (u_int)0x0804040c, (u_int)0x95c7c752, (u_int)0x46232365, (u_int)0x9dc3c35e,
+    (u_int)0x30181828, (u_int)0x379696a1, (u_int)0x0a05050f, (u_int)0x2f9a9ab5,
+    (u_int)0x0e070709, (u_int)0x24121236, (u_int)0x1b80809b, (u_int)0xdfe2e23d,
+    (u_int)0xcdebeb26, (u_int)0x4e272769, (u_int)0x7fb2b2cd, (u_int)0xea75759f,
+    (u_int)0x1209091b, (u_int)0x1d83839e, (u_int)0x582c2c74, (u_int)0x341a1a2e,
+    (u_int)0x361b1b2d, (u_int)0xdc6e6eb2, (u_int)0xb45a5aee, (u_int)0x5ba0a0fb,
+    (u_int)0xa45252f6, (u_int)0x763b3b4d, (u_int)0xb7d6d661, (u_int)0x7db3b3ce,
+    (u_int)0x5229297b, (u_int)0xdde3e33e, (u_int)0x5e2f2f71, (u_int)0x13848497,
+    (u_int)0xa65353f5, (u_int)0xb9d1d168, (u_int)0x00000000, (u_int)0xc1eded2c,
+    (u_int)0x40202060, (u_int)0xe3fcfc1f, (u_int)0x79b1b1c8, (u_int)0xb65b5bed,
+    (u_int)0xd46a6abe, (u_int)0x8dcbcb46, (u_int)0x67bebed9, (u_int)0x7239394b,
+    (u_int)0x944a4ade, (u_int)0x984c4cd4, (u_int)0xb05858e8, (u_int)0x85cfcf4a,
+    (u_int)0xbbd0d06b, (u_int)0xc5efef2a, (u_int)0x4faaaae5, (u_int)0xedfbfb16,
+    (u_int)0x864343c5, (u_int)0x9a4d4dd7, (u_int)0x66333355, (u_int)0x11858594,
+    (u_int)0x8a4545cf, (u_int)0xe9f9f910, (u_int)0x04020206, (u_int)0xfe7f7f81,
+    (u_int)0xa05050f0, (u_int)0x783c3c44, (u_int)0x259f9fba, (u_int)0x4ba8a8e3,
+    (u_int)0xa25151f3, (u_int)0x5da3a3fe, (u_int)0x804040c0, (u_int)0x058f8f8a,
+    (u_int)0x3f9292ad, (u_int)0x219d9dbc, (u_int)0x70383848, (u_int)0xf1f5f504,
+    (u_int)0x63bcbcdf, (u_int)0x77b6b6c1, (u_int)0xafdada75, (u_int)0x42212163,
+    (u_int)0x20101030, (u_int)0xe5ffff1a, (u_int)0xfdf3f30e, (u_int)0xbfd2d26d,
+    (u_int)0x81cdcd4c, (u_int)0x180c0c14, (u_int)0x26131335, (u_int)0xc3ecec2f,
+    (u_int)0xbe5f5fe1, (u_int)0x359797a2, (u_int)0x884444cc, (u_int)0x2e171739,
+    (u_int)0x93c4c457, (u_int)0x55a7a7f2, (u_int)0xfc7e7e82, (u_int)0x7a3d3d47,
+    (u_int)0xc86464ac, (u_int)0xba5d5de7, (u_int)0x3219192b, (u_int)0xe6737395,
+    (u_int)0xc06060a0, (u_int)0x19818198, (u_int)0x9e4f4fd1, (u_int)0xa3dcdc7f,
+    (u_int)0x44222266, (u_int)0x542a2a7e, (u_int)0x3b9090ab, (u_int)0x0b888883,
+    (u_int)0x8c4646ca, (u_int)0xc7eeee29, (u_int)0x6bb8b8d3, (u_int)0x2814143c,
+    (u_int)0xa7dede79, (u_int)0xbc5e5ee2, (u_int)0x160b0b1d, (u_int)0xaddbdb76,
+    (u_int)0xdbe0e03b, (u_int)0x64323256, (u_int)0x743a3a4e, (u_int)0x140a0a1e,
+    (u_int)0x924949db, (u_int)0x0c06060a, (u_int)0x4824246c, (u_int)0xb85c5ce4,
+    (u_int)0x9fc2c25d, (u_int)0xbdd3d36e, (u_int)0x43acacef, (u_int)0xc46262a6,
+    (u_int)0x399191a8, (u_int)0x319595a4, (u_int)0xd3e4e437, (u_int)0xf279798b,
+    (u_int)0xd5e7e732, (u_int)0x8bc8c843, (u_int)0x6e373759, (u_int)0xda6d6db7,
+    (u_int)0x018d8d8c, (u_int)0xb1d5d564, (u_int)0x9c4e4ed2, (u_int)0x49a9a9e0,
+    (u_int)0xd86c6cb4, (u_int)0xac5656fa, (u_int)0xf3f4f407, (u_int)0xcfeaea25,
+    (u_int)0xca6565af, (u_int)0xf47a7a8e, (u_int)0x47aeaee9, (u_int)0x10080818,
+    (u_int)0x6fbabad5, (u_int)0xf0787888, (u_int)0x4a25256f, (u_int)0x5c2e2e72,
+    (u_int)0x381c1c24, (u_int)0x57a6a6f1, (u_int)0x73b4b4c7, (u_int)0x97c6c651,
+    (u_int)0xcbe8e823, (u_int)0xa1dddd7c, (u_int)0xe874749c, (u_int)0x3e1f1f21,
+    (u_int)0x964b4bdd, (u_int)0x61bdbddc, (u_int)0x0d8b8b86, (u_int)0x0f8a8a85,
+    (u_int)0xe0707090, (u_int)0x7c3e3e42, (u_int)0x71b5b5c4, (u_int)0xcc6666aa,
+    (u_int)0x904848d8, (u_int)0x06030305, (u_int)0xf7f6f601, (u_int)0x1c0e0e12,
+    (u_int)0xc26161a3, (u_int)0x6a35355f, (u_int)0xae5757f9, (u_int)0x69b9b9d0,
+    (u_int)0x17868691, (u_int)0x99c1c158, (u_int)0x3a1d1d27, (u_int)0x279e9eb9,
+    (u_int)0xd9e1e138, (u_int)0xebf8f813, (u_int)0x2b9898b3, (u_int)0x22111133,
+    (u_int)0xd26969bb, (u_int)0xa9d9d970, (u_int)0x078e8e89, (u_int)0x339494a7,
+    (u_int)0x2d9b9bb6, (u_int)0x3c1e1e22, (u_int)0x15878792, (u_int)0xc9e9e920,
+    (u_int)0x87cece49, (u_int)0xaa5555ff, (u_int)0x50282878, (u_int)0xa5dfdf7a,
+    (u_int)0x038c8c8f, (u_int)0x59a1a1f8, (u_int)0x09898980, (u_int)0x1a0d0d17,
+    (u_int)0x65bfbfda, (u_int)0xd7e6e631, (u_int)0x844242c6, (u_int)0xd06868b8,
+    (u_int)0x824141c3, (u_int)0x299999b0, (u_int)0x5a2d2d77, (u_int)0x1e0f0f11,
+    (u_int)0x7bb0b0cb, (u_int)0xa85454fc, (u_int)0x6dbbbbd6, (u_int)0x2c16163a,
+};
+static const u32 Te1[256] = {
+    (u_int)0xa5c66363, (u_int)0x84f87c7c, (u_int)0x99ee7777, (u_int)0x8df67b7b,
+    (u_int)0x0dfff2f2, (u_int)0xbdd66b6b, (u_int)0xb1de6f6f, (u_int)0x5491c5c5,
+    (u_int)0x50603030, (u_int)0x03020101, (u_int)0xa9ce6767, (u_int)0x7d562b2b,
+    (u_int)0x19e7fefe, (u_int)0x62b5d7d7, (u_int)0xe64dabab, (u_int)0x9aec7676,
+    (u_int)0x458fcaca, (u_int)0x9d1f8282, (u_int)0x4089c9c9, (u_int)0x87fa7d7d,
+    (u_int)0x15effafa, (u_int)0xebb25959, (u_int)0xc98e4747, (u_int)0x0bfbf0f0,
+    (u_int)0xec41adad, (u_int)0x67b3d4d4, (u_int)0xfd5fa2a2, (u_int)0xea45afaf,
+    (u_int)0xbf239c9c, (u_int)0xf753a4a4, (u_int)0x96e47272, (u_int)0x5b9bc0c0,
+    (u_int)0xc275b7b7, (u_int)0x1ce1fdfd, (u_int)0xae3d9393, (u_int)0x6a4c2626,
+    (u_int)0x5a6c3636, (u_int)0x417e3f3f, (u_int)0x02f5f7f7, (u_int)0x4f83cccc,
+    (u_int)0x5c683434, (u_int)0xf451a5a5, (u_int)0x34d1e5e5, (u_int)0x08f9f1f1,
+    (u_int)0x93e27171, (u_int)0x73abd8d8, (u_int)0x53623131, (u_int)0x3f2a1515,
+    (u_int)0x0c080404, (u_int)0x5295c7c7, (u_int)0x65462323, (u_int)0x5e9dc3c3,
+    (u_int)0x28301818, (u_int)0xa1379696, (u_int)0x0f0a0505, (u_int)0xb52f9a9a,
+    (u_int)0x090e0707, (u_int)0x36241212, (u_int)0x9b1b8080, (u_int)0x3ddfe2e2,
+    (u_int)0x26cdebeb, (u_int)0x694e2727, (u_int)0xcd7fb2b2, (u_int)0x9fea7575,
+    (u_int)0x1b120909, (u_int)0x9e1d8383, (u_int)0x74582c2c, (u_int)0x2e341a1a,
+    (u_int)0x2d361b1b, (u_int)0xb2dc6e6e, (u_int)0xeeb45a5a, (u_int)0xfb5ba0a0,
+    (u_int)0xf6a45252, (u_int)0x4d763b3b, (u_int)0x61b7d6d6, (u_int)0xce7db3b3,
+    (u_int)0x7b522929, (u_int)0x3edde3e3, (u_int)0x715e2f2f, (u_int)0x97138484,
+    (u_int)0xf5a65353, (u_int)0x68b9d1d1, (u_int)0x00000000, (u_int)0x2cc1eded,
+    (u_int)0x60402020, (u_int)0x1fe3fcfc, (u_int)0xc879b1b1, (u_int)0xedb65b5b,
+    (u_int)0xbed46a6a, (u_int)0x468dcbcb, (u_int)0xd967bebe, (u_int)0x4b723939,
+    (u_int)0xde944a4a, (u_int)0xd4984c4c, (u_int)0xe8b05858, (u_int)0x4a85cfcf,
+    (u_int)0x6bbbd0d0, (u_int)0x2ac5efef, (u_int)0xe54faaaa, (u_int)0x16edfbfb,
+    (u_int)0xc5864343, (u_int)0xd79a4d4d, (u_int)0x55663333, (u_int)0x94118585,
+    (u_int)0xcf8a4545, (u_int)0x10e9f9f9, (u_int)0x06040202, (u_int)0x81fe7f7f,
+    (u_int)0xf0a05050, (u_int)0x44783c3c, (u_int)0xba259f9f, (u_int)0xe34ba8a8,
+    (u_int)0xf3a25151, (u_int)0xfe5da3a3, (u_int)0xc0804040, (u_int)0x8a058f8f,
+    (u_int)0xad3f9292, (u_int)0xbc219d9d, (u_int)0x48703838, (u_int)0x04f1f5f5,
+    (u_int)0xdf63bcbc, (u_int)0xc177b6b6, (u_int)0x75afdada, (u_int)0x63422121,
+    (u_int)0x30201010, (u_int)0x1ae5ffff, (u_int)0x0efdf3f3, (u_int)0x6dbfd2d2,
+    (u_int)0x4c81cdcd, (u_int)0x14180c0c, (u_int)0x35261313, (u_int)0x2fc3ecec,
+    (u_int)0xe1be5f5f, (u_int)0xa2359797, (u_int)0xcc884444, (u_int)0x392e1717,
+    (u_int)0x5793c4c4, (u_int)0xf255a7a7, (u_int)0x82fc7e7e, (u_int)0x477a3d3d,
+    (u_int)0xacc86464, (u_int)0xe7ba5d5d, (u_int)0x2b321919, (u_int)0x95e67373,
+    (u_int)0xa0c06060, (u_int)0x98198181, (u_int)0xd19e4f4f, (u_int)0x7fa3dcdc,
+    (u_int)0x66442222, (u_int)0x7e542a2a, (u_int)0xab3b9090, (u_int)0x830b8888,
+    (u_int)0xca8c4646, (u_int)0x29c7eeee, (u_int)0xd36bb8b8, (u_int)0x3c281414,
+    (u_int)0x79a7dede, (u_int)0xe2bc5e5e, (u_int)0x1d160b0b, (u_int)0x76addbdb,
+    (u_int)0x3bdbe0e0, (u_int)0x56643232, (u_int)0x4e743a3a, (u_int)0x1e140a0a,
+    (u_int)0xdb924949, (u_int)0x0a0c0606, (u_int)0x6c482424, (u_int)0xe4b85c5c,
+    (u_int)0x5d9fc2c2, (u_int)0x6ebdd3d3, (u_int)0xef43acac, (u_int)0xa6c46262,
+    (u_int)0xa8399191, (u_int)0xa4319595, (u_int)0x37d3e4e4, (u_int)0x8bf27979,
+    (u_int)0x32d5e7e7, (u_int)0x438bc8c8, (u_int)0x596e3737, (u_int)0xb7da6d6d,
+    (u_int)0x8c018d8d, (u_int)0x64b1d5d5, (u_int)0xd29c4e4e, (u_int)0xe049a9a9,
+    (u_int)0xb4d86c6c, (u_int)0xfaac5656, (u_int)0x07f3f4f4, (u_int)0x25cfeaea,
+    (u_int)0xafca6565, (u_int)0x8ef47a7a, (u_int)0xe947aeae, (u_int)0x18100808,
+    (u_int)0xd56fbaba, (u_int)0x88f07878, (u_int)0x6f4a2525, (u_int)0x725c2e2e,
+    (u_int)0x24381c1c, (u_int)0xf157a6a6, (u_int)0xc773b4b4, (u_int)0x5197c6c6,
+    (u_int)0x23cbe8e8, (u_int)0x7ca1dddd, (u_int)0x9ce87474, (u_int)0x213e1f1f,
+    (u_int)0xdd964b4b, (u_int)0xdc61bdbd, (u_int)0x860d8b8b, (u_int)0x850f8a8a,
+    (u_int)0x90e07070, (u_int)0x427c3e3e, (u_int)0xc471b5b5, (u_int)0xaacc6666,
+    (u_int)0xd8904848, (u_int)0x05060303, (u_int)0x01f7f6f6, (u_int)0x121c0e0e,
+    (u_int)0xa3c26161, (u_int)0x5f6a3535, (u_int)0xf9ae5757, (u_int)0xd069b9b9,
+    (u_int)0x91178686, (u_int)0x5899c1c1, (u_int)0x273a1d1d, (u_int)0xb9279e9e,
+    (u_int)0x38d9e1e1, (u_int)0x13ebf8f8, (u_int)0xb32b9898, (u_int)0x33221111,
+    (u_int)0xbbd26969, (u_int)0x70a9d9d9, (u_int)0x89078e8e, (u_int)0xa7339494,
+    (u_int)0xb62d9b9b, (u_int)0x223c1e1e, (u_int)0x92158787, (u_int)0x20c9e9e9,
+    (u_int)0x4987cece, (u_int)0xffaa5555, (u_int)0x78502828, (u_int)0x7aa5dfdf,
+    (u_int)0x8f038c8c, (u_int)0xf859a1a1, (u_int)0x80098989, (u_int)0x171a0d0d,
+    (u_int)0xda65bfbf, (u_int)0x31d7e6e6, (u_int)0xc6844242, (u_int)0xb8d06868,
+    (u_int)0xc3824141, (u_int)0xb0299999, (u_int)0x775a2d2d, (u_int)0x111e0f0f,
+    (u_int)0xcb7bb0b0, (u_int)0xfca85454, (u_int)0xd66dbbbb, (u_int)0x3a2c1616,
+};
+static const u32 Te2[256] = {
+    (u_int)0x63a5c663, (u_int)0x7c84f87c, (u_int)0x7799ee77, (u_int)0x7b8df67b,
+    (u_int)0xf20dfff2, (u_int)0x6bbdd66b, (u_int)0x6fb1de6f, (u_int)0xc55491c5,
+    (u_int)0x30506030, (u_int)0x01030201, (u_int)0x67a9ce67, (u_int)0x2b7d562b,
+    (u_int)0xfe19e7fe, (u_int)0xd762b5d7, (u_int)0xabe64dab, (u_int)0x769aec76,
+    (u_int)0xca458fca, (u_int)0x829d1f82, (u_int)0xc94089c9, (u_int)0x7d87fa7d,
+    (u_int)0xfa15effa, (u_int)0x59ebb259, (u_int)0x47c98e47, (u_int)0xf00bfbf0,
+    (u_int)0xadec41ad, (u_int)0xd467b3d4, (u_int)0xa2fd5fa2, (u_int)0xafea45af,
+    (u_int)0x9cbf239c, (u_int)0xa4f753a4, (u_int)0x7296e472, (u_int)0xc05b9bc0,
+    (u_int)0xb7c275b7, (u_int)0xfd1ce1fd, (u_int)0x93ae3d93, (u_int)0x266a4c26,
+    (u_int)0x365a6c36, (u_int)0x3f417e3f, (u_int)0xf702f5f7, (u_int)0xcc4f83cc,
+    (u_int)0x345c6834, (u_int)0xa5f451a5, (u_int)0xe534d1e5, (u_int)0xf108f9f1,
+    (u_int)0x7193e271, (u_int)0xd873abd8, (u_int)0x31536231, (u_int)0x153f2a15,
+    (u_int)0x040c0804, (u_int)0xc75295c7, (u_int)0x23654623, (u_int)0xc35e9dc3,
+    (u_int)0x18283018, (u_int)0x96a13796, (u_int)0x050f0a05, (u_int)0x9ab52f9a,
+    (u_int)0x07090e07, (u_int)0x12362412, (u_int)0x809b1b80, (u_int)0xe23ddfe2,
+    (u_int)0xeb26cdeb, (u_int)0x27694e27, (u_int)0xb2cd7fb2, (u_int)0x759fea75,
+    (u_int)0x091b1209, (u_int)0x839e1d83, (u_int)0x2c74582c, (u_int)0x1a2e341a,
+    (u_int)0x1b2d361b, (u_int)0x6eb2dc6e, (u_int)0x5aeeb45a, (u_int)0xa0fb5ba0,
+    (u_int)0x52f6a452, (u_int)0x3b4d763b, (u_int)0xd661b7d6, (u_int)0xb3ce7db3,
+    (u_int)0x297b5229, (u_int)0xe33edde3, (u_int)0x2f715e2f, (u_int)0x84971384,
+    (u_int)0x53f5a653, (u_int)0xd168b9d1, (u_int)0x00000000, (u_int)0xed2cc1ed,
+    (u_int)0x20604020, (u_int)0xfc1fe3fc, (u_int)0xb1c879b1, (u_int)0x5bedb65b,
+    (u_int)0x6abed46a, (u_int)0xcb468dcb, (u_int)0xbed967be, (u_int)0x394b7239,
+    (u_int)0x4ade944a, (u_int)0x4cd4984c, (u_int)0x58e8b058, (u_int)0xcf4a85cf,
+    (u_int)0xd06bbbd0, (u_int)0xef2ac5ef, (u_int)0xaae54faa, (u_int)0xfb16edfb,
+    (u_int)0x43c58643, (u_int)0x4dd79a4d, (u_int)0x33556633, (u_int)0x85941185,
+    (u_int)0x45cf8a45, (u_int)0xf910e9f9, (u_int)0x02060402, (u_int)0x7f81fe7f,
+    (u_int)0x50f0a050, (u_int)0x3c44783c, (u_int)0x9fba259f, (u_int)0xa8e34ba8,
+    (u_int)0x51f3a251, (u_int)0xa3fe5da3, (u_int)0x40c08040, (u_int)0x8f8a058f,
+    (u_int)0x92ad3f92, (u_int)0x9dbc219d, (u_int)0x38487038, (u_int)0xf504f1f5,
+    (u_int)0xbcdf63bc, (u_int)0xb6c177b6, (u_int)0xda75afda, (u_int)0x21634221,
+    (u_int)0x10302010, (u_int)0xff1ae5ff, (u_int)0xf30efdf3, (u_int)0xd26dbfd2,
+    (u_int)0xcd4c81cd, (u_int)0x0c14180c, (u_int)0x13352613, (u_int)0xec2fc3ec,
+    (u_int)0x5fe1be5f, (u_int)0x97a23597, (u_int)0x44cc8844, (u_int)0x17392e17,
+    (u_int)0xc45793c4, (u_int)0xa7f255a7, (u_int)0x7e82fc7e, (u_int)0x3d477a3d,
+    (u_int)0x64acc864, (u_int)0x5de7ba5d, (u_int)0x192b3219, (u_int)0x7395e673,
+    (u_int)0x60a0c060, (u_int)0x81981981, (u_int)0x4fd19e4f, (u_int)0xdc7fa3dc,
+    (u_int)0x22664422, (u_int)0x2a7e542a, (u_int)0x90ab3b90, (u_int)0x88830b88,
+    (u_int)0x46ca8c46, (u_int)0xee29c7ee, (u_int)0xb8d36bb8, (u_int)0x143c2814,
+    (u_int)0xde79a7de, (u_int)0x5ee2bc5e, (u_int)0x0b1d160b, (u_int)0xdb76addb,
+    (u_int)0xe03bdbe0, (u_int)0x32566432, (u_int)0x3a4e743a, (u_int)0x0a1e140a,
+    (u_int)0x49db9249, (u_int)0x060a0c06, (u_int)0x246c4824, (u_int)0x5ce4b85c,
+    (u_int)0xc25d9fc2, (u_int)0xd36ebdd3, (u_int)0xacef43ac, (u_int)0x62a6c462,
+    (u_int)0x91a83991, (u_int)0x95a43195, (u_int)0xe437d3e4, (u_int)0x798bf279,
+    (u_int)0xe732d5e7, (u_int)0xc8438bc8, (u_int)0x37596e37, (u_int)0x6db7da6d,
+    (u_int)0x8d8c018d, (u_int)0xd564b1d5, (u_int)0x4ed29c4e, (u_int)0xa9e049a9,
+    (u_int)0x6cb4d86c, (u_int)0x56faac56, (u_int)0xf407f3f4, (u_int)0xea25cfea,
+    (u_int)0x65afca65, (u_int)0x7a8ef47a, (u_int)0xaee947ae, (u_int)0x08181008,
+    (u_int)0xbad56fba, (u_int)0x7888f078, (u_int)0x256f4a25, (u_int)0x2e725c2e,
+    (u_int)0x1c24381c, (u_int)0xa6f157a6, (u_int)0xb4c773b4, (u_int)0xc65197c6,
+    (u_int)0xe823cbe8, (u_int)0xdd7ca1dd, (u_int)0x749ce874, (u_int)0x1f213e1f,
+    (u_int)0x4bdd964b, (u_int)0xbddc61bd, (u_int)0x8b860d8b, (u_int)0x8a850f8a,
+    (u_int)0x7090e070, (u_int)0x3e427c3e, (u_int)0xb5c471b5, (u_int)0x66aacc66,
+    (u_int)0x48d89048, (u_int)0x03050603, (u_int)0xf601f7f6, (u_int)0x0e121c0e,
+    (u_int)0x61a3c261, (u_int)0x355f6a35, (u_int)0x57f9ae57, (u_int)0xb9d069b9,
+    (u_int)0x86911786, (u_int)0xc15899c1, (u_int)0x1d273a1d, (u_int)0x9eb9279e,
+    (u_int)0xe138d9e1, (u_int)0xf813ebf8, (u_int)0x98b32b98, (u_int)0x11332211,
+    (u_int)0x69bbd269, (u_int)0xd970a9d9, (u_int)0x8e89078e, (u_int)0x94a73394,
+    (u_int)0x9bb62d9b, (u_int)0x1e223c1e, (u_int)0x87921587, (u_int)0xe920c9e9,
+    (u_int)0xce4987ce, (u_int)0x55ffaa55, (u_int)0x28785028, (u_int)0xdf7aa5df,
+    (u_int)0x8c8f038c, (u_int)0xa1f859a1, (u_int)0x89800989, (u_int)0x0d171a0d,
+    (u_int)0xbfda65bf, (u_int)0xe631d7e6, (u_int)0x42c68442, (u_int)0x68b8d068,
+    (u_int)0x41c38241, (u_int)0x99b02999, (u_int)0x2d775a2d, (u_int)0x0f111e0f,
+    (u_int)0xb0cb7bb0, (u_int)0x54fca854, (u_int)0xbbd66dbb, (u_int)0x163a2c16,
+};
+static const u32 Te3[256] = {
+
+    (u_int)0x6363a5c6, (u_int)0x7c7c84f8, (u_int)0x777799ee, (u_int)0x7b7b8df6,
+    (u_int)0xf2f20dff, (u_int)0x6b6bbdd6, (u_int)0x6f6fb1de, (u_int)0xc5c55491,
+    (u_int)0x30305060, (u_int)0x01010302, (u_int)0x6767a9ce, (u_int)0x2b2b7d56,
+    (u_int)0xfefe19e7, (u_int)0xd7d762b5, (u_int)0xababe64d, (u_int)0x76769aec,
+    (u_int)0xcaca458f, (u_int)0x82829d1f, (u_int)0xc9c94089, (u_int)0x7d7d87fa,
+    (u_int)0xfafa15ef, (u_int)0x5959ebb2, (u_int)0x4747c98e, (u_int)0xf0f00bfb,
+    (u_int)0xadadec41, (u_int)0xd4d467b3, (u_int)0xa2a2fd5f, (u_int)0xafafea45,
+    (u_int)0x9c9cbf23, (u_int)0xa4a4f753, (u_int)0x727296e4, (u_int)0xc0c05b9b,
+    (u_int)0xb7b7c275, (u_int)0xfdfd1ce1, (u_int)0x9393ae3d, (u_int)0x26266a4c,
+    (u_int)0x36365a6c, (u_int)0x3f3f417e, (u_int)0xf7f702f5, (u_int)0xcccc4f83,
+    (u_int)0x34345c68, (u_int)0xa5a5f451, (u_int)0xe5e534d1, (u_int)0xf1f108f9,
+    (u_int)0x717193e2, (u_int)0xd8d873ab, (u_int)0x31315362, (u_int)0x15153f2a,
+    (u_int)0x04040c08, (u_int)0xc7c75295, (u_int)0x23236546, (u_int)0xc3c35e9d,
+    (u_int)0x18182830, (u_int)0x9696a137, (u_int)0x05050f0a, (u_int)0x9a9ab52f,
+    (u_int)0x0707090e, (u_int)0x12123624, (u_int)0x80809b1b, (u_int)0xe2e23ddf,
+    (u_int)0xebeb26cd, (u_int)0x2727694e, (u_int)0xb2b2cd7f, (u_int)0x75759fea,
+    (u_int)0x09091b12, (u_int)0x83839e1d, (u_int)0x2c2c7458, (u_int)0x1a1a2e34,
+    (u_int)0x1b1b2d36, (u_int)0x6e6eb2dc, (u_int)0x5a5aeeb4, (u_int)0xa0a0fb5b,
+    (u_int)0x5252f6a4, (u_int)0x3b3b4d76, (u_int)0xd6d661b7, (u_int)0xb3b3ce7d,
+    (u_int)0x29297b52, (u_int)0xe3e33edd, (u_int)0x2f2f715e, (u_int)0x84849713,
+    (u_int)0x5353f5a6, (u_int)0xd1d168b9, (u_int)0x00000000, (u_int)0xeded2cc1,
+    (u_int)0x20206040, (u_int)0xfcfc1fe3, (u_int)0xb1b1c879, (u_int)0x5b5bedb6,
+    (u_int)0x6a6abed4, (u_int)0xcbcb468d, (u_int)0xbebed967, (u_int)0x39394b72,
+    (u_int)0x4a4ade94, (u_int)0x4c4cd498, (u_int)0x5858e8b0, (u_int)0xcfcf4a85,
+    (u_int)0xd0d06bbb, (u_int)0xefef2ac5, (u_int)0xaaaae54f, (u_int)0xfbfb16ed,
+    (u_int)0x4343c586, (u_int)0x4d4dd79a, (u_int)0x33335566, (u_int)0x85859411,
+    (u_int)0x4545cf8a, (u_int)0xf9f910e9, (u_int)0x02020604, (u_int)0x7f7f81fe,
+    (u_int)0x5050f0a0, (u_int)0x3c3c4478, (u_int)0x9f9fba25, (u_int)0xa8a8e34b,
+    (u_int)0x5151f3a2, (u_int)0xa3a3fe5d, (u_int)0x4040c080, (u_int)0x8f8f8a05,
+    (u_int)0x9292ad3f, (u_int)0x9d9dbc21, (u_int)0x38384870, (u_int)0xf5f504f1,
+    (u_int)0xbcbcdf63, (u_int)0xb6b6c177, (u_int)0xdada75af, (u_int)0x21216342,
+    (u_int)0x10103020, (u_int)0xffff1ae5, (u_int)0xf3f30efd, (u_int)0xd2d26dbf,
+    (u_int)0xcdcd4c81, (u_int)0x0c0c1418, (u_int)0x13133526, (u_int)0xecec2fc3,
+    (u_int)0x5f5fe1be, (u_int)0x9797a235, (u_int)0x4444cc88, (u_int)0x1717392e,
+    (u_int)0xc4c45793, (u_int)0xa7a7f255, (u_int)0x7e7e82fc, (u_int)0x3d3d477a,
+    (u_int)0x6464acc8, (u_int)0x5d5de7ba, (u_int)0x19192b32, (u_int)0x737395e6,
+    (u_int)0x6060a0c0, (u_int)0x81819819, (u_int)0x4f4fd19e, (u_int)0xdcdc7fa3,
+    (u_int)0x22226644, (u_int)0x2a2a7e54, (u_int)0x9090ab3b, (u_int)0x8888830b,
+    (u_int)0x4646ca8c, (u_int)0xeeee29c7, (u_int)0xb8b8d36b, (u_int)0x14143c28,
+    (u_int)0xdede79a7, (u_int)0x5e5ee2bc, (u_int)0x0b0b1d16, (u_int)0xdbdb76ad,
+    (u_int)0xe0e03bdb, (u_int)0x32325664, (u_int)0x3a3a4e74, (u_int)0x0a0a1e14,
+    (u_int)0x4949db92, (u_int)0x06060a0c, (u_int)0x24246c48, (u_int)0x5c5ce4b8,
+    (u_int)0xc2c25d9f, (u_int)0xd3d36ebd, (u_int)0xacacef43, (u_int)0x6262a6c4,
+    (u_int)0x9191a839, (u_int)0x9595a431, (u_int)0xe4e437d3, (u_int)0x79798bf2,
+    (u_int)0xe7e732d5, (u_int)0xc8c8438b, (u_int)0x3737596e, (u_int)0x6d6db7da,
+    (u_int)0x8d8d8c01, (u_int)0xd5d564b1, (u_int)0x4e4ed29c, (u_int)0xa9a9e049,
+    (u_int)0x6c6cb4d8, (u_int)0x5656faac, (u_int)0xf4f407f3, (u_int)0xeaea25cf,
+    (u_int)0x6565afca, (u_int)0x7a7a8ef4, (u_int)0xaeaee947, (u_int)0x08081810,
+    (u_int)0xbabad56f, (u_int)0x787888f0, (u_int)0x25256f4a, (u_int)0x2e2e725c,
+    (u_int)0x1c1c2438, (u_int)0xa6a6f157, (u_int)0xb4b4c773, (u_int)0xc6c65197,
+    (u_int)0xe8e823cb, (u_int)0xdddd7ca1, (u_int)0x74749ce8, (u_int)0x1f1f213e,
+    (u_int)0x4b4bdd96, (u_int)0xbdbddc61, (u_int)0x8b8b860d, (u_int)0x8a8a850f,
+    (u_int)0x707090e0, (u_int)0x3e3e427c, (u_int)0xb5b5c471, (u_int)0x6666aacc,
+    (u_int)0x4848d890, (u_int)0x03030506, (u_int)0xf6f601f7, (u_int)0x0e0e121c,
+    (u_int)0x6161a3c2, (u_int)0x35355f6a, (u_int)0x5757f9ae, (u_int)0xb9b9d069,
+    (u_int)0x86869117, (u_int)0xc1c15899, (u_int)0x1d1d273a, (u_int)0x9e9eb927,
+    (u_int)0xe1e138d9, (u_int)0xf8f813eb, (u_int)0x9898b32b, (u_int)0x11113322,
+    (u_int)0x6969bbd2, (u_int)0xd9d970a9, (u_int)0x8e8e8907, (u_int)0x9494a733,
+    (u_int)0x9b9bb62d, (u_int)0x1e1e223c, (u_int)0x87879215, (u_int)0xe9e920c9,
+    (u_int)0xcece4987, (u_int)0x5555ffaa, (u_int)0x28287850, (u_int)0xdfdf7aa5,
+    (u_int)0x8c8c8f03, (u_int)0xa1a1f859, (u_int)0x89898009, (u_int)0x0d0d171a,
+    (u_int)0xbfbfda65, (u_int)0xe6e631d7, (u_int)0x4242c684, (u_int)0x6868b8d0,
+    (u_int)0x4141c382, (u_int)0x9999b029, (u_int)0x2d2d775a, (u_int)0x0f0f111e,
+    (u_int)0xb0b0cb7b, (u_int)0x5454fca8, (u_int)0xbbbbd66d, (u_int)0x16163a2c,
+};
+static const u32 Te4[256] = {
+    (u_int)0x63636363, (u_int)0x7c7c7c7c, (u_int)0x77777777, (u_int)0x7b7b7b7b,
+    (u_int)0xf2f2f2f2, (u_int)0x6b6b6b6b, (u_int)0x6f6f6f6f, (u_int)0xc5c5c5c5,
+    (u_int)0x30303030, (u_int)0x01010101, (u_int)0x67676767, (u_int)0x2b2b2b2b,
+    (u_int)0xfefefefe, (u_int)0xd7d7d7d7, (u_int)0xabababab, (u_int)0x76767676,
+    (u_int)0xcacacaca, (u_int)0x82828282, (u_int)0xc9c9c9c9, (u_int)0x7d7d7d7d,
+    (u_int)0xfafafafa, (u_int)0x59595959, (u_int)0x47474747, (u_int)0xf0f0f0f0,
+    (u_int)0xadadadad, (u_int)0xd4d4d4d4, (u_int)0xa2a2a2a2, (u_int)0xafafafaf,
+    (u_int)0x9c9c9c9c, (u_int)0xa4a4a4a4, (u_int)0x72727272, (u_int)0xc0c0c0c0,
+    (u_int)0xb7b7b7b7, (u_int)0xfdfdfdfd, (u_int)0x93939393, (u_int)0x26262626,
+    (u_int)0x36363636, (u_int)0x3f3f3f3f, (u_int)0xf7f7f7f7, (u_int)0xcccccccc,
+    (u_int)0x34343434, (u_int)0xa5a5a5a5, (u_int)0xe5e5e5e5, (u_int)0xf1f1f1f1,
+    (u_int)0x71717171, (u_int)0xd8d8d8d8, (u_int)0x31313131, (u_int)0x15151515,
+    (u_int)0x04040404, (u_int)0xc7c7c7c7, (u_int)0x23232323, (u_int)0xc3c3c3c3,
+    (u_int)0x18181818, (u_int)0x96969696, (u_int)0x05050505, (u_int)0x9a9a9a9a,
+    (u_int)0x07070707, (u_int)0x12121212, (u_int)0x80808080, (u_int)0xe2e2e2e2,
+    (u_int)0xebebebeb, (u_int)0x27272727, (u_int)0xb2b2b2b2, (u_int)0x75757575,
+    (u_int)0x09090909, (u_int)0x83838383, (u_int)0x2c2c2c2c, (u_int)0x1a1a1a1a,
+    (u_int)0x1b1b1b1b, (u_int)0x6e6e6e6e, (u_int)0x5a5a5a5a, (u_int)0xa0a0a0a0,
+    (u_int)0x52525252, (u_int)0x3b3b3b3b, (u_int)0xd6d6d6d6, (u_int)0xb3b3b3b3,
+    (u_int)0x29292929, (u_int)0xe3e3e3e3, (u_int)0x2f2f2f2f, (u_int)0x84848484,
+    (u_int)0x53535353, (u_int)0xd1d1d1d1, (u_int)0x00000000, (u_int)0xedededed,
+    (u_int)0x20202020, (u_int)0xfcfcfcfc, (u_int)0xb1b1b1b1, (u_int)0x5b5b5b5b,
+    (u_int)0x6a6a6a6a, (u_int)0xcbcbcbcb, (u_int)0xbebebebe, (u_int)0x39393939,
+    (u_int)0x4a4a4a4a, (u_int)0x4c4c4c4c, (u_int)0x58585858, (u_int)0xcfcfcfcf,
+    (u_int)0xd0d0d0d0, (u_int)0xefefefef, (u_int)0xaaaaaaaa, (u_int)0xfbfbfbfb,
+    (u_int)0x43434343, (u_int)0x4d4d4d4d, (u_int)0x33333333, (u_int)0x85858585,
+    (u_int)0x45454545, (u_int)0xf9f9f9f9, (u_int)0x02020202, (u_int)0x7f7f7f7f,
+    (u_int)0x50505050, (u_int)0x3c3c3c3c, (u_int)0x9f9f9f9f, (u_int)0xa8a8a8a8,
+    (u_int)0x51515151, (u_int)0xa3a3a3a3, (u_int)0x40404040, (u_int)0x8f8f8f8f,
+    (u_int)0x92929292, (u_int)0x9d9d9d9d, (u_int)0x38383838, (u_int)0xf5f5f5f5,
+    (u_int)0xbcbcbcbc, (u_int)0xb6b6b6b6, (u_int)0xdadadada, (u_int)0x21212121,
+    (u_int)0x10101010, (u_int)0xffffffff, (u_int)0xf3f3f3f3, (u_int)0xd2d2d2d2,
+    (u_int)0xcdcdcdcd, (u_int)0x0c0c0c0c, (u_int)0x13131313, (u_int)0xecececec,
+    (u_int)0x5f5f5f5f, (u_int)0x97979797, (u_int)0x44444444, (u_int)0x17171717,
+    (u_int)0xc4c4c4c4, (u_int)0xa7a7a7a7, (u_int)0x7e7e7e7e, (u_int)0x3d3d3d3d,
+    (u_int)0x64646464, (u_int)0x5d5d5d5d, (u_int)0x19191919, (u_int)0x73737373,
+    (u_int)0x60606060, (u_int)0x81818181, (u_int)0x4f4f4f4f, (u_int)0xdcdcdcdc,
+    (u_int)0x22222222, (u_int)0x2a2a2a2a, (u_int)0x90909090, (u_int)0x88888888,
+    (u_int)0x46464646, (u_int)0xeeeeeeee, (u_int)0xb8b8b8b8, (u_int)0x14141414,
+    (u_int)0xdededede, (u_int)0x5e5e5e5e, (u_int)0x0b0b0b0b, (u_int)0xdbdbdbdb,
+    (u_int)0xe0e0e0e0, (u_int)0x32323232, (u_int)0x3a3a3a3a, (u_int)0x0a0a0a0a,
+    (u_int)0x49494949, (u_int)0x06060606, (u_int)0x24242424, (u_int)0x5c5c5c5c,
+    (u_int)0xc2c2c2c2, (u_int)0xd3d3d3d3, (u_int)0xacacacac, (u_int)0x62626262,
+    (u_int)0x91919191, (u_int)0x95959595, (u_int)0xe4e4e4e4, (u_int)0x79797979,
+    (u_int)0xe7e7e7e7, (u_int)0xc8c8c8c8, (u_int)0x37373737, (u_int)0x6d6d6d6d,
+    (u_int)0x8d8d8d8d, (u_int)0xd5d5d5d5, (u_int)0x4e4e4e4e, (u_int)0xa9a9a9a9,
+    (u_int)0x6c6c6c6c, (u_int)0x56565656, (u_int)0xf4f4f4f4, (u_int)0xeaeaeaea,
+    (u_int)0x65656565, (u_int)0x7a7a7a7a, (u_int)0xaeaeaeae, (u_int)0x08080808,
+    (u_int)0xbabababa, (u_int)0x78787878, (u_int)0x25252525, (u_int)0x2e2e2e2e,
+    (u_int)0x1c1c1c1c, (u_int)0xa6a6a6a6, (u_int)0xb4b4b4b4, (u_int)0xc6c6c6c6,
+    (u_int)0xe8e8e8e8, (u_int)0xdddddddd, (u_int)0x74747474, (u_int)0x1f1f1f1f,
+    (u_int)0x4b4b4b4b, (u_int)0xbdbdbdbd, (u_int)0x8b8b8b8b, (u_int)0x8a8a8a8a,
+    (u_int)0x70707070, (u_int)0x3e3e3e3e, (u_int)0xb5b5b5b5, (u_int)0x66666666,
+    (u_int)0x48484848, (u_int)0x03030303, (u_int)0xf6f6f6f6, (u_int)0x0e0e0e0e,
+    (u_int)0x61616161, (u_int)0x35353535, (u_int)0x57575757, (u_int)0xb9b9b9b9,
+    (u_int)0x86868686, (u_int)0xc1c1c1c1, (u_int)0x1d1d1d1d, (u_int)0x9e9e9e9e,
+    (u_int)0xe1e1e1e1, (u_int)0xf8f8f8f8, (u_int)0x98989898, (u_int)0x11111111,
+    (u_int)0x69696969, (u_int)0xd9d9d9d9, (u_int)0x8e8e8e8e, (u_int)0x94949494,
+    (u_int)0x9b9b9b9b, (u_int)0x1e1e1e1e, (u_int)0x87878787, (u_int)0xe9e9e9e9,
+    (u_int)0xcececece, (u_int)0x55555555, (u_int)0x28282828, (u_int)0xdfdfdfdf,
+    (u_int)0x8c8c8c8c, (u_int)0xa1a1a1a1, (u_int)0x89898989, (u_int)0x0d0d0d0d,
+    (u_int)0xbfbfbfbf, (u_int)0xe6e6e6e6, (u_int)0x42424242, (u_int)0x68686868,
+    (u_int)0x41414141, (u_int)0x99999999, (u_int)0x2d2d2d2d, (u_int)0x0f0f0f0f,
+    (u_int)0xb0b0b0b0, (u_int)0x54545454, (u_int)0xbbbbbbbb, (u_int)0x16161616,
+};
+static const u32 Td0[256] = {
+    (u_int)0x51f4a750, (u_int)0x7e416553, (u_int)0x1a17a4c3, (u_int)0x3a275e96,
+    (u_int)0x3bab6bcb, (u_int)0x1f9d45f1, (u_int)0xacfa58ab, (u_int)0x4be30393,
+    (u_int)0x2030fa55, (u_int)0xad766df6, (u_int)0x88cc7691, (u_int)0xf5024c25,
+    (u_int)0x4fe5d7fc, (u_int)0xc52acbd7, (u_int)0x26354480, (u_int)0xb562a38f,
+    (u_int)0xdeb15a49, (u_int)0x25ba1b67, (u_int)0x45ea0e98, (u_int)0x5dfec0e1,
+    (u_int)0xc32f7502, (u_int)0x814cf012, (u_int)0x8d4697a3, (u_int)0x6bd3f9c6,
+    (u_int)0x038f5fe7, (u_int)0x15929c95, (u_int)0xbf6d7aeb, (u_int)0x955259da,
+    (u_int)0xd4be832d, (u_int)0x587421d3, (u_int)0x49e06929, (u_int)0x8ec9c844,
+    (u_int)0x75c2896a, (u_int)0xf48e7978, (u_int)0x99583e6b, (u_int)0x27b971dd,
+    (u_int)0xbee14fb6, (u_int)0xf088ad17, (u_int)0xc920ac66, (u_int)0x7dce3ab4,
+    (u_int)0x63df4a18, (u_int)0xe51a3182, (u_int)0x97513360, (u_int)0x62537f45,
+    (u_int)0xb16477e0, (u_int)0xbb6bae84, (u_int)0xfe81a01c, (u_int)0xf9082b94,
+    (u_int)0x70486858, (u_int)0x8f45fd19, (u_int)0x94de6c87, (u_int)0x527bf8b7,
+    (u_int)0xab73d323, (u_int)0x724b02e2, (u_int)0xe31f8f57, (u_int)0x6655ab2a,
+    (u_int)0xb2eb2807, (u_int)0x2fb5c203, (u_int)0x86c57b9a, (u_int)0xd33708a5,
+    (u_int)0x302887f2, (u_int)0x23bfa5b2, (u_int)0x02036aba, (u_int)0xed16825c,
+    (u_int)0x8acf1c2b, (u_int)0xa779b492, (u_int)0xf307f2f0, (u_int)0x4e69e2a1,
+    (u_int)0x65daf4cd, (u_int)0x0605bed5, (u_int)0xd134621f, (u_int)0xc4a6fe8a,
+    (u_int)0x342e539d, (u_int)0xa2f355a0, (u_int)0x058ae132, (u_int)0xa4f6eb75,
+    (u_int)0x0b83ec39, (u_int)0x4060efaa, (u_int)0x5e719f06, (u_int)0xbd6e1051,
+    (u_int)0x3e218af9, (u_int)0x96dd063d, (u_int)0xdd3e05ae, (u_int)0x4de6bd46,
+    (u_int)0x91548db5, (u_int)0x71c45d05, (u_int)0x0406d46f, (u_int)0x605015ff,
+    (u_int)0x1998fb24, (u_int)0xd6bde997, (u_int)0x894043cc, (u_int)0x67d99e77,
+    (u_int)0xb0e842bd, (u_int)0x07898b88, (u_int)0xe7195b38, (u_int)0x79c8eedb,
+    (u_int)0xa17c0a47, (u_int)0x7c420fe9, (u_int)0xf8841ec9, (u_int)0x00000000,
+    (u_int)0x09808683, (u_int)0x322bed48, (u_int)0x1e1170ac, (u_int)0x6c5a724e,
+    (u_int)0xfd0efffb, (u_int)0x0f853856, (u_int)0x3daed51e, (u_int)0x362d3927,
+    (u_int)0x0a0fd964, (u_int)0x685ca621, (u_int)0x9b5b54d1, (u_int)0x24362e3a,
+    (u_int)0x0c0a67b1, (u_int)0x9357e70f, (u_int)0xb4ee96d2, (u_int)0x1b9b919e,
+    (u_int)0x80c0c54f, (u_int)0x61dc20a2, (u_int)0x5a774b69, (u_int)0x1c121a16,
+    (u_int)0xe293ba0a, (u_int)0xc0a02ae5, (u_int)0x3c22e043, (u_int)0x121b171d,
+    (u_int)0x0e090d0b, (u_int)0xf28bc7ad, (u_int)0x2db6a8b9, (u_int)0x141ea9c8,
+    (u_int)0x57f11985, (u_int)0xaf75074c, (u_int)0xee99ddbb, (u_int)0xa37f60fd,
+    (u_int)0xf701269f, (u_int)0x5c72f5bc, (u_int)0x44663bc5, (u_int)0x5bfb7e34,
+    (u_int)0x8b432976, (u_int)0xcb23c6dc, (u_int)0xb6edfc68, (u_int)0xb8e4f163,
+    (u_int)0xd731dcca, (u_int)0x42638510, (u_int)0x13972240, (u_int)0x84c61120,
+    (u_int)0x854a247d, (u_int)0xd2bb3df8, (u_int)0xaef93211, (u_int)0xc729a16d,
+    (u_int)0x1d9e2f4b, (u_int)0xdcb230f3, (u_int)0x0d8652ec, (u_int)0x77c1e3d0,
+    (u_int)0x2bb3166c, (u_int)0xa970b999, (u_int)0x119448fa, (u_int)0x47e96422,
+    (u_int)0xa8fc8cc4, (u_int)0xa0f03f1a, (u_int)0x567d2cd8, (u_int)0x223390ef,
+    (u_int)0x87494ec7, (u_int)0xd938d1c1, (u_int)0x8ccaa2fe, (u_int)0x98d40b36,
+    (u_int)0xa6f581cf, (u_int)0xa57ade28, (u_int)0xdab78e26, (u_int)0x3fadbfa4,
+    (u_int)0x2c3a9de4, (u_int)0x5078920d, (u_int)0x6a5fcc9b, (u_int)0x547e4662,
+    (u_int)0xf68d13c2, (u_int)0x90d8b8e8, (u_int)0x2e39f75e, (u_int)0x82c3aff5,
+    (u_int)0x9f5d80be, (u_int)0x69d0937c, (u_int)0x6fd52da9, (u_int)0xcf2512b3,
+    (u_int)0xc8ac993b, (u_int)0x10187da7, (u_int)0xe89c636e, (u_int)0xdb3bbb7b,
+    (u_int)0xcd267809, (u_int)0x6e5918f4, (u_int)0xec9ab701, (u_int)0x834f9aa8,
+    (u_int)0xe6956e65, (u_int)0xaaffe67e, (u_int)0x21bccf08, (u_int)0xef15e8e6,
+    (u_int)0xbae79bd9, (u_int)0x4a6f36ce, (u_int)0xea9f09d4, (u_int)0x29b07cd6,
+    (u_int)0x31a4b2af, (u_int)0x2a3f2331, (u_int)0xc6a59430, (u_int)0x35a266c0,
+    (u_int)0x744ebc37, (u_int)0xfc82caa6, (u_int)0xe090d0b0, (u_int)0x33a7d815,
+    (u_int)0xf104984a, (u_int)0x41ecdaf7, (u_int)0x7fcd500e, (u_int)0x1791f62f,
+    (u_int)0x764dd68d, (u_int)0x43efb04d, (u_int)0xccaa4d54, (u_int)0xe49604df,
+    (u_int)0x9ed1b5e3, (u_int)0x4c6a881b, (u_int)0xc12c1fb8, (u_int)0x4665517f,
+    (u_int)0x9d5eea04, (u_int)0x018c355d, (u_int)0xfa877473, (u_int)0xfb0b412e,
+    (u_int)0xb3671d5a, (u_int)0x92dbd252, (u_int)0xe9105633, (u_int)0x6dd64713,
+    (u_int)0x9ad7618c, (u_int)0x37a10c7a, (u_int)0x59f8148e, (u_int)0xeb133c89,
+    (u_int)0xcea927ee, (u_int)0xb761c935, (u_int)0xe11ce5ed, (u_int)0x7a47b13c,
+    (u_int)0x9cd2df59, (u_int)0x55f2733f, (u_int)0x1814ce79, (u_int)0x73c737bf,
+    (u_int)0x53f7cdea, (u_int)0x5ffdaa5b, (u_int)0xdf3d6f14, (u_int)0x7844db86,
+    (u_int)0xcaaff381, (u_int)0xb968c43e, (u_int)0x3824342c, (u_int)0xc2a3405f,
+    (u_int)0x161dc372, (u_int)0xbce2250c, (u_int)0x283c498b, (u_int)0xff0d9541,
+    (u_int)0x39a80171, (u_int)0x080cb3de, (u_int)0xd8b4e49c, (u_int)0x6456c190,
+    (u_int)0x7bcb8461, (u_int)0xd532b670, (u_int)0x486c5c74, (u_int)0xd0b85742,
+};
+static const u32 Td1[256] = {
+    (u_int)0x5051f4a7, (u_int)0x537e4165, (u_int)0xc31a17a4, (u_int)0x963a275e,
+    (u_int)0xcb3bab6b, (u_int)0xf11f9d45, (u_int)0xabacfa58, (u_int)0x934be303,
+    (u_int)0x552030fa, (u_int)0xf6ad766d, (u_int)0x9188cc76, (u_int)0x25f5024c,
+    (u_int)0xfc4fe5d7, (u_int)0xd7c52acb, (u_int)0x80263544, (u_int)0x8fb562a3,
+    (u_int)0x49deb15a, (u_int)0x6725ba1b, (u_int)0x9845ea0e, (u_int)0xe15dfec0,
+    (u_int)0x02c32f75, (u_int)0x12814cf0, (u_int)0xa38d4697, (u_int)0xc66bd3f9,
+    (u_int)0xe7038f5f, (u_int)0x9515929c, (u_int)0xebbf6d7a, (u_int)0xda955259,
+    (u_int)0x2dd4be83, (u_int)0xd3587421, (u_int)0x2949e069, (u_int)0x448ec9c8,
+    (u_int)0x6a75c289, (u_int)0x78f48e79, (u_int)0x6b99583e, (u_int)0xdd27b971,
+    (u_int)0xb6bee14f, (u_int)0x17f088ad, (u_int)0x66c920ac, (u_int)0xb47dce3a,
+    (u_int)0x1863df4a, (u_int)0x82e51a31, (u_int)0x60975133, (u_int)0x4562537f,
+    (u_int)0xe0b16477, (u_int)0x84bb6bae, (u_int)0x1cfe81a0, (u_int)0x94f9082b,
+    (u_int)0x58704868, (u_int)0x198f45fd, (u_int)0x8794de6c, (u_int)0xb7527bf8,
+    (u_int)0x23ab73d3, (u_int)0xe2724b02, (u_int)0x57e31f8f, (u_int)0x2a6655ab,
+    (u_int)0x07b2eb28, (u_int)0x032fb5c2, (u_int)0x9a86c57b, (u_int)0xa5d33708,
+    (u_int)0xf2302887, (u_int)0xb223bfa5, (u_int)0xba02036a, (u_int)0x5ced1682,
+    (u_int)0x2b8acf1c, (u_int)0x92a779b4, (u_int)0xf0f307f2, (u_int)0xa14e69e2,
+    (u_int)0xcd65daf4, (u_int)0xd50605be, (u_int)0x1fd13462, (u_int)0x8ac4a6fe,
+    (u_int)0x9d342e53, (u_int)0xa0a2f355, (u_int)0x32058ae1, (u_int)0x75a4f6eb,
+    (u_int)0x390b83ec, (u_int)0xaa4060ef, (u_int)0x065e719f, (u_int)0x51bd6e10,
+    (u_int)0xf93e218a, (u_int)0x3d96dd06, (u_int)0xaedd3e05, (u_int)0x464de6bd,
+    (u_int)0xb591548d, (u_int)0x0571c45d, (u_int)0x6f0406d4, (u_int)0xff605015,
+    (u_int)0x241998fb, (u_int)0x97d6bde9, (u_int)0xcc894043, (u_int)0x7767d99e,
+    (u_int)0xbdb0e842, (u_int)0x8807898b, (u_int)0x38e7195b, (u_int)0xdb79c8ee,
+    (u_int)0x47a17c0a, (u_int)0xe97c420f, (u_int)0xc9f8841e, (u_int)0x00000000,
+    (u_int)0x83098086, (u_int)0x48322bed, (u_int)0xac1e1170, (u_int)0x4e6c5a72,
+    (u_int)0xfbfd0eff, (u_int)0x560f8538, (u_int)0x1e3daed5, (u_int)0x27362d39,
+    (u_int)0x640a0fd9, (u_int)0x21685ca6, (u_int)0xd19b5b54, (u_int)0x3a24362e,
+    (u_int)0xb10c0a67, (u_int)0x0f9357e7, (u_int)0xd2b4ee96, (u_int)0x9e1b9b91,
+    (u_int)0x4f80c0c5, (u_int)0xa261dc20, (u_int)0x695a774b, (u_int)0x161c121a,
+    (u_int)0x0ae293ba, (u_int)0xe5c0a02a, (u_int)0x433c22e0, (u_int)0x1d121b17,
+    (u_int)0x0b0e090d, (u_int)0xadf28bc7, (u_int)0xb92db6a8, (u_int)0xc8141ea9,
+    (u_int)0x8557f119, (u_int)0x4caf7507, (u_int)0xbbee99dd, (u_int)0xfda37f60,
+    (u_int)0x9ff70126, (u_int)0xbc5c72f5, (u_int)0xc544663b, (u_int)0x345bfb7e,
+    (u_int)0x768b4329, (u_int)0xdccb23c6, (u_int)0x68b6edfc, (u_int)0x63b8e4f1,
+    (u_int)0xcad731dc, (u_int)0x10426385, (u_int)0x40139722, (u_int)0x2084c611,
+    (u_int)0x7d854a24, (u_int)0xf8d2bb3d, (u_int)0x11aef932, (u_int)0x6dc729a1,
+    (u_int)0x4b1d9e2f, (u_int)0xf3dcb230, (u_int)0xec0d8652, (u_int)0xd077c1e3,
+    (u_int)0x6c2bb316, (u_int)0x99a970b9, (u_int)0xfa119448, (u_int)0x2247e964,
+    (u_int)0xc4a8fc8c, (u_int)0x1aa0f03f, (u_int)0xd8567d2c, (u_int)0xef223390,
+    (u_int)0xc787494e, (u_int)0xc1d938d1, (u_int)0xfe8ccaa2, (u_int)0x3698d40b,
+    (u_int)0xcfa6f581, (u_int)0x28a57ade, (u_int)0x26dab78e, (u_int)0xa43fadbf,
+    (u_int)0xe42c3a9d, (u_int)0x0d507892, (u_int)0x9b6a5fcc, (u_int)0x62547e46,
+    (u_int)0xc2f68d13, (u_int)0xe890d8b8, (u_int)0x5e2e39f7, (u_int)0xf582c3af,
+    (u_int)0xbe9f5d80, (u_int)0x7c69d093, (u_int)0xa96fd52d, (u_int)0xb3cf2512,
+    (u_int)0x3bc8ac99, (u_int)0xa710187d, (u_int)0x6ee89c63, (u_int)0x7bdb3bbb,
+    (u_int)0x09cd2678, (u_int)0xf46e5918, (u_int)0x01ec9ab7, (u_int)0xa8834f9a,
+    (u_int)0x65e6956e, (u_int)0x7eaaffe6, (u_int)0x0821bccf, (u_int)0xe6ef15e8,
+    (u_int)0xd9bae79b, (u_int)0xce4a6f36, (u_int)0xd4ea9f09, (u_int)0xd629b07c,
+    (u_int)0xaf31a4b2, (u_int)0x312a3f23, (u_int)0x30c6a594, (u_int)0xc035a266,
+    (u_int)0x37744ebc, (u_int)0xa6fc82ca, (u_int)0xb0e090d0, (u_int)0x1533a7d8,
+    (u_int)0x4af10498, (u_int)0xf741ecda, (u_int)0x0e7fcd50, (u_int)0x2f1791f6,
+    (u_int)0x8d764dd6, (u_int)0x4d43efb0, (u_int)0x54ccaa4d, (u_int)0xdfe49604,
+    (u_int)0xe39ed1b5, (u_int)0x1b4c6a88, (u_int)0xb8c12c1f, (u_int)0x7f466551,
+    (u_int)0x049d5eea, (u_int)0x5d018c35, (u_int)0x73fa8774, (u_int)0x2efb0b41,
+    (u_int)0x5ab3671d, (u_int)0x5292dbd2, (u_int)0x33e91056, (u_int)0x136dd647,
+    (u_int)0x8c9ad761, (u_int)0x7a37a10c, (u_int)0x8e59f814, (u_int)0x89eb133c,
+    (u_int)0xeecea927, (u_int)0x35b761c9, (u_int)0xede11ce5, (u_int)0x3c7a47b1,
+    (u_int)0x599cd2df, (u_int)0x3f55f273, (u_int)0x791814ce, (u_int)0xbf73c737,
+    (u_int)0xea53f7cd, (u_int)0x5b5ffdaa, (u_int)0x14df3d6f, (u_int)0x867844db,
+    (u_int)0x81caaff3, (u_int)0x3eb968c4, (u_int)0x2c382434, (u_int)0x5fc2a340,
+    (u_int)0x72161dc3, (u_int)0x0cbce225, (u_int)0x8b283c49, (u_int)0x41ff0d95,
+    (u_int)0x7139a801, (u_int)0xde080cb3, (u_int)0x9cd8b4e4, (u_int)0x906456c1,
+    (u_int)0x617bcb84, (u_int)0x70d532b6, (u_int)0x74486c5c, (u_int)0x42d0b857,
+};
+static const u32 Td2[256] = {
+    (u_int)0xa75051f4, (u_int)0x65537e41, (u_int)0xa4c31a17, (u_int)0x5e963a27,
+    (u_int)0x6bcb3bab, (u_int)0x45f11f9d, (u_int)0x58abacfa, (u_int)0x03934be3,
+    (u_int)0xfa552030, (u_int)0x6df6ad76, (u_int)0x769188cc, (u_int)0x4c25f502,
+    (u_int)0xd7fc4fe5, (u_int)0xcbd7c52a, (u_int)0x44802635, (u_int)0xa38fb562,
+    (u_int)0x5a49deb1, (u_int)0x1b6725ba, (u_int)0x0e9845ea, (u_int)0xc0e15dfe,
+    (u_int)0x7502c32f, (u_int)0xf012814c, (u_int)0x97a38d46, (u_int)0xf9c66bd3,
+    (u_int)0x5fe7038f, (u_int)0x9c951592, (u_int)0x7aebbf6d, (u_int)0x59da9552,
+    (u_int)0x832dd4be, (u_int)0x21d35874, (u_int)0x692949e0, (u_int)0xc8448ec9,
+    (u_int)0x896a75c2, (u_int)0x7978f48e, (u_int)0x3e6b9958, (u_int)0x71dd27b9,
+    (u_int)0x4fb6bee1, (u_int)0xad17f088, (u_int)0xac66c920, (u_int)0x3ab47dce,
+    (u_int)0x4a1863df, (u_int)0x3182e51a, (u_int)0x33609751, (u_int)0x7f456253,
+    (u_int)0x77e0b164, (u_int)0xae84bb6b, (u_int)0xa01cfe81, (u_int)0x2b94f908,
+    (u_int)0x68587048, (u_int)0xfd198f45, (u_int)0x6c8794de, (u_int)0xf8b7527b,
+    (u_int)0xd323ab73, (u_int)0x02e2724b, (u_int)0x8f57e31f, (u_int)0xab2a6655,
+    (u_int)0x2807b2eb, (u_int)0xc2032fb5, (u_int)0x7b9a86c5, (u_int)0x08a5d337,
+    (u_int)0x87f23028, (u_int)0xa5b223bf, (u_int)0x6aba0203, (u_int)0x825ced16,
+    (u_int)0x1c2b8acf, (u_int)0xb492a779, (u_int)0xf2f0f307, (u_int)0xe2a14e69,
+    (u_int)0xf4cd65da, (u_int)0xbed50605, (u_int)0x621fd134, (u_int)0xfe8ac4a6,
+    (u_int)0x539d342e, (u_int)0x55a0a2f3, (u_int)0xe132058a, (u_int)0xeb75a4f6,
+    (u_int)0xec390b83, (u_int)0xefaa4060, (u_int)0x9f065e71, (u_int)0x1051bd6e,
+
+    (u_int)0x8af93e21, (u_int)0x063d96dd, (u_int)0x05aedd3e, (u_int)0xbd464de6,
+    (u_int)0x8db59154, (u_int)0x5d0571c4, (u_int)0xd46f0406, (u_int)0x15ff6050,
+    (u_int)0xfb241998, (u_int)0xe997d6bd, (u_int)0x43cc8940, (u_int)0x9e7767d9,
+    (u_int)0x42bdb0e8, (u_int)0x8b880789, (u_int)0x5b38e719, (u_int)0xeedb79c8,
+    (u_int)0x0a47a17c, (u_int)0x0fe97c42, (u_int)0x1ec9f884, (u_int)0x00000000,
+    (u_int)0x86830980, (u_int)0xed48322b, (u_int)0x70ac1e11, (u_int)0x724e6c5a,
+    (u_int)0xfffbfd0e, (u_int)0x38560f85, (u_int)0xd51e3dae, (u_int)0x3927362d,
+    (u_int)0xd9640a0f, (u_int)0xa621685c, (u_int)0x54d19b5b, (u_int)0x2e3a2436,
+    (u_int)0x67b10c0a, (u_int)0xe70f9357, (u_int)0x96d2b4ee, (u_int)0x919e1b9b,
+    (u_int)0xc54f80c0, (u_int)0x20a261dc, (u_int)0x4b695a77, (u_int)0x1a161c12,
+    (u_int)0xba0ae293, (u_int)0x2ae5c0a0, (u_int)0xe0433c22, (u_int)0x171d121b,
+    (u_int)0x0d0b0e09, (u_int)0xc7adf28b, (u_int)0xa8b92db6, (u_int)0xa9c8141e,
+    (u_int)0x198557f1, (u_int)0x074caf75, (u_int)0xddbbee99, (u_int)0x60fda37f,
+    (u_int)0x269ff701, (u_int)0xf5bc5c72, (u_int)0x3bc54466, (u_int)0x7e345bfb,
+    (u_int)0x29768b43, (u_int)0xc6dccb23, (u_int)0xfc68b6ed, (u_int)0xf163b8e4,
+    (u_int)0xdccad731, (u_int)0x85104263, (u_int)0x22401397, (u_int)0x112084c6,
+    (u_int)0x247d854a, (u_int)0x3df8d2bb, (u_int)0x3211aef9, (u_int)0xa16dc729,
+    (u_int)0x2f4b1d9e, (u_int)0x30f3dcb2, (u_int)0x52ec0d86, (u_int)0xe3d077c1,
+    (u_int)0x166c2bb3, (u_int)0xb999a970, (u_int)0x48fa1194, (u_int)0x642247e9,
+    (u_int)0x8cc4a8fc, (u_int)0x3f1aa0f0, (u_int)0x2cd8567d, (u_int)0x90ef2233,
+    (u_int)0x4ec78749, (u_int)0xd1c1d938, (u_int)0xa2fe8cca, (u_int)0x0b3698d4,
+    (u_int)0x81cfa6f5, (u_int)0xde28a57a, (u_int)0x8e26dab7, (u_int)0xbfa43fad,
+    (u_int)0x9de42c3a, (u_int)0x920d5078, (u_int)0xcc9b6a5f, (u_int)0x4662547e,
+    (u_int)0x13c2f68d, (u_int)0xb8e890d8, (u_int)0xf75e2e39, (u_int)0xaff582c3,
+    (u_int)0x80be9f5d, (u_int)0x937c69d0, (u_int)0x2da96fd5, (u_int)0x12b3cf25,
+    (u_int)0x993bc8ac, (u_int)0x7da71018, (u_int)0x636ee89c, (u_int)0xbb7bdb3b,
+    (u_int)0x7809cd26, (u_int)0x18f46e59, (u_int)0xb701ec9a, (u_int)0x9aa8834f,
+    (u_int)0x6e65e695, (u_int)0xe67eaaff, (u_int)0xcf0821bc, (u_int)0xe8e6ef15,
+    (u_int)0x9bd9bae7, (u_int)0x36ce4a6f, (u_int)0x09d4ea9f, (u_int)0x7cd629b0,
+    (u_int)0xb2af31a4, (u_int)0x23312a3f, (u_int)0x9430c6a5, (u_int)0x66c035a2,
+    (u_int)0xbc37744e, (u_int)0xcaa6fc82, (u_int)0xd0b0e090, (u_int)0xd81533a7,
+    (u_int)0x984af104, (u_int)0xdaf741ec, (u_int)0x500e7fcd, (u_int)0xf62f1791,
+    (u_int)0xd68d764d, (u_int)0xb04d43ef, (u_int)0x4d54ccaa, (u_int)0x04dfe496,
+    (u_int)0xb5e39ed1, (u_int)0x881b4c6a, (u_int)0x1fb8c12c, (u_int)0x517f4665,
+    (u_int)0xea049d5e, (u_int)0x355d018c, (u_int)0x7473fa87, (u_int)0x412efb0b,
+    (u_int)0x1d5ab367, (u_int)0xd25292db, (u_int)0x5633e910, (u_int)0x47136dd6,
+    (u_int)0x618c9ad7, (u_int)0x0c7a37a1, (u_int)0x148e59f8, (u_int)0x3c89eb13,
+    (u_int)0x27eecea9, (u_int)0xc935b761, (u_int)0xe5ede11c, (u_int)0xb13c7a47,
+    (u_int)0xdf599cd2, (u_int)0x733f55f2, (u_int)0xce791814, (u_int)0x37bf73c7,
+    (u_int)0xcdea53f7, (u_int)0xaa5b5ffd, (u_int)0x6f14df3d, (u_int)0xdb867844,
+    (u_int)0xf381caaf, (u_int)0xc43eb968, (u_int)0x342c3824, (u_int)0x405fc2a3,
+    (u_int)0xc372161d, (u_int)0x250cbce2, (u_int)0x498b283c, (u_int)0x9541ff0d,
+    (u_int)0x017139a8, (u_int)0xb3de080c, (u_int)0xe49cd8b4, (u_int)0xc1906456,
+    (u_int)0x84617bcb, (u_int)0xb670d532, (u_int)0x5c74486c, (u_int)0x5742d0b8,
+};
+static const u32 Td3[256] = {
+    (u_int)0xf4a75051, (u_int)0x4165537e, (u_int)0x17a4c31a, (u_int)0x275e963a,
+    (u_int)0xab6bcb3b, (u_int)0x9d45f11f, (u_int)0xfa58abac, (u_int)0xe303934b,
+    (u_int)0x30fa5520, (u_int)0x766df6ad, (u_int)0xcc769188, (u_int)0x024c25f5,
+    (u_int)0xe5d7fc4f, (u_int)0x2acbd7c5, (u_int)0x35448026, (u_int)0x62a38fb5,
+    (u_int)0xb15a49de, (u_int)0xba1b6725, (u_int)0xea0e9845, (u_int)0xfec0e15d,
+    (u_int)0x2f7502c3, (u_int)0x4cf01281, (u_int)0x4697a38d, (u_int)0xd3f9c66b,
+    (u_int)0x8f5fe703, (u_int)0x929c9515, (u_int)0x6d7aebbf, (u_int)0x5259da95,
+    (u_int)0xbe832dd4, (u_int)0x7421d358, (u_int)0xe0692949, (u_int)0xc9c8448e,
+    (u_int)0xc2896a75, (u_int)0x8e7978f4, (u_int)0x583e6b99, (u_int)0xb971dd27,
+    (u_int)0xe14fb6be, (u_int)0x88ad17f0, (u_int)0x20ac66c9, (u_int)0xce3ab47d,
+    (u_int)0xdf4a1863, (u_int)0x1a3182e5, (u_int)0x51336097, (u_int)0x537f4562,
+    (u_int)0x6477e0b1, (u_int)0x6bae84bb, (u_int)0x81a01cfe, (u_int)0x082b94f9,
+    (u_int)0x48685870, (u_int)0x45fd198f, (u_int)0xde6c8794, (u_int)0x7bf8b752,
+    (u_int)0x73d323ab, (u_int)0x4b02e272, (u_int)0x1f8f57e3, (u_int)0x55ab2a66,
+    (u_int)0xeb2807b2, (u_int)0xb5c2032f, (u_int)0xc57b9a86, (u_int)0x3708a5d3,
+    (u_int)0x2887f230, (u_int)0xbfa5b223, (u_int)0x036aba02, (u_int)0x16825ced,
+    (u_int)0xcf1c2b8a, (u_int)0x79b492a7, (u_int)0x07f2f0f3, (u_int)0x69e2a14e,
+    (u_int)0xdaf4cd65, (u_int)0x05bed506, (u_int)0x34621fd1, (u_int)0xa6fe8ac4,
+    (u_int)0x2e539d34, (u_int)0xf355a0a2, (u_int)0x8ae13205, (u_int)0xf6eb75a4,
+    (u_int)0x83ec390b, (u_int)0x60efaa40, (u_int)0x719f065e, (u_int)0x6e1051bd,
+    (u_int)0x218af93e, (u_int)0xdd063d96, (u_int)0x3e05aedd, (u_int)0xe6bd464d,
+    (u_int)0x548db591, (u_int)0xc45d0571, (u_int)0x06d46f04, (u_int)0x5015ff60,
+    (u_int)0x98fb2419, (u_int)0xbde997d6, (u_int)0x4043cc89, (u_int)0xd99e7767,
+    (u_int)0xe842bdb0, (u_int)0x898b8807, (u_int)0x195b38e7, (u_int)0xc8eedb79,
+    (u_int)0x7c0a47a1, (u_int)0x420fe97c, (u_int)0x841ec9f8, (u_int)0x00000000,
+    (u_int)0x80868309, (u_int)0x2bed4832, (u_int)0x1170ac1e, (u_int)0x5a724e6c,
+    (u_int)0x0efffbfd, (u_int)0x8538560f, (u_int)0xaed51e3d, (u_int)0x2d392736,
+    (u_int)0x0fd9640a, (u_int)0x5ca62168, (u_int)0x5b54d19b, (u_int)0x362e3a24,
+    (u_int)0x0a67b10c, (u_int)0x57e70f93, (u_int)0xee96d2b4, (u_int)0x9b919e1b,
+    (u_int)0xc0c54f80, (u_int)0xdc20a261, (u_int)0x774b695a, (u_int)0x121a161c,
+    (u_int)0x93ba0ae2, (u_int)0xa02ae5c0, (u_int)0x22e0433c, (u_int)0x1b171d12,
+    (u_int)0x090d0b0e, (u_int)0x8bc7adf2, (u_int)0xb6a8b92d, (u_int)0x1ea9c814,
+    (u_int)0xf1198557, (u_int)0x75074caf, (u_int)0x99ddbbee, (u_int)0x7f60fda3,
+    (u_int)0x01269ff7, (u_int)0x72f5bc5c, (u_int)0x663bc544, (u_int)0xfb7e345b,
+    (u_int)0x4329768b, (u_int)0x23c6dccb, (u_int)0xedfc68b6, (u_int)0xe4f163b8,
+    (u_int)0x31dccad7, (u_int)0x63851042, (u_int)0x97224013, (u_int)0xc6112084,
+    (u_int)0x4a247d85, (u_int)0xbb3df8d2, (u_int)0xf93211ae, (u_int)0x29a16dc7,
+    (u_int)0x9e2f4b1d, (u_int)0xb230f3dc, (u_int)0x8652ec0d, (u_int)0xc1e3d077,
+    (u_int)0xb3166c2b, (u_int)0x70b999a9, (u_int)0x9448fa11, (u_int)0xe9642247,
+    (u_int)0xfc8cc4a8, (u_int)0xf03f1aa0, (u_int)0x7d2cd856, (u_int)0x3390ef22,
+    (u_int)0x494ec787, (u_int)0x38d1c1d9, (u_int)0xcaa2fe8c, (u_int)0xd40b3698,
+    (u_int)0xf581cfa6, (u_int)0x7ade28a5, (u_int)0xb78e26da, (u_int)0xadbfa43f,
+    (u_int)0x3a9de42c, (u_int)0x78920d50, (u_int)0x5fcc9b6a, (u_int)0x7e466254,
+    (u_int)0x8d13c2f6, (u_int)0xd8b8e890, (u_int)0x39f75e2e, (u_int)0xc3aff582,
+    (u_int)0x5d80be9f, (u_int)0xd0937c69, (u_int)0xd52da96f, (u_int)0x2512b3cf,
+    (u_int)0xac993bc8, (u_int)0x187da710, (u_int)0x9c636ee8, (u_int)0x3bbb7bdb,
+    (u_int)0x267809cd, (u_int)0x5918f46e, (u_int)0x9ab701ec, (u_int)0x4f9aa883,
+    (u_int)0x956e65e6, (u_int)0xffe67eaa, (u_int)0xbccf0821, (u_int)0x15e8e6ef,
+    (u_int)0xe79bd9ba, (u_int)0x6f36ce4a, (u_int)0x9f09d4ea, (u_int)0xb07cd629,
+    (u_int)0xa4b2af31, (u_int)0x3f23312a, (u_int)0xa59430c6, (u_int)0xa266c035,
+    (u_int)0x4ebc3774, (u_int)0x82caa6fc, (u_int)0x90d0b0e0, (u_int)0xa7d81533,
+    (u_int)0x04984af1, (u_int)0xecdaf741, (u_int)0xcd500e7f, (u_int)0x91f62f17,
+    (u_int)0x4dd68d76, (u_int)0xefb04d43, (u_int)0xaa4d54cc, (u_int)0x9604dfe4,
+    (u_int)0xd1b5e39e, (u_int)0x6a881b4c, (u_int)0x2c1fb8c1, (u_int)0x65517f46,
+    (u_int)0x5eea049d, (u_int)0x8c355d01, (u_int)0x877473fa, (u_int)0x0b412efb,
+    (u_int)0x671d5ab3, (u_int)0xdbd25292, (u_int)0x105633e9, (u_int)0xd647136d,
+    (u_int)0xd7618c9a, (u_int)0xa10c7a37, (u_int)0xf8148e59, (u_int)0x133c89eb,
+    (u_int)0xa927eece, (u_int)0x61c935b7, (u_int)0x1ce5ede1, (u_int)0x47b13c7a,
+    (u_int)0xd2df599c, (u_int)0xf2733f55, (u_int)0x14ce7918, (u_int)0xc737bf73,
+    (u_int)0xf7cdea53, (u_int)0xfdaa5b5f, (u_int)0x3d6f14df, (u_int)0x44db8678,
+    (u_int)0xaff381ca, (u_int)0x68c43eb9, (u_int)0x24342c38, (u_int)0xa3405fc2,
+    (u_int)0x1dc37216, (u_int)0xe2250cbc, (u_int)0x3c498b28, (u_int)0x0d9541ff,
+    (u_int)0xa8017139, (u_int)0x0cb3de08, (u_int)0xb4e49cd8, (u_int)0x56c19064,
+    (u_int)0xcb84617b, (u_int)0x32b670d5, (u_int)0x6c5c7448, (u_int)0xb85742d0,
+};
+static const u32 Td4[256] = {
+    (u_int)0x52525252, (u_int)0x09090909, (u_int)0x6a6a6a6a, (u_int)0xd5d5d5d5,
+    (u_int)0x30303030, (u_int)0x36363636, (u_int)0xa5a5a5a5, (u_int)0x38383838,
+    (u_int)0xbfbfbfbf, (u_int)0x40404040, (u_int)0xa3a3a3a3, (u_int)0x9e9e9e9e,
+    (u_int)0x81818181, (u_int)0xf3f3f3f3, (u_int)0xd7d7d7d7, (u_int)0xfbfbfbfb,
+    (u_int)0x7c7c7c7c, (u_int)0xe3e3e3e3, (u_int)0x39393939, (u_int)0x82828282,
+    (u_int)0x9b9b9b9b, (u_int)0x2f2f2f2f, (u_int)0xffffffff, (u_int)0x87878787,
+    (u_int)0x34343434, (u_int)0x8e8e8e8e, (u_int)0x43434343, (u_int)0x44444444,
+    (u_int)0xc4c4c4c4, (u_int)0xdededede, (u_int)0xe9e9e9e9, (u_int)0xcbcbcbcb,
+    (u_int)0x54545454, (u_int)0x7b7b7b7b, (u_int)0x94949494, (u_int)0x32323232,
+    (u_int)0xa6a6a6a6, (u_int)0xc2c2c2c2, (u_int)0x23232323, (u_int)0x3d3d3d3d,
+    (u_int)0xeeeeeeee, (u_int)0x4c4c4c4c, (u_int)0x95959595, (u_int)0x0b0b0b0b,
+    (u_int)0x42424242, (u_int)0xfafafafa, (u_int)0xc3c3c3c3, (u_int)0x4e4e4e4e,
+    (u_int)0x08080808, (u_int)0x2e2e2e2e, (u_int)0xa1a1a1a1, (u_int)0x66666666,
+    (u_int)0x28282828, (u_int)0xd9d9d9d9, (u_int)0x24242424, (u_int)0xb2b2b2b2,
+    (u_int)0x76767676, (u_int)0x5b5b5b5b, (u_int)0xa2a2a2a2, (u_int)0x49494949,
+    (u_int)0x6d6d6d6d, (u_int)0x8b8b8b8b, (u_int)0xd1d1d1d1, (u_int)0x25252525,
+    (u_int)0x72727272, (u_int)0xf8f8f8f8, (u_int)0xf6f6f6f6, (u_int)0x64646464,
+    (u_int)0x86868686, (u_int)0x68686868, (u_int)0x98989898, (u_int)0x16161616,
+    (u_int)0xd4d4d4d4, (u_int)0xa4a4a4a4, (u_int)0x5c5c5c5c, (u_int)0xcccccccc,
+    (u_int)0x5d5d5d5d, (u_int)0x65656565, (u_int)0xb6b6b6b6, (u_int)0x92929292,
+    (u_int)0x6c6c6c6c, (u_int)0x70707070, (u_int)0x48484848, (u_int)0x50505050,
+    (u_int)0xfdfdfdfd, (u_int)0xedededed, (u_int)0xb9b9b9b9, (u_int)0xdadadada,
+    (u_int)0x5e5e5e5e, (u_int)0x15151515, (u_int)0x46464646, (u_int)0x57575757,
+    (u_int)0xa7a7a7a7, (u_int)0x8d8d8d8d, (u_int)0x9d9d9d9d, (u_int)0x84848484,
+    (u_int)0x90909090, (u_int)0xd8d8d8d8, (u_int)0xabababab, (u_int)0x00000000,
+    (u_int)0x8c8c8c8c, (u_int)0xbcbcbcbc, (u_int)0xd3d3d3d3, (u_int)0x0a0a0a0a,
+    (u_int)0xf7f7f7f7, (u_int)0xe4e4e4e4, (u_int)0x58585858, (u_int)0x05050505,
+    (u_int)0xb8b8b8b8, (u_int)0xb3b3b3b3, (u_int)0x45454545, (u_int)0x06060606,
+    (u_int)0xd0d0d0d0, (u_int)0x2c2c2c2c, (u_int)0x1e1e1e1e, (u_int)0x8f8f8f8f,
+    (u_int)0xcacacaca, (u_int)0x3f3f3f3f, (u_int)0x0f0f0f0f, (u_int)0x02020202,
+    (u_int)0xc1c1c1c1, (u_int)0xafafafaf, (u_int)0xbdbdbdbd, (u_int)0x03030303,
+    (u_int)0x01010101, (u_int)0x13131313, (u_int)0x8a8a8a8a, (u_int)0x6b6b6b6b,
+    (u_int)0x3a3a3a3a, (u_int)0x91919191, (u_int)0x11111111, (u_int)0x41414141,
+    (u_int)0x4f4f4f4f, (u_int)0x67676767, (u_int)0xdcdcdcdc, (u_int)0xeaeaeaea,
+    (u_int)0x97979797, (u_int)0xf2f2f2f2, (u_int)0xcfcfcfcf, (u_int)0xcececece,
+    (u_int)0xf0f0f0f0, (u_int)0xb4b4b4b4, (u_int)0xe6e6e6e6, (u_int)0x73737373,
+    (u_int)0x96969696, (u_int)0xacacacac, (u_int)0x74747474, (u_int)0x22222222,
+    (u_int)0xe7e7e7e7, (u_int)0xadadadad, (u_int)0x35353535, (u_int)0x85858585,
+    (u_int)0xe2e2e2e2, (u_int)0xf9f9f9f9, (u_int)0x37373737, (u_int)0xe8e8e8e8,
+    (u_int)0x1c1c1c1c, (u_int)0x75757575, (u_int)0xdfdfdfdf, (u_int)0x6e6e6e6e,
+    (u_int)0x47474747, (u_int)0xf1f1f1f1, (u_int)0x1a1a1a1a, (u_int)0x71717171,
+    (u_int)0x1d1d1d1d, (u_int)0x29292929, (u_int)0xc5c5c5c5, (u_int)0x89898989,
+    (u_int)0x6f6f6f6f, (u_int)0xb7b7b7b7, (u_int)0x62626262, (u_int)0x0e0e0e0e,
+    (u_int)0xaaaaaaaa, (u_int)0x18181818, (u_int)0xbebebebe, (u_int)0x1b1b1b1b,
+    (u_int)0xfcfcfcfc, (u_int)0x56565656, (u_int)0x3e3e3e3e, (u_int)0x4b4b4b4b,
+    (u_int)0xc6c6c6c6, (u_int)0xd2d2d2d2, (u_int)0x79797979, (u_int)0x20202020,
+    (u_int)0x9a9a9a9a, (u_int)0xdbdbdbdb, (u_int)0xc0c0c0c0, (u_int)0xfefefefe,
+    (u_int)0x78787878, (u_int)0xcdcdcdcd, (u_int)0x5a5a5a5a, (u_int)0xf4f4f4f4,
+    (u_int)0x1f1f1f1f, (u_int)0xdddddddd, (u_int)0xa8a8a8a8, (u_int)0x33333333,
+    (u_int)0x88888888, (u_int)0x07070707, (u_int)0xc7c7c7c7, (u_int)0x31313131,
+    (u_int)0xb1b1b1b1, (u_int)0x12121212, (u_int)0x10101010, (u_int)0x59595959,
+    (u_int)0x27272727, (u_int)0x80808080, (u_int)0xecececec, (u_int)0x5f5f5f5f,
+    (u_int)0x60606060, (u_int)0x51515151, (u_int)0x7f7f7f7f, (u_int)0xa9a9a9a9,
+    (u_int)0x19191919, (u_int)0xb5b5b5b5, (u_int)0x4a4a4a4a, (u_int)0x0d0d0d0d,
+    (u_int)0x2d2d2d2d, (u_int)0xe5e5e5e5, (u_int)0x7a7a7a7a, (u_int)0x9f9f9f9f,
+    (u_int)0x93939393, (u_int)0xc9c9c9c9, (u_int)0x9c9c9c9c, (u_int)0xefefefef,
+    (u_int)0xa0a0a0a0, (u_int)0xe0e0e0e0, (u_int)0x3b3b3b3b, (u_int)0x4d4d4d4d,
+    (u_int)0xaeaeaeae, (u_int)0x2a2a2a2a, (u_int)0xf5f5f5f5, (u_int)0xb0b0b0b0,
+    (u_int)0xc8c8c8c8, (u_int)0xebebebeb, (u_int)0xbbbbbbbb, (u_int)0x3c3c3c3c,
+    (u_int)0x83838383, (u_int)0x53535353, (u_int)0x99999999, (u_int)0x61616161,
+    (u_int)0x17171717, (u_int)0x2b2b2b2b, (u_int)0x04040404, (u_int)0x7e7e7e7e,
+    (u_int)0xbabababa, (u_int)0x77777777, (u_int)0xd6d6d6d6, (u_int)0x26262626,
+    (u_int)0xe1e1e1e1, (u_int)0x69696969, (u_int)0x14141414, (u_int)0x63636363,
+    (u_int)0x55555555, (u_int)0x21212121, (u_int)0x0c0c0c0c, (u_int)0x7d7d7d7d,
+};
+static const u32 rcon[] = {
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+#define	SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+
+#ifdef _MSC_VER
+#define	GETU32(p) SWAP(*((u32 *)(p)))
+#define	PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
+#else
+#define	GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
+#define	PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
+#endif
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * @return	the number of rounds for the given cipher key size.
+ */
+/*
+ * __db_rijndaelKeySetupEnc --
+ *
+ * PUBLIC: int __db_rijndaelKeySetupEnc __P((u32 *, const u8 *, int));
+ */
+int
+__db_rijndaelKeySetupEnc(rk, cipherKey, keyBits)
+	u32 *rk;	/* rk[4*(Nr + 1)] */
+	const u8 *cipherKey;
+	int keyBits;
+{
+	int i = 0;
+	u32 temp;
+
+	rk[0] = GETU32(cipherKey     );
+	rk[1] = GETU32(cipherKey +  4);
+	rk[2] = GETU32(cipherKey +  8);
+	rk[3] = GETU32(cipherKey + 12);
+	if (keyBits == 128) {
+		for (;;) {
+			temp  = rk[3];
+			rk[4] = rk[0] ^
+				(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+				(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+				(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+				(Te4[(temp >> 24)       ] & 0x000000ff) ^
+				rcon[i];
+			rk[5] = rk[1] ^ rk[4];
+			rk[6] = rk[2] ^ rk[5];
+			rk[7] = rk[3] ^ rk[6];
+			if (++i == 10) {
+				return 10;
+			}
+			rk += 4;
+		}
+	}
+	rk[4] = GETU32(cipherKey + 16);
+	rk[5] = GETU32(cipherKey + 20);
+	if (keyBits == 192) {
+		for (;;) {
+			temp = rk[ 5];
+			rk[ 6] = rk[ 0] ^
+				(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+				(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+				(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+				(Te4[(temp >> 24)       ] & 0x000000ff) ^
+				rcon[i];
+			rk[ 7] = rk[ 1] ^ rk[ 6];
+			rk[ 8] = rk[ 2] ^ rk[ 7];
+			rk[ 9] = rk[ 3] ^ rk[ 8];
+			if (++i == 8) {
+				return 12;
+			}
+			rk[10] = rk[ 4] ^ rk[ 9];
+			rk[11] = rk[ 5] ^ rk[10];
+			rk += 6;
+		}
+	}
+	rk[6] = GETU32(cipherKey + 24);
+	rk[7] = GETU32(cipherKey + 28);
+	if (keyBits == 256) {
+	for (;;) {
+		temp = rk[ 7];
+		rk[ 8] = rk[ 0] ^
+			(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+			(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+			(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+			(Te4[(temp >> 24)       ] & 0x000000ff) ^
+			rcon[i];
+		rk[ 9] = rk[ 1] ^ rk[ 8];
+		rk[10] = rk[ 2] ^ rk[ 9];
+		rk[11] = rk[ 3] ^ rk[10];
+			if (++i == 7) {
+				return 14;
+			}
+		temp = rk[11];
+		rk[12] = rk[ 4] ^
+			(Te4[(temp >> 24)       ] & 0xff000000) ^
+			(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+			(Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
+			(Te4[(temp      ) & 0xff] & 0x000000ff);
+		rk[13] = rk[ 5] ^ rk[12];
+		rk[14] = rk[ 6] ^ rk[13];
+		rk[15] = rk[ 7] ^ rk[14];
+
+			rk += 8;
+	}
+	}
+	return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * @return	the number of rounds for the given cipher key size.
+ */
+/*
+ * __db_rijndaelKeySetupDec --
+ *
+ * PUBLIC: int __db_rijndaelKeySetupDec __P((u32 *, const u8 *, int));
+ */
+int
+__db_rijndaelKeySetupDec(rk, cipherKey, keyBits)
+	u32 *rk;	/* rk[4*(Nr + 1)] */
+	const u8 *cipherKey;
+	int keyBits;
+{
+	int Nr, i, j;
+	u32 temp;
+
+	/* expand the cipher key: */
+	Nr = __db_rijndaelKeySetupEnc(rk, cipherKey, keyBits);
+	/* invert the order of the round keys: */
+	for (i = 0, j = 4*Nr; i < j; i += 4, j -= 4) {
+		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+	}
+	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
+	for (i = 1; i < Nr; i++) {
+		rk += 4;
+		rk[0] =
+			Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
+			Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+			Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
+			Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
+		rk[1] =
+			Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
+			Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+			Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
+			Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
+		rk[2] =
+			Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
+			Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+			Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
+			Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
+		rk[3] =
+			Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
+			Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+			Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
+			Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
+	}
+	return Nr;
+}
+
+/*
+ * __db_rijndaelEncrypt --
+ *
+ * PUBLIC: void __db_rijndaelEncrypt __P((u32 *, int, const u8 *, u8 *));
+ */
+void
+__db_rijndaelEncrypt(rk, Nr, pt, ct)
+	u32 *rk;	/* rk[4*(Nr + 1)] */
+	int Nr;
+	const u8 *pt;
+	u8 *ct;
+{
+	u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    /*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = GETU32(pt     ) ^ rk[0];
+	s1 = GETU32(pt +  4) ^ rk[1];
+	s2 = GETU32(pt +  8) ^ rk[2];
+	s3 = GETU32(pt + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+	/* round 2: */
+	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+    /* round 3: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+	/* round 4: */
+	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+    /* round 5: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+	/* round 6: */
+	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+    /* round 7: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+	/* round 8: */
+	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+    /* round 9: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+	/* round 10: */
+	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+	/* round 11: */
+	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+	if (Nr > 12) {
+	    /* round 12: */
+	    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+	    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+	    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+	    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+	    /* round 13: */
+	    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+	    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+	    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+	    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+	}
+    }
+    rk += Nr << 2;
+#else  /* !FULL_UNROLL */
+    /*
+	 * Nr - 1 full rounds:
+	 */
+    r = Nr >> 1;
+    for (;;) {
+	t0 =
+	    Te0[(s0 >> 24)       ] ^
+	    Te1[(s1 >> 16) & 0xff] ^
+	    Te2[(s2 >>  8) & 0xff] ^
+	    Te3[(s3      ) & 0xff] ^
+	    rk[4];
+	t1 =
+	    Te0[(s1 >> 24)       ] ^
+	    Te1[(s2 >> 16) & 0xff] ^
+	    Te2[(s3 >>  8) & 0xff] ^
+	    Te3[(s0      ) & 0xff] ^
+	    rk[5];
+	t2 =
+	    Te0[(s2 >> 24)       ] ^
+	    Te1[(s3 >> 16) & 0xff] ^
+	    Te2[(s0 >>  8) & 0xff] ^
+	    Te3[(s1      ) & 0xff] ^
+	    rk[6];
+	t3 =
+	    Te0[(s3 >> 24)       ] ^
+	    Te1[(s0 >> 16) & 0xff] ^
+	    Te2[(s1 >>  8) & 0xff] ^
+	    Te3[(s2      ) & 0xff] ^
+	    rk[7];
+
+	rk += 8;
+	if (--r == 0) {
+	    break;
+	}
+
+	s0 =
+	    Te0[(t0 >> 24)       ] ^
+	    Te1[(t1 >> 16) & 0xff] ^
+	    Te2[(t2 >>  8) & 0xff] ^
+	    Te3[(t3      ) & 0xff] ^
+	    rk[0];
+	s1 =
+	    Te0[(t1 >> 24)       ] ^
+	    Te1[(t2 >> 16) & 0xff] ^
+	    Te2[(t3 >>  8) & 0xff] ^
+	    Te3[(t0      ) & 0xff] ^
+	    rk[1];
+	s2 =
+	    Te0[(t2 >> 24)       ] ^
+	    Te1[(t3 >> 16) & 0xff] ^
+	    Te2[(t0 >>  8) & 0xff] ^
+	    Te3[(t1      ) & 0xff] ^
+	    rk[2];
+	s3 =
+	    Te0[(t3 >> 24)       ] ^
+	    Te1[(t0 >> 16) & 0xff] ^
+	    Te2[(t1 >>  8) & 0xff] ^
+	    Te3[(t2      ) & 0xff] ^
+	    rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+	s0 =
+		(Te4[(t0 >> 24)       ] & 0xff000000) ^
+		(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(t3      ) & 0xff] & 0x000000ff) ^
+		rk[0];
+	PUTU32(ct     , s0);
+	s1 =
+		(Te4[(t1 >> 24)       ] & 0xff000000) ^
+		(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(t0      ) & 0xff] & 0x000000ff) ^
+		rk[1];
+	PUTU32(ct +  4, s1);
+	s2 =
+		(Te4[(t2 >> 24)       ] & 0xff000000) ^
+		(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(t1      ) & 0xff] & 0x000000ff) ^
+		rk[2];
+	PUTU32(ct +  8, s2);
+	s3 =
+		(Te4[(t3 >> 24)       ] & 0xff000000) ^
+		(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(t2      ) & 0xff] & 0x000000ff) ^
+		rk[3];
+	PUTU32(ct + 12, s3);
+}
+
+/*
+ * __db_rijndaelDecrypt --
+ *
+ * PUBLIC: void __db_rijndaelDecrypt __P((u32 *, int, const u8 *, u8 *));
+ */
+void
+__db_rijndaelDecrypt(rk, Nr, ct, pt)
+	u32 *rk;	/* rk[4*(Nr + 1)] */
+	int Nr;
+	const u8 *ct;
+	u8 *pt;
+{
+	u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    /*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+    s0 = GETU32(ct     ) ^ rk[0];
+    s1 = GETU32(ct +  4) ^ rk[1];
+    s2 = GETU32(ct +  8) ^ rk[2];
+    s3 = GETU32(ct + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+	/* round 10: */
+	s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+	s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+	s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+	s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+	/* round 11: */
+	t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+	t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+	t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+	t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+	if (Nr > 12) {
+	    /* round 12: */
+	    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+	    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+	    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+	    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+	    /* round 13: */
+	    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+	    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+	    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+	    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+	}
+    }
+	rk += Nr << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = Nr >> 1;
+    for (;;) {
+	t0 =
+	    Td0[(s0 >> 24)       ] ^
+	    Td1[(s3 >> 16) & 0xff] ^
+	    Td2[(s2 >>  8) & 0xff] ^
+	    Td3[(s1      ) & 0xff] ^
+	    rk[4];
+	t1 =
+	    Td0[(s1 >> 24)       ] ^
+	    Td1[(s0 >> 16) & 0xff] ^
+	    Td2[(s3 >>  8) & 0xff] ^
+	    Td3[(s2      ) & 0xff] ^
+	    rk[5];
+	t2 =
+	    Td0[(s2 >> 24)       ] ^
+	    Td1[(s1 >> 16) & 0xff] ^
+	    Td2[(s0 >>  8) & 0xff] ^
+	    Td3[(s3      ) & 0xff] ^
+	    rk[6];
+	t3 =
+	    Td0[(s3 >> 24)       ] ^
+	    Td1[(s2 >> 16) & 0xff] ^
+	    Td2[(s1 >>  8) & 0xff] ^
+	    Td3[(s0      ) & 0xff] ^
+	    rk[7];
+
+	rk += 8;
+	if (--r == 0) {
+	    break;
+	}
+
+	s0 =
+	    Td0[(t0 >> 24)       ] ^
+	    Td1[(t3 >> 16) & 0xff] ^
+	    Td2[(t2 >>  8) & 0xff] ^
+	    Td3[(t1      ) & 0xff] ^
+	    rk[0];
+	s1 =
+	    Td0[(t1 >> 24)       ] ^
+	    Td1[(t0 >> 16) & 0xff] ^
+	    Td2[(t3 >>  8) & 0xff] ^
+	    Td3[(t2      ) & 0xff] ^
+	    rk[1];
+	s2 =
+	    Td0[(t2 >> 24)       ] ^
+	    Td1[(t1 >> 16) & 0xff] ^
+	    Td2[(t0 >>  8) & 0xff] ^
+	    Td3[(t3      ) & 0xff] ^
+	    rk[2];
+	s3 =
+	    Td0[(t3 >> 24)       ] ^
+	    Td1[(t2 >> 16) & 0xff] ^
+	    Td2[(t1 >>  8) & 0xff] ^
+	    Td3[(t0      ) & 0xff] ^
+	    rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+	s0 =
+		(Td4[(t0 >> 24)       ] & 0xff000000) ^
+		(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(t1      ) & 0xff] & 0x000000ff) ^
+		rk[0];
+	PUTU32(pt     , s0);
+	s1 =
+		(Td4[(t1 >> 24)       ] & 0xff000000) ^
+		(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(t2      ) & 0xff] & 0x000000ff) ^
+		rk[1];
+	PUTU32(pt +  4, s1);
+	s2 =
+		(Td4[(t2 >> 24)       ] & 0xff000000) ^
+		(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(t3      ) & 0xff] & 0x000000ff) ^
+		rk[2];
+	PUTU32(pt +  8, s2);
+	s3 =
+		(Td4[(t3 >> 24)       ] & 0xff000000) ^
+		(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(t0      ) & 0xff] & 0x000000ff) ^
+		rk[3];
+	PUTU32(pt + 12, s3);
+}
+
+#ifdef INTERMEDIATE_VALUE_KAT
+
+/*
+ * __db_rijndaelEncryptRound --
+ *
+ * PUBLIC: void __db_rijndaelEncryptRound __P((const u32 *, int, u8 *, int));
+ */
+void
+__db_rijndaelEncryptRound(rk, Nr, pt, ct)
+	const u32 *rk;	/* rk[4*(Nr + 1)] */
+	int Nr;
+	u8 *block;
+	int rounds;
+{
+	int r;
+	u32 s0, s1, s2, s3, t0, t1, t2, t3;
+
+    /*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = GETU32(block     ) ^ rk[0];
+	s1 = GETU32(block +  4) ^ rk[1];
+	s2 = GETU32(block +  8) ^ rk[2];
+	s3 = GETU32(block + 12) ^ rk[3];
+    rk += 4;
+
+    /*
+	 * Nr - 1 full rounds:
+	 */
+	for (r = (rounds < Nr ? rounds : Nr - 1); r > 0; r--) {
+		t0 =
+			Te0[(s0 >> 24)       ] ^
+			Te1[(s1 >> 16) & 0xff] ^
+			Te2[(s2 >>  8) & 0xff] ^
+			Te3[(s3      ) & 0xff] ^
+			rk[0];
+		t1 =
+			Te0[(s1 >> 24)       ] ^
+			Te1[(s2 >> 16) & 0xff] ^
+			Te2[(s3 >>  8) & 0xff] ^
+			Te3[(s0      ) & 0xff] ^
+			rk[1];
+		t2 =
+			Te0[(s2 >> 24)       ] ^
+			Te1[(s3 >> 16) & 0xff] ^
+			Te2[(s0 >>  8) & 0xff] ^
+			Te3[(s1      ) & 0xff] ^
+			rk[2];
+		t3 =
+			Te0[(s3 >> 24)       ] ^
+			Te1[(s0 >> 16) & 0xff] ^
+			Te2[(s1 >>  8) & 0xff] ^
+			Te3[(s2      ) & 0xff] ^
+			rk[3];
+
+		s0 = t0;
+		s1 = t1;
+		s2 = t2;
+		s3 = t3;
+		rk += 4;
+
+    }
+
+    /*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+	if (rounds == Nr) {
+	t0 =
+		(Te4[(s0 >> 24)       ] & 0xff000000) ^
+		(Te4[(s1 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(s2 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(s3      ) & 0xff] & 0x000000ff) ^
+		rk[0];
+	t1 =
+		(Te4[(s1 >> 24)       ] & 0xff000000) ^
+		(Te4[(s2 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(s3 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(s0      ) & 0xff] & 0x000000ff) ^
+		rk[1];
+	t2 =
+		(Te4[(s2 >> 24)       ] & 0xff000000) ^
+		(Te4[(s3 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(s0 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(s1      ) & 0xff] & 0x000000ff) ^
+		rk[2];
+	t3 =
+		(Te4[(s3 >> 24)       ] & 0xff000000) ^
+		(Te4[(s0 >> 16) & 0xff] & 0x00ff0000) ^
+		(Te4[(s1 >>  8) & 0xff] & 0x0000ff00) ^
+		(Te4[(s2      ) & 0xff] & 0x000000ff) ^
+		rk[3];
+
+		s0 = t0;
+		s1 = t1;
+		s2 = t2;
+		s3 = t3;
+	}
+
+	PUTU32(block     , s0);
+	PUTU32(block +  4, s1);
+	PUTU32(block +  8, s2);
+	PUTU32(block + 12, s3);
+}
+
+/*
+ * __db_rijndaelDecryptRound --
+ *
+ * PUBLIC: void __db_rijndaelDecryptRound __P((const u32 *, int, u8 *, int));
+ */
+void
+__db_rijndaelDecryptRound(rk, Nr, pt, ct)
+	const u32 *rk;	/* rk[4*(Nr + 1)] */
+	int Nr;
+	u8 *block;
+	int rounds;
+{
+	int r;
+	u32 s0, s1, s2, s3, t0, t1, t2, t3;
+
+    /*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = GETU32(block     ) ^ rk[0];
+	s1 = GETU32(block +  4) ^ rk[1];
+	s2 = GETU32(block +  8) ^ rk[2];
+	s3 = GETU32(block + 12) ^ rk[3];
+    rk += 4;
+
+    /*
+	 * Nr - 1 full rounds:
+	 */
+	for (r = (rounds < Nr ? rounds : Nr) - 1; r > 0; r--) {
+		t0 =
+			Td0[(s0 >> 24)       ] ^
+			Td1[(s3 >> 16) & 0xff] ^
+			Td2[(s2 >>  8) & 0xff] ^
+			Td3[(s1      ) & 0xff] ^
+			rk[0];
+		t1 =
+			Td0[(s1 >> 24)       ] ^
+			Td1[(s0 >> 16) & 0xff] ^
+			Td2[(s3 >>  8) & 0xff] ^
+			Td3[(s2      ) & 0xff] ^
+			rk[1];
+		t2 =
+			Td0[(s2 >> 24)       ] ^
+			Td1[(s1 >> 16) & 0xff] ^
+			Td2[(s0 >>  8) & 0xff] ^
+			Td3[(s3      ) & 0xff] ^
+			rk[2];
+		t3 =
+			Td0[(s3 >> 24)       ] ^
+			Td1[(s2 >> 16) & 0xff] ^
+			Td2[(s1 >>  8) & 0xff] ^
+			Td3[(s0      ) & 0xff] ^
+			rk[3];
+
+		s0 = t0;
+		s1 = t1;
+		s2 = t2;
+		s3 = t3;
+		rk += 4;
+
+    }
+
+    /*
+	 * complete the last round and
+	 * map cipher state to byte array block:
+	 */
+	t0 =
+		(Td4[(s0 >> 24)       ] & 0xff000000) ^
+		(Td4[(s3 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(s2 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(s1      ) & 0xff] & 0x000000ff);
+	t1 =
+		(Td4[(s1 >> 24)       ] & 0xff000000) ^
+		(Td4[(s0 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(s3 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(s2      ) & 0xff] & 0x000000ff);
+	t2 =
+		(Td4[(s2 >> 24)       ] & 0xff000000) ^
+		(Td4[(s1 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(s0 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(s3      ) & 0xff] & 0x000000ff);
+	t3 =
+		(Td4[(s3 >> 24)       ] & 0xff000000) ^
+		(Td4[(s2 >> 16) & 0xff] & 0x00ff0000) ^
+		(Td4[(s1 >>  8) & 0xff] & 0x0000ff00) ^
+		(Td4[(s0      ) & 0xff] & 0x000000ff);
+
+	if (rounds == Nr) {
+	    t0 ^= rk[0];
+	    t1 ^= rk[1];
+	    t2 ^= rk[2];
+	    t3 ^= rk[3];
+	}
+
+	PUTU32(block     , t0);
+	PUTU32(block +  4, t1);
+	PUTU32(block +  8, t2);
+	PUTU32(block + 12, t3);
+}
+
+#endif /* INTERMEDIATE_VALUE_KAT */
diff --git a/src/crypto/rijndael/rijndael-alg-fst.h b/src/crypto/rijndael/rijndael-alg-fst.h
new file mode 100644
index 00000000..7d5e228c
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-alg-fst.h
@@ -0,0 +1,40 @@
+/*
+ * $Id$
+ */
+/**
+ * rijndael-alg-fst.h
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __RIJNDAEL_ALG_FST_H
+#define __RIJNDAEL_ALG_FST_H
+
+#define MAXKC	(256/32)
+#define MAXKB	(256/8)
+#define MAXNR	14
+
+typedef u_int8_t	u8;
+typedef u_int16_t	u16;
+typedef u_int32_t	u32;
+
+#endif /* __RIJNDAEL_ALG_FST_H */
diff --git a/src/crypto/rijndael/rijndael-api-fst.c b/src/crypto/rijndael/rijndael-api-fst.c
new file mode 100644
index 00000000..3fd6489d
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-api-fst.c
@@ -0,0 +1,491 @@
+/**
+ * rijndael-api-fst.c
+ *
+ * @version 2.9 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Acknowledgements:
+ *
+ * We are deeply indebted to the following people for their bug reports,
+ * fixes, and improvement suggestions to this implementation. Though we
+ * tried to list all contributions, we apologise in advance for any
+ * missing reference.
+ *
+ * Andrew Bales <Andrew.Bales@Honeywell.com>
+ * Markus Friedl <markus.friedl@informatik.uni-erlangen.de>
+ * John Skodon <skodonj@webquill.com>
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+#include "crypto/rijndael/rijndael-api-fst.h"
+
+/*
+ * __db_makeKey --
+ *
+ * PUBLIC: int __db_makeKey __P((keyInstance *, int, int, char *));
+ */
+int
+__db_makeKey(key, direction, keyLen, keyMaterial)
+	keyInstance *key;
+	int direction;
+	int keyLen;
+	char *keyMaterial;
+{
+	u8 cipherKey[MAXKB];
+
+	if (key == NULL) {
+		return BAD_KEY_INSTANCE;
+	}
+
+	if ((direction == DIR_ENCRYPT) || (direction == DIR_DECRYPT)) {
+		key->direction = direction;
+	} else {
+		return BAD_KEY_DIR;
+	}
+
+	if ((keyLen == 128) || (keyLen == 192) || (keyLen == 256)) {
+		key->keyLen = keyLen;
+	} else {
+		return BAD_KEY_MAT;
+	}
+
+	if (keyMaterial != NULL) {
+		memcpy(cipherKey, keyMaterial, key->keyLen/8);
+	}
+
+	if (direction == DIR_ENCRYPT) {
+		key->Nr = __db_rijndaelKeySetupEnc(key->rk, cipherKey, keyLen);
+	} else {
+		key->Nr = __db_rijndaelKeySetupDec(key->rk, cipherKey, keyLen);
+	}
+	__db_rijndaelKeySetupEnc(key->ek, cipherKey, keyLen);
+	return TRUE;
+}
+
+/*
+ * __db_cipherInit --
+ *
+ * PUBLIC: int __db_cipherInit __P((cipherInstance *, int, char *));
+ */
+int
+__db_cipherInit(cipher, mode, IV)
+	cipherInstance *cipher;
+	int mode;
+	char *IV;
+{
+	if ((mode == MODE_ECB) || (mode == MODE_CBC) || (mode == MODE_CFB1)) {
+		cipher->mode = mode;
+	} else {
+		return BAD_CIPHER_MODE;
+	}
+	if (IV != NULL) {
+	  memcpy(cipher->IV, IV, MAX_IV_SIZE);
+	}
+	return TRUE;
+}
+
+/*
+ * __db_blockEncrypt --
+ *
+ * PUBLIC: int __db_blockEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC:    size_t, u_int8_t *));
+ */
+int
+__db_blockEncrypt(cipher, key, input, inputLen, outBuffer)
+	cipherInstance *cipher;
+	keyInstance *key;
+	u_int8_t *input;
+	size_t inputLen;
+	u_int8_t *outBuffer;
+{
+	int i, k, t, numBlocks;
+	u8 block[16], *iv;
+	u32 tmpiv[4];
+
+	if (cipher == NULL ||
+		key == NULL ||
+		key->direction == DIR_DECRYPT) {
+		return BAD_CIPHER_STATE;
+	}
+	if (input == NULL || inputLen <= 0) {
+		return 0; /* nothing to do */
+	}
+
+	numBlocks = (int)(inputLen/128);
+
+	switch (cipher->mode) {
+	case MODE_ECB:
+		for (i = numBlocks; i > 0; i--) {
+			__db_rijndaelEncrypt(key->rk, key->Nr, input, outBuffer);
+			input += 16;
+			outBuffer += 16;
+		}
+		break;
+
+	case MODE_CBC:
+		iv = cipher->IV;
+		for (i = numBlocks; i > 0; i--) {
+			memcpy(tmpiv, iv, MAX_IV_SIZE);
+			((u32*)block)[0] = ((u32*)input)[0] ^ tmpiv[0];
+			((u32*)block)[1] = ((u32*)input)[1] ^ tmpiv[1];
+			((u32*)block)[2] = ((u32*)input)[2] ^ tmpiv[2];
+			((u32*)block)[3] = ((u32*)input)[3] ^ tmpiv[3];
+			__db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+			iv = outBuffer;
+			input += 16;
+			outBuffer += 16;
+		}
+		break;
+
+    case MODE_CFB1:
+		iv = cipher->IV;
+	for (i = numBlocks; i > 0; i--) {
+			memcpy(outBuffer, input, 16);
+	    for (k = 0; k < 128; k++) {
+				__db_rijndaelEncrypt(key->ek, key->Nr, iv, block);
+		outBuffer[k >> 3] ^= (block[0] & (u_int)0x80) >> (k & 7);
+		for (t = 0; t < 15; t++) {
+			iv[t] = (iv[t] << 1) | (iv[t + 1] >> 7);
+		}
+		iv[15] = (iv[15] << 1) | ((outBuffer[k >> 3] >> (7 - (k & 7))) & 1);
+	    }
+	    outBuffer += 16;
+	    input += 16;
+	}
+	break;
+
+	default:
+		return BAD_CIPHER_STATE;
+	}
+
+	return 128*numBlocks;
+}
+
+/**
+ * Encrypt data partitioned in octets, using RFC 2040-like padding.
+ *
+ * @param   input           data to be encrypted (octet sequence)
+ * @param   inputOctets		input length in octets (not bits)
+ * @param   outBuffer       encrypted output data
+ *
+ * @return	length in octets (not bits) of the encrypted output buffer.
+ */
+/*
+ * __db_padEncrypt --
+ *
+ * PUBLIC: int __db_padEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC:    int, u_int8_t *));
+ */
+int
+__db_padEncrypt(cipher, key, input, inputOctets, outBuffer)
+	cipherInstance *cipher;
+	keyInstance *key;
+	u_int8_t *input;
+	int inputOctets;
+	u_int8_t *outBuffer;
+{
+	int i, numBlocks, padLen;
+	u8 block[16], *iv;
+	u32 tmpiv[4];
+
+	if (cipher == NULL ||
+		key == NULL ||
+		key->direction == DIR_DECRYPT) {
+		return BAD_CIPHER_STATE;
+	}
+	if (input == NULL || inputOctets <= 0) {
+		return 0; /* nothing to do */
+	}
+
+	numBlocks = inputOctets/16;
+
+	switch (cipher->mode) {
+	case MODE_ECB:
+		for (i = numBlocks; i > 0; i--) {
+			__db_rijndaelEncrypt(key->rk, key->Nr, input, outBuffer);
+			input += 16;
+			outBuffer += 16;
+		}
+		padLen = 16 - (inputOctets - 16*numBlocks);
+		DB_ASSERT(NULL, padLen > 0 && padLen <= 16);
+		memcpy(block, input, 16 - padLen);
+		memset(block + 16 - padLen, padLen, padLen);
+		__db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+		break;
+
+	case MODE_CBC:
+		iv = cipher->IV;
+		for (i = numBlocks; i > 0; i--) {
+			memcpy(tmpiv, iv, MAX_IV_SIZE);
+			((u32*)block)[0] = ((u32*)input)[0] ^ tmpiv[0];
+			((u32*)block)[1] = ((u32*)input)[1] ^ tmpiv[1];
+			((u32*)block)[2] = ((u32*)input)[2] ^ tmpiv[2];
+			((u32*)block)[3] = ((u32*)input)[3] ^ tmpiv[3];
+			__db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+			iv = outBuffer;
+			input += 16;
+			outBuffer += 16;
+		}
+		padLen = 16 - (inputOctets - 16*numBlocks);
+		DB_ASSERT(NULL, padLen > 0 && padLen <= 16);
+		for (i = 0; i < 16 - padLen; i++) {
+			block[i] = input[i] ^ iv[i];
+		}
+		for (i = 16 - padLen; i < 16; i++) {
+			block[i] = (u_int8_t)padLen ^ iv[i];
+		}
+		__db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+		break;
+
+	default:
+		return BAD_CIPHER_STATE;
+	}
+
+	return 16*(numBlocks + 1);
+}
+
+/*
+ * __db_blockDecrypt --
+ *
+ * PUBLIC: int __db_blockDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC:    size_t, u_int8_t *));
+ */
+int
+__db_blockDecrypt(cipher, key, input, inputLen, outBuffer)
+	cipherInstance *cipher;
+	keyInstance *key;
+	u_int8_t *input;
+	size_t inputLen;
+	u_int8_t *outBuffer;
+{
+	int i, k, t, numBlocks;
+	u8 block[16], *iv;
+	u32 tmpiv[4];
+
+	if (cipher == NULL ||
+		key == NULL ||
+		(cipher->mode != MODE_CFB1 && key->direction == DIR_ENCRYPT)) {
+		return BAD_CIPHER_STATE;
+	}
+	if (input == NULL || inputLen <= 0) {
+		return 0; /* nothing to do */
+	}
+
+	numBlocks = (int)(inputLen/128);
+
+	switch (cipher->mode) {
+	case MODE_ECB:
+		for (i = numBlocks; i > 0; i--) {
+			__db_rijndaelDecrypt(key->rk, key->Nr, input, outBuffer);
+			input += 16;
+			outBuffer += 16;
+		}
+		break;
+
+	case MODE_CBC:
+		memcpy(tmpiv, cipher->IV, MAX_IV_SIZE);
+		for (i = numBlocks; i > 0; i--) {
+			__db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+			((u32*)block)[0] ^= tmpiv[0];
+			((u32*)block)[1] ^= tmpiv[1];
+			((u32*)block)[2] ^= tmpiv[2];
+			((u32*)block)[3] ^= tmpiv[3];
+			memcpy(tmpiv, input, 16);
+			memcpy(outBuffer, block, 16);
+			input += 16;
+			outBuffer += 16;
+		}
+		break;
+
+    case MODE_CFB1:
+		iv = cipher->IV;
+	for (i = numBlocks; i > 0; i--) {
+			memcpy(outBuffer, input, 16);
+	    for (k = 0; k < 128; k++) {
+				__db_rijndaelEncrypt(key->ek, key->Nr, iv, block);
+		for (t = 0; t < 15; t++) {
+			iv[t] = (iv[t] << 1) | (iv[t + 1] >> 7);
+		}
+		iv[15] = (iv[15] << 1) | ((input[k >> 3] >> (7 - (k & 7))) & 1);
+		outBuffer[k >> 3] ^= (block[0] & (u_int)0x80) >> (k & 7);
+	    }
+	    outBuffer += 16;
+	    input += 16;
+	}
+	break;
+
+	default:
+		return BAD_CIPHER_STATE;
+	}
+
+	return 128*numBlocks;
+}
+
+/*
+ * __db_padDecrypt --
+ *
+ * PUBLIC: int __db_padDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC:    int, u_int8_t *));
+ */
+int
+__db_padDecrypt(cipher, key, input, inputOctets, outBuffer)
+	cipherInstance *cipher;
+	keyInstance *key;
+	u_int8_t *input;
+	int inputOctets;
+	u_int8_t *outBuffer;
+{
+	int i, numBlocks, padLen;
+	u8 block[16];
+	u32 tmpiv[4];
+
+	if (cipher == NULL ||
+		key == NULL ||
+		key->direction == DIR_ENCRYPT) {
+		return BAD_CIPHER_STATE;
+	}
+	if (input == NULL || inputOctets <= 0) {
+		return 0; /* nothing to do */
+	}
+	if (inputOctets % 16 != 0) {
+		return BAD_DATA;
+	}
+
+	numBlocks = inputOctets/16;
+
+	switch (cipher->mode) {
+	case MODE_ECB:
+		/* all blocks but last */
+		for (i = numBlocks - 1; i > 0; i--) {
+			__db_rijndaelDecrypt(key->rk, key->Nr, input, outBuffer);
+			input += 16;
+			outBuffer += 16;
+		}
+		/* last block */
+		__db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+		padLen = block[15];
+		if (padLen >= 16) {
+			return BAD_DATA;
+		}
+		for (i = 16 - padLen; i < 16; i++) {
+			if (block[i] != padLen) {
+				return BAD_DATA;
+			}
+		}
+		memcpy(outBuffer, block, 16 - padLen);
+		break;
+
+	case MODE_CBC:
+		/* all blocks but last */
+		memcpy(tmpiv, cipher->IV, MAX_IV_SIZE);
+		for (i = numBlocks - 1; i > 0; i--) {
+			__db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+			((u32*)block)[0] ^= tmpiv[0];
+			((u32*)block)[1] ^= tmpiv[1];
+			((u32*)block)[2] ^= tmpiv[2];
+			((u32*)block)[3] ^= tmpiv[3];
+			memcpy(tmpiv, input, 16);
+			memcpy(outBuffer, block, 16);
+			input += 16;
+			outBuffer += 16;
+		}
+		/* last block */
+		__db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+		((u32*)block)[0] ^= tmpiv[0];
+		((u32*)block)[1] ^= tmpiv[1];
+		((u32*)block)[2] ^= tmpiv[2];
+		((u32*)block)[3] ^= tmpiv[3];
+		padLen = block[15];
+		if (padLen <= 0 || padLen > 16) {
+			return BAD_DATA;
+		}
+		for (i = 16 - padLen; i < 16; i++) {
+			if (block[i] != padLen) {
+				return BAD_DATA;
+			}
+		}
+		memcpy(outBuffer, block, 16 - padLen);
+		break;
+
+	default:
+		return BAD_CIPHER_STATE;
+	}
+
+	return 16*numBlocks - padLen;
+}
+
+#ifdef INTERMEDIATE_VALUE_KAT
+/**
+ *	cipherUpdateRounds:
+ *
+ *	Encrypts/Decrypts exactly one full block a specified number of rounds.
+ *	Only used in the Intermediate Value Known Answer Test.
+ *
+ *	Returns:
+ *		TRUE - on success
+ *		BAD_CIPHER_STATE - cipher in bad state (e.g., not initialized)
+ */
+/*
+ * __db_cipherUpdateRounds --
+ *
+ * PUBLIC: int __db_cipherUpdateRounds __P((cipherInstance *, keyInstance *,
+ * PUBLIC:    u_int8_t *, int, u_int8_t *, int));
+ */
+int
+__db_cipherUpdateRounds(cipher, key, input, inputLen, outBuffer, rounds)
+	cipherInstance *cipher;
+	keyInstance *key;
+	u_int8_t *input;
+	size_t inputLen;
+	u_int8_t *outBuffer;
+	int rounds;
+{
+	u8 block[16];
+
+	if (cipher == NULL || key == NULL) {
+		return BAD_CIPHER_STATE;
+	}
+
+	memcpy(block, input, 16);
+
+	switch (key->direction) {
+	case DIR_ENCRYPT:
+		__db_rijndaelEncryptRound(key->rk, key->Nr, block, rounds);
+		break;
+
+	case DIR_DECRYPT:
+		__db_rijndaelDecryptRound(key->rk, key->Nr, block, rounds);
+		break;
+
+	default:
+		return BAD_KEY_DIR;
+	}
+
+	memcpy(outBuffer, block, 16);
+
+	return TRUE;
+}
+#endif /* INTERMEDIATE_VALUE_KAT */
diff --git a/src/crypto/rijndael/rijndael-api-fst.h b/src/crypto/rijndael/rijndael-api-fst.h
new file mode 100644
index 00000000..3e31920a
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-api-fst.h
@@ -0,0 +1,91 @@
+/*
+ * $Id$
+ */
+/**
+ * rijndael-api-fst.h
+ *
+ * @version 2.9 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Acknowledgements:
+ *
+ * We are deeply indebted to the following people for their bug reports,
+ * fixes, and improvement suggestions to this implementation. Though we
+ * tried to list all contributions, we apologise in advance for any
+ * missing reference.
+ *
+ * Andrew Bales <Andrew.Bales@Honeywell.com>
+ * Markus Friedl <markus.friedl@informatik.uni-erlangen.de>
+ * John Skodon <skodonj@webquill.com>
+ */
+
+#ifndef __RIJNDAEL_API_FST_H
+#define __RIJNDAEL_API_FST_H
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+
+/*  Generic Defines  */
+#define     DIR_ENCRYPT           0 /*  Are we encrpyting?  */
+#define     DIR_DECRYPT           1 /*  Are we decrpyting?  */
+#define     MODE_ECB              1 /*  Are we ciphering in ECB mode?   */
+#define     MODE_CBC              2 /*  Are we ciphering in CBC mode?   */
+#define     MODE_CFB1             3 /*  Are we ciphering in 1-bit CFB mode? */
+#undef	    TRUE
+#define     TRUE                  1
+#undef	    FALSE
+#define     FALSE                 0
+#define     BITSPERBLOCK        128 /* Default number of bits in a cipher block */
+
+/*  Error Codes  */
+#define     BAD_KEY_DIR          -1 /*  Key direction is invalid, e.g., unknown value */
+#define     BAD_KEY_MAT          -2 /*  Key material not of correct length */
+#define     BAD_KEY_INSTANCE     -3 /*  Key passed is not valid */
+#define     BAD_CIPHER_MODE      -4 /*  Params struct passed to cipherInit invalid */
+#define     BAD_CIPHER_STATE     -5 /*  Cipher in wrong state (e.g., not initialized) */
+#define     BAD_BLOCK_LENGTH     -6
+#define     BAD_CIPHER_INSTANCE  -7
+#define     BAD_DATA             -8 /*  Data contents are invalid, e.g., invalid padding */
+#define     BAD_OTHER            -9 /*  Unknown error */
+
+/*  Algorithm-specific Defines  */
+#define     MAX_KEY_SIZE         64 /* # of ASCII char's needed to represent a key */
+#define     MAX_IV_SIZE          16 /* # bytes needed to represent an IV  */
+
+/*  Typedefs  */
+
+/*  The structure for key information */
+typedef struct {
+    u_int8_t  direction;            /* Key used for encrypting or decrypting? */
+    int   keyLen;                   /* Length of the key  */
+    char  keyMaterial[MAX_KEY_SIZE+1];  /* Raw key data in ASCII, e.g., user input or KAT values */
+	int   Nr;                       /* key-length-dependent number of rounds */
+	u32   rk[4*(MAXNR + 1)];        /* key schedule */
+	u32   ek[4*(MAXNR + 1)];        /* CFB1 key schedule (encryption only) */
+} keyInstance;
+
+/*  The structure for cipher information */
+typedef struct {                    /* changed order of the components */
+    u_int8_t  mode;                 /* MODE_ECB, MODE_CBC, or MODE_CFB1 */
+    u_int8_t  IV[MAX_IV_SIZE];      /* A possible Initialization Vector for ciphering */
+} cipherInstance;
+
+#endif /* __RIJNDAEL_API_FST_H */
diff --git a/src/db/crdel.src b/src/db/crdel.src
new file mode 100644
index 00000000..70473899
--- /dev/null
+++ b/src/db/crdel.src
@@ -0,0 +1,71 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__crdel
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Metasub: log the creation of a subdatabase meta data page.
+ *
+ * fileid: identifies the file being acted upon.
+ * pgno: page number on which to write this meta-data page
+ * page: the actual meta-data page
+ * lsn: lsn of the page.
+ */
+BEGIN metasub		42	142
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+PGDBT	page		DBT		s
+POINTER	lsn		DB_LSN *	lu
+END
+
+/*
+ * Inmem_create: Log the creation of an in-memory database.
+ *
+ * name: Name of the database
+ * fid: File id of the database
+ */
+BEGIN	inmem_create	44	138
+ARG	fileid	int32_t		ld
+DBT	name	DBT		s
+DBT	fid	DBT		s
+ARG	pgsize	u_int32_t	lu
+END
+
+/*
+ * Inmem_rename: Log the renaming of an in-memory only database.
+ *
+ * oldname: database's starting name
+ * newname: database's ending name
+ * fid: fileid
+ */
+BEGIN	inmem_rename	44	139
+DBT	oldname		DBT		s
+DBT	newname		DBT		s
+DBT	fid		DBT		s
+END
+
+/*
+ * Inmem_remove: Log the removal of an in-memory only database.
+ *
+ * name: database's ending name
+ * fid: fileid
+ */
+BEGIN	inmem_remove	44	140
+DBT	name		DBT		s
+DBT	fid		DBT		s
+END
+
diff --git a/src/db/crdel_auto.c b/src/db/crdel_auto.c
new file mode 100644
index 00000000..a2a3f54b
--- /dev/null
+++ b/src/db/crdel_auto.c
@@ -0,0 +1,59 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __crdel_metasub_desc[] = {
+	{LOGREC_DB, SSZ(__crdel_metasub_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__crdel_metasub_args, pgno), "pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__crdel_metasub_args, page), "page", ""},
+	{LOGREC_POINTER, SSZ(__crdel_metasub_args, lsn), "lsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_create_desc[] = {
+	{LOGREC_ARG, SSZ(__crdel_inmem_create_args, fileid), "fileid", "%ld"},
+	{LOGREC_DBT, SSZ(__crdel_inmem_create_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__crdel_inmem_create_args, fid), "fid", ""},
+	{LOGREC_ARG, SSZ(__crdel_inmem_create_args, pgsize), "pgsize", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_rename_desc[] = {
+	{LOGREC_DBT, SSZ(__crdel_inmem_rename_args, oldname), "oldname", ""},
+	{LOGREC_DBT, SSZ(__crdel_inmem_rename_args, newname), "newname", ""},
+	{LOGREC_DBT, SSZ(__crdel_inmem_rename_args, fid), "fid", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_remove_desc[] = {
+	{LOGREC_DBT, SSZ(__crdel_inmem_remove_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__crdel_inmem_remove_args, fid), "fid", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_metasub_recover, DB___crdel_metasub)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_create_recover, DB___crdel_inmem_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_rename_recover, DB___crdel_inmem_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_remove_recover, DB___crdel_inmem_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/db/crdel_autop.c b/src/db/crdel_autop.c
new file mode 100644
index 00000000..79bd4d99
--- /dev/null
+++ b/src/db/crdel_autop.c
@@ -0,0 +1,103 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__crdel_metasub_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__crdel_metasub", __crdel_metasub_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_create", __crdel_inmem_create_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_rename", __crdel_inmem_rename_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_remove", __crdel_inmem_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_metasub_print, DB___crdel_metasub)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_create_print, DB___crdel_inmem_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_rename_print, DB___crdel_inmem_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_remove_print, DB___crdel_inmem_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/db/crdel_rec.c b/src/db/crdel_rec.c
new file mode 100644
index 00000000..08e7bae8
--- /dev/null
+++ b/src/db/crdel_rec.c
@@ -0,0 +1,301 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __crdel_metasub_recover --
+ *	Recovery function for metasub.
+ *
+ * PUBLIC: int __crdel_metasub_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_metasub_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_metasub_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_p, ret, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__crdel_metasub_print);
+	REC_INTRO(__crdel_metasub_read, ip, 0);
+
+	/*
+	 * If we are undoing this operation, but the DB that we got back
+	 * was never really opened, then this open was an in-memory open
+	 * that did not finish. We can let the file creation take care
+	 * of any necessary undo/cleanup.
+	 */
+	if (DB_UNDO(op) && !F_ISSET(file_dbp, DB_AM_OPEN_CALLED))
+		goto done;
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		/*
+		 * If this is an in-memory file, this might be OK. Also, heap
+		 * can get there through a truncate and we have to redo page 1
+		 */
+		if ((file_dbp->type == DB_HEAP ||
+		    F_ISSET(file_dbp, DB_AM_INMEM)) &&
+		    (ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) == 0) {
+			if (F_ISSET(file_dbp, DB_AM_INMEM))
+				LSN_NOT_LOGGED(LSN(pagep));
+		} else {
+			*lsnp = argp->prev_lsn;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->page.data, argp->page.size);
+		LSN(pagep) = *lsnp;
+
+		/*
+		 * If this was an in-memory database and we are re-creating
+		 * and this is the meta-data page, then we need to set up a
+		 * bunch of fields in the dbo as well.
+		 */
+		if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+		    argp->pgno == PGNO_BASE_MD &&
+		    (ret = __db_meta_setup(file_dbp->env, file_dbp,
+		    file_dbp->dname, (DBMETA *)pagep, 0, DB_CHK_META)) != 0)
+			goto out;
+	} else if (DB_UNDO(op)) {
+		/*
+		 * We want to undo this page creation.  The page creation
+		 * happened in two parts.  First, we called __db_pg_alloc which
+		 * was logged separately. Then we wrote the meta-data onto
+		 * the page.  So long as we restore the LSN, then the recovery
+		 * for __db_pg_alloc will do everything else.
+		 *
+		 * Don't bother checking the lsn on the page.  If we are
+		 * rolling back the next thing is that this page will get
+		 * freed.  Opening the subdb will have reinitialized the
+		 * page, but not the lsn.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->lsn;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL && (t_ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	REC_CLOSE;
+}
+
+/*
+ * __crdel_inmem_create_recover --
+ *	Recovery function for inmem_create.
+ *
+ * PUBLIC: int __crdel_inmem_create_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_create_args *argp;
+	DB *dbp;
+	int do_close, ret, t_ret;
+
+	COMPQUIET(info, NULL);
+
+	dbp = NULL;
+	do_close = 0;
+	REC_PRINT(__crdel_inmem_create_print);
+	REC_NOOP_INTRO(__crdel_inmem_create_read);
+
+	/* First, see if the DB handle already exists. */
+	if (argp->fileid == DB_LOGFILEID_INVALID) {
+		if (DB_REDO(op))
+			ret = ENOENT;
+		else
+			ret = 0;
+	} else
+		ret = __dbreg_id_to_db(env, argp->txnp, &dbp, argp->fileid, 0);
+
+	if (DB_REDO(op)) {
+		/*
+		 * If the dbreg failed, that means that we're creating a
+		 * tmp file.
+		 */
+		if (ret != 0) {
+			if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+				goto out;
+
+			F_SET(dbp, DB_AM_RECOVER | DB_AM_INMEM);
+			memcpy(dbp->fileid, argp->fid.data, DB_FILE_ID_LEN);
+			if (((ret = __os_strdup(env,
+			    argp->name.data, &dbp->dname)) != 0))
+				goto out;
+
+			/*
+			 * This DBP is never going to be entered into the
+			 * dbentry table, so if we leave it open here,
+			 * then we're going to lose it.
+			 */
+			do_close = 1;
+		}
+
+		/* Now, set the fileid. */
+		memcpy(dbp->fileid, argp->fid.data, argp->fid.size);
+		if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+			goto out;
+		dbp->preserve_fid = 1;
+		MAKE_INMEM(dbp);
+		if ((ret = __env_setup(dbp,
+		    NULL, NULL, argp->name.data, TXN_INVALID, 0)) != 0)
+			goto out;
+		ret = __env_mpool(dbp, argp->name.data, 0);
+
+		if (ret == ENOENT) {
+			dbp->pgsize = argp->pgsize;
+			if ((ret = __env_mpool(dbp,
+			    argp->name.data, DB_CREATE)) != 0)
+				goto out;
+		} else if (ret != 0)
+			goto out;
+	}
+
+	if (DB_UNDO(op)) {
+		if (ret == 0)
+			ret = __memp_nameop(env, argp->fid.data, NULL,
+			    (const char *)argp->name.data,  NULL, 1);
+
+		if (ret == ENOENT || ret == DB_DELETED)
+			ret = 0;
+		else
+			goto out;
+	}
+
+	*lsnp = argp->prev_lsn;
+
+out:	if (dbp != NULL) {
+		t_ret = 0;
+
+		if (do_close || ret != 0)
+			t_ret = __db_close(dbp, NULL, DB_NOSYNC);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+	}
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_rename_recover --
+ *	Recovery function for inmem_rename.
+ *
+ * PUBLIC: int __crdel_inmem_rename_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_rename_args *argp;
+	u_int8_t *fileid;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__crdel_inmem_rename_print);
+	REC_NOOP_INTRO(__crdel_inmem_rename_read);
+	fileid = argp->fid.data;
+
+	/* Void out errors because the files may or may not still exist. */
+	if (DB_REDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->newname.data,
+		    (const char *)argp->oldname.data,
+		    (const char *)argp->newname.data, 1);
+
+	if (DB_UNDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->oldname.data,
+		    (const char *)argp->newname.data,
+		    (const char *)argp->oldname.data, 1);
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_remove_recover --
+ *	Recovery function for inmem_remove.
+ *
+ * PUBLIC: int __crdel_inmem_remove_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_remove_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__crdel_inmem_remove_print);
+	REC_NOOP_INTRO(__crdel_inmem_remove_read);
+
+	/*
+	 * Since removes are delayed; there is no undo for a remove; only redo.
+	 * The remove may fail, which is OK.
+	 */
+	if (DB_REDO(op)) {
+		(void)__memp_nameop(env,
+		    argp->fid.data, NULL, argp->name.data, NULL, 1);
+	}
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
diff --git a/src/db/db.c b/src/db/db.c
new file mode 100644
index 00000000..0d9d1e6e
--- /dev/null
+++ b/src/db/db.c
@@ -0,0 +1,1659 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_disassociate __P((DB *));
+static int __db_disassociate_foreign __P ((DB *));
+
+#ifdef CONFIG_TEST
+static int __db_makecopy __P((ENV *, const char *, const char *));
+static int __qam_testdocopy __P((DB *, const char *));
+#endif
+
+/*
+ * DB.C --
+ *	This file contains the utility functions for the DBP layer.
+ */
+
+/*
+ * __db_master_open --
+ *	Open up a handle on a master database.
+ *
+ * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char *, u_int32_t, int, DB **));
+ */
+int
+__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
+	DB *subdbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	u_int32_t flags;
+	int mode;
+	DB **dbpp;
+{
+	DB *dbp;
+	int ret;
+
+	*dbpp = NULL;
+
+	/* Open up a handle on the main database. */
+	if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
+		return (ret);
+
+	/*
+	 * It's always a btree.
+	 * Run in the transaction we've created.
+	 * Set the pagesize in case we're creating a new database.
+	 * Flag that we're creating a database with subdatabases.
+	 */
+	dbp->pgsize = subdbp->pgsize;
+	F_SET(dbp, DB_AM_SUBDB);
+	F_SET(dbp, F_ISSET(subdbp,
+	    DB_AM_RECOVER | DB_AM_SWAP |
+	    DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
+
+	/*
+	 * If there was a subdb specified, then we only want to apply
+	 * DB_EXCL to the subdb, not the actual file.  We only got here
+	 * because there was a subdb specified.
+	 */
+	LF_CLR(DB_EXCL);
+	LF_SET(DB_RDWRMASTER);
+	if ((ret = __db_open(dbp, ip, txn,
+	    name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	/*
+	 * The items in dbp are initialized from the master file's meta page.
+	 * Other items such as checksum and encryption are checked when we
+	 * read the meta-page, so we do not check those here.  However, if
+	 * the meta-page caused checksumming to be turned on and it wasn't
+	 * already, set it here.
+	 */
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		F_SET(subdbp, DB_AM_CHKSUM);
+
+	/*
+	 * The user may have specified a page size for an existing file,
+	 * which we want to ignore.
+	 */
+	subdbp->pgsize = dbp->pgsize;
+	*dbpp = dbp;
+
+	if (0) {
+err:		if (!F_ISSET(dbp, DB_AM_DISCARD))
+			(void)__db_close(dbp, txn, DB_NOSYNC);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_master_update --
+ *	Add/Open/Remove a subdatabase from a master database.
+ *
+ * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:      const char *, DBTYPE, mu_action, const char *, u_int32_t));
+ */
+int
+__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
+	DB *mdbp, *sdbp;
+	DB_TXN *txn;
+	DB_THREAD_INFO *ip;
+	const char *subdb;
+	DBTYPE type;
+	mu_action action;
+	const char *newname;
+	u_int32_t flags;
+{
+	DBC *dbc, *ndbc;
+	DBT key, data, ndata;
+	ENV *env;
+	PAGE *p, *r;
+	db_pgno_t t_pgno;
+	int modify, ret, t_ret;
+
+	env = mdbp->env;
+	dbc = ndbc = NULL;
+	p = NULL;
+
+	/*
+	 * Open up a cursor.  If this is CDB and we're creating the database,
+	 * make it an update cursor.
+	 *
+	 * Might we modify the master database?  If so, we'll need to lock.
+	 */
+	modify = (!F_ISSET(mdbp, DB_AM_RDONLY) &&
+	    (action != MU_OPEN || LF_ISSET(DB_CREATE))) ? 1 : 0;
+
+	if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
+	    (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
+		return (ret);
+
+	/*
+	 * Point the cursor at the record.
+	 *
+	 * If we're removing or potentially creating an entry, lock the page
+	 * with DB_RMW.
+	 *
+	 * We do multiple cursor operations with the cursor in some cases and
+	 * subsequently access the data DBT information.  Set DB_DBT_MALLOC so
+	 * we don't risk modification of the data between our uses of it.
+	 *
+	 * !!!
+	 * We don't include the name's nul termination in the database.
+	 */
+	DB_INIT_DBT(key, subdb, strlen(subdb));
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_MALLOC);
+
+	ret = __dbc_get(dbc, &key, &data,
+	    DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
+
+	/*
+	 * What we do next--whether or not we found a record for the
+	 * specified subdatabase--depends on what the specified action is.
+	 * Handle ret appropriately as the first statement of each case.
+	 */
+	switch (action) {
+	case MU_REMOVE:
+		/*
+		 * We should have found something if we're removing it.  Note
+		 * that in the common case where the DB we're asking to remove
+		 * doesn't exist, we won't get this far;  __db_subdb_remove
+		 * will already have returned an error from __db_open.
+		 */
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Delete the subdatabase entry first;  if this fails,
+		 * we don't want to touch the actual subdb pages.
+		 */
+		if ((ret = __dbc_del(dbc, 0)) != 0)
+			goto err;
+
+		/*
+		 * We're handling actual data, not on-page meta-data,
+		 * so it hasn't been converted to/from opposite
+		 * endian architectures.  Do it explicitly, now.
+		 */
+		memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+		DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+		if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
+		    ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
+			goto err;
+
+		/* Free the root on the master db if it was created. */
+		if (TYPE(p) == P_BTREEMETA &&
+		    ((BTMETA *)p)->root != PGNO_INVALID) {
+			if ((ret = __memp_fget(mdbp->mpf,
+			    &((BTMETA *)p)->root, ip, dbc->txn,
+			    DB_MPOOL_DIRTY, &r)) != 0)
+				goto err;
+
+			/* Free and put the page. */
+			if ((ret = __db_free(dbc, r, 0)) != 0) {
+				r = NULL;
+				goto err;
+			}
+		}
+		/* Free and put the page. */
+		if ((ret = __db_free(dbc, p, 0)) != 0) {
+			p = NULL;
+			goto err;
+		}
+		p = NULL;
+		break;
+	case MU_RENAME:
+		/* We should have found something if we're renaming it. */
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Before we rename, we need to make sure we're not
+		 * overwriting another subdatabase, or else this operation
+		 * won't be undoable.  Open a second cursor and check
+		 * for the existence of newname;  it shouldn't appear under
+		 * us since we hold the metadata lock.
+		 */
+		if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
+		    CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+			goto err;
+		DB_SET_DBT(key, newname, strlen(newname));
+
+		/*
+		 * We don't actually care what the meta page of the potentially-
+		 * overwritten DB is; we just care about existence.
+		 */
+		memset(&ndata, 0, sizeof(ndata));
+		F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+		if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
+			/* A subdb called newname exists.  Bail. */
+			ret = EEXIST;
+			__db_errx(env, DB_STR_A("0673",
+			    "rename: database %s exists", "%s"), newname);
+			goto err;
+		} else if (ret != DB_NOTFOUND)
+			goto err;
+
+		/*
+		 * Now do the put first; we don't want to lose our only
+		 * reference to the subdb.  Use the second cursor so the
+		 * first one continues to point to the old record.
+		 */
+		if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
+			goto err;
+		if ((ret = __dbc_del(dbc, 0)) != 0) {
+			/*
+			 * If the delete fails, try to delete the record
+			 * we just put, in case we're not txn-protected.
+			 */
+			(void)__dbc_del(ndbc, 0);
+			goto err;
+		}
+
+		break;
+	case MU_OPEN:
+		/*
+		 * Get the subdatabase information.  If it already exists,
+		 * copy out the page number and we're done.
+		 */
+		switch (ret) {
+		case 0:
+			if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+				ret = EEXIST;
+				goto err;
+			}
+			memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+			DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+			goto done;
+		case DB_NOTFOUND:
+			if (LF_ISSET(DB_CREATE))
+				break;
+			/*
+			 * No db_err, it is reasonable to remove a
+			 * nonexistent db.
+			 */
+			ret = ENOENT;
+			goto err;
+		default:
+			goto err;
+		}
+
+		/* Create a subdatabase. */
+		if (F_ISSET(mdbp, DB_AM_RDONLY)) {
+			ret = EBADF;
+			goto err;
+		}
+		if ((ret = __db_new(dbc,
+		    type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0)
+			goto err;
+		sdbp->meta_pgno = PGNO(p);
+
+		/*
+		 * XXX
+		 * We're handling actual data, not on-page meta-data, so it
+		 * hasn't been converted to/from opposite endian architectures.
+		 * Do it explicitly, now.
+		 */
+		t_pgno = PGNO(p);
+		DB_HTONL_SWAP(env, &t_pgno);
+		memset(&ndata, 0, sizeof(ndata));
+		ndata.data = &t_pgno;
+		ndata.size = sizeof(db_pgno_t);
+		if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+			goto err;
+		F_SET(sdbp, DB_AM_CREATED);
+		break;
+
+	case MU_MOVE:
+		/* We should have found something if we're moving it. */
+		if (ret != 0)
+			goto err;
+		t_pgno = sdbp->meta_pgno;
+		DB_HTONL_SWAP(env, &t_pgno);
+		memset(&ndata, 0, sizeof(ndata));
+		ndata.data = &t_pgno;
+		ndata.size = sizeof(db_pgno_t);
+		if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+			goto err;
+		mdbp->mpf->mfp->revision++;
+	}
+
+err:
+done:	/*
+	 * If we allocated a page: if we're successful, mark the page dirty
+	 * and return it to the cache, otherwise, discard/free it.
+	 */
+	if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
+	     dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the cursor(s) and data. */
+	if (data.data != NULL)
+		__os_ufree(env, data.data);
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __env_dbreg_setup --
+ *
+ * PUBLIC: int __env_dbreg_setup __P((DB *,
+ * PUBLIC:      DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbreg_setup(dbp, txn, fname, dname, id)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	u_int32_t id;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+	if (dbp->log_filename == NULL
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+	    && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
+#endif
+#if !defined(DEBUG_ROP)
+	    && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+	    ) {
+		if ((ret = __dbreg_setup(dbp,
+		    F_ISSET(dbp, DB_AM_INMEM) ? dname: fname,
+		    F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
+			return (ret);
+
+		/*
+		 * If we're actively logging and our caller isn't a
+		 * recovery function that already did so, then assign
+		 * this dbp a log fileid.
+		 */
+		if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    (ret = __dbreg_new_id(dbp, txn)) != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+/*
+ * __env_setup --
+ *	Set up the underlying environment during a db_open.
+ *
+ * PUBLIC: int __env_setup __P((DB *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__env_setup(dbp, txn, fname, dname, id, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	u_int32_t id, flags;
+{
+	DB *ldbp;
+	DB_ENV *dbenv;
+	ENV *env;
+	u_int32_t maxid;
+	int ret;
+
+	env = dbp->env;
+	dbenv = env->dbenv;
+
+	/*
+	 * When verifying an in-memory db, we need to pass dname to
+	 * __env_mpool.  That is the only time fname will be used.
+	 */
+	if (F_ISSET(dbp, DB_AM_INMEM) && F_ISSET(dbp, DB_AM_VERIFYING))
+		fname = dname;
+
+	/* If we don't yet have an environment, it's time to create it. */
+	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+#if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8)
+		__db_errx(env, DB_STR("0701", "DB_PRIVATE is not supported by"
+		    " 64-bit applications in mixed-size-addressing mode"));
+	       return (EINVAL);
+#endif
+		/* Make sure we have at least DB_MINCACHE pages in our cache. */
+		if (dbenv->mp_gbytes == 0 &&
+		    dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
+		    (ret = __memp_set_cachesize(
+		    dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
+			return (ret);
+
+		if ((ret = __env_open(dbenv, NULL, DB_CREATE |
+		    DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
+			return (ret);
+	}
+
+	/* Join the underlying cache. */
+	if ((!F_ISSET(dbp, DB_AM_INMEM) || F_ISSET(dbp, DB_AM_VERIFYING) ||
+	    dname == NULL) && (ret = __env_mpool(dbp, fname, flags)) != 0)
+		return (ret);
+
+	/* We may need a per-thread mutex. */
+	if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
+	    env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
+		return (ret);
+
+	/*
+	 * Set up a bookkeeping entry for this database in the log region,
+	 * if such a region exists.  Note that even if we're in recovery
+	 * or a replication client, where we won't log registries, we'll
+	 * still need an FNAME struct, so LOGGING_ON is the correct macro.
+	 */
+	if (LOGGING_ON(env) &&
+	    (!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
+	    (ret = __env_dbreg_setup(dbp, txn, fname, dname, id)) != 0)
+		return (ret);
+
+	/*
+	 * Insert ourselves into the ENV's dblist.  We allocate a
+	 * unique ID to each {fileid, meta page number} pair, and to
+	 * each temporary file (since they all have a zero fileid).
+	 * This ID gives us something to use to tell which DB handles
+	 * go with which databases in all the cursor adjustment
+	 * routines, where we don't want to do a lot of ugly and
+	 * expensive memcmps.
+	 */
+	MUTEX_LOCK(env, env->mtx_dblist);
+	maxid = 0;
+	TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
+		/*
+		 * There are three cases: on-disk database (first clause),
+		 * named in-memory database (second clause), temporary database
+		 * (never matches; no clause).
+		 */
+		if (!F_ISSET(dbp, DB_AM_INMEM)) {
+			if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
+			    == 0 && ldbp->meta_pgno == dbp->meta_pgno)
+				break;
+		} else if (dname != NULL) {
+			if (F_ISSET(ldbp, DB_AM_INMEM) &&
+			    ldbp->dname != NULL &&
+			    strcmp(ldbp->dname, dname) == 0)
+				break;
+		}
+		if (ldbp->adj_fileid > maxid)
+			maxid = ldbp->adj_fileid;
+	}
+
+	/*
+	 * If ldbp is NULL, we didn't find a match. Assign the dbp an
+	 * adj_fileid one higher than the largest we found, and
+	 * insert it at the head of the master dbp list.
+	 *
+	 * If ldbp is not NULL, it is a match for our dbp.  Give dbp
+	 * the same ID that ldbp has, and add it after ldbp so they're
+	 * together in the list.
+	 */
+	if (ldbp == NULL) {
+		dbp->adj_fileid = maxid + 1;
+		TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
+	} else {
+		dbp->adj_fileid = ldbp->adj_fileid;
+		TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
+	}
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	return (0);
+}
+
+/*
+ * __env_mpool --
+ *	Set up the underlying environment cache during a db_open.
+ *
+ * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
+ */
+int
+__env_mpool(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+	DBT pgcookie;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	int fidset, ftype, ret;
+	int32_t lsn_off;
+	u_int8_t nullfid[DB_FILE_ID_LEN];
+	u_int32_t clear_len;
+
+	env = dbp->env;
+
+	/* The LSN is the first entry on a DB page, byte offset 0. */
+	lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
+
+	/* It's possible that this database is already open. */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (0);
+
+	/*
+	 * If we need to pre- or post-process a file's pages on I/O, set the
+	 * file type.  If it's a hash file, always call the pgin and pgout
+	 * routines.  This means that hash files can never be mapped into
+	 * process memory.  If it's a btree file and requires swapping, we
+	 * need to page the file in and out.  This has to be right -- we can't
+	 * mmap files that are being paged in and out.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_HEAP:
+	case DB_RECNO:
+		ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+		    ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+		clear_len = CRYPTO_ON(env) ?
+		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+		    DB_PAGE_DB_LEN;
+		break;
+	case DB_HASH:
+		ftype = DB_FTYPE_SET;
+		clear_len = CRYPTO_ON(env) ?
+		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+		    DB_PAGE_DB_LEN;
+		break;
+	case DB_QUEUE:
+		ftype = F_ISSET(dbp,
+		    DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
+		    DB_FTYPE_SET : DB_FTYPE_NOTSET;
+
+		/*
+		 * If we came in here without a pagesize set, then we need
+		 * to mark the in-memory handle as having clear_len not
+		 * set, because we don't really know the clear length or
+		 * the page size yet (since the file doesn't yet exist).
+		 */
+		clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
+		break;
+	case DB_UNKNOWN:
+		/*
+		 * If we're running in the verifier, our database might
+		 * be corrupt and we might not know its type--but we may
+		 * still want to be able to verify and salvage.
+		 *
+		 * If we can't identify the type, it's not going to be safe
+		 * to call __db_pgin--we pretty much have to give up all
+		 * hope of salvaging cross-endianness.  Proceed anyway;
+		 * at worst, the database will just appear more corrupt
+		 * than it actually is, but at best, we may be able
+		 * to salvage some data even with no metadata page.
+		 */
+		if (F_ISSET(dbp, DB_AM_VERIFYING)) {
+			ftype = DB_FTYPE_NOTSET;
+			clear_len = DB_PAGE_DB_LEN;
+			break;
+		}
+
+		/*
+		 * This might be an in-memory file and we won't know its
+		 * file type until after we open it and read the meta-data
+		 * page.
+		 */
+		if (F_ISSET(dbp, DB_AM_INMEM)) {
+			clear_len = DB_CLEARLEN_NOTSET;
+			ftype = DB_FTYPE_NOTSET;
+			lsn_off = DB_LSN_OFF_NOTSET;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		return (__db_unknown_type(env, "DB->open", dbp->type));
+	}
+
+	mpf = dbp->mpf;
+
+	memset(nullfid, 0, DB_FILE_ID_LEN);
+	fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
+	if (fidset)
+		(void)__memp_set_fileid(mpf, dbp->fileid);
+
+	(void)__memp_set_clear_len(mpf, clear_len);
+	(void)__memp_set_ftype(mpf, ftype);
+	(void)__memp_set_lsn_offset(mpf, lsn_off);
+
+	pginfo.db_pagesize = dbp->pgsize;
+	pginfo.flags =
+	    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+	pginfo.type = dbp->type;
+	pgcookie.data = &pginfo;
+	pgcookie.size = sizeof(DB_PGINFO);
+	(void)__memp_set_pgcookie(mpf, &pgcookie);
+
+#ifndef DIAG_MVCC
+	if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+#endif
+		if (F_ISSET(dbp, DB_AM_TXN) &&
+		    dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+			LF_SET(DB_MULTIVERSION);
+
+	if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname,
+	    LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
+		DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
+	    (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
+	    (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
+	    0, dbp->pgsize)) != 0) {
+		/*
+		 * The open didn't work; we need to reset the mpf,
+		 * retaining the in-memory semantics (if any).
+		 */
+		(void)__memp_fclose(dbp->mpf, 0);
+		(void)__memp_fcreate(env, &dbp->mpf);
+		if (F_ISSET(dbp, DB_AM_INMEM))
+			MAKE_INMEM(dbp);
+		return (ret);
+	}
+
+	/*
+	 * Set the open flag.  We use it to mean that the dbp has gone
+	 * through mpf setup, including dbreg_register.  Also, below,
+	 * the underlying access method open functions may want to do
+	 * things like acquire cursors, so the open flag has to be set
+	 * before calling them.
+	 */
+	F_SET(dbp, DB_AM_OPEN_CALLED);
+	if (!fidset && fname != NULL) {
+		(void)__memp_get_fileid(dbp->mpf, dbp->fileid);
+		dbp->preserve_fid = 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __db_close --
+ *	DB->close method.
+ *
+ * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__db_close(dbp, txn, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	ENV *env;
+	int db_ref, deferred_close, ret, t_ret;
+
+	env = dbp->env;
+	deferred_close = 0;
+
+	PERFMON4(env, db, close,
+	    dbp->fname, dbp->dname, flags, &dbp->fileid[0]);
+
+	/* Refresh the structure and close any underlying resources. */
+	ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
+
+	/*
+	 * If we've deferred the close because the logging of the close failed,
+	 * return our failure right away without destroying the handle.
+	 */
+	if (deferred_close)
+		return (ret);
+
+	/* !!!
+	 * This code has an apparent race between the moment we read and
+	 * decrement env->db_ref and the moment we check whether it's 0.
+	 * However, if the environment is DBLOCAL, the user shouldn't have a
+	 * reference to the env handle anyway;  the only way we can get
+	 * multiple dbps sharing a local env is if we open them internally
+	 * during something like a subdatabase open.  If any such thing is
+	 * going on while the user is closing the original dbp with a local
+	 * env, someone's already badly screwed up, so there's no reason
+	 * to bother engineering around this possibility.
+	 */
+	MUTEX_LOCK(env, env->mtx_dblist);
+	db_ref = --env->db_ref;
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+	if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
+	    (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Free the database handle. */
+	memset(dbp, CLEAR_BYTE, sizeof(*dbp));
+	__os_free(env, dbp);
+
+	return (ret);
+}
+
+/*
+ * __db_refresh --
+ *	Refresh the DB structure, releasing any allocated resources.
+ * This does most of the work of closing files now because refresh
+ * is what is used during abort processing (since we can't destroy
+ * the actual handle) and during abort processing, we may have a
+ * fully opened handle.
+ *
+ * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+ */
+int
+__db_refresh(dbp, txn, flags, deferred_closep, reuse)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+	int *deferred_closep, reuse;
+{
+	DB *sdbp;
+	DBC *dbc;
+	DB_FOREIGN_INFO *f_info, *tmp;
+	DB_LOCKER *locker;
+	DB_LOCKREQ lreq;
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+	u_int32_t save_flags;
+	int resync, ret, t_ret;
+
+	ret = 0;
+
+	env = dbp->env;
+	infop = env->reginfo;
+	if (infop != NULL)
+		renv = infop->primary;
+	else
+		renv = NULL;
+
+	/*
+	 * If this dbp is not completely open, avoid trapping by trying to
+	 * sync without an mpool file.
+	 */
+	if (dbp->mpf == NULL)
+		LF_SET(DB_NOSYNC);
+
+	/* If never opened, or not currently open, it's easy. */
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		goto never_opened;
+
+	/*
+	 * If we have any secondary indices, disassociate them from us.
+	 * We don't bother with the mutex here;  it only protects some
+	 * of the ops that will make us core-dump mid-close anyway, and
+	 * if you're trying to do something with a secondary *while* you're
+	 * closing the primary, you deserve what you get.  The disassociation
+	 * is mostly done just so we can close primaries and secondaries in
+	 * any order--but within one thread of control.
+	 */
+	LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
+		LIST_REMOVE(sdbp, s_links);
+		if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (F_ISSET(dbp, DB_AM_SECONDARY))
+		LIST_REMOVE(dbp, s_links);
+
+	/*
+	 * Disassociate ourself from any databases using us as a foreign key
+	 * database by clearing the referring db's pointer.  Reclaim memory.
+	 */
+	f_info = LIST_FIRST(&dbp->f_primaries);
+	while (f_info != NULL) {
+		tmp = LIST_NEXT(f_info, f_links);
+		LIST_REMOVE(f_info, f_links);
+		f_info->dbp->s_foreign = NULL;
+		__os_free(env, f_info);
+		f_info = tmp;
+	}
+
+	if (dbp->s_foreign != NULL &&
+	    (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Sync the underlying access method.  Do before closing the cursors
+	 * because DB->sync allocates cursors in order to write Recno backing
+	 * source text files.
+	 *
+	 * Sync is slow on some systems, notably Solaris filesystems where the
+	 * entire buffer cache is searched.  If we're in recovery, don't flush
+	 * the file, it's not necessary.
+	 */
+	if (!LF_ISSET(DB_NOSYNC) &&
+	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+	    (t_ret = __db_sync(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Go through the active cursors, unregister each cursor from its
+	 * transaction if any, and call the cursor recycle routine,
+	 * which resolves pending operations and moves the cursors onto the
+	 * free list.  Then, walk the free list and call the cursor destroy
+	 * routine.  Note that any failure on a close is considered "really
+	 * bad" and we just break out of the loop and force forward.
+	 */
+	resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
+	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) {
+		if (dbc->txn != NULL)
+			TAILQ_REMOVE(&(dbc->txn->my_cursors), dbc, txn_cursors);
+
+		if ((t_ret = __dbc_close(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+	}
+
+	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((t_ret = __dbc_destroy(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/*
+	 * Close any outstanding join cursors.  Join cursors destroy themselves
+	 * on close and have no separate destroy routine.  We don't have to set
+	 * the resync flag here, because join cursors aren't write cursors.
+	 */
+	while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
+		if ((t_ret = __db_join_close(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/*
+	 * Sync the memory pool, even though we've already called DB->sync,
+	 * because closing cursors can dirty pages by deleting items they
+	 * referenced.
+	 *
+	 * Sync is slow on some systems, notably Solaris filesystems where the
+	 * entire buffer cache is searched.  If we're in recovery, don't flush
+	 * the file, it's not necessary.
+	 */
+	if (resync && !LF_ISSET(DB_NOSYNC) &&
+	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+	    (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * If there is a file extension watermark associated with this
+	 * database, we don't need it any more.
+	 */
+	__txn_remove_fe_watermark(txn, dbp);
+
+never_opened:
+	MUTEX_LOCK(env, env->mtx_dblist);
+	/*
+	 * At this point, we haven't done anything to render the DB handle
+	 * unusable, at least by a transaction abort.  Take the opportunity
+	 * now to log the file close if we have initialized the logging
+	 * information.  If this log fails and we're in a transaction,
+	 * we have to bail out of the attempted close; we'll need a dbp in
+	 * order to successfully abort the transaction, and we can't conjure
+	 * a new one up because we haven't gotten out the dbreg_register
+	 * record that represents the close.  In this case, we put off
+	 * actually closing the dbp until we've performed the abort.
+	 */
+	if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
+		/*
+		 * Discard the log file id, if any.  We want to log the close
+		 * if and only if this is not a recovery dbp or a client dbp,
+		 * or a dead dbp handle.
+		 */
+		DB_ASSERT(env, renv != NULL);
+		if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
+		    dbp->timestamp != renv->rep_timestamp) {
+			if ((t_ret = __dbreg_revoke_id(dbp,
+			    0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+				ret = t_ret;
+		} else {
+			if ((t_ret = __dbreg_close_id(dbp,
+			    txn, DBREG_CLOSE)) != 0 && txn != NULL) {
+				MUTEX_UNLOCK(env, env->mtx_dblist);
+				/*
+				 * We're in a txn and the attempt to log the
+				 * close failed;  let the txn subsystem know
+				 * that we need to destroy this dbp once we're
+				 * done with the abort, then bail from the
+				 * close.
+				 *
+				 * Note that if the attempt to put off the
+				 * close -also- fails--which it won't unless
+				 * we're out of heap memory--we're really
+				 * screwed.  Panic.
+				 */
+				if ((ret =
+				    __txn_closeevent(env, txn, dbp)) != 0)
+					return (__env_panic(env, ret));
+				if (deferred_closep != NULL)
+					*deferred_closep = 1;
+				return (t_ret);
+			}
+			/*
+			 * If dbreg_close_id failed and we were not in a
+			 * transaction, then we need to finish this close
+			 * because the caller can't do anything with the
+			 * handle after we return an error.  We rely on
+			 * dbreg_close_id to mark the entry in some manner
+			 * so that we do not do a clean shutdown of this
+			 * environment.  If shutdown isn't clean, then the
+			 * application *must* run recovery and that will
+			 * generate the RCLOSE record.
+			 */
+		}
+
+	}
+
+	/* Close any handle we've been holding since the open.  */
+	if (dbp->saved_open_fhp != NULL &&
+	    (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Remove this DB handle from the ENV's dblist, if it's been added.
+	 *
+	 * Close our reference to the underlying cache while locked, we don't
+	 * want to race with a thread searching for our underlying cache link
+	 * while opening a DB handle.
+	 *
+	 * The DB handle may not yet have been added to the ENV list, don't
+	 * blindly call the underlying TAILQ_REMOVE macro.  Explicitly reset
+	 * the field values to NULL so that we can't call TAILQ_REMOVE twice.
+	 */
+	if (!reuse &&
+	    (dbp->dblistlinks.tqe_next != NULL ||
+	    dbp->dblistlinks.tqe_prev != NULL)) {
+		TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
+		dbp->dblistlinks.tqe_next = NULL;
+		dbp->dblistlinks.tqe_prev = NULL;
+	}
+
+	/* Close the memory pool file handle. */
+	if (dbp->mpf != NULL) {
+		if ((t_ret = __memp_fclose(dbp->mpf,
+		    F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		dbp->mpf = NULL;
+		if (reuse &&
+		    (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	/*
+	 * Call the access specific close function.
+	 *
+	 * We do this here rather than in __db_close as we need to do this when
+	 * aborting an open so that file descriptors are closed and abort of
+	 * renames can succeed on platforms that lock open files (such as
+	 * Windows).  In particular, we need to ensure that all the extents
+	 * associated with a queue are closed so that queue renames can be
+	 * aborted.
+	 *
+	 * It is also important that we do this before releasing the handle
+	 * lock, because dbremove and dbrename assume that once they have the
+	 * handle lock, it is safe to modify the underlying file(s).
+	 *
+	 * !!!
+	 * Because of where these functions are called in the DB handle close
+	 * process, these routines can't do anything that would dirty pages or
+	 * otherwise affect closing down the database.  Specifically, we can't
+	 * abort and recover any of the information they control.
+	 */
+#ifdef HAVE_PARTITION
+	if (dbp->p_internal != NULL &&
+	    (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+	if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __heap_db_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * !!!
+	 * At this point, the access-method specific information has been
+	 * freed.  From now on, we can use the dbp, but not touch any
+	 * access-method specific data.
+	 */
+
+	if (!reuse && dbp->locker != NULL) {
+		/* We may have pending trade operations on this dbp. */
+		if (txn == NULL)
+			txn = dbp->cur_txn;
+		if (IS_REAL_TXN(txn))
+			__txn_remlock(env,
+			     txn, &dbp->handle_lock, dbp->locker);
+
+		/* We may be holding the handle lock; release it. */
+		lreq.op = DB_LOCK_PUT_ALL;
+		lreq.obj = NULL;
+		if ((t_ret = __lock_vec(env,
+		    dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if ((t_ret =
+		     __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
+			ret = t_ret;
+		dbp->locker = NULL;
+		LOCK_INIT(dbp->handle_lock);
+	}
+
+	/*
+	 * If this is a temporary file (un-named in-memory file), then
+	 * discard the locker ID allocated as the fileid.
+	 */
+	if (LOCKING_ON(env) &&
+	    F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
+	    *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
+		if ((t_ret = __lock_getlocker(env->lk_handle,
+		     *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
+			t_ret = __lock_id_free(env, locker);
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if (reuse) {
+		/*
+		 * If we are reusing this dbp, then we're done now. Re-init
+		 * the handle, preserving important flags, and then return.
+		 * This code is borrowed from __db_init, which does more
+		 * than we can do here.
+		 */
+		save_flags = F_ISSET(dbp, DB_AM_INMEM |
+		    DB_AM_RDONLY | DB_AM_TXN);
+
+		if ((ret = __bam_db_create(dbp)) != 0)
+			return (ret);
+		if ((ret = __ham_db_create(dbp)) != 0)
+			return (ret);
+		if ((ret = __heap_db_create(dbp)) != 0)
+			return (ret);
+		if ((ret = __qam_db_create(dbp)) != 0)
+			return (ret);
+
+		/* Restore flags */
+		dbp->flags = dbp->orig_flags | save_flags;
+
+		if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
+			/*
+			 * If this is inmem, then it may have a fileid
+			 * even if it was never opened, and we need to
+			 * clear out that fileid.
+			 */
+			memset(dbp->fileid, 0, sizeof(dbp->fileid));
+			MAKE_INMEM(dbp);
+		}
+		return (ret);
+	}
+
+	dbp->type = DB_UNKNOWN;
+
+	/*
+	 * The thread mutex may have been invalidated in __dbreg_close_id if the
+	 * fname refcount did not go to 0. If not, discard the thread mutex.
+	 */
+	if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard any memory allocated for the file and database names. */
+	if (dbp->fname != NULL) {
+		__os_free(dbp->env, dbp->fname);
+		dbp->fname = NULL;
+	}
+	if (dbp->dname != NULL) {
+		__os_free(dbp->env, dbp->dname);
+		dbp->dname = NULL;
+	}
+
+	/* Discard any memory used to store returned data. */
+	if (dbp->my_rskey.data != NULL)
+		__os_free(dbp->env, dbp->my_rskey.data);
+	if (dbp->my_rkey.data != NULL)
+		__os_free(dbp->env, dbp->my_rkey.data);
+	if (dbp->my_rdata.data != NULL)
+		__os_free(dbp->env, dbp->my_rdata.data);
+
+	/* For safety's sake;  we may refresh twice. */
+	memset(&dbp->my_rskey, 0, sizeof(DBT));
+	memset(&dbp->my_rkey, 0, sizeof(DBT));
+	memset(&dbp->my_rdata, 0, sizeof(DBT));
+
+	/* Clear out fields that normally get set during open. */
+	memset(dbp->fileid, 0, sizeof(dbp->fileid));
+	dbp->adj_fileid = 0;
+	dbp->meta_pgno = 0;
+	dbp->cur_locker = NULL;
+	dbp->cur_txn = NULL;
+	dbp->associate_locker = NULL;
+	dbp->open_flags = 0;
+
+	/*
+	 * If we are being refreshed with a txn specified, then we need
+	 * to make sure that we clear out the lock handle field, because
+	 * releasing all the locks for this transaction will release this
+	 * lock and we don't want close to stumble upon this handle and
+	 * try to close it.
+	 */
+	if (txn != NULL)
+		LOCK_INIT(dbp->handle_lock);
+
+	/* Reset flags to whatever the user configured. */
+	dbp->flags = dbp->orig_flags;
+
+	return (ret);
+}
+
+/*
+ * __db_disassociate --
+ *	Destroy the association between a given secondary and its primary.
+ */
+static int
+__db_disassociate(sdbp)
+	DB *sdbp;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	ret = 0;
+
+	sdbp->s_callback = NULL;
+	sdbp->s_primary = NULL;
+	sdbp->get = sdbp->stored_get;
+	sdbp->close = sdbp->stored_close;
+
+	/*
+	 * Complain, but proceed, if we have any active cursors.  (We're in
+	 * the middle of a close, so there's really no turning back.)
+	 */
+	if (sdbp->s_refcnt != 1 ||
+	    TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+		__db_errx(sdbp->env, DB_STR("0674",
+"Closing a primary DB while a secondary DB has active cursors is unsafe"));
+		ret = EINVAL;
+	}
+	sdbp->s_refcnt = 0;
+
+	while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+		if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+	F_CLR(sdbp, DB_AM_SECONDARY);
+	return (ret);
+}
+
+/*
+ * __db_disassociate_foreign --
+ *     Destroy the association between a given secondary and its foreign.
+ */
+static int
+__db_disassociate_foreign(sdbp)
+	DB *sdbp;
+{
+	DB *fdbp;
+	DB_FOREIGN_INFO *f_info, *tmp;
+	int ret;
+
+	if (sdbp->s_foreign == NULL)
+		return (0);
+	if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
+		return (ret);
+
+	fdbp = sdbp->s_foreign;
+	ret = 0;
+	f_info = LIST_FIRST(&fdbp->f_primaries);
+	while (f_info != NULL) {
+		tmp = LIST_NEXT(f_info, f_links);
+		if (f_info ->dbp == sdbp) {
+			LIST_REMOVE(f_info, f_links);
+			__os_free(sdbp->env, f_info);
+		}
+		f_info = tmp;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_log_page
+ *	Log a meta-data or root page during a subdatabase create operation.
+ *
+ * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+ */
+int
+__db_log_page(dbp, txn, lsn, pgno, page)
+	DB *dbp;
+	DB_TXN *txn;
+	DB_LSN *lsn;
+	db_pgno_t pgno;
+	PAGE *page;
+{
+	DBT page_dbt;
+	DB_LSN new_lsn;
+	int ret;
+
+	if (!LOGGING_ON(dbp->env) || txn == NULL)
+		return (0);
+
+	memset(&page_dbt, 0, sizeof(page_dbt));
+	page_dbt.size = dbp->pgsize;
+	page_dbt.data = page;
+
+	ret = __crdel_metasub_log(dbp, txn, &new_lsn, F_ISSET(dbp,
+	    DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0, pgno, &page_dbt, lsn);
+
+	if (ret == 0)
+		page->lsn = new_lsn;
+	return (ret);
+}
+
+/*
+ * __db_walk_cursors
+ *	Walk all cursors for a database.
+ *
+ * PUBLIC: int __db_walk_cursors __P((DB *, DBC *,
+ * PUBLIC:	int (*) __P((DBC *, DBC *,
+ * PUBLIC:      u_int32_t *, db_pgno_t, u_int32_t, void *)),
+ * PUBLIC:      u_int32_t *, db_pgno_t, u_int32_t, void *));
+ */
+ int
+ __db_walk_cursors(dbp, my_dbc, func, countp, pgno, indx, args)
+	DB *dbp;
+	DBC *my_dbc;
+	int (*func)__P((DBC *, DBC *,
+	    u_int32_t *, db_pgno_t, u_int32_t, void *));
+	u_int32_t *countp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *args;
+{
+	ENV *env;
+	DB *ldbp;
+	DBC *dbc;
+	int ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	MUTEX_LOCK(env, env->mtx_dblist);
+	FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+	for (*countp = 0;
+	    ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+	    ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+loop:		MUTEX_LOCK(env, ldbp->mutex);
+		TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
+			if ((ret = (func)(dbc, my_dbc,
+			    countp, pgno, indx, args)) != 0)
+				break;
+		/*
+		 * We use the error to communicate that function
+		 * dropped the mutex.
+		 */
+		if (ret == DB_LOCK_NOTGRANTED)
+			goto loop;
+		MUTEX_UNLOCK(env, ldbp->mutex);
+		if (ret != 0)
+			break;
+	}
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+	return (ret);
+}
+
+/*
+ * __db_backup_name
+ *	Create the backup file name for a given file.
+ *
+ * PUBLIC: int __db_backup_name __P((ENV *,
+ * PUBLIC:     const char *, DB_TXN *, char **));
+ */
+#undef	BACKUP_PREFIX
+#define	BACKUP_PREFIX	"__db."
+
+#undef	MAX_INT_TO_HEX
+#define	MAX_INT_TO_HEX	8
+
+int
+__db_backup_name(env, name, txn, backup)
+	ENV *env;
+	const char *name;
+	DB_TXN *txn;
+	char **backup;
+{
+	u_int32_t id;
+	size_t len;
+	int ret;
+	char *p, *retp;
+
+	*backup = NULL;
+
+	/*
+	 * Part of the name may be a full path, so we need to make sure that
+	 * we allocate enough space for it, even in the case where we don't
+	 * use the entire filename for the backup name.
+	 */
+	len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
+	if ((ret = __os_malloc(env, len, &retp)) != 0)
+		return (ret);
+
+	/*
+	 * Create the name.  Backup file names are in one of 2 forms: in a
+	 * transactional env "__db.TXNID.ID", where ID is a random number,
+	 * and in any other env "__db.FILENAME".
+	 *
+	 * In addition, the name passed may contain an env-relative path.
+	 * In that case, put the "__db." in the right place (in the last
+	 * component of the pathname).
+	 *
+	 * There are four cases here:
+	 *	1. simple path w/out transaction
+	 *	2. simple path + transaction
+	 *	3. multi-component path w/out transaction
+	 *	4. multi-component path + transaction
+	 */
+	p = __db_rpath(name);
+	if (IS_REAL_TXN(txn)) {
+		__os_unique_id(env, &id);
+		if (p == NULL)				/* Case 2. */
+			snprintf(retp, len, "%s%x.%x",
+			    BACKUP_PREFIX, txn->txnid, id);
+		else					/* Case 4. */
+			snprintf(retp, len, "%.*s%x.%x",
+			    (int)(p - name) + 1, name, txn->txnid, id);
+	} else {
+		if (p == NULL)				/* Case 1. */
+			snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
+		else					/* Case 3. */
+			snprintf(retp, len, "%.*s%s%s",
+			    (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
+	}
+
+	*backup = retp;
+	return (0);
+}
+
+#ifdef CONFIG_TEST
+/*
+ * __db_testcopy
+ *	Create a copy of all backup files and our "main" DB.
+ *
+ * PUBLIC: #ifdef CONFIG_TEST
+ * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
+ * PUBLIC: #endif
+ */
+int
+__db_testcopy(env, dbp, name)
+	ENV *env;
+	DB *dbp;
+	const char *name;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *mpf;
+
+	DB_ASSERT(env, dbp != NULL || name != NULL);
+
+	if (name == NULL) {
+		dbmp = env->mp_handle;
+		mpf = dbp->mpf;
+		name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
+	}
+
+	if (dbp != NULL && dbp->type == DB_QUEUE)
+		return (__qam_testdocopy(dbp, name));
+	else
+#ifdef HAVE_PARTITION
+	if (dbp != NULL && DB_IS_PARTITIONED(dbp))
+		return (__part_testdocopy(dbp, name));
+	else
+#endif
+		return (__db_testdocopy(env, name));
+}
+
+static int
+__qam_testdocopy(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	DB_THREAD_INFO *ip;
+	QUEUE_FILELIST *filelist, *fp;
+	int ret;
+	char buf[DB_MAXPATHLEN], *dir;
+
+	filelist = NULL;
+	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+		return (ret);
+
+	/* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
+	ENV_GET_THREAD_INFO(dbp->env, ip);
+	if (dbp->mpf != NULL &&
+	    (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+		goto done;
+
+	if (filelist == NULL)
+		return (0);
+	dir = ((QUEUE *)dbp->q_internal)->dir;
+	for (fp = filelist; fp->mpf != NULL; fp++) {
+		snprintf(buf, sizeof(buf),
+		    QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
+		if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
+			return (ret);
+	}
+
+done:	__os_free(dbp->env, filelist);
+	return (0);
+}
+
+/*
+ * __db_testdocopy
+ *	Create a copy of all backup files and our "main" DB.
+ * PUBLIC: int __db_testdocopy __P((ENV *, const char *));
+ */
+int
+__db_testdocopy(env, name)
+	ENV *env;
+	const char *name;
+{
+	size_t len;
+	int dircnt, i, ret;
+	char *copy, **namesp, *p, *real_name;
+
+	dircnt = 0;
+	copy = NULL;
+	namesp = NULL;
+
+	/* Create the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, NULL, &real_name)) != 0)
+		return (ret);
+
+	/*
+	 * !!!
+	 * There are tests that attempt to copy non-existent files.  I'd guess
+	 * it's a testing bug, but I don't have time to figure it out.  Block
+	 * the case here.
+	 */
+	if (__os_exists(env, real_name, NULL) != 0) {
+		__os_free(env, real_name);
+		return (0);
+	}
+
+	/*
+	 * Copy the file itself.
+	 *
+	 * Allocate space for the file name, including adding an ".afterop" and
+	 * trailing nul byte.
+	 */
+	len = strlen(real_name) + sizeof(".afterop");
+	if ((ret = __os_malloc(env, len, &copy)) != 0)
+		goto err;
+	snprintf(copy, len, "%s.afterop", real_name);
+	if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+		goto err;
+
+	/*
+	 * Get the directory path to call __os_dirlist().
+	 */
+	if ((p = __db_rpath(real_name)) != NULL)
+		*p = '\0';
+	if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
+		goto err;
+
+	/*
+	 * Walk the directory looking for backup files.  Backup file names in
+	 * transactional environments are of the form:
+	 *
+	 *	BACKUP_PREFIX.TXNID.ID
+	 */
+	for (i = 0; i < dircnt; i++) {
+		/* Check for a related backup file name. */
+		if (strncmp(
+		    namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
+			continue;
+		p = namesp[i] + sizeof(BACKUP_PREFIX);
+		p += strspn(p, "0123456789ABCDEFabcdef");
+		if (*p != '.')
+			continue;
+		++p;
+		p += strspn(p, "0123456789ABCDEFabcdef");
+		if (*p != '\0')
+			continue;
+
+		/*
+		 * Copy the backup file.
+		 *
+		 * Allocate space for the file name, including adding a
+		 * ".afterop" and trailing nul byte.
+		 */
+		if (real_name != NULL) {
+			__os_free(env, real_name);
+			real_name = NULL;
+		}
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, namesp[i], NULL, &real_name)) != 0)
+			goto err;
+		if (copy != NULL) {
+			__os_free(env, copy);
+			copy = NULL;
+		}
+		len = strlen(real_name) + sizeof(".afterop");
+		if ((ret = __os_malloc(env, len, &copy)) != 0)
+			goto err;
+		snprintf(copy, len, "%s.afterop", real_name);
+		if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+			goto err;
+	}
+
+err:	if (namesp != NULL)
+		__os_dirfree(env, namesp, dircnt);
+	if (copy != NULL)
+		__os_free(env, copy);
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	return (ret);
+}
+
+static int
+__db_makecopy(env, src, dest)
+	ENV *env;
+	const char *src, *dest;
+{
+	DB_FH *rfhp, *wfhp;
+	size_t rcnt, wcnt;
+	int ret;
+	char *buf;
+
+	rfhp = wfhp = NULL;
+
+	if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
+		goto err;
+
+	if ((ret = __os_open(env, src, 0,
+	    DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
+		goto err;
+	if ((ret = __os_open(env, dest, 0,
+	    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
+		goto err;
+
+	for (;;) {
+		if ((ret =
+		    __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
+			goto err;
+		if (rcnt == 0)
+			break;
+		if ((ret =
+		    __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
+			goto err;
+	}
+
+	if (0) {
+err:		__db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
+	}
+
+	if (buf != NULL)
+		__os_free(env, buf);
+	if (rfhp != NULL)
+		(void)__os_closehandle(env, rfhp);
+	if (wfhp != NULL)
+		(void)__os_closehandle(env, wfhp);
+	return (ret);
+}
+#endif
diff --git a/src/db/db.src b/src/db/db.src
new file mode 100644
index 00000000..879c7856
--- /dev/null
+++ b/src/db/db.src
@@ -0,0 +1,431 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__db
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a duplicate page.
+ *
+ * opcode:	identifies if this is an add or delete.
+ * fileid:	file identifier of the file being modified.
+ * pgno:	duplicate page number.
+ * indx:	location at which to insert or delete.
+ * nbytes:	number of bytes added/removed to/from the page.
+ * hdr:		header for the data item.
+ * dbt:		data that is deleted or is to be added.
+ * pagelsn:	former lsn of the page.
+ *
+ * If the hdr was NULL then, the dbt is a regular B_KEYDATA.
+ * If the dbt was NULL then the hdr is a complete item to be
+ * pasted on the page.
+ */
+BEGIN addrem		50	41
+OP	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	nbytes		u_int32_t	lu
+HDR	hdr		DBT		s
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+BEGIN_COMPAT addrem		42	41
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	nbytes		u_int32_t	lu
+DBT	hdr		DBT		s
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+/*
+ * big -- Handles addition and deletion of big key/data items.
+ *
+ * opcode:	identifies get/put.
+ * fileid:	file identifier of the file being modified.
+ * pgno:	page onto which data is being added/removed.
+ * prev_pgno:	the page before the one we are logging.
+ * next_pgno:	the page after the one we are logging.
+ * dbt:		data being written onto the page.
+ * pagelsn:	former lsn of the orig_page.
+ * prevlsn:	former lsn of the prev_pgno.
+ * nextlsn:	former lsn of the next_pgno. This is not currently used, but
+ *		may be used later if we actually do overwrites of big key/
+ *		data items in place.
+ */
+BEGIN big		50	43
+OP	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	prev_pgno	db_pgno_t	lu
+ARG	next_pgno	db_pgno_t	lu
+HDR	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+POINTER	prevlsn		DB_LSN *	lu
+POINTER	nextlsn		DB_LSN *	lu
+END
+
+BEGIN_COMPAT big		42	43
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	prev_pgno	db_pgno_t	lu
+ARG	next_pgno	db_pgno_t	lu
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+POINTER	prevlsn		DB_LSN *	lu
+POINTER	nextlsn		DB_LSN *	lu
+END
+
+/*
+ * ovref -- Handles increment/decrement of overflow page reference count.
+ *
+ * fileid:	identifies the file being modified.
+ * pgno:	page number whose ref count is being incremented/decremented.
+ * adjust:	the adjustment being made.
+ * lsn:		the page's original lsn.
+ */
+BEGIN ovref		42	44
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	adjust		int32_t		ld
+POINTER	lsn		DB_LSN *	lu
+END
+
+/*
+ * relink -- Handles relinking around a page.
+ *
+ * opcode:	indicates if this is an addpage or delete page
+ * pgno:	the page being changed.
+ * lsn		the page's original lsn.
+ * prev:	the previous page.
+ * lsn_prev:	the previous page's original lsn.
+ * next:	the next page.
+ * lsn_next:	the previous page's original lsn.
+ */
+BEGIN_COMPAT relink		42	45
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	prev		db_pgno_t	lu
+POINTER	lsn_prev	DB_LSN *	lu
+ARG	next		db_pgno_t	lu
+POINTER	lsn_next	DB_LSN *	lu
+END
+
+/*
+ * Debug -- log an operation upon entering an access method.
+ * op:		Operation (cursor, c_close, c_get, c_put, c_del,
+ *		get, put, delete).
+ * fileid:	identifies the file being acted upon.
+ * key:		key paramater
+ * data:	data parameter
+ * flags:	flags parameter
+ */
+BEGIN debug		42	47
+DBT	op		DBT		s
+ARG	fileid		int32_t		ld
+DBT	key		DBT		s
+DBT	data		DBT		s
+ARG	arg_flags	u_int32_t	lu
+END
+
+/*
+ * noop -- do nothing, but get an LSN.
+ */
+BEGIN noop		42	48
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	prevlsn		DB_LSN *	lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn:	the original lsn of the page reference by meta_pgno.
+ * meta_pgno	the page pointing at the allocated page in the free list.
+ *			If the list is unsorted this is the metadata page.
+ * page_lsn:	the allocated page's original lsn.
+ * pgno:	the page allocated.
+ * ptype:	the type of the page allocated.
+ * next:	the next page on the free list.
+ * last_pgno:	the last page in the file after this op (4.3+).
+ */
+BEGIN_COMPAT pg_alloc	42	49
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+POINTER	page_lsn	DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+ARG	next		db_pgno_t	lu
+END
+
+BEGIN pg_alloc	43	49
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+POINTER	page_lsn	DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * pg_free: used to record freeing a page.
+ *	If we are maintaining a sorted free list (during compact) meta_pgno
+ * will be non-zero and refer to the page that preceeds the one we are freeing
+ * in the free list.  Meta_lsn will then be the lsn of that page.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_free		42	50
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+END
+
+BEGIN pg_free		43	50
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * cksum --
+ *	This log record is written when we're unable to checksum a page,
+ *	before returning DB_RUNRECOVERY.  This log record causes normal
+ *	recovery to itself return DB_RUNRECOVERY, as only catastrophic
+ *	recovery can fix things.
+ */
+BEGIN cksum		42	51
+END
+
+/*
+ * pg_freedata: used to record freeing a page with data on it.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header and index entries from the free'd page.
+ * data:	the data from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_freedata		42	52
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+PGDDBT	data		DBT		s
+END
+
+BEGIN pg_freedata		43	52
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+PGDDBT	data		DBT		s
+END
+
+/*
+ * pg_prepare: used to record an aborted page in a prepared transaction.
+ *
+ * pgno:	the page being freed.
+ */
+X BEGIN pg_prepare		42	53
+X DB	fileid		int32_t		ld
+X ARG	pgno		db_pgno_t	lu
+X END
+
+/*
+ * pg_new: used to record a new page put on the free list.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ */
+X BEGIN pg_new		42	54
+X DB	fileid		int32_t		ld
+X ARG	pgno		db_pgno_t	lu
+X POINTER	meta_lsn	DB_LSN *	lu
+X ARG	meta_pgno	db_pgno_t	lu
+X PGDBT	header		DBT		s
+X ARG	next		db_pgno_t	lu
+X END
+
+/*
+ * pg_init: used to reinitialize a page during truncate.
+ *
+ * pgno:	the page being initialized.
+ * header:	the header from the page.
+ * data:	data that used to be on the page.
+ */
+BEGIN pg_init		43	60
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+PGDBT	header		DBT		s
+PGDDBT	data		DBT		s
+END
+
+/*
+ * pg_sort: sort the free list
+ *
+ * meta:	meta page number
+ * meta_lsn:	lsn on meta page.
+ * last_free:	page number of new last free page.
+ * last_lsn;	lsn of last free page.
+ * last_pgno:	current last page number.
+ * list:	list of pages and lsns to sort.
+ */
+BEGIN_COMPAT pg_sort		44	61
+DB	fileid		int32_t		ld
+ARG	meta		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	last_free	db_pgno_t	lu
+POINTER	last_lsn	DB_LSN *	lu
+ARG	last_pgno	db_pgno_t	lu
+DBT	list		DBT		s
+END
+
+
+/*
+ * pg_truc: truncate the free list
+ *
+ * meta:	meta page number
+ * meta_lsn:	lsn on meta page.
+ * last_free:	page number of new last free page.
+ * last_lsn;	lsn of last free page.
+ * last_pgno:	current last page number.
+ * list:	list of pages and lsns on free list.
+ */
+BEGIN pg_trunc		50	66
+DB	fileid		int32_t		ld
+ARG	meta		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	last_free	db_pgno_t	lu
+POINTER	last_lsn	DB_LSN *	lu
+ARG	next_free	db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+PGLIST	list		DBT		s
+END
+
+/*
+ * realloc: allocate a range of pages from the free list
+ * prev_pgno:	page number of the page preceeding the set of pages to
+ *			be allocated
+ * prev_lsn	LSN from the prev_pgno page
+ * next_free	page number of the page immediately following the set
+ *			of pages to be allocated
+ * ptype	The type of page being allocated
+ * list:	pairs of page numbers and LSNs corresponding to the pages on
+ *			the free list that are being reallocated
+ */
+BEGIN realloc		50	36
+DB	fileid		int32_t		ld
+ARG	prev_pgno	db_pgno_t	lu
+POINTER	page_lsn	DB_LSN *	lu
+ARG	next_free	db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+PGLIST	list		DBT		s
+END
+
+/*
+ * relink: relink next and previous page pointers
+ * NOTE: moved from btree so its number is from that range.
+ * pgno:	The page being removed.
+ * new_pgno:	The new page number, if any.
+ * prev_pgno:	The previous page, if any.
+ * lsn_prev:	The previous page's original lsn.
+ * next_pgno:	The next page, if any.
+ * lsn_next:	The previous page's original lsn.
+ */
+BEGIN relink		44	147
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	new_pgno	db_pgno_t	lu
+ARG	prev_pgno	db_pgno_t	lu
+POINTER	lsn_prev	DB_LSN *	lu
+ARG	next_pgno	db_pgno_t	lu
+POINTER	lsn_next	DB_LSN *	lu
+END
+
+/*
+ * Merge: merge two pages.
+ * NOTE: moved from btree so its number is from that range.
+ * pgno:	The page number of the target page.
+ * lsn:		Orignial LSN of the page.
+ * npgno:	The page number of the next, or merged, page.
+ * nlsn:	The LSN of hte next page.
+ * hdr:		The page header of the next page.
+ * data:	The data from the next page.
+ * pg_copy:	If 1, then the whole page was copied.
+ */
+BEGIN merge		47	148
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	npgno		db_pgno_t	lu
+POINTER nlsn		DB_LSN *	lu
+PGDBT	hdr		DBT		s
+PGDDBT	data		DBT		s
+ARG	pg_copy		int32_t		lu
+END
+
+
+/*
+ * pgno -- Handles replacing a page number in a record
+ * reference on pgno by indx.
+ * NOTE: moved from btree so its number is from that range.
+ * pgno:	The page that is being updated.
+ * lsn:		The LSN of the page.
+ * indx:	The index of the record being updated.
+ * opgno:	Old page number.
+ * npgno:	New page number.
+ */
+BEGIN pgno		44	149
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	indx		u_int32_t	lu
+ARG	opgno		db_pgno_t	lu
+ARG	npgno		db_pgno_t	lu
+END
diff --git a/src/db/db_am.c b/src/db/db_am.c
new file mode 100644
index 00000000..1cf3a505
--- /dev/null
+++ b/src/db/db_am.c
@@ -0,0 +1,1150 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY));
+static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* ));
+
+/*
+ * __db_cursor_int --
+ *	Internal routine to create a cursor.
+ *
+ * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+ */
+int
+__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBTYPE dbtype;
+	db_pgno_t root;
+	int flags;
+	DB_LOCKER *locker;
+	DBC **dbcp;
+{
+	DBC *dbc;
+	DBC_INTERNAL *cp;
+	DB_LOCKREQ req;
+	ENV *env;
+	db_threadid_t tid;
+	int allocated, envlid, ret;
+	pid_t pid;
+
+	env = dbp->env;
+	allocated = envlid = 0;
+
+	/*
+	 * If dbcp is non-NULL it is assumed to point to an area to initialize
+	 * as a cursor.
+	 *
+	 * Take one from the free list if it's available.  Take only the
+	 * right type.  With off page dups we may have different kinds
+	 * of cursors on the queue for a single database.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+#ifndef HAVE_NO_DB_REFCOUNT
+	/*
+	 * If this DBP is being logged then refcount the log filename
+	 * relative to this transaction. We do this here because we have
+	 * the dbp->mutex which protects the refcount.  We want to avoid
+	 * calling the function if the transaction handle has a shared parent
+	 * locker or we are duplicating a cursor.  This includes the case of
+	 * creating an off page duplicate cursor.
+	 * If we knew this cursor will not be used in an update, we could avoid
+	 * this, but we don't have that information.
+	 */
+	if (IS_REAL_TXN(txn) &&
+	    !LF_ISSET(DBC_OPD | DBC_DUPLICATE) &&
+	    !F_ISSET(dbp, DB_AM_RECOVER) &&
+	    dbp->log_filename != NULL && !IS_REP_CLIENT(env) &&
+	    (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) {
+		MUTEX_UNLOCK(env, dbp->mutex);
+		return (ret);
+	}
+
+#endif
+
+	TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+		if (dbtype == dbc->dbtype) {
+			TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+			F_CLR(dbc, ~DBC_OWN_LID);
+			break;
+		}
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (dbc == NULL) {
+		if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+			return (ret);
+		allocated = 1;
+		dbc->flags = 0;
+
+		dbc->dbp = dbp;
+		dbc->dbenv = dbp->dbenv;
+		dbc->env = dbp->env;
+
+		/* Set up locking information. */
+		if (LOCKING_ON(env)) {
+			/*
+			 * If we are not threaded, we share a locker ID among
+			 * all cursors opened in the environment handle,
+			 * allocating one if this is the first cursor.
+			 *
+			 * This relies on the fact that non-threaded DB handles
+			 * always have non-threaded environment handles, since
+			 * we set DB_THREAD on DB handles created with threaded
+			 * environment handles.
+			 */
+			if (!DB_IS_THREADED(dbp)) {
+				if (env->env_lref == NULL) {
+					if ((ret = __lock_id(env,
+					    NULL, &env->env_lref)) != 0)
+						goto err;
+				       envlid = 1;
+				}
+				dbc->lref = env->env_lref;
+			}
+
+			/*
+			 * In CDB, secondary indices should share a lock file
+			 * ID with the primary;  otherwise we're susceptible
+			 * to deadlocks.  We also use __db_cursor_int rather
+			 * than __db_cursor to create secondary update cursors
+			 * in c_put and c_del; these won't acquire a new lock.
+			 *
+			 * !!!
+			 * Since this is in the one-time cursor allocation
+			 * code, we need to be sure to destroy, not just
+			 * close, all cursors in the secondary when we
+			 * associate.
+			 */
+			if (CDB_LOCKING(env) &&
+			    F_ISSET(dbp, DB_AM_SECONDARY))
+				memcpy(dbc->lock.fileid,
+				    dbp->s_primary->fileid, DB_FILE_ID_LEN);
+			else
+				memcpy(dbc->lock.fileid,
+				    dbp->fileid, DB_FILE_ID_LEN);
+
+			if (CDB_LOCKING(env)) {
+				if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+					/*
+					 * If we are doing a single lock per
+					 * environment, set up the global
+					 * lock object just like we do to
+					 * single thread creates.
+					 */
+					DB_ASSERT(env, sizeof(db_pgno_t) ==
+					    sizeof(u_int32_t));
+					dbc->lock_dbt.size = sizeof(u_int32_t);
+					dbc->lock_dbt.data = &dbc->lock.pgno;
+					dbc->lock.pgno = 0;
+				} else {
+					dbc->lock_dbt.size = DB_FILE_ID_LEN;
+					dbc->lock_dbt.data = dbc->lock.fileid;
+				}
+			} else {
+				dbc->lock.type = DB_PAGE_LOCK;
+				dbc->lock_dbt.size = sizeof(dbc->lock);
+				dbc->lock_dbt.data = &dbc->lock;
+			}
+		}
+		/* Init the DBC internal structure. */
+#ifdef HAVE_PARTITION
+		if (DB_IS_PARTITIONED(dbp)) {
+			if ((ret = __partc_init(dbc)) != 0)
+				goto err;
+		} else
+#endif
+		switch (dbtype) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bamc_init(dbc, dbtype)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __hamc_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_HEAP:
+			if ((ret = __heapc_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_QUEUE:
+			if ((ret = __qamc_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(env, "DB->cursor", dbtype);
+			goto err;
+		}
+
+		cp = dbc->internal;
+	}
+
+	/* Refresh the DBC structure. */
+	dbc->dbtype = dbtype;
+	RESET_RET_MEM(dbc);
+	dbc->set_priority = __dbc_set_priority;
+	dbc->get_priority = __dbc_get_priority;
+	dbc->priority = dbp->priority;
+	dbc->txn_cursors.tqe_next = NULL;
+	dbc->txn_cursors.tqe_prev = NULL;
+
+	/*
+	 * If the DB handle is not threaded, there is one locker ID for the
+	 * whole environment.  There should only one family transaction active
+	 * as well.  This doesn't apply to CDS group transactions, where the
+	 * cursor can simply use the transaction's locker directly.
+	 */
+	if (!CDB_LOCKING(env) && txn != NULL && F_ISSET(txn, TXN_FAMILY) &&
+	    (F_ISSET(dbc, DBC_OWN_LID) || dbc->lref == NULL || envlid))  {
+		if (LOCKING_ON(env)) {
+			if (dbc->lref == NULL) {
+				if ((ret =
+				    __lock_id(env, NULL, &dbc->lref)) != 0)
+					goto err;
+				F_SET(dbc, DBC_OWN_LID);
+			}
+			if ((ret = __lock_addfamilylocker(env,
+			    txn->txnid, dbc->lref->id, 1)) != 0)
+				goto err;
+		}
+		F_SET(dbc, DBC_FAMILY);
+		txn = NULL;
+	}
+
+	if ((dbc->txn = txn) != NULL)
+		dbc->locker = txn->locker;
+	else if (LOCKING_ON(env)) {
+		/*
+		 * There are certain cases in which we want to create a
+		 * new cursor with a particular locker ID that is known
+		 * to be the same as (and thus not conflict with) an
+		 * open cursor.
+		 *
+		 * The most obvious case is cursor duplication;  when we
+		 * call DBC->dup or __dbc_idup, we want to use the original
+		 * cursor's locker ID.
+		 *
+		 * Another case is when updating secondary indices.  Standard
+		 * CDB locking would mean that we might block ourself:  we need
+		 * to open an update cursor in the secondary while an update
+		 * cursor in the primary is open, and when the secondary and
+		 * primary are subdatabases or we're using env-wide locking,
+		 * this is disastrous.
+		 *
+		 * In these cases, our caller will pass a nonzero locker
+		 * ID into this function.  Use this locker ID instead of
+		 * the default as the locker ID for our new cursor.
+		 */
+		if (locker != NULL)
+			dbc->locker = locker;
+		else if (LF_ISSET(DB_RECOVER))
+			dbc->locker = NULL;
+		else {
+			if (dbc->lref == NULL) {
+				if ((ret =
+				    __lock_id(env, NULL, &dbc->lref)) != 0)
+					goto err;
+				F_SET(dbc, DBC_OWN_LID);
+			}
+			/*
+			 * If we are threaded then we need to set the
+			 * proper thread id into the locker.
+			 */
+			if (DB_IS_THREADED(dbp)) {
+				env->dbenv->thread_id(env->dbenv, &pid, &tid);
+				__lock_set_thread_id(dbc->lref, pid, tid);
+			}
+			dbc->locker = dbc->lref;
+		}
+	}
+
+	/*
+	 * These fields change when we are used as a secondary index, so
+	 * if the DB is a secondary, make sure they're set properly just
+	 * in case we opened some cursors before we were associated.
+	 *
+	 * __dbc_get is used by all access methods, so this should be safe.
+	 */
+	if (F_ISSET(dbp, DB_AM_SECONDARY))
+		dbc->get = dbc->c_get = __dbc_secondary_get_pp;
+
+	/*
+	 * Don't enable bulk for btrees with record numbering, since avoiding
+	 * a full search avoids taking write locks necessary to maintain
+	 * consistent numbering.
+	 */
+	if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE &&
+	    !F_ISSET(dbp, DB_AM_RECNUM))
+		F_SET(dbc, DBC_BULK);
+	if (LF_ISSET(DB_CURSOR_TRANSIENT))
+		F_SET(dbc, DBC_TRANSIENT);
+	if (LF_ISSET(DBC_OPD))
+		F_SET(dbc, DBC_OPD);
+	if (F_ISSET(dbp, DB_AM_RECOVER) || LF_ISSET(DB_RECOVER))
+		F_SET(dbc, DBC_RECOVER);
+	if (F_ISSET(dbp, DB_AM_COMPENSATE))
+		F_SET(dbc, DBC_DONTLOCK);
+	/*
+	* If this database is exclusive then the cursor
+	* does not need to get locks.
+	*/
+	if (F2_ISSET(dbp, DB2_AM_EXCL)) {
+		F_SET(dbc, DBC_DONTLOCK);
+		if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) {
+			/* 
+			 * Exclusive databases can only have one active 
+			 * transaction at a time since there are no internal 
+			 * locks to prevent one transaction from reading and
+			 * writing another's uncommitted changes. 
+			 */
+			if (dbp->cur_txn != NULL && dbp->cur_txn != txn) {
+			    __db_errx(env, DB_STR("0749",
+"Exclusive database handles can only have one active transaction at a time."));
+				ret = EINVAL;
+				goto err;
+			}
+			/* Do not trade a second time. */
+			if (dbp->cur_txn != txn) {
+				/* Trade the handle lock to the txn locker. */
+				memset(&req, 0, sizeof(req));
+				req.lock = dbp->handle_lock;
+				req.op = DB_LOCK_TRADE;
+				if ((ret = __lock_vec(env, txn->locker, 0, 
+				    &req, 1, 0)) != 0)
+					goto err;
+				dbp->cur_txn = txn;
+				dbp->cur_locker = txn->locker;
+				if ((ret = __txn_lockevent(env, txn, dbp,
+				    &dbp->handle_lock, dbp->locker)) != 0)
+					goto err;
+			}
+		}
+	}
+#ifdef HAVE_REPLICATION
+	/*
+	 * If we are replicating from a down rev version then we must
+	 * use old locking protocols.
+	 */
+	if (LOGGING_ON(env) &&
+	     ((LOG *)env->lg_handle->
+	     reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING)
+		F_SET(dbc, DBC_DOWNREV);
+#endif
+
+	/* Refresh the DBC internal structure. */
+	cp = dbc->internal;
+	cp->opd = NULL;
+	cp->pdbc = NULL;
+
+	cp->indx = 0;
+	cp->page = NULL;
+	cp->pgno = PGNO_INVALID;
+	cp->root = root;
+	cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID;
+	cp->stream_off = 0;
+
+	if (DB_IS_PARTITIONED(dbp)) {
+		DBC_PART_REFRESH(dbc);
+	} else switch (dbtype) {
+	case DB_BTREE:
+	case DB_RECNO:
+		if ((ret = __bamc_refresh(dbc)) != 0)
+			goto err;
+		break;
+	case DB_HEAP:
+		if ((ret = __heapc_refresh(dbc)) != 0)
+			goto err;
+		break;
+	case DB_HASH:
+	case DB_QUEUE:
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->cursor", dbp->type);
+		goto err;
+	}
+
+	/*
+	 * The transaction keeps track of how many cursors were opened within
+	 * it to catch application errors where the cursor isn't closed when
+	 * the transaction is resolved.
+	 */
+	if (txn != NULL)
+		++txn->cursors;
+	if (ip != NULL) {
+		dbc->thread_info = ip;
+#ifdef DIAGNOSTIC
+		if (dbc->locker != NULL)
+			ip->dbth_locker =
+			    R_OFFSET(&(env->lk_handle->reginfo), dbc->locker);
+		else
+			ip->dbth_locker = INVALID_ROFF;
+#endif
+	} else if (txn != NULL)
+		dbc->thread_info = txn->thread_info;
+	else
+		ENV_GET_THREAD_INFO(env, dbc->thread_info);
+
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+	F_SET(dbc, DBC_ACTIVE);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	*dbcp = dbc;
+	return (0);
+
+err:	if (allocated)
+		__os_free(env, dbc);
+	return (ret);
+}
+
+/*
+ * __db_put --
+ *	Store a key/data pair.
+ *
+ * PUBLIC: int __db_put __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put(dbp, ip, txn, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB_HEAP_RID rid;
+	DBC *dbc;
+	DBT tdata, tkey;
+	ENV *env;
+	void *bulk_kptr, *bulk_ptr;
+	db_recno_t recno;
+	u_int32_t cursor_flags;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/*
+	 * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT.
+	 *
+	 * Note that the get in the DB_NOOVERWRITE case is safe to do with this
+	 * flag set;  if it errors in any way other than DB_NOTFOUND, we're
+	 * going to close the cursor without doing anything else, and if it
+	 * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if
+	 * an access method moved the cursor, since that's not
+	 * position-dependent.
+	 */
+	cursor_flags = DB_WRITELOCK;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+		cursor_flags |= DB_CURSOR_BULK;
+	else
+		cursor_flags |= DB_CURSOR_TRANSIENT;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags);
+	PERFMON6(env, db, put, dbp->fname,
+	    dbp->dname, txn == NULL ? 0 : txn->txnid, key, data, flags);
+
+	SET_RET_MEM(dbc, dbp);
+
+	if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) {
+		/*
+		 * If there is an append callback, the value stored in
+		 * data->data may be replaced and then freed.  To avoid
+		 * passing a freed pointer back to the user, just operate
+		 * on a copy of the data DBT.
+		 */
+		tdata = *data;
+
+		/*
+		 * Append isn't a normal put operation;  call the appropriate
+		 * access method's append function.
+		 */
+		switch (dbp->type) {
+		case DB_HEAP:
+			if ((ret = __heap_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_QUEUE:
+			if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_RECNO:
+			if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_BTREE:
+		case DB_HASH:
+		case DB_UNKNOWN:
+		default:
+			/* The interface should prevent this. */
+			DB_ASSERT(env,
+			    dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+			ret = __db_ferr(env, "DB->put", 0);
+			goto err;
+		}
+
+		/*
+		 * The append callback, if one exists, may have allocated
+		 * a new tdata.data buffer.  If so, free it.
+		 */
+		FREE_IF_NEEDED(env, &tdata);
+
+		/* No need for a cursor put;  we're done. */
+#ifdef HAVE_COMPRESSION
+	} else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+		ret = __dbc_put(dbc, key, data, flags);
+#endif
+	} else if (LF_ISSET(DB_MULTIPLE)) {
+		ret = 0;
+		memset(&tkey, 0, sizeof(tkey));
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		memset(&tdata, 0, sizeof(tdata));
+		DB_MULTIPLE_INIT(bulk_kptr, key);
+		DB_MULTIPLE_INIT(bulk_ptr, data);
+		key->doff = 0;
+		while (ret == 0) {
+			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+				DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key,
+				    recno, tdata.data, tdata.size);
+			else
+				DB_MULTIPLE_NEXT(bulk_kptr, key,
+				    tkey.data, tkey.size);
+			DB_MULTIPLE_NEXT(bulk_ptr, data,
+			    tdata.data, tdata.size);
+			if (bulk_kptr == NULL || bulk_ptr == NULL)
+				break;
+			if (dbp->type == DB_HEAP) {
+				memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+				tkey.data = &rid;
+			}
+			ret = __dbc_put(dbc, &tkey, &tdata,
+			    LF_ISSET(DB_OPFLAGS_MASK));
+			if (ret == 0)
+				++key->doff;
+		}
+	} else if (LF_ISSET(DB_MULTIPLE_KEY)) {
+		ret = 0;
+		memset(&tkey, 0, sizeof(tkey));
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		memset(&tdata, 0, sizeof(tdata));
+		DB_MULTIPLE_INIT(bulk_ptr, key);
+		while (ret == 0) {
+			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+				DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno,
+				    tdata.data, tdata.size);
+			else
+				DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data,
+				    tkey.size, tdata.data, tdata.size);
+			if (bulk_ptr == NULL)
+				break;
+			if (dbp->type == DB_HEAP) {
+				memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+				tkey.data = &rid;
+			}
+			ret = __dbc_put(dbc, &tkey, &tdata,
+			    LF_ISSET(DB_OPFLAGS_MASK));
+			if (ret == 0)
+				++key->doff;
+		}
+	} else
+		ret = __dbc_put(dbc, key, data, flags);
+
+err:	/* Close the cursor. */
+	if (!DB_RETOK_DBPUT(ret))
+		F_SET(dbc, DBC_ERROR);
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_del --
+ *	Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_del __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del(dbp, ip, txn, key, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DB_HEAP_RID rid;
+	DBC *dbc;
+	DBT data, tkey;
+	void *bulk_ptr;
+	db_recno_t recno;
+	u_int32_t cursor_flags, f_init, f_next;
+	int ret, t_ret;
+
+	COMPQUIET(bulk_ptr, NULL);
+	/* Allocate a cursor. */
+	cursor_flags = DB_WRITELOCK;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+		cursor_flags |= DB_CURSOR_BULK;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags);
+	PERFMON5(env, db, del,
+	    dbp->fname, dbp->dname, txn == NULL ? 0 : txn->txnid, key, flags);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+		F_SET(dbc, DBC_TRANSIENT);
+		ret = __dbc_bulk_del(dbc, key, flags);
+		goto err;
+	}
+#endif
+
+	/*
+	 * Walk a cursor through the key/data pairs, deleting as we go.  Set
+	 * the DB_DBT_USERMEM flag, as this might be a threaded application
+	 * and the flags checking will catch us.  We don't actually want the
+	 * keys or data, set DB_DBT_ISSET.  We rely on __dbc_get to clear
+	 * this.
+	 */
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_USERMEM);
+	tkey = *key;
+
+	f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET;
+	f_next = DB_NEXT_DUP;
+
+	/*
+	 * If locking (and we haven't already acquired CDB locks), set the
+	 * read-modify-write flag.
+	 */
+	if (STD_LOCKING(dbc)) {
+		f_init |= DB_RMW;
+		f_next |= DB_RMW;
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			memset(&tkey, 0, sizeof(tkey));
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		DB_MULTIPLE_INIT(bulk_ptr, key);
+		/* We return the number of keys deleted in doff. */
+		key->doff = 0;
+bulk_next:	if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+			DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key,
+			    recno, data.data, data.size);
+		else if (LF_ISSET(DB_MULTIPLE))
+			DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size);
+		else
+			DB_MULTIPLE_KEY_NEXT(bulk_ptr, key,
+			    tkey.data, tkey.size, data.data, data.size);
+		if (bulk_ptr == NULL)
+			goto err;
+		if (dbp->type == DB_HEAP) {
+			memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+			tkey.data = &rid;
+		}
+
+	}
+
+	/* We're not interested in the data -- do not return it. */
+	F_SET(&tkey, DB_DBT_ISSET);
+	F_SET(&data, DB_DBT_ISSET);
+
+	/*
+	 * Optimize the simple cases.  For all AMs if we don't have secondaries
+	 * and are not a secondary and we aren't a foreign database and there
+	 * are no dups then we can avoid a bunch of overhead.  For queue we
+	 * don't need to fetch the record since we delete by direct calculation
+	 * from the record number.
+	 *
+	 * Hash permits an optimization in DB->del: since on-page duplicates are
+	 * stored in a single HKEYDATA structure, it's possible to delete an
+	 * entire set of them at once, and as the HKEYDATA has to be rebuilt
+	 * and re-put each time it changes, this is much faster than deleting
+	 * the duplicates one by one.  Thus, if not pointing at an off-page
+	 * duplicate set, and we're not using secondary indices (in which case
+	 * we'd have to examine the items one by one anyway), let hash do this
+	 * "quick delete".
+	 *
+	 * !!!
+	 * Note that this is the only application-executed delete call in
+	 * Berkeley DB that does not go through the __dbc_del function.
+	 * If anything other than the delete itself (like a secondary index
+	 * update) has to happen there in a particular situation, the
+	 * conditions here should be modified not to use these optimizations.
+	 * The ordinary AM-independent alternative will work just fine;
+	 * it'll just be slower.
+	 */
+	if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) &&
+	    LIST_FIRST(&dbp->f_primaries) == NULL) {
+#ifdef HAVE_QUEUE
+		if (dbp->type == DB_QUEUE) {
+			ret = __qam_delete(dbc, &tkey, flags);
+			goto next;
+		}
+#endif
+
+		/* Fetch the first record. */
+		if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+			goto err;
+
+#ifdef HAVE_HASH
+		/*
+		 * Hash "quick delete" removes all on-page duplicates.  We
+		 * can't do that if deleting specific key/data pairs.
+		 */
+		if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) {
+			DBC *sdbc;
+			sdbc = dbc;
+#ifdef HAVE_PARTITION
+			if (F_ISSET(dbc, DBC_PARTITIONED))
+				sdbc =
+				    ((PART_CURSOR*)dbc->internal)->sub_cursor;
+#endif
+			if (sdbc->internal->opd == NULL) {
+				ret = __ham_quick_delete(sdbc);
+				goto next;
+			}
+		}
+#endif
+
+		if (!F_ISSET(dbp, DB_AM_DUP)) {
+			ret = dbc->am_del(dbc, 0);
+			goto next;
+		}
+	} else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+		goto err;
+
+	/* Walk through the set of key/data pairs, deleting as we go. */
+	for (;;) {
+		if ((ret = __dbc_del(dbc, flags)) != 0)
+			break;
+		/*
+		 * With DB_MULTIPLE_KEY, the application has specified the
+		 * exact records they want deleted.  We don't need to walk
+		 * through a set of duplicates.
+		 */
+		if (LF_ISSET(DB_MULTIPLE_KEY))
+			break;
+
+		F_SET(&tkey, DB_DBT_ISSET);
+		F_SET(&data, DB_DBT_ISSET);
+		if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) {
+			if (ret == DB_NOTFOUND)
+				ret = 0;
+			break;
+		}
+	}
+
+next:	if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		++key->doff;
+		goto bulk_next;
+	}
+err:	/* Discard the cursor. */
+	if (!DB_RETOK_DBDEL(ret))
+		F_SET(dbc, DBC_ERROR);
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_sync --
+ *	Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *));
+ */
+int
+__db_sync(dbp)
+	DB *dbp;
+{
+	int ret, t_ret;
+
+	ret = 0;
+
+	/* If the database was read-only, we're done. */
+	if (F_ISSET(dbp, DB_AM_RDONLY))
+		return (0);
+
+	/* If it's a Recno tree, write the backing source text file. */
+	if (dbp->type == DB_RECNO)
+		ret = __ram_writeback(dbp);
+
+	/* If the database was never backed by a database file, we're done. */
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		return (ret);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __partition_sync(dbp);
+	else
+#endif
+	if (dbp->type == DB_QUEUE)
+		ret = __qam_sync(dbp);
+	else
+		/* Flush any dirty pages from the cache to the backing file. */
+		if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+			ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_associate --
+ *	Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, ip, txn, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	DBC *pdbc, *sdbc;
+	DBT key, data, skey, *tskeyp;
+	ENV *env;
+	int build, ret, t_ret;
+	u_int32_t nskey;
+
+	env = dbp->env;
+	pdbc = sdbc = NULL;
+	ret = 0;
+
+	memset(&skey, 0, sizeof(DBT));
+	nskey = 0;
+	tskeyp = NULL;
+
+	/*
+	 * Check to see if the secondary is empty -- and thus if we should
+	 * build it -- before we link it in and risk making it show up in other
+	 * threads.  Do this first so that the databases remain unassociated on
+	 * error.
+	 */
+	build = 0;
+	if (LF_ISSET(DB_CREATE)) {
+		FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_CREATE);
+
+		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about key or data;  we're just doing
+		 * an existence check.
+		 */
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		if ((ret = __dbc_get(sdbc, &key, &data,
+		    (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+		    DB_FIRST)) == DB_NOTFOUND) {
+			build = 1;
+			ret = 0;
+		}
+
+		if (ret != 0)
+			F_SET(sdbc, DBC_ERROR);
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		/* Reset for later error check. */
+		sdbc = NULL;
+
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Set up the database handle as a secondary.
+	 */
+	sdbp->s_callback = callback;
+	sdbp->s_primary = dbp;
+
+	sdbp->stored_get = sdbp->get;
+	sdbp->get = __db_secondary_get;
+
+	sdbp->stored_close = sdbp->close;
+	sdbp->close = __db_secondary_close_pp;
+
+	F_SET(sdbp, DB_AM_SECONDARY);
+
+	if (LF_ISSET(DB_IMMUTABLE_KEY))
+		FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY);
+
+	/*
+	 * Add the secondary to the list on the primary.  Do it here
+	 * so that we see any updates that occur while we're walking
+	 * the primary.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+	/* See __db_s_next for an explanation of secondary refcounting. */
+	DB_ASSERT(env, sdbp->s_refcnt == 0);
+	sdbp->s_refcnt = 1;
+	LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (build) {
+		/*
+		 * We loop through the primary, putting each item we
+		 * find into the new secondary.
+		 *
+		 * If we're using CDB, opening these two cursors puts us
+		 * in a bit of a locking tangle:  CDB locks are done on the
+		 * primary, so that we stay deadlock-free, but that means
+		 * that updating the secondary while we have a read cursor
+		 * open on the primary will self-block.  To get around this,
+		 * we force the primary cursor to use the same locker ID
+		 * as the secondary, so they won't conflict.  This should
+		 * be harmless even if we're not using CDB.
+		 */
+		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc,
+		    CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0)
+			goto err;
+		if ((ret = __db_cursor_int(dbp, ip,
+		    txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+			goto err;
+
+		/* Lock out other threads, now that we have a locker. */
+		dbp->associate_locker = sdbc->locker;
+
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+			if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+				if (ret == DB_DONOTINDEX)
+					continue;
+				goto err;
+			}
+			if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+				__db_check_skeyset(sdbp, &skey);
+#endif
+				nskey = skey.size;
+				tskeyp = (DBT *)skey.data;
+			} else {
+				nskey = 1;
+				tskeyp = &skey;
+			}
+			SWAP_IF_NEEDED(sdbp, &key);
+			for (; nskey > 0; nskey--, tskeyp++) {
+				if ((ret = __dbc_put(sdbc,
+				    tskeyp, &key, DB_UPDATE_SECONDARY)) != 0)
+					goto err;
+				FREE_IF_NEEDED(env, tskeyp);
+			}
+			SWAP_IF_NEEDED(sdbp, &key);
+			FREE_IF_NEEDED(env, &skey);
+		}
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+	}
+
+err:	if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbp->associate_locker = NULL;
+
+	for (; nskey > 0; nskey--, tskeyp++)
+		FREE_IF_NEEDED(env, tskeyp);
+	FREE_IF_NEEDED(env, &skey);
+
+	return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ *	This wrapper function for DB->pget() is the DB->get() function
+ *	on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+	DB *sdbp;
+	DB_TXN *txn;
+	DBT *skey, *data;
+	u_int32_t flags;
+{
+	DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY));
+	return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ *	Wrapper function for DB->close() which we use on secondaries to
+ *	manage refcounting and make sure we don't close them underneath
+ *	a primary that is updating.
+ *
+ * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close(sdbp, flags)
+	DB *sdbp;
+	u_int32_t flags;
+{
+	DB *primary;
+	ENV *env;
+	int doclose;
+
+	/*
+	 * If the opening transaction is rolled back then the db handle
+	 * will have already been refreshed, we just need to call
+	 * __db_close to free the data.
+	 */
+	if (!F_ISSET(sdbp, DB_AM_OPEN_CALLED)) {
+		doclose = 1;
+		goto done;
+	}
+	doclose = 0;
+	primary = sdbp->s_primary;
+	env = primary->env;
+
+	MUTEX_LOCK(env, primary->mutex);
+	/*
+	 * Check the refcount--if it was at 1 when we were called, no
+	 * thread is currently updating this secondary through the primary,
+	 * so it's safe to close it for real.
+	 *
+	 * If it's not safe to do the close now, we do nothing;  the
+	 * database will actually be closed when the refcount is decremented,
+	 * which can happen in either __db_s_next or __db_s_done.
+	 */
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		/* We don't want to call close while the mutex is held. */
+		doclose = 1;
+	}
+	MUTEX_UNLOCK(env, primary->mutex);
+
+	/*
+	 * sdbp->close is this function;  call the real one explicitly if
+	 * need be.
+	 */
+done:	return (doclose ? __db_close(sdbp, NULL, flags) : 0);
+}
+
+/*
+ * __db_associate_foreign --
+ *	Associate this database (fdbp) as a foreign constraint to another
+ *	database (pdbp).  That is, dbp's keys appear as foreign key values in
+ *	pdbp.
+ *
+ * PUBLIC: int __db_associate_foreign __P((DB *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_associate_foreign(fdbp, pdbp, callback, flags)
+	DB *fdbp, *pdbp;
+	int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *);
+	u_int32_t flags;
+{
+	DB_FOREIGN_INFO *f_info;
+	ENV *env;
+	int ret;
+
+	env = fdbp->env;
+	ret = 0;
+
+	if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) {
+		return (ret);
+	}
+	memset(f_info, 0, sizeof(DB_FOREIGN_INFO));
+
+	f_info->dbp = pdbp;
+	f_info->callback = callback;
+
+	/*
+	 * It might be wise to filter this, but for now the flags only
+	 * set the delete action type.
+	 */
+	FLD_SET(f_info->flags, flags);
+
+	/*
+	 * Add f_info to the foreign database's list of primaries.  That is to
+	 * say, fdbp->f_primaries lists all databases for which fdbp is a
+	 * foreign constraint.
+	 */
+	MUTEX_LOCK(env, fdbp->mutex);
+	LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links);
+	MUTEX_UNLOCK(env, fdbp->mutex);
+
+	/*
+	* Associate fdbp as pdbp's foreign db, for referential integrity
+	* checks.  We don't allow the foreign db to be changed, because we
+	* currently have no way of removing pdbp from the old foreign db's list
+	* of primaries.
+	*/
+	if (pdbp->s_foreign != NULL)
+		return (EINVAL);
+	pdbp->s_foreign = fdbp;
+
+	return (ret);
+}
+
+static int
+__dbc_set_priority(dbc, priority)
+	DBC *dbc;
+	DB_CACHE_PRIORITY priority;
+{
+	dbc->priority = priority;
+	return (0);
+}
+
+static int
+__dbc_get_priority(dbc, priority)
+	DBC *dbc;
+	DB_CACHE_PRIORITY *priority;
+{
+	if (dbc->priority == DB_PRIORITY_UNCHANGED)
+		return (__memp_get_priority(dbc->dbp->mpf, priority));
+	else
+		*priority = dbc->priority;
+
+	return (0);
+}
diff --git a/src/db/db_auto.c b/src/db/db_auto.c
new file mode 100644
index 00000000..7c6b7e66
--- /dev/null
+++ b/src/db/db_auto.c
@@ -0,0 +1,276 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __db_addrem_desc[] = {
+	{LOGREC_OP, SSZ(__db_addrem_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__db_addrem_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_addrem_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_addrem_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__db_addrem_args, nbytes), "nbytes", "%lu"},
+	{LOGREC_HDR, SSZ(__db_addrem_args, hdr), "hdr", ""},
+	{LOGREC_DBT, SSZ(__db_addrem_args, dbt), "dbt", ""},
+	{LOGREC_POINTER, SSZ(__db_addrem_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_addrem_42_desc[] = {
+	{LOGREC_ARG, SSZ(__db_addrem_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__db_addrem_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_addrem_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_addrem_42_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__db_addrem_42_args, nbytes), "nbytes", "%lu"},
+	{LOGREC_DBT, SSZ(__db_addrem_42_args, hdr), "hdr", ""},
+	{LOGREC_DBT, SSZ(__db_addrem_42_args, dbt), "dbt", ""},
+	{LOGREC_POINTER, SSZ(__db_addrem_42_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_big_desc[] = {
+	{LOGREC_OP, SSZ(__db_big_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__db_big_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_big_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_big_args, prev_pgno), "prev_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_big_args, next_pgno), "next_pgno", "%lu"},
+	{LOGREC_HDR, SSZ(__db_big_args, dbt), "dbt", ""},
+	{LOGREC_POINTER, SSZ(__db_big_args, pagelsn), "pagelsn", ""},
+	{LOGREC_POINTER, SSZ(__db_big_args, prevlsn), "prevlsn", ""},
+	{LOGREC_POINTER, SSZ(__db_big_args, nextlsn), "nextlsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_big_42_desc[] = {
+	{LOGREC_ARG, SSZ(__db_big_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__db_big_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_big_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_big_42_args, prev_pgno), "prev_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_big_42_args, next_pgno), "next_pgno", "%lu"},
+	{LOGREC_DBT, SSZ(__db_big_42_args, dbt), "dbt", ""},
+	{LOGREC_POINTER, SSZ(__db_big_42_args, pagelsn), "pagelsn", ""},
+	{LOGREC_POINTER, SSZ(__db_big_42_args, prevlsn), "prevlsn", ""},
+	{LOGREC_POINTER, SSZ(__db_big_42_args, nextlsn), "nextlsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_ovref_desc[] = {
+	{LOGREC_DB, SSZ(__db_ovref_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_ovref_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_ovref_args, adjust), "adjust", "%ld"},
+	{LOGREC_POINTER, SSZ(__db_ovref_args, lsn), "lsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_relink_42_desc[] = {
+	{LOGREC_ARG, SSZ(__db_relink_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__db_relink_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_relink_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_relink_42_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__db_relink_42_args, prev), "prev", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_relink_42_args, lsn_prev), "lsn_prev", ""},
+	{LOGREC_ARG, SSZ(__db_relink_42_args, next), "next", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_relink_42_args, lsn_next), "lsn_next", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_debug_desc[] = {
+	{LOGREC_DBT, SSZ(__db_debug_args, op), "op", ""},
+	{LOGREC_ARG, SSZ(__db_debug_args, fileid), "fileid", "%ld"},
+	{LOGREC_DBT, SSZ(__db_debug_args, key), "key", ""},
+	{LOGREC_DBT, SSZ(__db_debug_args, data), "data", ""},
+	{LOGREC_ARG, SSZ(__db_debug_args, arg_flags), "arg_flags", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_noop_desc[] = {
+	{LOGREC_DB, SSZ(__db_noop_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_noop_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_noop_args, prevlsn), "prevlsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_alloc_42_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_alloc_42_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__db_pg_alloc_42_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_42_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_alloc_42_args, page_lsn), "page_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_42_args, ptype), "ptype", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_42_args, next), "next", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_alloc_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_alloc_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__db_pg_alloc_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_alloc_args, page_lsn), "page_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_args, ptype), "ptype", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_args, next), "next", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_alloc_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_free_42_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_free_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_free_42_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_42_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__db_pg_free_42_args, header), "header", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_42_args, next), "next", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_free_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_free_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_free_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__db_pg_free_args, header), "header", ""},
+	{LOGREC_ARG, SSZ(__db_pg_free_args, next), "next", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_free_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_cksum_desc[] = {
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_freedata_42_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_freedata_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_freedata_42_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_42_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__db_pg_freedata_42_args, header), "header", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_42_args, next), "next", "%lu"},
+	{LOGREC_PGDDBT, SSZ(__db_pg_freedata_42_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_freedata_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_freedata_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_freedata_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__db_pg_freedata_args, header), "header", ""},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_args, next), "next", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_freedata_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_PGDDBT, SSZ(__db_pg_freedata_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_init_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_init_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_init_args, pgno), "pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__db_pg_init_args, header), "header", ""},
+	{LOGREC_PGDDBT, SSZ(__db_pg_init_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_sort_44_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_sort_44_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_sort_44_args, meta), "meta", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_sort_44_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_sort_44_args, last_free), "last_free", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_sort_44_args, last_lsn), "last_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_sort_44_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_DBT, SSZ(__db_pg_sort_44_args, list), "list", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_trunc_desc[] = {
+	{LOGREC_DB, SSZ(__db_pg_trunc_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pg_trunc_args, meta), "meta", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_trunc_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_trunc_args, last_free), "last_free", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pg_trunc_args, last_lsn), "last_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pg_trunc_args, next_free), "next_free", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pg_trunc_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_PGLIST, SSZ(__db_pg_trunc_args, list), "list", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_realloc_desc[] = {
+	{LOGREC_DB, SSZ(__db_realloc_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_realloc_args, prev_pgno), "prev_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_realloc_args, page_lsn), "page_lsn", ""},
+	{LOGREC_ARG, SSZ(__db_realloc_args, next_free), "next_free", "%lu"},
+	{LOGREC_ARG, SSZ(__db_realloc_args, ptype), "ptype", "%lu"},
+	{LOGREC_PGLIST, SSZ(__db_realloc_args, list), "list", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_relink_desc[] = {
+	{LOGREC_DB, SSZ(__db_relink_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_relink_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_relink_args, new_pgno), "new_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_relink_args, prev_pgno), "prev_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_relink_args, lsn_prev), "lsn_prev", ""},
+	{LOGREC_ARG, SSZ(__db_relink_args, next_pgno), "next_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_relink_args, lsn_next), "lsn_next", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_merge_desc[] = {
+	{LOGREC_DB, SSZ(__db_merge_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_merge_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_merge_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__db_merge_args, npgno), "npgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_merge_args, nlsn), "nlsn", ""},
+	{LOGREC_PGDBT, SSZ(__db_merge_args, hdr), "hdr", ""},
+	{LOGREC_PGDDBT, SSZ(__db_merge_args, data), "data", ""},
+	{LOGREC_ARG, SSZ(__db_merge_args, pg_copy), "pg_copy", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pgno_desc[] = {
+	{LOGREC_DB, SSZ(__db_pgno_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__db_pgno_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__db_pgno_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__db_pgno_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pgno_args, opgno), "opgno", "%lu"},
+	{LOGREC_ARG, SSZ(__db_pgno_args, npgno), "npgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __db_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_addrem_recover, DB___db_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_big_recover, DB___db_big)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_ovref_recover, DB___db_ovref)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_debug_recover, DB___db_debug)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_noop_recover, DB___db_noop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_alloc_recover, DB___db_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_free_recover, DB___db_pg_free)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_cksum_recover, DB___db_cksum)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_freedata_recover, DB___db_pg_freedata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_init_recover, DB___db_pg_init)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_trunc_recover, DB___db_pg_trunc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_realloc_recover, DB___db_realloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_relink_recover, DB___db_relink)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_merge_recover, DB___db_merge)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pgno_recover, DB___db_pgno)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/db/db_autop.c b/src/db/db_autop.c
new file mode 100644
index 00000000..6fe77039
--- /dev/null
+++ b/src/db/db_autop.c
@@ -0,0 +1,441 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_addrem_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_addrem", __db_addrem_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_addrem_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_addrem_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_addrem_42", __db_addrem_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_big", __db_big_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_big_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_big_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_big_42", __db_big_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_ovref_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_ovref_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_ovref", __db_ovref_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_relink_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_relink_42", __db_relink_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_debug_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_debug_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_debug", __db_debug_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_noop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_noop_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_noop", __db_noop_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_alloc_42", __db_pg_alloc_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_alloc", __db_pg_alloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_free_42", __db_pg_free_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_free", __db_pg_free_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_cksum_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_cksum_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_cksum", __db_cksum_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_freedata_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_freedata_42", __db_pg_freedata_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_freedata_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_freedata", __db_pg_freedata_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_init_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_init", __db_pg_init_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_sort_44_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_sort_44", __db_pg_sort_44_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_trunc_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pg_trunc", __db_pg_trunc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_realloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_realloc_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_realloc", __db_realloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_relink_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_relink_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_relink", __db_relink_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_merge_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_merge_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_merge", __db_merge_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pgno_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pgno_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__db_pgno", __db_pgno_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_addrem_print, DB___db_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_big_print, DB___db_big)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_ovref_print, DB___db_ovref)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_debug_print, DB___db_debug)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_noop_print, DB___db_noop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_alloc_print, DB___db_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_free_print, DB___db_pg_free)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_cksum_print, DB___db_cksum)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_freedata_print, DB___db_pg_freedata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_init_print, DB___db_pg_init)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_trunc_print, DB___db_pg_trunc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_realloc_print, DB___db_realloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_relink_print, DB___db_relink)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_merge_print, DB___db_merge)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pgno_print, DB___db_pgno)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/db/db_backup.c b/src/db/db_backup.c
new file mode 100644
index 00000000..66d7382a
--- /dev/null
+++ b/src/db/db_backup.c
@@ -0,0 +1,775 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_QUEUE
+#include "dbinc/qam.h"
+#endif
+
+static void save_error __P((const DB_ENV *, const char *, const char *));
+static int backup_read_log_dir __P((DB_ENV *, const char *, int *, u_int32_t));
+static int backup_read_data_dir
+    __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
+static int backup_dir_clean
+    __P((DB_ENV *, const char *, const char *, int *, u_int32_t));
+static int backup_data_copy
+    __P((DB_ENV *, const char *, const char *, const char *, int));
+
+/*
+ * __db_dbbackup_pp --
+ *	Copy a database file coordinated with mpool.
+ *
+ * PUBLIC: int __db_dbbackup_pp __P((DB_ENV *,
+ * PUBLIC:     const char *, const char *, u_int32_t));
+ */
+int
+__db_dbbackup_pp(dbenv, dbfile, target, flags)
+	DB_ENV *dbenv;
+	const char *dbfile, *target;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	if ((ret = __db_fchk(dbenv->env,
+	    "DB_ENV->dbbackup", flags, DB_EXCL)) != 0)
+		return (ret);
+	ENV_ENTER(dbenv->env, ip);
+
+	ret = __db_dbbackup(dbenv, ip, dbfile, target, flags);
+
+	ENV_LEAVE(dbenv->env, ip);
+	return (ret);
+}
+
+/*
+ * __db_dbbackup --
+ *	Copy a database file coordinated with mpool.
+ *
+ * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *,
+ * PUBLIC:     const char *, const char *, u_int32_t));
+ */
+int
+__db_dbbackup(dbenv, ip, dbfile, target, flags)
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	const char *dbfile, *target;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_FH *fp;
+	void *handle;
+	int ret, retry_count, t_ret;
+
+	dbp = NULL;
+	retry_count = 0;
+
+retry:	if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
+	    (ret = __db_open(dbp, ip, NULL, dbfile, NULL,
+	    DB_UNKNOWN, DB_AUTO_COMMIT | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) {
+		if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
+			(void)__db_close(dbp, NULL, DB_NOSYNC);
+			dbp = NULL;
+			if (++retry_count > 100)
+				return (ret);
+			__db_errx(dbenv->env, DB_STR_A("0702",
+		    "Deadlock while opening %s, retrying", "%s"), dbfile);
+			__os_yield(dbenv->env, 1, 0);
+			goto retry;
+		}
+	}
+
+	if (ret == 0) {
+		if ((ret = __memp_backup_open(dbenv->env,
+		    dbp->mpf, dbfile, target, flags, &fp, &handle)) == 0) {
+			if (dbp->type == DB_HEAP)
+				ret = __heap_backup(
+				    dbenv, dbp, ip, fp, handle, flags);
+			else
+				ret = __memp_backup_mpf(
+				    dbenv->env, dbp->mpf,
+				    ip, 0, dbp->mpf->mfp->last_pgno,
+				    fp, handle, flags);
+		}
+		if ((t_ret = __memp_backup_close(dbenv->env,
+		    dbp->mpf, dbfile, fp, handle)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+#ifdef HAVE_QUEUE
+	/*
+	 * For compatibility with the 5.2 and patch versions of db_copy
+	 * dump the queue extents here.
+	 */
+	if (ret == 0 && dbp->type == DB_QUEUE)
+		ret = __qam_backup_extents(dbp, ip, target, flags);
+#endif
+
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret != 0)
+		__db_err(dbenv->env, ret, "Backup Failed");
+	return (ret);
+}
+
+/*
+ * backup_dir_clean --
+ *	Clean out the backup directory.
+ */
+static int
+backup_dir_clean(dbenv, backup_dir, log_dir, remove_maxp, flags)
+	DB_ENV *dbenv;
+	const char *backup_dir, *log_dir;
+	int *remove_maxp;
+	u_int32_t flags;
+{
+	ENV *env;
+	int cnt, fcnt, ret, v;
+	const char *dir;
+	char **names, buf[DB_MAXPATHLEN], path[DB_MAXPATHLEN];
+
+	env = dbenv->env;
+
+	/* We may be cleaning a log directory separate from the target. */
+	if (log_dir != NULL) {
+		if ((ret = __os_concat_path(buf,
+		    sizeof(buf), backup_dir, log_dir)) != 0) {
+			buf[sizeof(buf) - 1] = '\0';
+			__db_errx(env,  DB_STR_A("0717",
+			    "%s: path too long", "%s"), buf);
+			return (EINVAL);
+		}
+		dir = buf;
+	} else
+		dir = backup_dir;
+
+	/* Get a list of file names. */
+	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+		if (log_dir != NULL && !LF_ISSET(DB_BACKUP_UPDATE))
+			return (0);
+		__db_err(env,
+		    ret, DB_STR_A("0718", "%s: directory read", "%s"), dir);
+		return (ret);
+	}
+	for (cnt = fcnt; --cnt >= 0;) {
+		/*
+		 * Skip non-log files (if update was specified).
+		 */
+		if (!IS_LOG_FILE(names[cnt])) {
+			if (LF_ISSET(DB_BACKUP_UPDATE))
+				continue;
+		} else {
+			/* Track the highest-numbered log file removed. */
+			v = atoi(names[cnt] + sizeof(LFPREFIX) - 1);
+			if (*remove_maxp < v)
+				*remove_maxp = v;
+		}
+		if ((ret = __os_concat_path(path,
+		    sizeof(path), dir, names[cnt])) != 0) {
+			path[sizeof(path) - 1] = '\0';
+			__db_errx(env, DB_STR_A("0714",
+			    "%s: path too long", "%s"), path);
+			return (EINVAL);
+		}
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+			__db_msg(env, DB_STR_A("0715", "removing %s",
+			    "%s"),  path);
+		if ((ret = __os_unlink(env, path, 0)) != 0)
+			return (ret);
+	}
+
+	__os_dirfree(env, names, fcnt);
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP) && *remove_maxp != 0)
+		__db_msg(env, DB_STR_A("0719",
+		    "highest numbered log file removed: %d", "%d"),
+		    *remove_maxp);
+
+	return (0);
+}
+
+/*
+ * backup_data_copy --
+ *	Copy a non-database file into the backup directory.
+ */
+static int
+backup_data_copy(dbenv, file, from_dir, to_dir, log)
+	DB_ENV *dbenv;
+	const char *file, *from_dir, *to_dir;
+	int log;
+{
+	DB_BACKUP *backup;
+	DB_FH *rfhp, *wfhp;
+	ENV *env;
+	u_int32_t gigs, off;
+	size_t nr, nw;
+	int ret, t_ret;
+	char *buf;
+	void *handle;
+	char from[DB_MAXPATHLEN], to[DB_MAXPATHLEN];
+
+	rfhp = wfhp = NULL;
+	handle = NULL;
+	buf = NULL;
+	env = dbenv->env;
+	backup = env->backup_handle;
+
+	if ((ret = __os_concat_path(from,
+	    sizeof(from), from_dir, file)) != 0) {
+		from[sizeof(from) - 1] = '\0';
+		__db_errx(env, DB_STR_A("0728",
+		     "%s: path too long", "%s"), from);
+		goto err;
+	}
+	if ((ret = __os_concat_path(to,
+	    sizeof(to), to_dir, file)) != 0) {
+		to[sizeof(to) - 1] = '\0';
+		__db_errx(env, DB_STR_A("0729",
+		     "%s: path too long", "%s"), to);
+		goto err;
+	}
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+		__db_msg(env, DB_STR_A("0726",
+		    "copying %s to %s", "%s %s"), from, to);
+
+	if ((ret = __os_malloc(env, MEGABYTE, &buf)) != 0) {
+		__db_err(env, ret, DB_STR_A("0727",
+		    "%lu buffer allocation", "%lu"), (u_long)MEGABYTE);
+		return (ret);
+	}
+
+	/* Open the input file. */
+	if ((ret = __os_open(env, from, 0, DB_OSO_RDONLY, 0, &rfhp)) != 0) {
+		if (ret == ENOENT && !log) {
+			ret = 0;
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+				__db_msg(env, DB_STR_A("0730",
+				    "%s%c%s not present", "%s %c %s"),
+				    from_dir, PATH_SEPARATOR[0], file);
+			goto done;
+		}
+		__db_err(env, ret, "%s", buf);
+		goto err;
+	}
+
+	/* Open the output file. */
+	if (backup != NULL && backup->open != NULL)
+		ret = backup->open(env->dbenv, file, to_dir, &handle);
+	else {
+		if ((ret = __os_open(env, to, 0,
+		    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0) {
+			__db_err(env, ret, "%s", to);
+			goto err;
+		}
+	}
+
+	off = 0;
+	gigs = 0;
+	/* Copy the data. */
+	while ((ret = __os_read(env, rfhp, buf, MEGABYTE, &nr)) == 0 &&
+	    nr > 0) {
+		if (backup != NULL && backup->write != NULL) {
+			if ((ret = backup->write(env->dbenv, gigs,
+			     off, (u_int32_t)nr, (u_int8_t *)buf, handle)) != 0)
+				break;
+		} else {
+			if ((ret = __os_write(env, wfhp, buf, nr, &nw)) != 0)
+				break;
+			if (nr != nw) {
+				ret = EIO;
+				break;
+			}
+		}
+		off += (u_int32_t)nr;
+		if (off >= GIGABYTE) {
+			gigs++;
+			off -= GIGABYTE;
+		}
+	}
+	if (ret != 0)
+		__db_err(env, ret, DB_STR("0748", "Write failed."));
+
+err:
+done:	if (buf != NULL)
+		__os_free(env, buf);
+
+	if (backup != NULL && backup->close != NULL &&
+	    (t_ret = backup->close(env->dbenv, file, handle)) != 0 && ret != 0)
+		ret = t_ret;
+	if (rfhp != NULL &&
+	    (t_ret = __os_closehandle(env, rfhp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* We may be running on a remote filesystem; force the flush. */
+	if (ret == 0 && wfhp != NULL) {
+		ret = __os_fsync(env, wfhp);
+		if (ret != 0)
+			__db_err(env, ret, DB_STR("0731", "Sync failed"));
+	}
+	if (wfhp != NULL &&
+	    (t_ret = __os_closehandle(env, wfhp)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static void save_error(dbenv, prefix, errstr)
+	const DB_ENV *dbenv;
+	const char *prefix;
+	const char *errstr;
+{
+	COMPQUIET(prefix, NULL);
+	if (DB_GLOBAL(saved_errstr) != NULL)
+		__os_free(dbenv->env, DB_GLOBAL(saved_errstr));
+	(void)__os_strdup(dbenv->env, errstr, &DB_GLOBAL(saved_errstr));
+}
+
+/*
+ * backup_read_data_dir --
+ *	Read a directory looking for databases to copy.
+ */
+static int
+backup_read_data_dir(dbenv, ip, dir, backup_dir, flags)
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	const char *dir, *backup_dir;
+	u_int32_t flags;
+{
+	DB_MSGBUF mb;
+	ENV *env;
+	FILE *savefile;
+	int fcnt, ret;
+	size_t cnt;
+	const char *bd;
+	char **names, buf[DB_MAXPATHLEN], bbuf[DB_MAXPATHLEN];
+	void (*savecall) (const DB_ENV *, const char *, const char *);
+
+	env = dbenv->env;
+	memset(bbuf, 0, sizeof(bbuf));
+
+	bd = backup_dir;
+	if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && dir != env->db_home) {
+		cnt = sizeof(bbuf);
+		/* Build a path name to the destination. */
+		if ((ret = __os_concat_path(bbuf, sizeof(bbuf),
+		    backup_dir, dir)) != 0 ||
+		    (((cnt = strlen(bbuf)) == sizeof(bbuf) ||
+		    (cnt == sizeof(bbuf) - 1 &&
+		    strchr(PATH_SEPARATOR, bbuf[cnt - 1]) == NULL)) &&
+		    LF_ISSET(DB_CREATE))) {
+			bbuf[sizeof(bbuf) - 1] = '\0';
+			__db_errx(env, DB_STR_A("0720",
+			    "%s: path too long", "%s"), bbuf);
+			return (1);
+		}
+
+		/* Create the path. */
+		if (LF_ISSET(DB_CREATE)) {
+			if (strchr(PATH_SEPARATOR, bbuf[cnt - 1]) == NULL)
+				bbuf[cnt] = PATH_SEPARATOR[0];
+
+			if ((ret = __db_mkpath(env, bbuf)) != 0) {
+				__db_err(env,  ret, DB_STR_A("0721",
+				    "%s: cannot create", "%s"), bbuf);
+				return (ret);
+			}
+			/* step on the trailing '/' */
+			bbuf[cnt] = '\0';
+		}
+		bd = bbuf;
+
+	}
+	if (!__os_abspath(dir) && dir != env->db_home) {
+		/* Build a path name to the source. */
+		if ((ret = __os_concat_path(buf,
+		    sizeof(buf), env->db_home, dir)) != 0) {
+			buf[sizeof(buf) - 1] = '\0';
+			__db_errx(env, DB_STR_A("0722",
+			    "%s: path too long", "%s"), buf);
+			return (EINVAL);
+		}
+		dir = buf;
+	}
+	/* Get a list of file names. */
+	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+		__db_err(env, ret, DB_STR_A("0723", "%s: directory read",
+		    "%s"), dir);
+		return (ret);
+	}
+	for (cnt = (size_t)fcnt; cnt-- > 0;) {
+		/*
+		 * Skip files in DB's name space, except replication dbs.
+		 */
+		if (IS_LOG_FILE(names[cnt]))
+			continue;
+		if (IS_DB_FILE(names[cnt]) && !IS_REP_FILE(names[cnt])
+#ifdef HAVE_PARTITION
+		    && !IS_PARTITION_DB_FILE(names[cnt])
+#endif
+		)
+			continue;
+
+		/*
+		 * Skip DB_CONFIG.
+		 */
+		if (LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+		     !strncmp(names[cnt], "DB_CONFIG", sizeof("DB_CONFIG")))
+			continue;
+
+		/*
+		 * Copy the database.
+		 */
+
+		DB_MSGBUF_INIT(&mb);
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+			__db_msgadd(env, &mb, DB_STR_A("0724",
+			    "copying database %s%c%s to %s%c%s",
+			    "%s%c%s %s%c%s"),
+			    dir, PATH_SEPARATOR[0], names[cnt],
+			    bd, PATH_SEPARATOR[0], names[cnt]);
+
+		/*
+		 * Suppress errors on non-db files.
+		 */
+		savecall = dbenv->db_errcall;
+		dbenv->db_errcall = save_error;
+		savefile = dbenv->db_errfile;
+		dbenv->db_errfile = NULL;
+
+		ret = __db_dbbackup(dbenv, ip, names[cnt], bd, flags);
+
+		dbenv->db_errcall = savecall;
+		dbenv->db_errfile = savefile;
+
+		/* The file might not be a database. */
+		if (ret == ENOENT || ret == EINVAL) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP)) {
+				__db_msgadd(env, &mb, " -- Not a database");
+				DB_MSGBUF_FLUSH(env, &mb);
+			}
+			if (LF_ISSET(DB_BACKUP_FILES))
+				ret = backup_data_copy(
+				    dbenv, names[cnt], dir, bd, 0);
+			else
+				ret = 0;
+		} else if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+			DB_MSGBUF_FLUSH(env, &mb);
+
+		if (ret != 0) {
+			if (DB_GLOBAL(saved_errstr) != NULL) {
+				__db_errx(env, "%s", DB_GLOBAL(saved_errstr));
+				__os_free(env, DB_GLOBAL(saved_errstr));
+				DB_GLOBAL(saved_errstr) = NULL;
+			}
+			break;
+		}
+	}
+
+	__os_dirfree(env, names, fcnt);
+
+	return (ret);
+}
+
+/*
+ * backup_read_log_dir --
+ *	Read a directory looking for log files to copy.
+ */
+static int
+backup_read_log_dir(dbenv, backup_dir, copy_minp, flags)
+	DB_ENV *dbenv;
+	const char *backup_dir;
+	int *copy_minp;
+	u_int32_t flags;
+{
+	ENV *env;
+	u_int32_t aflag;
+	size_t cnt;
+	int ret, update, v;
+	const char *backupd;
+	char **begin, **names, *logd;
+	char from[DB_MAXPATHLEN], to[DB_MAXPATHLEN];
+
+	env = dbenv->env;
+	ret = 0;
+	begin = NULL;
+	memset(to, 0, sizeof(to));
+
+	/*
+	 * Figure out where the log files are and create the log
+	 * destination directory if necessary.
+	 */
+	backupd = backup_dir;
+	if ((logd = dbenv->db_log_dir) == NULL)
+		logd = env->db_home;
+	else {
+		if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)) {
+			cnt = sizeof(to);
+			if ((ret = __os_concat_path(to,
+			    sizeof(to), backup_dir, logd)) != 0 ||
+			    (((cnt = strlen(to)) == sizeof(to) ||
+			    (cnt == sizeof(to) - 1 &&
+			    strchr(PATH_SEPARATOR, to[cnt - 1]) == NULL)) &&
+			    LF_ISSET(DB_CREATE))) {
+				to[sizeof(to) - 1] = '\0';
+				__db_errx(env, DB_STR_A("0733",
+				    "%s: path too long", "%s"), to);
+				goto err;
+			}
+			if (LF_ISSET(DB_CREATE)) {
+				if (strchr(PATH_SEPARATOR, to[cnt - 1]) == NULL)
+					to[cnt] = PATH_SEPARATOR[0];
+
+				if ((ret = __db_mkpath(env, to)) != 0) {
+					__db_err(env, ret, DB_STR_A("0734",
+					    "%s: cannot create", "%s"), to);
+					goto err;
+				}
+				to[cnt] = '\0';
+			}
+			if ((ret = __os_strdup(env, to, (void*) &backupd)) != 0)
+				goto err;
+		}
+		if (!__os_abspath(logd)) {
+			if ((ret = __os_concat_path(from,
+			    sizeof(from), env->db_home, logd)) != 0) {
+				from[sizeof(from) - 1] = '\0';
+				__db_errx(env, DB_STR_A("0732",
+				    "%s: path too long", "%s"), from);
+				goto err;
+			}
+			if ((ret = __os_strdup(env, from, &logd)) != 0)
+				goto err;
+		}
+	}
+
+	update = LF_ISSET(DB_BACKUP_UPDATE);
+again:	aflag = DB_ARCH_LOG;
+
+	/*
+	 * If this is an update and we are deleting files, first process
+	 * those files that can be removed, then repeat with the rest.
+	 */
+	if (update)
+		aflag = 0;
+
+	/* Flush the log to get latest info. */
+	if ((ret = __log_flush(env, NULL)) != 0) {
+		__db_err(env, ret, DB_STR("0735", "Can't flush log"));
+		goto err;
+	}
+
+	/* Get a list of file names to be copied. */
+	if ((ret = __log_archive(env, &names, aflag)) != 0) {
+		__db_err(env, ret, DB_STR("0736", "Can't get log file names"));
+		goto err;
+	}
+	if (names == NULL)
+		goto done;
+	begin = names;
+	for (; *names != NULL; names++) {
+		/* Track the lowest-numbered log file copied. */
+		v = atoi(*names + sizeof(LFPREFIX) - 1);
+		if (*copy_minp == 0 || *copy_minp > v)
+			*copy_minp = v;
+
+		if ((ret = __os_concat_path(from,
+		    sizeof(from), logd, *names)) != 0) {
+			from[sizeof(from) - 1] = '\0';
+			__db_errx(env, DB_STR_A("0737",
+			    "%s: path too long", "%s"), from);
+			goto err;
+		}
+
+		/*
+		 * If we're going to remove the file, attempt to rename it
+		 * instead of copying and then removing.  The likely failure
+		 * is EXDEV (source and destination are on different volumes).
+		 * Fall back to a copy, regardless of the error.  We don't
+		 * worry about partial contents, the copy truncates the file
+		 * on open.
+		 */
+		if (update) {
+			if ((ret = __os_concat_path(to,
+			    sizeof(to), backupd, *names)) != 0) {
+				to[sizeof(to) - 1] = '\0';
+				__db_errx(env, DB_STR_A("0738",
+				    "%s: path too long", "%s"), to);
+				goto err;
+			}
+			if (__os_rename(env, from, to, 1) == 0) {
+				if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+					__db_msg(env, DB_STR_A("0739",
+					    "moving %s to %s",
+					    "%s %s"), from, to);
+				continue;
+			}
+		}
+
+		/* Copy the file. */
+		if (backup_data_copy(dbenv, *names, logd, backupd, 1) != 0) {
+			ret = 1;
+			goto err;
+		}
+
+		if (update) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+				__db_msg(env, DB_STR_A("0740",
+				    "removing %s", "%s"), from);
+			if ((ret = __os_unlink(env, from, 0)) != 0) {
+				__db_err(env, ret, DB_STR_A("0741",
+				    "unlink of %s failed", "%s"), from);
+				goto err;
+			}
+		}
+
+	}
+
+	__os_ufree(env, begin);
+	begin = NULL;
+done:	if (update) {
+		update = 0;
+		goto again;
+	}
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP) && *copy_minp != 0)
+		__db_msg(env, DB_STR_A("0742",
+		    "lowest numbered log file copied: %d", "%d"),
+		    *copy_minp);
+err:	if (logd != dbenv->db_log_dir && logd != env->db_home)
+		__os_free(env, logd);
+	if (backupd != NULL && backupd != backup_dir)
+		__os_free(env, (void *)backupd);
+	if (begin != NULL)
+		__os_ufree(env, begin);
+
+	return (ret);
+}
+
+/*
+ * __db_backup --
+ *	Backup databases in the enviornment.
+ *
+ * PUBLIC: int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__db_backup(dbenv, target, flags)
+	DB_ENV *dbenv;
+	const char *target;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int copy_min, remove_max, ret;
+	char **dir;
+
+	env = dbenv->env;
+	remove_max = copy_min = 0;
+
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_CREATE | DB_EXCL | DB_BACKUP_FILES | DB_BACKUP_SINGLE_DIR |	\
+	DB_BACKUP_UPDATE | DB_BACKUP_NO_LOGS | DB_BACKUP_CLEAN)
+
+	if ((ret = __db_fchk(env, "DB_ENV->backup", flags, OKFLAGS)) != 0)
+		return (ret);
+
+	if (target == NULL) {
+		__db_errx(env,
+		    DB_STR("0716", "Target directory may not be null."));
+		return (EINVAL);
+	}
+
+	/*
+	 * If the target directory for the backup does not exist, create it
+	 * with mode read-write-execute for the owner.  Ignore errors here,
+	 * it's simpler and more portable to just always try the create.  If
+	 * there's a problem, we'll fail with reasonable errors later.
+	 */
+	if (LF_ISSET(DB_CREATE))
+		(void)__os_mkdir(NULL, target, DB_MODE_700);
+
+	if (LF_ISSET(DB_BACKUP_CLEAN)) {
+		if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+		    dbenv->db_log_dir != NULL &&
+		    (ret = backup_dir_clean(dbenv, target,
+		    dbenv->db_log_dir, &remove_max, flags)) != 0)
+			return (ret);
+		if ((ret = backup_dir_clean(dbenv,
+		    target, NULL, &remove_max, flags)) != 0)
+			return (ret);
+
+	}
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * If the UPDATE option was not specified, copy all database
+	 * files found in the database environment home directory and
+	 * data directories..
+	 */
+	if ((ret = __env_set_backup(env, 1)) != 0)
+		goto end;
+	F_SET(dbenv, DB_ENV_HOTBACKUP);
+	if (!LF_ISSET(DB_BACKUP_UPDATE)) {
+		if ((ret = backup_read_data_dir(dbenv,
+		    ip, env->db_home, target, flags)) != 0)
+			goto err;
+		for (dir = dbenv->db_data_dir;
+		    dir != NULL && *dir != NULL; ++dir) {
+			/*
+			 * Don't allow absolute path names taken from the
+			 * enviroment  -- running recovery with them would
+			 * corrupt the source files.
+			 */
+			if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)
+			   && __os_abspath(*dir)) {
+				__db_errx(env, DB_STR_A("0725",
+"data directory '%s' is absolute path, not permitted unless backup is to a single directory",
+				    "%s"), *dir);
+				ret = EINVAL;
+				goto err;
+			}
+			if ((ret = backup_read_data_dir(
+			    dbenv, ip, *dir, target, flags)) != 0)
+				goto err;
+		}
+	}
+
+	/*
+	 * Copy all log files found in the log directory.
+	 * The log directory defaults to the home directory.
+	 */
+	if ((ret = backup_read_log_dir(dbenv, target, &copy_min, flags)) != 0)
+		goto err;
+	/*
+	 * If we're updating a snapshot, the lowest-numbered log file copied
+	 * into the backup directory should be less than, or equal to, the
+	 * highest-numbered log file removed from the backup directory during
+	 * cleanup.
+	 */
+	if (LF_ISSET(DB_BACKUP_UPDATE) && remove_max < copy_min &&
+	     !(remove_max == 0 && copy_min == 1)) {
+		__db_errx(env, DB_STR_A("0743",
+"the largest log file removed (%d) must be greater than or equal the smallest log file copied (%d)",
+		    "%d %d"), remove_max, copy_min);
+		ret = EINVAL;
+	}
+
+err:	F_CLR(dbenv, DB_ENV_HOTBACKUP);
+	(void)__env_set_backup(env, 0);
+end:	ENV_LEAVE(env, ip);
+	return (ret);
+}
diff --git a/src/db/db_cam.c b/src/db/db_cam.c
new file mode 100644
index 00000000..6ee8b579
--- /dev/null
+++ b/src/db/db_cam.c
@@ -0,0 +1,3506 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_s_count __P((DB *));
+static int __db_wrlock_err __P((ENV *));
+static int __dbc_del_foreign __P((DBC *));
+static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *));
+static int __dbc_del_secondary __P((DBC *));
+static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_append __P((DBC *,
+		DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_fixed_len __P((DBC *, DBT *, DBT *));
+static inline int __dbc_put_partial __P((DBC *,
+		DBT *, DBT *, DBT *, DBT *, u_int32_t *, u_int32_t));
+static int __dbc_put_primary __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_resolve_key __P((DBC *,
+		DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_secondaries __P((DBC *,
+		DBT *, DBT *, DBT *, int, DBT *, u_int32_t *));
+
+#define	CDB_LOCKING_INIT(env, dbc)					\
+	/*								\
+	 * If we are running CDB, this had better be either a write	\
+	 * cursor or an immediate writer.  If it's a regular writer,	\
+	 * that means we have an IWRITE lock and we need to upgrade	\
+	 * it to a write lock.						\
+	 */								\
+	if (CDB_LOCKING(env)) {						\
+		if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER))	\
+			return (__db_wrlock_err(env));			\
+									\
+		if (F_ISSET(dbc, DBC_WRITECURSOR) &&			\
+		    (ret = __lock_get(env,				\
+		    (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt,	\
+		    DB_LOCK_WRITE, &(dbc)->mylock)) != 0)		\
+			return (ret);					\
+	}
+#define	CDB_LOCKING_DONE(env, dbc)					\
+	/* Release the upgraded lock. */				\
+	if (F_ISSET(dbc, DBC_WRITECURSOR))				\
+		(void)__lock_downgrade(					\
+		    env, &(dbc)->mylock, DB_LOCK_IWRITE, 0);
+
+#define	SET_READ_LOCKING_FLAGS(dbc, var) do {				\
+	var = 0;							\
+	if (!F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED)) {	\
+		if (LF_ISSET(DB_READ_COMMITTED))			\
+			var = DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED; \
+		if (LF_ISSET(DB_READ_UNCOMMITTED))			\
+			var = DBC_READ_UNCOMMITTED;			\
+	}								\
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);		\
+} while (0)
+
+/*
+ * __dbc_close --
+ *	DBC->close.
+ *
+ * PUBLIC: int __dbc_close __P((DBC *));
+ */
+int
+__dbc_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DBC *opd;
+	DBC_INTERNAL *cp;
+	DB_TXN *txn;
+	ENV *env;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = dbc->internal;
+	opd = cp->opd;
+	ret = 0;
+
+	/*
+	 * Remove the cursor(s) from the active queue.  We may be closing two
+	 * cursors at once here, a top-level one and a lower-level, off-page
+	 * duplicate one.  The access-method specific cursor close routine must
+	 * close both of them in a single call.
+	 *
+	 * !!!
+	 * Cursors must be removed from the active queue before calling the
+	 * access specific cursor close routine, btree depends on having that
+	 * order of operations.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+	if (opd != NULL) {
+		DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE));
+		F_CLR(opd, DBC_ACTIVE);
+		TAILQ_REMOVE(&dbp->active_queue, opd, links);
+	}
+	DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE));
+	F_CLR(dbc, DBC_ACTIVE);
+	TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	/* Call the access specific cursor close routine. */
+	if ((t_ret =
+	    dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Release the lock after calling the access method specific close
+	 * routine, a Btree cursor may have had pending deletes.
+	 *
+	 * Also, be sure not to free anything if mylock.off is INVALID;  in
+	 * some cases, such as idup'ed read cursors and secondary update
+	 * cursors, a cursor in a CDB environment may not have a lock at all.
+	 */
+	if (LOCK_ISSET(dbc->mylock)) {
+		if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0)
+			ret = t_ret;
+
+		/* For safety's sake, since this is going on the free queue. */
+		memset(&dbc->mylock, 0, sizeof(dbc->mylock));
+		if (opd != NULL)
+			memset(&opd->mylock, 0, sizeof(opd->mylock));
+	}
+
+	/*
+	 * Remove this cursor's locker ID from its family.
+	 */
+	if (F_ISSET(dbc, DBC_OWN_LID) && F_ISSET(dbc, DBC_FAMILY)) {
+		if ((t_ret = __lock_familyremove(env->lk_handle,
+		    dbc->lref)) != 0 && ret == 0)
+			ret = t_ret;
+		F_CLR(dbc, DBC_FAMILY);
+	}
+
+	if ((txn = dbc->txn) != NULL)
+		txn->cursors--;
+
+	/* Move the cursor(s) to the free queue. */
+	MUTEX_LOCK(env, dbp->mutex);
+	if (opd != NULL) {
+		if (txn != NULL)
+			txn->cursors--;
+		TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links);
+	}
+	TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 &&
+	    (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __dbc_destroy --
+ *	Destroy the cursor, called after DBC->close.
+ *
+ * PUBLIC: int __dbc_destroy __P((DBC *));
+ */
+int
+__dbc_destroy(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	ENV *env;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Remove the cursor from the free queue. */
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	/* Free up allocated memory. */
+	if (dbc->my_rskey.data != NULL)
+		__os_free(env, dbc->my_rskey.data);
+	if (dbc->my_rkey.data != NULL)
+		__os_free(env, dbc->my_rkey.data);
+	if (dbc->my_rdata.data != NULL)
+		__os_free(env, dbc->my_rdata.data);
+
+	/* Call the access specific cursor destroy routine. */
+	ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc);
+
+	/*
+	 * Release the lock id for this cursor.
+	 */
+	if (LOCKING_ON(env) &&
+	    F_ISSET(dbc, DBC_OWN_LID) &&
+	    (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, dbc);
+
+	return (ret);
+}
+
+/*
+ * __dbc_cmp --
+ *	Compare the position of two cursors. Return whether two cursors are
+ *	pointing to the same key/data pair.
+ *
+ * result == 0  if both cursors refer to the same item.
+ * result == 1  otherwise
+ *
+ * PUBLIC: int __dbc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__dbc_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	DBC *curr_dbc, *curr_odbc;
+	DBC_INTERNAL *dbc_int, *odbc_int;
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+	ret = 0;
+
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbc->dbp)) {
+		dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+		other_dbc = ((PART_CURSOR *)other_dbc->internal)->sub_cursor;
+	}
+	/* Both cursors must still be valid. */
+	if (dbc == NULL || other_dbc == NULL) {
+		__db_errx(env, DB_STR("0692",
+"Both cursors must be initialized before calling DBC->cmp."));
+		return (EINVAL);
+	}
+
+	if (dbc->dbp != other_dbc->dbp) {
+		*result = 1;
+		return (0);
+	}
+#endif
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbc->dbp))
+		return (__bamc_compress_cmp(dbc, other_dbc, result));
+#endif
+
+	curr_dbc = dbc;
+	curr_odbc = other_dbc;
+	dbc_int = dbc->internal;
+	odbc_int = other_dbc->internal;
+
+	/* Both cursors must be on valid positions. */
+	if (dbc_int->pgno == PGNO_INVALID || odbc_int->pgno == PGNO_INVALID) {
+		__db_errx(env, DB_STR("0693",
+"Both cursors must be initialized before calling DBC->cmp."));
+		return (EINVAL);
+	}
+
+	/*
+	 * Use a loop since cursors can be nested. Off page duplicate
+	 * sets can only be nested one level deep, so it is safe to use a
+	 * while (true) loop.
+	 */
+	while (1) {
+		if (dbc_int->pgno == odbc_int->pgno &&
+		    dbc_int->indx == odbc_int->indx) {
+			/*
+			 * If one cursor is sitting on an off page duplicate
+			 * set, the other will be pointing to the same set. Be
+			 * careful, and check  anyway.
+			 */
+			if (dbc_int->opd != NULL && odbc_int->opd != NULL) {
+				curr_dbc = dbc_int->opd;
+				curr_odbc = odbc_int->opd;
+				dbc_int = dbc_int->opd->internal;
+				odbc_int= odbc_int->opd->internal;
+				continue;
+			} else if (dbc_int->opd == NULL &&
+			    odbc_int->opd == NULL)
+				*result = 0;
+			else {
+				__db_errx(env, DB_STR("0694",
+	    "DBCursor->cmp mismatched off page duplicate cursor pointers."));
+				return (EINVAL);
+			}
+
+			switch (curr_dbc->dbtype) {
+			case DB_HASH:
+				/*
+				 * Make sure that on-page duplicate data
+				 * indexes match, and that the deleted
+				 * flags are consistent.
+				 */
+				ret = __hamc_cmp(curr_dbc, curr_odbc, result);
+				break;
+			case DB_BTREE:
+			case DB_RECNO:
+				/*
+				 * Check for consisted deleted flags on btree
+				 * specific cursors.
+				 */
+				ret = __bamc_cmp(curr_dbc, curr_odbc, result);
+				break;
+			default:
+				/* NO-OP break out. */
+				break;
+			}
+		} else
+			*result = 1;
+		return (ret);
+	}
+	/* NOTREACHED. */
+	return (ret);
+}
+
+/*
+ * __dbc_count --
+ *	Return a count of duplicate data items.
+ *
+ * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *));
+ */
+int
+__dbc_count(dbc, recnop)
+	DBC *dbc;
+	db_recno_t *recnop;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbc->dbp))
+		dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+#endif
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are not duplicated and will not be cleaned up on return.
+	 * So, pages/locks that the cursor references must be resolved by the
+	 * underlying functions.
+	 */
+	switch (dbc->dbtype) {
+	case DB_HEAP:
+	case DB_QUEUE:
+	case DB_RECNO:
+		*recnop = 1;
+		break;
+	case DB_HASH:
+		if (dbc->internal->opd == NULL) {
+			if ((ret = __hamc_count(dbc, recnop)) != 0)
+				return (ret);
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_BTREE:
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbc->dbp))
+			return (__bamc_compress_count(dbc, recnop));
+#endif
+		if ((ret = __bamc_count(dbc, recnop)) != 0)
+			return (ret);
+		break;
+	case DB_UNKNOWN:
+	default:
+		return (__db_unknown_type(env, "__dbc_count", dbc->dbtype));
+	}
+	return (0);
+}
+
+/*
+ * __dbc_del --
+ *	DBC->del.
+ *
+ * PUBLIC: int __dbc_del __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	CDB_LOCKING_INIT(env, dbc);
+	F_CLR(dbc, DBC_ERROR);
+
+	/*
+	 * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set
+	 * (which it only is if we're being called from a primary update),
+	 * then we need to call through to the primary and delete the item.
+	 *
+	 * Note that this will delete the current item;  we don't need to
+	 * delete it ourselves as well, so we can just goto done.
+	 */
+	if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) {
+		ret = __dbc_del_secondary(dbc);
+		goto done;
+	}
+
+	/*
+	 * If we are a foreign db, go through and check any foreign key
+	 * constraints first, which will make rolling back changes on an abort
+	 * simpler.
+	 */
+	if (LIST_FIRST(&dbp->f_primaries) != NULL &&
+	    (ret = __dbc_del_foreign(dbc)) != 0)
+		goto done;
+
+	/*
+	 * If we are a primary and have secondary indices, go through
+	 * and delete any secondary keys that point at the current record.
+	 */
+	if (DB_IS_PRIMARY(dbp) &&
+	    (ret = __dbc_del_primary(dbc)) != 0)
+		goto done;
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp))
+		ret = __bamc_compress_del(dbc, flags);
+	else
+#endif
+		ret = __dbc_idel(dbc, flags);
+
+done:	CDB_LOCKING_DONE(env, dbc);
+
+	if (!DB_RETOK_DBCDEL(ret))
+		F_SET(dbc, DBC_ERROR);
+	return (ret);
+}
+
+/*
+ * __dbc_del --
+ *	Implemenation of DBC->del.
+ *
+ * PUBLIC: int __dbc_idel __P((DBC *, u_int32_t));
+ */
+int
+__dbc_idel(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *opd;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+
+	dbp = dbc->dbp;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are not duplicated and will not be cleaned up on return.
+	 * So, pages/locks that the cursor references must be resolved by the
+	 * underlying functions.
+	 */
+
+	/*
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the del operation is done in an off-page
+	 * duplicate tree, call the primary cursor's upgrade routine first.
+	 */
+	opd = dbc->internal->opd;
+	if (opd == NULL)
+		ret = dbc->am_del(dbc, flags);
+	else if ((ret = dbc->am_writelock(dbc)) == 0)
+		ret = opd->am_del(opd, flags);
+
+	/*
+	 * If this was an update that is supporting dirty reads
+	 * then we may have just swapped our read for a write lock
+	 * which is held by the surviving cursor.  We need
+	 * to explicitly downgrade this lock.  The closed cursor
+	 * may only have had a read lock.
+	 */
+	if (ret == 0 && F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+	    dbc->internal->lock_mode == DB_LOCK_WRITE) {
+		if ((ret = __TLPUT(dbc, dbc->internal->lock)) == 0)
+			dbc->internal->lock_mode = DB_LOCK_WWRITE;
+		if (dbc->internal->page != NULL && (t_ret =
+		    __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __dbc_bulk_del --
+ *	Bulk del for a cursor.
+ *
+ *	Only implemented for compressed BTrees. In this file in order to
+ *	use the CDB_LOCKING_* macros.
+ *
+ * PUBLIC: #ifdef HAVE_COMPRESSION
+ * PUBLIC: int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__dbc_bulk_del(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+	DB_ASSERT(env, DB_IS_COMPRESSED(dbc->dbp));
+
+	CDB_LOCKING_INIT(env, dbc);
+	F_CLR(dbc, DBC_ERROR);
+
+	ret = __bamc_compress_bulk_del(dbc, key, flags);
+
+	CDB_LOCKING_DONE(env, dbc);
+
+	return (ret);
+}
+#endif
+
+/*
+ * __dbc_dup --
+ *	Duplicate a cursor
+ *
+ * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup(dbc_orig, dbcp, flags)
+	DBC *dbc_orig;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc_n, *dbc_nopd;
+	int ret;
+
+	dbc_n = dbc_nopd = NULL;
+
+	/* Allocate a new cursor and initialize it. */
+	if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0)
+		goto err;
+	*dbcp = dbc_n;
+
+	/*
+	 * If the cursor references an off-page duplicate tree, allocate a
+	 * new cursor for that tree and initialize it.
+	 */
+	if (dbc_orig->internal->opd != NULL) {
+		if ((ret =
+		   __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0)
+			goto err;
+		dbc_n->internal->opd = dbc_nopd;
+		dbc_nopd->internal->pdbc = dbc_n;
+	}
+	return (0);
+
+err:	if (dbc_n != NULL)
+		(void)__dbc_close(dbc_n);
+	if (dbc_nopd != NULL)
+		(void)__dbc_close(dbc_nopd);
+
+	return (ret);
+}
+
+/*
+ * __dbc_idup --
+ *	Internal version of __dbc_dup.
+ *
+ * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_idup(dbc_orig, dbcp, flags)
+	DBC *dbc_orig, **dbcp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *dbc_n;
+	DBC_INTERNAL *int_n, *int_orig;
+	ENV *env;
+	int ret;
+
+	dbp = dbc_orig->dbp;
+	dbc_n = *dbcp;
+	env = dbp->env;
+
+	if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info,
+	    dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root,
+	    F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE,
+	    dbc_orig->locker, &dbc_n)) != 0)
+		return (ret);
+
+	/* Position the cursor if requested, acquiring the necessary locks. */
+	if (LF_ISSET(DB_POSITION)) {
+		int_n = dbc_n->internal;
+		int_orig = dbc_orig->internal;
+
+		dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID;
+
+		int_n->indx = int_orig->indx;
+		int_n->pgno = int_orig->pgno;
+		int_n->root = int_orig->root;
+		int_n->lock_mode = int_orig->lock_mode;
+
+		int_n->stream_start_pgno = int_orig->stream_start_pgno;
+		int_n->stream_off = int_orig->stream_off;
+		int_n->stream_curr_pgno = int_orig->stream_curr_pgno;
+
+		switch (dbc_orig->dbtype) {
+		case DB_QUEUE:
+			if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0)
+				goto err;
+			break;
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bamc_dup(dbc_orig, dbc_n, flags)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0)
+				goto err;
+			break;
+		case DB_HEAP:
+			if ((ret = __heapc_dup(dbc_orig, dbc_n)) != 0)
+				goto err;
+			break;
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(env,
+			    "__dbc_idup", dbc_orig->dbtype);
+			goto err;
+		}
+	} else if (F_ISSET(dbc_orig, DBC_BULK)) {
+		/*
+		 * For bulk cursors, remember what page were on, even if we
+		 * don't know that the next operation will be nearby.
+		 */
+		dbc_n->internal->pgno = dbc_orig->internal->pgno;
+	}
+
+	/* Copy the locking flags to the new cursor. */
+	F_SET(dbc_n, F_ISSET(dbc_orig, DBC_BULK |
+	    DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR));
+
+	/*
+	 * If we're in CDB and this isn't an offpage dup cursor, then
+	 * we need to get a lock for the duplicated cursor.
+	 */
+	if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) &&
+	    (ret = __lock_get(env, dbc_n->locker, 0,
+	    &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ?
+	    DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0)
+		goto err;
+
+	dbc_n->priority = dbc_orig->priority;
+	dbc_n->internal->pdbc = dbc_orig->internal->pdbc;
+	*dbcp = dbc_n;
+	return (0);
+
+err:	(void)__dbc_close(dbc_n);
+	return (ret);
+}
+
+/*
+ * __dbc_newopd --
+ *	Create a new off-page duplicate cursor.
+ *
+ * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+ */
+int
+__dbc_newopd(dbc_parent, root, oldopd, dbcp)
+	DBC *dbc_parent;
+	db_pgno_t root;
+	DBC *oldopd;
+	DBC **dbcp;
+{
+	DB *dbp;
+	DBC *opd;
+	DBTYPE dbtype;
+	int ret;
+
+	dbp = dbc_parent->dbp;
+	dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE;
+
+	/*
+	 * On failure, we want to default to returning the old off-page dup
+	 * cursor, if any;  our caller can't be left with a dangling pointer
+	 * to a freed cursor.  On error the only allowable behavior is to
+	 * close the cursor (and the old OPD cursor it in turn points to), so
+	 * this should be safe.
+	 */
+	*dbcp = oldopd;
+
+	if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info,
+	    dbc_parent->txn,
+	    dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0)
+		return (ret);
+
+	opd->priority = dbc_parent->priority;
+	opd->internal->pdbc = dbc_parent;
+	*dbcp = opd;
+
+	/*
+	 * Check to see if we already have an off-page dup cursor that we've
+	 * passed in.  If we do, close it.  It'd be nice to use it again
+	 * if it's a cursor belonging to the right tree, but if we're doing
+	 * a cursor-relative operation this might not be safe, so for now
+	 * we'll take the easy way out and always close and reopen.
+	 *
+	 * Note that under no circumstances do we want to close the old
+	 * cursor without returning a valid new one;  we don't want to
+	 * leave the main cursor in our caller with a non-NULL pointer
+	 * to a freed off-page dup cursor.
+	 */
+	if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __dbc_get --
+ *	Get using a cursor.
+ *
+ * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	F_CLR(dbc, DBC_ERROR);
+#ifdef HAVE_PARTITION
+	if (F_ISSET(dbc, DBC_PARTITIONED))
+		return (__partc_get(dbc, key, data, flags));
+#endif
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbc->dbp))
+		return (__bamc_compress_get(dbc, key, data, flags));
+#endif
+
+	return (__dbc_iget(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iget --
+ *	Implementation of get using a cursor.
+ *
+ * PUBLIC: int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iget(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *ddbc, *dbc_n, *opd;
+	DBC_INTERNAL *cp, *cp_n;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	db_pgno_t pgno;
+	db_indx_t indx_off;
+	u_int32_t multi, orig_ulen, tmp_flags, tmp_read_locking, tmp_rmw;
+	u_int8_t type;
+	int key_small, ret, t_ret;
+
+	COMPQUIET(orig_ulen, 0);
+
+	key_small = 0;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are duplicated cursors.  On return, any referenced pages
+	 * will be discarded, and, if the cursor is not intended to be used
+	 * again, the close function will be called.  So, pages/locks that
+	 * the cursor references do not need to be resolved by the underlying
+	 * functions.
+	 */
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	dbc_n = NULL;
+	opd = NULL;
+
+	PERFMON6(env, db, get, dbp->fname, dbp->dname,
+	    dbc->txn == NULL ? 0 : dbc->txn->txnid, key, data, flags);
+
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = LF_ISSET(DB_RMW);
+	LF_CLR(DB_RMW);
+
+	SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+
+	multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+	LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+	/*
+	 * Return a cursor's record number.  It has nothing to do with the
+	 * cursor get code except that it was put into the interface.
+	 */
+	if (flags == DB_GET_RECNO) {
+		if (tmp_rmw)
+			F_SET(dbc, DBC_RMW);
+		F_SET(dbc, tmp_read_locking);
+		ret = __bamc_rget(dbc, data);
+		if (tmp_rmw)
+			F_CLR(dbc, DBC_RMW);
+		/* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+		F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+		return (ret);
+	}
+
+	if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+		CDB_LOCKING_INIT(env, dbc);
+
+	/* Don't return the key or data if it was passed to us. */
+	if (!DB_RETURNS_A_KEY(dbp, flags))
+		F_SET(key, DB_DBT_ISSET);
+	if (flags == DB_GET_BOTH &&
+	    (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp))
+		F_SET(data, DB_DBT_ISSET);
+
+	/*
+	 * If we have an off-page duplicates cursor, and the operation applies
+	 * to it, perform the operation.  Duplicate the cursor and call the
+	 * underlying function.
+	 *
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the DB_RMW flag was specified and the get
+	 * operation is done in an off-page duplicate tree, call the primary
+	 * cursor's upgrade routine first.
+	 */
+	cp = dbc->internal;
+	if (cp->opd != NULL &&
+	    (flags == DB_CURRENT || flags == DB_GET_BOTHC ||
+	    flags == DB_NEXT || flags == DB_NEXT_DUP ||
+	    flags == DB_PREV || flags == DB_PREV_DUP)) {
+		if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0)
+			goto err;
+		if (F_ISSET(dbc, DBC_TRANSIENT))
+			opd = cp->opd;
+		else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0)
+			goto err;
+
+		if ((ret = opd->am_get(opd, key, data, flags, NULL)) == 0)
+			goto done;
+		/*
+		 * Another cursor may have deleted all of the off-page
+		 * duplicates, so for operations that are moving a cursor, we
+		 * need to skip the empty tree and retry on the parent cursor.
+		 */
+		if (ret == DB_NOTFOUND &&
+		    (flags == DB_PREV || flags == DB_NEXT)) {
+			ret = __dbc_close(opd);
+			opd = NULL;
+			if (F_ISSET(dbc, DBC_TRANSIENT))
+				cp->opd = NULL;
+		}
+		if (ret != 0)
+			goto err;
+	} else if (cp->opd != NULL && F_ISSET(dbc, DBC_TRANSIENT)) {
+		if ((ret = __dbc_close(cp->opd)) != 0)
+			goto err;
+		cp->opd = NULL;
+	}
+
+	/*
+	 * Perform an operation on the main cursor.  Duplicate the cursor,
+	 * upgrade the lock as required, and call the underlying function.
+	 */
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_GET_BOTHC:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		tmp_flags = DB_POSITION;
+		break;
+	default:
+		tmp_flags = 0;
+		break;
+	}
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+		dbc_n = dbc;
+	else {
+		ret = __dbc_idup(dbc, &dbc_n, tmp_flags);
+
+		if (ret != 0)
+			goto err;
+		COPY_RET_MEM(dbc, dbc_n);
+	}
+
+	if (tmp_rmw)
+		F_SET(dbc_n, DBC_RMW);
+	F_SET(dbc_n, tmp_read_locking);
+
+	switch (multi) {
+	case DB_MULTIPLE:
+		F_SET(dbc_n, DBC_MULTIPLE);
+		break;
+	case DB_MULTIPLE_KEY:
+		F_SET(dbc_n, DBC_MULTIPLE_KEY);
+		break;
+	case DB_MULTIPLE | DB_MULTIPLE_KEY:
+		F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+		break;
+	case 0:
+	default:
+		break;
+	}
+
+retry:	pgno = PGNO_INVALID;
+	ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno);
+	if (tmp_rmw)
+		F_CLR(dbc_n, DBC_RMW);
+	/*
+	 * Clear the temporary locking flags in the new cursor.  The user's
+	 * (old) cursor needs to have the WAS_READ_COMMITTED flag because this
+	 * is used on the next call on that cursor.
+	 */
+	F_CLR(dbc_n, tmp_read_locking);
+	F_SET(dbc, tmp_read_locking & DBC_WAS_READ_COMMITTED);
+	F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+	if (ret != 0)
+		goto err;
+
+	cp_n = dbc_n->internal;
+
+	/*
+	 * We may be referencing a new off-page duplicates tree.  Acquire
+	 * a new cursor and call the underlying function.
+	 */
+	if (pgno != PGNO_INVALID) {
+		if ((ret = __dbc_newopd(dbc,
+		    pgno, cp_n->opd, &cp_n->opd)) != 0)
+			goto err;
+
+		switch (flags) {
+		case DB_FIRST:
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+		case DB_SET:
+		case DB_SET_RECNO:
+		case DB_SET_RANGE:
+			tmp_flags = DB_FIRST;
+			break;
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_NODUP:
+			tmp_flags = DB_LAST;
+			break;
+		case DB_GET_BOTH:
+		case DB_GET_BOTHC:
+		case DB_GET_BOTH_RANGE:
+			tmp_flags = flags;
+			break;
+		default:
+			ret = __db_unknown_flag(env, "__dbc_get", flags);
+			goto err;
+		}
+		ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL);
+		/*
+		 * Another cursor may have deleted all of the off-page
+		 * duplicates, so for operations that are moving a cursor, we
+		 * need to skip the empty tree and retry on the parent cursor.
+		 */
+		if (ret == DB_NOTFOUND) {
+			PERFMON5(env, race, dbc_get,
+			    dbp->fname, dbp->dname, ret, tmp_flags, key);
+
+			switch (flags) {
+			case DB_FIRST:
+			case DB_NEXT:
+			case DB_NEXT_NODUP:
+				flags = DB_NEXT;
+				break;
+			case DB_LAST:
+			case DB_PREV:
+			case DB_PREV_NODUP:
+				flags = DB_PREV;
+				break;
+			default:
+				goto err;
+			}
+
+			ret = __dbc_close(cp_n->opd);
+			cp_n->opd = NULL;
+			if (ret == 0)
+				goto retry;
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+done:	/*
+	 * Return a key/data item.  The only exception is that we don't return
+	 * a key if the user already gave us one, that is, if the DB_SET flag
+	 * was set.  The DB_SET flag is necessary.  In a Btree, the user's key
+	 * doesn't have to be the same as the key stored the tree, depending on
+	 * the magic performed by the comparison function.  As we may not have
+	 * done any key-oriented operation here, the page reference may not be
+	 * valid.  Fill it in as necessary.  We don't have to worry about any
+	 * locks, the cursor must already be holding appropriate locks.
+	 *
+	 * XXX
+	 * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key
+	 * either, should we?
+	 */
+	cp_n = dbc_n == NULL ? dbc->internal : dbc_n->internal;
+	if (!F_ISSET(key, DB_DBT_ISSET)) {
+		if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno,
+		    dbc->thread_info, dbc->txn, 0, &cp_n->page)) != 0)
+			goto err;
+
+		if ((ret = __db_ret(dbc, cp_n->page, cp_n->indx, key,
+		    &dbc->rkey->data, &dbc->rkey->ulen)) != 0) {
+			/*
+			 * If the key DBT is too small, we still want to return
+			 * the size of the data.  Otherwise applications are
+			 * forced to check each one with a separate call.  We
+			 * don't want to copy the data, so we set the ulen to
+			 * zero before calling __db_ret.
+			 */
+			if (ret == DB_BUFFER_SMALL &&
+			    F_ISSET(data, DB_DBT_USERMEM)) {
+				key_small = 1;
+				orig_ulen = data->ulen;
+				data->ulen = 0;
+			} else
+				goto err;
+		}
+	}
+	if (multi != 0 && dbc->am_bulk != NULL) {
+		/*
+		 * Even if fetching from the OPD cursor we need a duplicate
+		 * primary cursor if we are going after multiple keys.
+		 */
+		if (dbc_n == NULL) {
+			/*
+			 * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor,
+			 * so it's safe to just use dbc, unless the cursor
+			 * has an open off-page duplicate cursor whose state
+			 * might need to be preserved.
+			 */
+			if ((!(multi & DB_MULTIPLE_KEY) &&
+			    dbc->internal->opd == NULL) ||
+			    F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+				dbc_n = dbc;
+			else {
+				if ((ret = __dbc_idup(dbc,
+				    &dbc_n, DB_POSITION)) != 0)
+					goto err;
+				if ((ret = dbc_n->am_get(dbc_n,
+				    key, data, DB_CURRENT, &pgno)) != 0)
+					goto err;
+			}
+			cp_n = dbc_n->internal;
+		}
+
+		/*
+		 * If opd is set then we dupped the opd that we came in with.
+		 * When we return we may have a new opd if we went to another
+		 * key.
+		 */
+		if (opd != NULL) {
+			DB_ASSERT(env, cp_n->opd == NULL);
+			cp_n->opd = opd;
+			opd = NULL;
+		}
+
+		/*
+		 * Bulk get doesn't use __db_retcopy, so data.size won't
+		 * get set up unless there is an error.  Assume success
+		 * here.  This is the only call to am_bulk, and it avoids
+		 * setting it exactly the same everywhere.  If we have an
+		 * DB_BUFFER_SMALL error, it'll get overwritten with the
+		 * needed value.
+		 */
+		data->size = data->ulen;
+		ret = dbc_n->am_bulk(dbc_n, data, flags | multi);
+	} else if (!F_ISSET(data, DB_DBT_ISSET)) {
+		ddbc = opd != NULL ? opd :
+		    cp_n->opd != NULL ? cp_n->opd : dbc_n;
+		cp = ddbc->internal;
+		if (cp->page == NULL &&
+		    (ret = __memp_fget(mpf, &cp->pgno,
+			 dbc->thread_info, ddbc->txn, 0, &cp->page)) != 0)
+			goto err;
+
+		type = TYPE(cp->page);
+		indx_off = ((type == P_LBTREE ||
+		    type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0);
+		ret = __db_ret(ddbc, cp->page, cp->indx + indx_off,
+		    data, &dbc->rdata->data, &dbc->rdata->ulen);
+	}
+
+err:	/* Don't pass DB_DBT_ISSET back to application level, error or no. */
+	F_CLR(key, DB_DBT_ISSET);
+	F_CLR(data, DB_DBT_ISSET);
+
+	/* Cleanup and cursor resolution. */
+	if (opd != NULL) {
+		/*
+		 * To support dirty reads we must reget the write lock
+		 * if we have just stepped off a deleted record.
+		 * Since the OPD cursor does not know anything
+		 * about the referencing page or cursor we need
+		 * to peek at the OPD cursor and get the lock here.
+		 */
+		if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+		     F_ISSET((BTREE_CURSOR *)
+		     dbc->internal->opd->internal, C_DELETED))
+			if ((t_ret =
+			    dbc->am_writelock(dbc)) != 0 && ret == 0)
+				ret = t_ret;
+		if ((t_ret = __dbc_cleanup(
+		    dbc->internal->opd, opd, ret)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	if (key_small) {
+		data->ulen = orig_ulen;
+		if (ret == 0)
+			ret = DB_BUFFER_SMALL;
+	}
+
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+	    (ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+
+	if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+		CDB_LOCKING_DONE(env, dbc);
+	return (ret);
+}
+
+/* Internal flags shared by the dbc_put functions. */
+#define	DBC_PUT_RMW		0x001
+#define	DBC_PUT_NODEL		0x002
+#define	DBC_PUT_HAVEREC		0x004
+
+/*
+ * __dbc_put_resolve_key --
+ *	Get the current key and data so that we can correctly update the
+ *	secondary and foreign databases.
+ */
+static inline int
+__dbc_put_resolve_key(dbc, oldkey, olddata, put_statep, flags)
+	DBC *dbc;
+	DBT *oldkey, *olddata;
+	u_int32_t flags, *put_statep;
+{
+	DB *dbp;
+	ENV *env;
+	int ret, rmw;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	DB_ASSERT(env, flags == DB_CURRENT);
+	COMPQUIET(flags, 0);
+
+	/*
+	 * This is safe to do on the cursor we already have;
+	 * error or no, it won't move.
+	 *
+	 * We use DB_RMW for all of these gets because we'll be
+	 * writing soon enough in the "normal" put code.  In
+	 * transactional databases we'll hold those write locks
+	 * even if we close the cursor we're reading with.
+	 *
+	 * The DB_KEYEMPTY return needs special handling -- if the
+	 * cursor is on a deleted key, we return DB_NOTFOUND.
+	 */
+	memset(oldkey, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, oldkey, olddata, rmw | DB_CURRENT)) != 0)
+		return (ret == DB_KEYEMPTY ? DB_NOTFOUND : ret);
+
+	/* Record that we've looked for the old record. */
+	FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+	return (0);
+}
+
+/*
+ * __dbc_put_append --
+ *	Handle an append to a primary.
+ */
+static inline int
+__dbc_put_append(dbc, key, data, put_statep, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags, *put_statep;
+{
+	DB *dbp;
+	ENV *env;
+	DBC *dbc_n;
+	DBT tdata;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = 0;
+	dbc_n = NULL;
+
+	DB_ASSERT(env, flags == DB_APPEND);
+	COMPQUIET(flags, 0);
+
+	/*
+	 * With DB_APPEND, we need to do the insert to populate the key value.
+	 * So we swap the 'normal' order of updating secondary / verifying
+	 * foreign databases and inserting.
+	 *
+	 * If there is an append callback, the value stored in data->data may
+	 * be replaced and then freed.  To avoid passing a freed pointer back
+	 * to the user, just operate on a copy of the data DBT.
+	 */
+	tdata = *data;
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else if ((ret = __dbc_idup(dbc, &dbc_n, 0)) != 0)
+		goto err;
+
+	/*
+	 * Append isn't a normal put operation;  call the appropriate access
+	 * method's append function.
+	 */
+	switch (dbp->type) {
+	case DB_HEAP:
+		if ((ret = __heap_append(dbc_n, key, &tdata)) != 0)
+			goto err;
+		break;
+	case DB_QUEUE:
+		if ((ret = __qam_append(dbc_n, key, &tdata)) != 0)
+			goto err;
+		break;
+	case DB_RECNO:
+		if ((ret = __ram_append(dbc_n, key, &tdata)) != 0)
+			goto err;
+		break;
+	default:
+		/* The interface should prevent this. */
+		DB_ASSERT(env,
+		    dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+		ret = __db_ferr(env, "DBC->put", 0);
+		goto err;
+	}
+
+	/*
+	 * The append callback, if one exists, may have allocated a new
+	 * tdata.data buffer.  If so, free it.
+	 */
+	FREE_IF_NEEDED(env, &tdata);
+
+	/*
+	 * The key value may have been generated by the above operation, but
+	 * not set in the data buffer. Make sure it is there so that secondary
+	 * updates can complete.
+	 */
+	__dbt_userfree(env, key, NULL, NULL);
+	if ((ret = __dbt_usercopy(env, key)) != 0)
+		goto err;
+
+	/* An append cannot be replacing an existing item. */
+	FLD_SET(*put_statep, DBC_PUT_NODEL);
+
+err:	if (dbc_n != NULL &&
+	    (t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __dbc_put_partial --
+ *	Ensure that the data item we are using is complete and correct.
+ *      Otherwise we could break the secondary constraints.
+ */
+static inline int
+__dbc_put_partial(dbc, pkey, data, orig_data, out_data, put_statep, flags)
+	DBC *dbc;
+	DBT *pkey, *data, *orig_data, *out_data;
+	u_int32_t *put_statep, flags;
+{
+	DB *dbp;
+	DBC *pdbc;
+	ENV *env;
+	int ret, rmw, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = t_ret = 0;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	if (!FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+	    !FLD_ISSET(*put_statep, DBC_PUT_NODEL)) {
+		/*
+		 * We're going to have to search the tree for the
+		 * specified key.  Dup a cursor (so we have the same
+		 * locking info) and do a c_get.
+		 */
+		if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+			return (ret);
+
+		/*
+		 * When doing a put with DB_CURRENT, partial data items have
+		 * already been resolved.
+		 */
+		DB_ASSERT(env, flags != DB_CURRENT);
+
+		F_SET(pkey, DB_DBT_ISSET);
+		ret = __dbc_get(pdbc, pkey, orig_data, rmw | DB_SET);
+		if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+			FLD_SET(*put_statep, DBC_PUT_NODEL);
+			ret = 0;
+		}
+		if ((t_ret = __dbc_close(pdbc)) != 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+
+		FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+	}
+
+	COMPQUIET(flags, 0);
+
+	/*
+	 * Now build the new datum from orig_data and the partial data
+	 * we were given.  It's okay to do this if no record was
+	 * returned above: a partial put on an empty record is allowed,
+	 * if a little strange.  The data is zero-padded.
+	 */
+	return (__db_buildpartial(dbp, orig_data, data, out_data));
+}
+
+/*
+ * __dbc_put_fixed_len --
+ *      Handle padding for fixed-length records.
+ */
+static inline int
+__dbc_put_fixed_len(dbc, data, out_data)
+	DBC *dbc;
+	DBT *data, *out_data;
+{
+	DB *dbp;
+	ENV *env;
+	int re_pad, ret;
+	u_int32_t re_len, size;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * Handle fixed-length records.  If the primary database has
+	 * fixed-length records, we need to pad out the datum before
+	 * we pass it into the callback function;  we always index the
+	 * "real" record.
+	 */
+	if (dbp->type == DB_QUEUE) {
+		re_len = ((QUEUE *)dbp->q_internal)->re_len;
+		re_pad = ((QUEUE *)dbp->q_internal)->re_pad;
+	} else {
+		re_len = ((BTREE *)dbp->bt_internal)->re_len;
+		re_pad = ((BTREE *)dbp->bt_internal)->re_pad;
+	}
+
+	size = data->size;
+	if (size > re_len) {
+		ret = __db_rec_toobig(env, size, re_len);
+		return (ret);
+	} else if (size < re_len) {
+		/*
+		 * If we're not doing a partial put, copy data->data into
+		 * out_data->data, then pad out out_data->data. This overrides
+		 * the assignment made above, which is used in the more common
+		 * case when padding is not needed.
+		 *
+		 * If we're doing a partial put, the data we want are already
+		 * in out_data.data; we just need to pad.
+		 */
+		if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		       if ((ret = __os_realloc(
+			    env, re_len, &out_data->data)) != 0)
+				return (ret);
+		       /*
+			* In the partial case, we have built the item into
+			* out_data already using __db_buildpartial. Just need
+			* to pad from the end of out_data, not from data->size.
+			*/
+		       size = out_data->size;
+		} else {
+			if ((ret = __os_malloc(
+			    env, re_len, &out_data->data)) != 0)
+				return (ret);
+			memcpy(out_data->data, data->data, size);
+		}
+		memset((u_int8_t *)out_data->data + size, re_pad,
+		    re_len - size);
+		out_data->size = re_len;
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbc_put_secondaries --
+ *	Insert the secondary keys, and validate the foreign key constraints.
+ */
+static inline int
+__dbc_put_secondaries(dbc,
+    pkey, data, orig_data, s_count, s_keys_buf, put_statep)
+	DBC *dbc;
+	DBT *pkey, *data, *orig_data, *s_keys_buf;
+	int s_count;
+	u_int32_t *put_statep;
+{
+	DB *dbp, *sdbp;
+	DBC *fdbc, *sdbc;
+	DBT fdata, oldpkey, *skeyp, temppkey, tempskey, *tskeyp;
+	ENV *env;
+	int cmp, ret, rmw, t_ret;
+	u_int32_t nskey;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	fdbc = sdbc = NULL;
+	sdbp = NULL;
+	t_ret = 0;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	/*
+	 * Loop through the secondaries.  (Step 3.)
+	 *
+	 * Note that __db_s_first and __db_s_next will take care of
+	 * thread-locking and refcounting issues.
+	 */
+	for (ret = __db_s_first(dbp, &sdbp), skeyp = s_keys_buf;
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn), ++skeyp) {
+		DB_ASSERT(env, skeyp - s_keys_buf < s_count);
+		/*
+		 * Don't process this secondary if the key is immutable and we
+		 * know that the old record exists.  This optimization can't be
+		 * used if we have not checked for the old record yet.
+		 */
+		if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+		    !FLD_ISSET(*put_statep, DBC_PUT_NODEL) &&
+		    FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+			continue;
+
+		/*
+		 * Call the callback for this secondary, to get the
+		 * appropriate secondary key.
+		 */
+		if ((ret = sdbp->s_callback(sdbp,
+		    pkey, data, skeyp)) != 0) {
+			/* Not indexing is equivalent to an empty key set. */
+			if (ret == DB_DONOTINDEX) {
+				F_SET(skeyp, DB_DBT_MULTIPLE);
+				skeyp->size = 0;
+				ret = 0;
+			} else
+				goto err;
+		}
+
+		if (sdbp->s_foreign != NULL &&
+		    (ret = __db_cursor_int(sdbp->s_foreign,
+		    dbc->thread_info, dbc->txn, sdbp->s_foreign->type,
+		    PGNO_INVALID, 0, dbc->locker, &fdbc)) != 0)
+			goto err;
+
+		/*
+		 * Mark the secondary key DBT(s) as set -- that is, the
+		 * callback returned at least one secondary key.
+		 *
+		 * Also, if this secondary index is associated with a foreign
+		 * database, check that the foreign db contains the key(s) to
+		 * maintain referential integrity.  Set flags in fdata to avoid
+		 * mem copying, we just need to know existence.  We need to do
+		 * this check before setting DB_DBT_ISSET, otherwise __dbc_get
+		 * will overwrite the flag values.
+		 */
+		if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+			__db_check_skeyset(sdbp, skeyp);
+#endif
+			for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size;
+			     nskey > 0; nskey--, tskeyp++) {
+				if (fdbc != NULL) {
+					memset(&fdata, 0, sizeof(DBT));
+					F_SET(&fdata,
+					    DB_DBT_PARTIAL | DB_DBT_USERMEM);
+					if ((ret = __dbc_get(
+					    fdbc, tskeyp, &fdata,
+					    DB_SET | rmw)) == DB_NOTFOUND ||
+					    ret == DB_KEYEMPTY) {
+						ret = DB_FOREIGN_CONFLICT;
+						break;
+					}
+				}
+				F_SET(tskeyp, DB_DBT_ISSET);
+			}
+			tskeyp = (DBT *)skeyp->data;
+			nskey = skeyp->size;
+		} else {
+			if (fdbc != NULL) {
+				memset(&fdata, 0, sizeof(DBT));
+				F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+				if ((ret = __dbc_get(fdbc, skeyp, &fdata,
+				    DB_SET | rmw)) == DB_NOTFOUND ||
+				    ret == DB_KEYEMPTY)
+					ret = DB_FOREIGN_CONFLICT;
+			}
+			F_SET(skeyp, DB_DBT_ISSET);
+			tskeyp = skeyp;
+			nskey = 1;
+		}
+		if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		fdbc = NULL;
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * If we have the old record, we can generate and remove any
+		 * old secondary key(s) now.  We can also skip the secondary
+		 * put if there is no change.
+		 */
+		if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC)) {
+			if ((ret = __dbc_del_oldskey(sdbp, dbc,
+			    skeyp, pkey, orig_data)) == DB_KEYEXIST)
+				continue;
+			else if (ret != 0)
+				goto err;
+		}
+		if (nskey == 0)
+			continue;
+
+		/*
+		 * Open a cursor in this secondary.
+		 *
+		 * Use the same locker ID as our primary cursor, so that
+		 * we're guaranteed that the locks don't conflict (e.g. in CDB
+		 * or if we're subdatabases that share and want to lock a
+		 * metadata page).
+		 */
+		if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+		    sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+			goto err;
+
+		/*
+		 * If we're in CDB, updates will fail since the new cursor
+		 * isn't a writer.  However, we hold the WRITE lock in the
+		 * primary and will for as long as our new cursor lasts,
+		 * and the primary and secondary share a lock file ID,
+		 * so it's safe to consider this a WRITER.  The close
+		 * routine won't try to put anything because we don't
+		 * really have a lock.
+		 */
+		if (CDB_LOCKING(env)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+		}
+
+		/*
+		 * Swap the primary key to the byte order of this secondary, if
+		 * necessary.  By doing this now, we can compare directly
+		 * against the data already in the secondary without having to
+		 * swap it after reading.
+		 */
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+		for (; nskey > 0 && ret == 0; nskey--, tskeyp++) {
+			/* Skip this key if it is already in the database. */
+			if (!F_ISSET(tskeyp, DB_DBT_ISSET))
+				continue;
+
+			/*
+			 * There are three cases here--
+			 * 1) The secondary supports sorted duplicates.
+			 *	If we attempt to put a secondary/primary pair
+			 *	that already exists, that's a duplicate
+			 *	duplicate, and c_put will return DB_KEYEXIST
+			 *	(see __db_duperr).  This will leave us with
+			 *	exactly one copy of the secondary/primary pair,
+			 *	and this is just right--we'll avoid deleting it
+			 *	later, as the old and new secondaries will
+			 *	match (since the old secondary is the dup dup
+			 *	that's already there).
+			 * 2) The secondary supports duplicates, but they're not
+			 *	sorted.  We need to avoid putting a duplicate
+			 *	duplicate, because the matching old and new
+			 *	secondaries will prevent us from deleting
+			 *	anything and we'll wind up with two secondary
+			 *	records that point to the same primary key.  Do
+			 *	a c_get(DB_GET_BOTH);  only do the put if the
+			 *	secondary doesn't exist.
+			 * 3) The secondary doesn't support duplicates at all.
+			 *	In this case, secondary keys must be unique;
+			 *	if another primary key already exists for this
+			 *	secondary key, we have to either overwrite it
+			 *	or not put this one, and in either case we've
+			 *	corrupted the secondary index.  Do a
+			 *	c_get(DB_SET).  If the secondary/primary pair
+			 *	already exists, do nothing;  if the secondary
+			 *	exists with a different primary, return an
+			 *	error;  and if the secondary does not exist,
+			 *	put it.
+			 */
+			if (!F_ISSET(sdbp, DB_AM_DUP)) {
+				/* Case 3. */
+				memset(&oldpkey, 0, sizeof(DBT));
+				F_SET(&oldpkey, DB_DBT_MALLOC);
+				ret = __dbc_get(sdbc,
+				    tskeyp, &oldpkey, rmw | DB_SET);
+				if (ret == 0) {
+					cmp = __bam_defcmp(sdbp,
+					    &oldpkey, pkey);
+					__os_ufree(env, oldpkey.data);
+					/*
+					 * If the secondary key is unchanged,
+					 * skip the put and go on to the next
+					 * one.
+					 */
+					if (cmp == 0)
+						continue;
+
+					__db_errx(env, DB_STR("0695",
+			    "Put results in a non-unique secondary key in an "
+			    "index not configured to support duplicates"));
+					ret = EINVAL;
+				}
+				if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+					break;
+			} else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) {
+				/* Case 2. */
+				DB_INIT_DBT(tempskey,
+				    tskeyp->data, tskeyp->size);
+				DB_INIT_DBT(temppkey,
+				    pkey->data, pkey->size);
+				ret = __dbc_get(sdbc, &tempskey, &temppkey,
+				    rmw | DB_GET_BOTH);
+				if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+					break;
+			}
+
+			ret = __dbc_put(sdbc, tskeyp, pkey,
+			    DB_UPDATE_SECONDARY);
+
+			/*
+			 * We don't know yet whether this was a put-overwrite
+			 * that in fact changed nothing.  If it was, we may get
+			 * DB_KEYEXIST.  This is not an error.
+			 */
+			if (ret == DB_KEYEXIST)
+				ret = 0;
+		}
+
+		/* Make sure the primary key is back in native byte-order. */
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Mark that we have a key for this secondary so we can check
+		 * it later before deleting the old one.  We can't set it
+		 * earlier or it would be cleared in the calls above.
+		 */
+		F_SET(skeyp, DB_DBT_ISSET);
+	}
+err:	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+	COMPQUIET(s_count, 0);
+	return (ret);
+}
+
+static int
+__dbc_put_primary(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp, *sdbp;
+	DBC *dbc_n, *pdbc;
+	DBT oldkey, olddata, newdata;
+	DBT *all_skeys, *skeyp, *tskeyp;
+	ENV *env;
+	int ret, t_ret, s_count;
+	u_int32_t nskey, put_state, rmw;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	t_ret = 0;
+	put_state = 0;
+	sdbp = NULL;
+	pdbc = dbc_n = NULL;
+	all_skeys = NULL;
+	memset(&newdata, 0, sizeof(DBT));
+	memset(&olddata, 0, sizeof(DBT));
+
+	/*
+	 * We do multiple cursor operations in some cases and subsequently
+	 * access the data DBT information.  Set DB_DBT_MALLOC so we don't risk
+	 * modification of the data between our uses of it.
+	 */
+	F_SET(&olddata, DB_DBT_MALLOC);
+
+	/*
+	 * We have at least one secondary which we may need to update.
+	 *
+	 * There is a rather vile locking issue here.  Secondary gets
+	 * will always involve acquiring a read lock in the secondary,
+	 * then acquiring a read lock in the primary.  Ideally, we
+	 * would likewise perform puts by updating all the secondaries
+	 * first, then doing the actual put in the primary, to avoid
+	 * deadlock (since having multiple threads doing secondary
+	 * gets and puts simultaneously is probably a common case).
+	 *
+	 * However, if this put is a put-overwrite--and we have no way to
+	 * tell in advance whether it will be--we may need to delete
+	 * an outdated secondary key.  In order to find that old
+	 * secondary key, we need to get the record we're overwriting,
+	 * before we overwrite it.
+	 *
+	 * (XXX: It would be nice to avoid this extra get, and have the
+	 * underlying put routines somehow pass us the old record
+	 * since they need to traverse the tree anyway.  I'm saving
+	 * this optimization for later, as it's a lot of work, and it
+	 * would be hard to fit into this locking paradigm anyway.)
+	 *
+	 * The simple thing to do would be to go get the old record before
+	 * we do anything else.  Unfortunately, though, doing so would
+	 * violate our "secondary, then primary" lock acquisition
+	 * ordering--even in the common case where no old primary record
+	 * exists, we'll still acquire and keep a lock on the page where
+	 * we're about to do the primary insert.
+	 *
+	 * To get around this, we do the following gyrations, which
+	 * hopefully solve this problem in the common case:
+	 *
+	 * 1) If this is a c_put(DB_CURRENT), go ahead and get the
+	 *    old record.  We already hold the lock on this page in
+	 *    the primary, so no harm done, and we'll need the primary
+	 *    key (which we weren't passed in this case) to do any
+	 *    secondary puts anyway.
+	 *    If this is a put(DB_APPEND), then we need to insert the item,
+	 *    so that we can know the key value. So go ahead and insert. In
+	 *    the case of a put(DB_APPEND) without secondaries it is
+	 *    implemented in the __db_put method as an optimization.
+	 *
+	 * 2) If we're doing a partial put, we need to perform the
+	 *    get on the primary key right away, since we don't have
+	 *    the whole datum that the secondary key is based on.
+	 *    We may also need to pad out the record if the primary
+	 *    has a fixed record length.
+	 *
+	 * 3) Loop through the secondary indices, putting into each a
+	 *    new secondary key that corresponds to the new record.
+	 *
+	 * 4) If we haven't done so in (1) or (2), get the old primary
+	 *    key/data pair.  If one does not exist--the common case--we're
+	 *    done with secondary indices, and can go straight on to the
+	 *    primary put.
+	 *
+	 * 5) If we do have an old primary key/data pair, however, we need
+	 *    to loop through all the secondaries a second time and delete
+	 *    the old secondary in each.
+	 */
+	s_count = __db_s_count(dbp);
+	if ((ret = __os_calloc(env,
+	    (u_int)s_count, sizeof(DBT), &all_skeys)) != 0)
+		goto err;
+
+	/*
+	 * Primary indices can't have duplicates, so only DB_APPEND,
+	 * DB_CURRENT, DB_KEYFIRST, and DB_KEYLAST make any sense.  Other flags
+	 * should have been caught by the checking routine, but
+	 * add a sprinkling of paranoia.
+	 */
+	DB_ASSERT(env, flags == DB_APPEND || flags == DB_CURRENT ||
+	      flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+	      flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP);
+
+	/*
+	 * We'll want to use DB_RMW in a few places, but it's only legal
+	 * when locking is on.
+	 */
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+	if (rmw)
+		FLD_SET(put_state, DBC_PUT_RMW);
+
+	/* Resolve the primary key if required (Step 1). */
+	if (flags == DB_CURRENT) {
+		if ((ret = __dbc_put_resolve_key(dbc,
+		    &oldkey, &olddata, &put_state, flags)) != 0)
+			goto err;
+		key = &oldkey;
+	} else if (flags == DB_APPEND) {
+		if ((ret = __dbc_put_append(dbc,
+		    key, data, &put_state, flags)) != 0)
+			goto err;
+	}
+
+	/*
+	 * PUT_NOOVERWRITE with secondaries is a troublesome case. We need
+	 * to check that the insert will work prior to making any changes
+	 * to secondaries. Try to work within the locking constraints outlined
+	 * above.
+	 *
+	 * This is DB->put (DB_NOOVERWRITE). DBC->put(DB_NODUPDATA) is not
+	 * relevant since it is only valid on DBs that support duplicates,
+	 * which primaries with secondaries can't have.
+	 */
+	if (flags == DB_NOOVERWRITE) {
+		/* Don't bother retrieving the data. */
+		F_SET(key, DB_DBT_ISSET);
+		olddata.dlen = 0;
+		olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+		ret = __dbc_get(dbc, key, &olddata, DB_SET);
+		if (ret == 0) {
+			ret = DB_KEYEXIST;
+			goto done;
+		} else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+			goto err;
+	}
+
+	/*
+	 * Check for partial puts using DB_DBT_PARTIAL (Step 2).
+	 */
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		if ((ret = __dbc_put_partial(dbc,
+		    key, data, &olddata, &newdata, &put_state, flags)) != 0)
+			goto err;
+	} else {
+		newdata = *data;
+	}
+
+	/*
+	 * Check for partial puts, with fixed length record databases (Step 2).
+	 */
+	if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) ||
+	    (dbp->type == DB_QUEUE)) {
+		if ((ret = __dbc_put_fixed_len(dbc, data, &newdata)) != 0)
+			goto err;
+	}
+
+	/* Validate any foreign databases, and update secondaries. (Step 3). */
+	if ((ret = __dbc_put_secondaries(dbc, key, &newdata,
+	    &olddata, s_count, all_skeys, &put_state))
+	    != 0)
+		goto err;
+	/*
+	 * If we've already got the old primary key/data pair, the secondary
+	 * updates are already done.
+	 */
+	if (FLD_ISSET(put_state, DBC_PUT_HAVEREC))
+		goto done;
+
+	/*
+	 * If still necessary, go get the old primary key/data.  (Step 4.)
+	 *
+	 * See the comments in step 2.  This is real familiar.
+	 */
+	if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+		goto err;
+	DB_ASSERT(env, flags != DB_CURRENT);
+	F_SET(key, DB_DBT_ISSET);
+	ret = __dbc_get(pdbc, key, &olddata, rmw | DB_SET);
+	if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+		FLD_SET(put_state, DBC_PUT_NODEL);
+		ret = 0;
+	}
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Check whether we do in fact have an old record we may need to
+	 * delete.  (Step 5).
+	 */
+	if (FLD_ISSET(put_state, DBC_PUT_NODEL))
+		goto done;
+
+	for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys;
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn), skeyp++) {
+		DB_ASSERT(env, skeyp - all_skeys < s_count);
+		/*
+		 * Don't process this secondary if the key is immutable.  We
+		 * know that the old record exists, so this optimization can
+		 * always be used.
+		 */
+		if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+			continue;
+
+		if ((ret = __dbc_del_oldskey(sdbp, dbc,
+		    skeyp, key, &olddata)) != 0 && ret != DB_KEYEXIST)
+			goto err;
+	}
+	if (ret != 0)
+		goto err;
+
+done:
+err:
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* If newdata or olddata were used, free their buffers. */
+	if (newdata.data != NULL && newdata.data != data->data)
+		__os_free(env, newdata.data);
+	if (olddata.data != NULL)
+		__os_ufree(env, olddata.data);
+
+	CDB_LOCKING_DONE(env, dbc);
+
+	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (all_skeys != NULL) {
+		for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) {
+			if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+				for (nskey = skeyp->size,
+				    tskeyp = (DBT *)skeyp->data;
+				    nskey > 0;
+				    nskey--, tskeyp++)
+					FREE_IF_NEEDED(env, tskeyp);
+			}
+			FREE_IF_NEEDED(env, skeyp);
+		}
+		__os_free(env, all_skeys);
+	}
+	return (ret);
+}
+
+/*
+ * __dbc_put --
+ *	Put using a cursor.
+ *
+ * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+	ret = 0;
+	F_CLR(dbc, DBC_ERROR);
+
+	/*
+	 * Putting to secondary indices is forbidden;  when we need to
+	 * internally update one, we're called with a private flag,
+	 * DB_UPDATE_SECONDARY, which does the right thing but won't return an
+	 * error during flag checking.
+	 *
+	 * As a convenience, many places that want the default DB_KEYLAST
+	 * behavior call DBC->put with flags == 0.  Protect lower-level code
+	 * here by translating that.
+	 *
+	 * Lastly, the DB_OVERWRITE_DUP flag is equivalent to DB_KEYLAST unless
+	 * there are sorted duplicates.  Limit the number of places that need
+	 * to test for it explicitly.
+	 */
+	if (flags == DB_UPDATE_SECONDARY || flags == 0 ||
+	    (flags == DB_OVERWRITE_DUP && !F_ISSET(dbp, DB_AM_DUPSORT)))
+		flags = DB_KEYLAST;
+
+	CDB_LOCKING_INIT(dbc->env, dbc);
+
+	PERFMON6(env, db, put, dbp->fname, dbp->dname,
+	    dbc->txn == NULL ? 0 : dbc->txn->txnid, key, data, flags);
+	/*
+	 * Check to see if we are a primary and have secondary indices.
+	 * If we are not, we save ourselves a good bit of trouble and
+	 * just skip to the "normal" put.
+	 */
+	if (DB_IS_PRIMARY(dbp) &&
+	    ((ret = __dbc_put_primary(dbc, key, data, flags)) != 0))
+		return (ret);
+
+	/*
+	 * If this is an append operation, the insert was done prior to the
+	 * secondary updates, so we are finished.
+	 */
+	if (flags == DB_APPEND)
+		return (ret);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp))
+		return (__bamc_compress_put(dbc, key, data, flags));
+#endif
+
+	return (__dbc_iput(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iput --
+ *	Implementation of put using a cursor.
+ *
+ * PUBLIC: int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iput(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc_n, *oldopd, *opd;
+	db_pgno_t pgno;
+	int ret, t_ret;
+	u_int32_t tmp_flags;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are duplicated cursors.  On return, any referenced pages
+	 * will be discarded, and, if the cursor is not intended to be used
+	 * again, the close function will be called.  So, pages/locks that
+	 * the cursor references do not need to be resolved by the underlying
+	 * functions.
+	 */
+	dbc_n = NULL;
+	ret = t_ret = 0;
+
+	/*
+	 * If we have an off-page duplicates cursor, and the operation applies
+	 * to it, perform the operation.  Duplicate the cursor and call the
+	 * underlying function.
+	 *
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the put operation is done in an off-page
+	 * duplicate tree, call the primary cursor's upgrade routine first.
+	 */
+	if (dbc->internal->opd != NULL &&
+	    (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) {
+		/*
+		 * A special case for hash off-page duplicates.  Hash doesn't
+		 * support (and is documented not to support) put operations
+		 * relative to a cursor which references an already deleted
+		 * item.  For consistency, apply the same criteria to off-page
+		 * duplicates as well.
+		 */
+		if (dbc->dbtype == DB_HASH && F_ISSET(
+		    ((BTREE_CURSOR *)(dbc->internal->opd->internal)),
+		    C_DELETED)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		if ((ret = dbc->am_writelock(dbc)) != 0 ||
+		    (ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+			goto err;
+		opd = dbc_n->internal->opd;
+		if ((ret = opd->am_put(
+		    opd, key, data, flags, NULL)) != 0)
+			goto err;
+		goto done;
+	}
+
+	/*
+	 * Perform an operation on the main cursor.  Duplicate the cursor,
+	 * and call the underlying function.
+	 */
+	if (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)
+		tmp_flags = DB_POSITION;
+	else
+		tmp_flags = 0;
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+		dbc_n = dbc;
+	else if ((ret = __dbc_idup(dbc, &dbc_n, tmp_flags)) != 0)
+		goto err;
+
+	pgno = PGNO_INVALID;
+	if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0)
+		goto err;
+
+	/*
+	 * We may be referencing a new off-page duplicates tree.  Acquire
+	 * a new cursor and call the underlying function.
+	 */
+	if (pgno != PGNO_INVALID) {
+		oldopd = dbc_n->internal->opd;
+		if ((ret = __dbc_newopd(dbc, pgno, oldopd, &opd)) != 0) {
+			dbc_n->internal->opd = opd;
+			goto err;
+		}
+
+		dbc_n->internal->opd = opd;
+		opd->internal->pdbc = dbc_n;
+
+		if (flags == DB_NOOVERWRITE)
+			flags = DB_KEYLAST;
+		if ((ret = opd->am_put(
+		    opd, key, data, flags, NULL)) != 0)
+			goto err;
+	}
+
+done:
+err:	/* Cleanup and cursor resolution. */
+	if (dbc_n != NULL && !DB_RETOK_DBCPUT(ret))
+		F_SET(dbc_n, DBC_ERROR);
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __dbc_del_oldskey --
+ *	Delete an old secondary key, if necessary.
+ *	Returns DB_KEYEXIST if the new and old keys match..
+ */
+static int
+__dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata)
+	DB *sdbp;
+	DBC *dbc;
+	DBT *skey, *pkey, *olddata;
+{
+	DB *dbp;
+	DBC *sdbc;
+	DBT *toldskeyp, *tskeyp;
+	DBT oldskey, temppkey, tempskey;
+	ENV *env;
+	int ret, t_ret;
+	u_int32_t i, noldskey, nsame, nskey, rmw;
+
+	sdbc = NULL;
+	dbp = sdbp->s_primary;
+	env = dbp->env;
+	nsame = 0;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * Get the old secondary key.
+	 */
+	memset(&oldskey, 0, sizeof(DBT));
+	if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) {
+		if (ret == DB_DONOTINDEX ||
+		    (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0))
+			/* There's no old key to delete. */
+			ret = 0;
+		return (ret);
+	}
+
+	if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+		__db_check_skeyset(sdbp, &oldskey);
+#endif
+		toldskeyp = (DBT *)oldskey.data;
+		noldskey = oldskey.size;
+	} else {
+		toldskeyp = &oldskey;
+		noldskey = 1;
+	}
+
+	if (F_ISSET(skey, DB_DBT_MULTIPLE)) {
+		nskey = skey->size;
+		skey = (DBT *)skey->data;
+	} else
+		nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0;
+
+	for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) {
+		/*
+		 * Check whether this old secondary key is also a new key
+		 * before we delete it.  Note that bt_compare is (and must be)
+		 * set no matter what access method we're in.
+		 */
+		for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++)
+			if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+			    toldskeyp, tskeyp) == 0) {
+				nsame++;
+				F_CLR(tskeyp, DB_DBT_ISSET);
+				break;
+			}
+
+		if (i < nskey) {
+			FREE_IF_NEEDED(env, toldskeyp);
+			continue;
+		}
+
+		if (sdbc == NULL) {
+			if ((ret = __db_cursor_int(sdbp,
+			    dbc->thread_info, dbc->txn, sdbp->type,
+			    PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+				goto err;
+			if (CDB_LOCKING(env)) {
+				DB_ASSERT(env,
+				    sdbc->mylock.off == LOCK_INVALID);
+				F_SET(sdbc, DBC_WRITER);
+			}
+		}
+
+		/*
+		 * Don't let c_get(DB_GET_BOTH) stomp on our data.  Use
+		 * temporary DBTs instead.
+		 */
+		SWAP_IF_NEEDED(sdbp, pkey);
+		DB_INIT_DBT(temppkey, pkey->data, pkey->size);
+		DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size);
+		if ((ret = __dbc_get(sdbc,
+		    &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0)
+			ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+		else if (ret == DB_NOTFOUND)
+			ret = __db_secondary_corrupt(dbp);
+		SWAP_IF_NEEDED(sdbp, pkey);
+		FREE_IF_NEEDED(env, toldskeyp);
+	}
+
+err:	for (; noldskey > 0; noldskey--, toldskeyp++)
+		FREE_IF_NEEDED(env, toldskeyp);
+	FREE_IF_NEEDED(env, &oldskey);
+	if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret == 0 && nsame == nskey)
+		return (DB_KEYEXIST);
+	return (ret);
+}
+
+/*
+ * __db_duperr()
+ *	Error message: we don't currently support sorted duplicate duplicates.
+ * PUBLIC: int __db_duperr __P((DB *, u_int32_t));
+ */
+int
+__db_duperr(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	/*
+	 * If we run into this error while updating a secondary index,
+	 * don't yell--there's no clean way to pass DB_NODUPDATA in along
+	 * with DB_UPDATE_SECONDARY, but we may run into this problem
+	 * in a normal, non-error course of events.
+	 *
+	 * !!!
+	 * If and when we ever permit duplicate duplicates in sorted-dup
+	 * databases, we need to either change the secondary index code
+	 * to check for dup dups, or we need to maintain the implicit
+	 * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set.
+	 */
+	if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY))
+		__db_errx(dbp->env, DB_STR("0696",
+		    "Duplicate data items are not supported with sorted data"));
+	return (DB_KEYEXIST);
+}
+
+/*
+ * __dbc_cleanup --
+ *	Clean up duplicate cursors.
+ *
+ * PUBLIC: int __dbc_cleanup __P((DBC *, DBC *, int));
+ */
+int
+__dbc_cleanup(dbc, dbc_n, failed)
+	DBC *dbc, *dbc_n;
+	int failed;
+{
+	DB *dbp;
+	DBC *opd;
+	DBC_INTERNAL *internal;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_OFF(dbc->thread_info);
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	internal = dbc->internal;
+	ret = 0;
+
+	/* Discard any pages we're holding. */
+	if (internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		     internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		internal->page = NULL;
+	}
+	opd = internal->opd;
+	if (opd != NULL && opd->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    opd->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		opd->internal->page = NULL;
+	}
+
+	/*
+	 * If dbc_n is NULL, there's no internal cursor swapping to be done
+	 * and no dbc_n to close--we probably did the entire operation on an
+	 * offpage duplicate cursor.  Just return.
+	 *
+	 * If dbc and dbc_n are the same, we're either inside a DB->{put/get}
+	 * operation, and as an optimization we performed the operation on
+	 * the main cursor rather than on a duplicated one, or we're in a
+	 * bulk get that can't have moved the cursor (DB_MULTIPLE with the
+	 * initial c_get operation on an off-page dup cursor).  Just
+	 * return--either we know we didn't move the cursor, or we're going
+	 * to close it before we return to application code, so we're sure
+	 * not to visibly violate the "cursor stays put on error" rule.
+	 */
+	if (dbc_n == NULL || dbc == dbc_n)
+		goto done;
+
+	if (dbc_n->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    dbc_n->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		dbc_n->internal->page = NULL;
+	}
+	opd = dbc_n->internal->opd;
+	if (opd != NULL && opd->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		     opd->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		opd->internal->page = NULL;
+	}
+
+	/*
+	 * If we didn't fail before entering this routine or just now when
+	 * freeing pages, swap the interesting contents of the old and new
+	 * cursors.
+	 */
+	if (!failed && ret == 0) {
+		if (opd != NULL)
+			opd->internal->pdbc = dbc;
+		if (internal->opd != NULL)
+			internal->opd->internal->pdbc = dbc_n;
+		dbc->internal = dbc_n->internal;
+		dbc_n->internal = internal;
+	}
+
+	/*
+	 * Close the cursor we don't care about anymore.  The close can fail,
+	 * but we only expect DB_LOCK_DEADLOCK failures.  This violates our
+	 * "the cursor is unchanged on error" semantics, but since all you can
+	 * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe
+	 * that's OK.
+	 *
+	 * XXX
+	 * There's no way to recover from failure to close the old cursor.
+	 * All we can do is move to the new position and return an error.
+	 *
+	 * XXX
+	 * We might want to consider adding a flag to the cursor, so that any
+	 * subsequent operations other than close just return an error?
+	 */
+	if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * If this was an update that is supporting dirty reads
+	 * then we may have just swapped our read for a write lock
+	 * which is held by the surviving cursor.  We need
+	 * to explicitly downgrade this lock.  The closed cursor
+	 * may only have had a read lock.
+	 */
+	if (ret == 0 && failed == 0 && F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+	    dbc->internal->lock_mode == DB_LOCK_WRITE &&
+	    (ret = __TLPUT(dbc, dbc->internal->lock)) == 0)
+		dbc->internal->lock_mode = DB_LOCK_WWRITE;
+
+done:
+	if (F_ISSET(dbc, DBC_OPD))
+		LOCK_CHECK_ON(dbc->thread_info);
+
+	return (ret);
+}
+
+/*
+ * __dbc_secondary_get_pp --
+ *	This wrapper function for DBC->pget() is the DBC->get() function
+ *	for a secondary index cursor.
+ *
+ * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_secondary_get_pp(dbc, skey, data, flags)
+	DBC *dbc;
+	DBT *skey, *data;
+	u_int32_t flags;
+{
+	DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+	return (__dbc_pget_pp(dbc, skey, NULL, data, flags));
+}
+
+/*
+ * __dbc_pget --
+ *	Get a primary key/data pair through a secondary index.
+ *
+ * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget(dbc, skey, pkey, data, flags)
+	DBC *dbc;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB *pdbp, *sdbp;
+	DBC *dbc_n, *pdbc;
+	DBT nullpkey, *save_data;
+	u_int32_t save_pkey_flags, tmp_flags, tmp_read_locking, tmp_rmw;
+	int pkeymalloc, ret, t_ret;
+
+	sdbp = dbc->dbp;
+	pdbp = sdbp->s_primary;
+	dbc_n = NULL;
+	save_data = NULL;
+	pkeymalloc = t_ret = 0;
+
+	/*
+	 * The challenging part of this function is getting the behavior
+	 * right for all the various permutations of DBT flags.  The
+	 * next several blocks handle the various cases we need to
+	 * deal with specially.
+	 */
+
+	/*
+	 * We may be called with a NULL pkey argument, if we've been
+	 * wrapped by a 2-DBT get call.  If so, we need to use our
+	 * own DBT.
+	 */
+	if (pkey == NULL) {
+		memset(&nullpkey, 0, sizeof(DBT));
+		pkey = &nullpkey;
+	}
+
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = LF_ISSET(DB_RMW);
+	LF_CLR(DB_RMW);
+
+	SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+	/*
+	 * DB_GET_RECNO is a special case, because we're interested not in
+	 * the primary key/data pair, but rather in the primary's record
+	 * number.
+	 */
+	if (flags == DB_GET_RECNO) {
+		if (tmp_rmw)
+			F_SET(dbc, DBC_RMW);
+		F_SET(dbc, tmp_read_locking);
+		ret = __dbc_pget_recno(dbc, pkey, data, flags);
+		if (tmp_rmw)
+			F_CLR(dbc, DBC_RMW);
+		/* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+		F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+		return (ret);
+	}
+
+	/*
+	 * If the DBTs we've been passed don't have any of the
+	 * user-specified memory management flags set, we want to make sure
+	 * we return values using the DBTs dbc->rskey, dbc->rkey, and
+	 * dbc->rdata, respectively.
+	 *
+	 * There are two tricky aspects to this:  first, we need to pass
+	 * skey and pkey *in* to the initial c_get on the secondary key,
+	 * since either or both may be looked at by it (depending on the
+	 * get flag).  Second, we must not use a normal DB->get call
+	 * on the secondary, even though that's what we want to accomplish,
+	 * because the DB handle may be free-threaded.  Instead,
+	 * we open a cursor, then take steps to ensure that we actually use
+	 * the rkey/rdata from the *secondary* cursor.
+	 *
+	 * We accomplish all this by passing in the DBTs we started out
+	 * with to the c_get, but swapping the contents of rskey and rkey,
+	 * respectively, into rkey and rdata;  __db_ret will treat them like
+	 * the normal key/data pair in a c_get call, and will realloc them as
+	 * need be (this is "step 1").  Then, for "step 2", we swap back
+	 * rskey/rkey/rdata to normal, and do a get on the primary with the
+	 * secondary dbc appointed as the owner of the returned-data memory.
+	 *
+	 * Note that in step 2, we copy the flags field in case we need to
+	 * pass down a DB_DBT_PARTIAL or other flag that is compatible with
+	 * letting DB do the memory management.
+	 */
+
+	/*
+	 * It is correct, though slightly sick, to attempt a partial get of a
+	 * primary key.  However, if we do so here, we'll never find the
+	 * primary record;  clear the DB_DBT_PARTIAL field of pkey just for the
+	 * duration of the next call.
+	 */
+	save_pkey_flags = pkey->flags;
+	F_CLR(pkey, DB_DBT_PARTIAL);
+
+	/*
+	 * Now we can go ahead with the meat of this call.  First, get the
+	 * primary key from the secondary index.  (What exactly we get depends
+	 * on the flags, but the underlying cursor get will take care of the
+	 * dirty work.)  Duplicate the cursor, in case the later get on the
+	 * primary fails.
+	 */
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_GET_BOTHC:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		tmp_flags = DB_POSITION;
+		break;
+	default:
+		tmp_flags = 0;
+		break;
+	}
+
+	if (dbc->internal->opd != NULL ||
+	     F_ISSET(dbc, DBC_PARTITIONED | DBC_TRANSIENT)) {
+		dbc_n = dbc;
+		save_data = dbc_n->rdata;
+	} else {
+		if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+			return (ret);
+		F_SET(dbc_n, DBC_TRANSIENT);
+	}
+	dbc_n->rdata = dbc->rkey;
+	dbc_n->rkey = dbc->rskey;
+
+	if (tmp_rmw)
+		F_SET(dbc_n, DBC_RMW);
+	F_SET(dbc_n, tmp_read_locking);
+
+	/*
+	 * If we've been handed a primary key, it will be in native byte order,
+	 * so we need to swap it before reading from the secondary.
+	 */
+	if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+	    flags == DB_GET_BOTH_RANGE)
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+retry:	/* Step 1. */
+	ret = __dbc_get(dbc_n, skey, pkey, flags);
+	/* Restore pkey's flags in case we stomped the PARTIAL flag. */
+	pkey->flags = save_pkey_flags;
+
+	/*
+	 * We need to swap the primary key to native byte order if we read it
+	 * successfully, or if we swapped it on entry above.  We can't return
+	 * with the application's data modified.
+	 */
+	if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+	    flags == DB_GET_BOTH_RANGE)
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Now we're ready for "step 2".  If either or both of pkey and data do
+	 * not have memory management flags set--that is, if DB is managing
+	 * their memory--we need to swap around the rkey/rdata structures so
+	 * that we don't wind up trying to use memory managed by the primary
+	 * database cursor, which we'll close before we return.
+	 *
+	 * !!!
+	 * If you're carefully following the bouncing ball, you'll note that in
+	 * the DB-managed case, the buffer hanging off of pkey is the same as
+	 * dbc->rkey->data.  This is just fine;  we may well realloc and stomp
+	 * on it when we return, if we're doing a DB_GET_BOTH and need to
+	 * return a different partial or key (depending on the comparison
+	 * function), but this is safe.
+	 *
+	 * !!!
+	 * We need to use __db_cursor_int here rather than simply calling
+	 * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a
+	 * new locker ID and leave ourselves open to deadlocks.  (Even though
+	 * we're only acquiring read locks, we'll still block if there are any
+	 * waiters.)
+	 */
+	if ((ret = __db_cursor_int(pdbp, dbc->thread_info,
+	    dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+		goto err;
+
+	F_SET(pdbc, tmp_read_locking |
+	     F_ISSET(dbc, DBC_READ_UNCOMMITTED | DBC_READ_COMMITTED | DBC_RMW));
+
+	/*
+	 * We're about to use pkey a second time.  If DB_DBT_MALLOC is set on
+	 * it, we'll leak the memory we allocated the first time.  Thus, set
+	 * DB_DBT_REALLOC instead so that we reuse that memory instead of
+	 * leaking it.
+	 *
+	 * Alternatively, if the application is handling copying for pkey, we
+	 * need to take a copy now.  The copy will be freed on exit from
+	 * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY
+	 * is set).  In the case of DB_GET_BOTH_RANGE, the pkey supplied by
+	 * the application has already been copied in but the value may have
+	 * changed in the search.  In that case, free the original copy and get
+	 * a new one.
+	 *
+	 * !!!
+	 * This assumes that the user must always specify a compatible realloc
+	 * function if a malloc function is specified.  I think this is a
+	 * reasonable requirement.
+	 */
+	if (F_ISSET(pkey, DB_DBT_MALLOC)) {
+		F_CLR(pkey, DB_DBT_MALLOC);
+		F_SET(pkey, DB_DBT_REALLOC);
+		pkeymalloc = 1;
+	} else if (F_ISSET(pkey, DB_DBT_USERCOPY)) {
+		if (flags == DB_GET_BOTH_RANGE)
+			__dbt_userfree(sdbp->env, NULL, pkey, NULL);
+		if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Do the actual get.  Set DBC_TRANSIENT since we don't care about
+	 * preserving the position on error, and it's faster.  SET_RET_MEM so
+	 * that the secondary DBC owns any returned-data memory.
+	 */
+	F_SET(pdbc, DBC_TRANSIENT);
+	SET_RET_MEM(pdbc, dbc);
+	ret = __dbc_get(pdbc, pkey, data, DB_SET);
+	DB_ASSERT(pdbp->env, ret != DB_PAGE_NOTFOUND);
+
+	/*
+	 * If the item wasn't found in the primary, this is a bug; our
+	 * secondary has somehow gotten corrupted, and contains elements that
+	 * don't correspond to anything in the primary.  Complain.
+	 */
+
+	/* Now close the primary cursor. */
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	else if (ret == DB_NOTFOUND) {
+		if (!F_ISSET(dbc, DBC_READ_UNCOMMITTED))
+			ret = __db_secondary_corrupt(pdbp);
+		else switch (flags) {
+		case DB_GET_BOTHC:
+		case DB_NEXT:
+		case DB_NEXT_DUP:
+		case DB_NEXT_NODUP:
+		case DB_PREV:
+		case DB_PREV_DUP:
+		case DB_PREV_NODUP:
+			PERFMON5(pdbp->env, race, dbc_get,
+			    sdbp->fname, sdbp->dname, ret, flags, pkey);
+			goto retry;
+		default:
+			break;
+		}
+	}
+
+err:	/* Cleanup and cursor resolution. */
+	if (dbc_n == dbc) {
+		dbc_n->rkey = dbc_n->rdata;
+		dbc_n->rdata = save_data;
+	}
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pkeymalloc) {
+		/*
+		 * If pkey had a MALLOC flag, we need to restore it; otherwise,
+		 * if the user frees the buffer but reuses the DBT without
+		 * NULL'ing its data field or changing the flags, we may drop
+		 * core.
+		 */
+		F_CLR(pkey, DB_DBT_REALLOC);
+		F_SET(pkey, DB_DBT_MALLOC);
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbc_pget_recno --
+ *	Perform a DB_GET_RECNO c_pget on a secondary index.  Returns
+ * the secondary's record number in the pkey field and the primary's
+ * in the data field.
+ */
+static int
+__dbc_pget_recno(sdbc, pkey, data, flags)
+	DBC *sdbc;
+	DBT *pkey, *data;
+	u_int32_t flags;
+{
+	DB *pdbp, *sdbp;
+	DBC *pdbc;
+	DBT discardme, primary_key;
+	ENV *env;
+	db_recno_t oob;
+	u_int32_t rmw;
+	int ret, t_ret;
+
+	sdbp = sdbc->dbp;
+	pdbp = sdbp->s_primary;
+	env = sdbp->env;
+	pdbc = NULL;
+	ret = t_ret = 0;
+
+	rmw = LF_ISSET(DB_RMW);
+
+	memset(&discardme, 0, sizeof(DBT));
+	F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+	oob = RECNO_OOB;
+
+	/*
+	 * If the primary is an rbtree, we want its record number, whether
+	 * or not the secondary is one too.  Fetch the recno into "data".
+	 *
+	 * If it's not an rbtree, return RECNO_OOB in "data".
+	 */
+	if (F_ISSET(pdbp, DB_AM_RECNUM)) {
+		/*
+		 * Get the primary key, so we can find the record number
+		 * in the primary. (We're uninterested in the secondary key.)
+		 */
+		memset(&primary_key, 0, sizeof(DBT));
+		F_SET(&primary_key, DB_DBT_MALLOC);
+		if ((ret = __dbc_get(sdbc,
+		    &discardme, &primary_key, rmw | DB_CURRENT)) != 0)
+			return (ret);
+
+		/*
+		 * Open a cursor on the primary, set it to the right record,
+		 * and fetch its recno into "data".
+		 *
+		 * (See __dbc_pget for comments on the use of __db_cursor_int.)
+		 *
+		 * SET_RET_MEM so that the secondary DBC owns any returned-data
+		 * memory.
+		 */
+		if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn,
+		    pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+			goto perr;
+		SET_RET_MEM(pdbc, sdbc);
+		if ((ret = __dbc_get(pdbc,
+		    &primary_key, &discardme, rmw | DB_SET)) != 0)
+			goto perr;
+
+		ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO);
+
+perr:		__os_ufree(env, primary_key.data);
+		if (pdbc != NULL &&
+		    (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+	} else if ((ret = __db_retcopy(env, data, &oob,
+		    sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0)
+			return (ret);
+
+	/*
+	 * If the secondary is an rbtree, we want its record number, whether
+	 * or not the primary is one too.  Fetch the recno into "pkey".
+	 *
+	 * If it's not an rbtree, return RECNO_OOB in "pkey".
+	 */
+	if (F_ISSET(sdbp, DB_AM_RECNUM))
+		return (__dbc_get(sdbc, &discardme, pkey, flags));
+	else
+		return (__db_retcopy(env, pkey, &oob,
+		    sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen));
+}
+
+/*
+ * __db_wrlock_err -- do not have a write lock.
+ */
+static int
+__db_wrlock_err(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0697", "Write attempted on read-only cursor"));
+	return (EPERM);
+}
+
+/*
+ * __dbc_del_secondary --
+ *	Perform a delete operation on a secondary index:  call through
+ *	to the primary and delete the primary record that this record
+ *	points to.
+ *
+ *	Note that deleting the primary record will call c_del on all
+ *	the secondaries, including this one;  thus, it is not necessary
+ *	to execute both this function and an actual delete.
+ */
+static int
+__dbc_del_secondary(dbc)
+	DBC *dbc;
+{
+	DB *pdbp;
+	DBC *pdbc;
+	DBT skey, pkey;
+	ENV *env;
+	int ret, t_ret;
+	u_int32_t rmw;
+
+	pdbp = dbc->dbp->s_primary;
+	env = pdbp->env;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * Get the current item that we're pointing at.
+	 * We don't actually care about the secondary key, just
+	 * the primary.
+	 */
+	memset(&skey, 0, sizeof(DBT));
+	memset(&pkey, 0, sizeof(DBT));
+	F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+	if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0)
+		return (ret);
+
+	SWAP_IF_NEEDED(dbc->dbp, &pkey);
+	DEBUG_LWRITE(dbc, dbc->txn, "del_secondary", &skey, &pkey, 0);
+
+	/*
+	 * Create a cursor on the primary with our locker ID,
+	 * so that when it calls back, we don't conflict.
+	 *
+	 * We create a cursor explicitly because there's no
+	 * way to specify the same locker ID if we're using
+	 * locking but not transactions if we use the DB->del
+	 * interface.  This shouldn't be any less efficient
+	 * anyway.
+	 */
+	if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn,
+	    pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+		return (ret);
+
+	/*
+	 * See comment in __dbc_put--if we're in CDB,
+	 * we already hold the locks we need, and we need to flag
+	 * the cursor as a WRITER so we don't run into errors
+	 * when we try to delete.
+	 */
+	if (CDB_LOCKING(env)) {
+		DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID);
+		F_SET(pdbc, DBC_WRITER);
+	}
+
+	/*
+	 * Set the new cursor to the correct primary key.  Then
+	 * delete it.  We don't really care about the datum;
+	 * just reuse our skey DBT.
+	 *
+	 * If the primary get returns DB_NOTFOUND, something is amiss--
+	 * every record in the secondary should correspond to some record
+	 * in the primary.
+	 */
+	if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0)
+		ret = __dbc_del(pdbc, 0);
+	else if (ret == DB_NOTFOUND)
+		ret = __db_secondary_corrupt(pdbp);
+
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __dbc_del_primary --
+ *	Perform a delete operation on a primary index.  Loop through
+ *	all the secondary indices which correspond to this primary
+ *	database, and delete any secondary keys that point at the current
+ *	record.
+ *
+ * PUBLIC: int __dbc_del_primary __P((DBC *));
+ */
+int
+__dbc_del_primary(dbc)
+	DBC *dbc;
+{
+	DB *dbp, *sdbp;
+	DBC *sdbc;
+	DBT *tskeyp;
+	DBT data, pkey, skey, temppkey, tempskey;
+	ENV *env;
+	u_int32_t nskey, rmw;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	sdbp = NULL;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * If we're called at all, we have at least one secondary.
+	 * (Unfortunately, we can't assert this without grabbing the mutex.)
+	 * Get the current record so that we can construct appropriate
+	 * secondary keys as needed.
+	 */
+	memset(&pkey, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0)
+		return (ret);
+
+	memset(&skey, 0, sizeof(DBT));
+	for (ret = __db_s_first(dbp, &sdbp);
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn)) {
+		/*
+		 * Get the secondary key for this secondary and the current
+		 * item.
+		 */
+		if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) {
+			/* Not indexing is equivalent to an empty key set. */
+			if (ret == DB_DONOTINDEX) {
+				F_SET(&skey, DB_DBT_MULTIPLE);
+				skey.size = 0;
+			} else /* We had a substantive error.  Bail. */
+				goto err;
+		}
+
+#ifdef DIAGNOSTIC
+		if (F_ISSET(&skey, DB_DBT_MULTIPLE))
+			__db_check_skeyset(sdbp, &skey);
+#endif
+
+		if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+			tskeyp = (DBT *)skey.data;
+			nskey = skey.size;
+			if (nskey == 0)
+				continue;
+		} else {
+			tskeyp = &skey;
+			nskey = 1;
+		}
+
+		/* Open a secondary cursor. */
+		if ((ret = __db_cursor_int(sdbp,
+		    dbc->thread_info, dbc->txn, sdbp->type,
+		    PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+			goto err;
+		/* See comment above and in __dbc_put. */
+		if (CDB_LOCKING(env)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+		}
+
+		for (; nskey > 0; nskey--, tskeyp++) {
+			/*
+			 * Set the secondary cursor to the appropriate item.
+			 * Delete it.
+			 *
+			 * We want to use DB_RMW if locking is on; it's only
+			 * legal then, though.
+			 *
+			 * !!!
+			 * Don't stomp on any callback-allocated buffer in skey
+			 * when we do a c_get(DB_GET_BOTH); use a temp DBT
+			 * instead.  Similarly, don't allow pkey to be
+			 * invalidated when the cursor is closed.
+			 */
+			DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size);
+			SWAP_IF_NEEDED(sdbp, &pkey);
+			DB_INIT_DBT(temppkey, pkey.data, pkey.size);
+			if ((ret = __dbc_get(sdbc, &tempskey, &temppkey,
+			    DB_GET_BOTH | rmw)) == 0)
+				ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+			else if (ret == DB_NOTFOUND)
+				ret = __db_secondary_corrupt(dbp);
+			SWAP_IF_NEEDED(sdbp, &pkey);
+			FREE_IF_NEEDED(env, tskeyp);
+		}
+
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * In the common case where there is a single secondary key, we
+		 * will have freed any application-allocated data in skey
+		 * already.  In the multiple key case, we need to free it here.
+		 * It is safe to do this twice as the macro resets the data
+		 * field.
+		 */
+		FREE_IF_NEEDED(env, &skey);
+	}
+
+err:	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+	FREE_IF_NEEDED(env, &skey);
+	return (ret);
+}
+
+/*
+ * __dbc_del_foreign --
+ *	Apply the foreign database constraints for a particular foreign
+ *	database when an item is being deleted (dbc points at item being deleted
+ *	in the foreign database.)
+ *
+ *      Delete happens in dbp, check for occurrences of key in pdpb.
+ *      Terminology:
+ *        Foreign db = Where delete occurs (dbp).
+ *        Secondary db = Where references to dbp occur (sdbp, a secondary)
+ *        Primary db = sdbp's primary database, references to dbp are secondary
+ *                      keys here
+ *        Foreign Key = Key being deleted in dbp (fkey)
+ *        Primary Key = Key of the corresponding entry in sdbp's primary (pkey).
+ */
+static int
+__dbc_del_foreign(dbc)
+	DBC *dbc;
+{
+	DB_FOREIGN_INFO *f_info;
+	DB *dbp, *pdbp, *sdbp;
+	DBC *pdbc, *sdbc;
+	DBT data, fkey, pkey;
+	ENV *env;
+	u_int32_t flags, rmw;
+	int changed, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	memset(&fkey, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0)
+		return (ret);
+
+	LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) {
+		sdbp = f_info->dbp;
+		pdbp = sdbp->s_primary;
+		flags = f_info->flags;
+
+		rmw = (STD_LOCKING(dbc) &&
+		    !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0;
+
+		/*
+		 * Handle CDB locking.  Some of this is copied from
+		 * __dbc_del_primary, but a bit more acrobatics are required.
+		 * If we're not going to abort, then we need to get a write
+		 * cursor.  If CDB_ALLDB is set, then only one write cursor is
+		 * allowed and we hold it, so we fudge things and promote the
+		 * cursor on the other DBs manually, it won't cause a problem.
+		 * If CDB_ALLDB is not set, then we go through the usual route
+		 * to make sure we block as necessary.  If there are any open
+		 * read cursors on sdbp, the delete or put call later will
+		 * block.
+		 *
+		 * If NULLIFY is set, we'll need a cursor on the primary to
+		 * update it with the nullified data.  Because primary and
+		 * secondary dbs share a lock file ID in CDB, we open a cursor
+		 * on the secondary and then get another writable cursor on the
+		 * primary via __db_cursor_int to avoid deadlocking.
+		 */
+		sdbc = pdbc = NULL;
+		if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) &&
+		    !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+			ret = __db_cursor(sdbp,
+			    dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) {
+				ret = __db_cursor_int(pdbp,
+				    dbc->thread_info, dbc->txn, pdbp->type,
+				    PGNO_INVALID, 0, dbc->locker, &pdbc);
+				F_SET(pdbc, DBC_WRITER);
+			}
+		} else {
+			ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+			    sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0)
+				ret = __db_cursor_int(pdbp, dbc->thread_info,
+				    dbc->txn, pdbp->type, PGNO_INVALID, 0,
+				    dbc->locker, &pdbc);
+			}
+		if (ret != 0) {
+			if (sdbc != NULL)
+				(void)__dbc_close(sdbc);
+			return (ret);
+		}
+		if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) {
+				DB_ASSERT(env,
+				    pdbc->mylock.off == LOCK_INVALID);
+				F_SET(pdbc, DBC_WRITER);
+			}
+		}
+
+		/*
+		 * There are three actions possible when a foreign database has
+		 * items corresponding to a deleted item:
+		 * DB_FOREIGN_ABORT - The delete operation should be aborted.
+		 * DB_FOREIGN_CASCADE - All corresponding foreign items should
+		 *    be deleted.
+		 * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing
+		 *    the application to modify the data DBT from the
+		 *    associated database.  If the callback makes a
+		 *    modification, the updated item needs to replace the
+		 *    original item in the foreign db
+		 */
+		memset(&pkey, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw);
+
+		if (ret == DB_NOTFOUND) {
+			/* No entry means no constraint */
+			ret = __dbc_close(sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+			    (t_ret = __dbc_close(pdbc)) != 0)
+				ret = t_ret;
+			if (ret != 0)
+				return (ret);
+			continue;
+		} else if (ret != 0) {
+			/* Just return the error code from the pget */
+			(void)__dbc_close(sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY))
+				(void)__dbc_close(pdbc);
+			return (ret);
+		} else if (LF_ISSET(DB_FOREIGN_ABORT)) {
+			/* If the record exists and ABORT is set, we're done */
+			if ((ret = __dbc_close(sdbc)) != 0)
+				return (ret);
+			return (DB_FOREIGN_CONFLICT);
+		}
+
+		/*
+		 * There were matching items in the primary DB, and the action
+		 * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY.
+		 */
+		while (ret == 0) {
+			if (LF_ISSET(DB_FOREIGN_CASCADE)) {
+				/*
+				 * Don't use the DB_UPDATE_SECONDARY flag,
+				 * since we want the delete to cascade into the
+				 * secondary's primary.
+				 */
+				if ((ret = __dbc_del(sdbc, 0)) != 0) {
+					__db_err(env, ret, DB_STR("0698",
+	    "Attempt to execute cascading delete in a foreign index failed"));
+					break;
+				}
+			} else if (LF_ISSET(DB_FOREIGN_NULLIFY)) {
+				changed = 0;
+				if ((ret = f_info->callback(sdbp,
+				    &pkey, &data, &fkey, &changed)) != 0) {
+					__db_err(env, ret, DB_STR("0699",
+				    "Foreign database application callback"));
+					break;
+				}
+
+				/*
+				 * If the user callback modified the DBT and
+				 * a put on the primary failed.
+				 */
+				if (changed && (ret = __dbc_put(pdbc,
+				    &pkey, &data, DB_KEYFIRST)) != 0) {
+					__db_err(env, ret, DB_STR("0700",
+"Attempt to overwrite item in foreign database with nullified value failed"));
+					break;
+				}
+			}
+			/* retrieve the next matching item from the prim. db */
+			memset(&pkey, 0, sizeof(DBT));
+			memset(&data, 0, sizeof(DBT));
+			ret = __dbc_pget(sdbc,
+			    &fkey, &pkey, &data, DB_NEXT_DUP|rmw);
+		}
+
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+		    (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_s_first --
+ *	Get the first secondary, if any are present, from the primary.
+ *
+ * PUBLIC: int __db_s_first __P((DB *, DB **));
+ */
+int
+__db_s_first(pdbp, sdbpp)
+	DB *pdbp, **sdbpp;
+{
+	DB *sdbp;
+
+	MUTEX_LOCK(pdbp->env, pdbp->mutex);
+	sdbp = LIST_FIRST(&pdbp->s_secondaries);
+
+	/* See __db_s_next. */
+	if (sdbp != NULL)
+		sdbp->s_refcnt++;
+	MUTEX_UNLOCK(pdbp->env, pdbp->mutex);
+
+	*sdbpp = sdbp;
+
+	return (0);
+}
+
+/*
+ * __db_s_next --
+ *	Get the next secondary in the list.
+ *
+ * PUBLIC: int __db_s_next __P((DB **, DB_TXN *));
+ */
+int
+__db_s_next(sdbpp, txn)
+	DB **sdbpp;
+	DB_TXN *txn;
+{
+	DB *sdbp, *pdbp, *closeme;
+	ENV *env;
+	int ret;
+
+	/*
+	 * Secondary indices are kept in a linked list, s_secondaries,
+	 * off each primary DB handle.  If a primary is free-threaded,
+	 * this list may only be traversed or modified while the primary's
+	 * thread mutex is held.
+	 *
+	 * The tricky part is that we don't want to hold the thread mutex
+	 * across the full set of secondary puts necessary for each primary
+	 * put, or we'll wind up essentially single-threading all the puts
+	 * to the handle;  the secondary puts will each take about as
+	 * long as the primary does, and may require I/O.  So we instead
+	 * hold the thread mutex only long enough to follow one link to the
+	 * next secondary, and then we release it before performing the
+	 * actual secondary put.
+	 *
+	 * The only danger here is that we might legitimately close a
+	 * secondary index in one thread while another thread is performing
+	 * a put and trying to update that same secondary index.  To
+	 * prevent this from happening, we refcount the secondary handles.
+	 * If close is called on a secondary index handle while we're putting
+	 * to it, it won't really be closed--the refcount will simply drop,
+	 * and we'll be responsible for closing it here.
+	 */
+	sdbp = *sdbpp;
+	pdbp = sdbp->s_primary;
+	env = pdbp->env;
+	closeme = NULL;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		closeme = sdbp;
+	}
+	sdbp = LIST_NEXT(sdbp, s_links);
+	if (sdbp != NULL)
+		sdbp->s_refcnt++;
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	*sdbpp = sdbp;
+
+	/*
+	 * closeme->close() is a wrapper;  call __db_close explicitly.
+	 */
+	if (closeme == NULL)
+		ret = 0;
+	else
+		ret = __db_close(closeme, txn, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_s_done --
+ *	Properly decrement the refcount on a secondary database handle we're
+ *	using, without calling __db_s_next.
+ *
+ * PUBLIC: int __db_s_done __P((DB *, DB_TXN *));
+ */
+int
+__db_s_done(sdbp, txn)
+	DB *sdbp;
+	DB_TXN *txn;
+{
+	DB *pdbp;
+	ENV *env;
+	int doclose, ret;
+
+	pdbp = sdbp->s_primary;
+	env = pdbp->env;
+	doclose = 0;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		doclose = 1;
+	}
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	if (doclose == 0)
+		ret = 0;
+	else
+		ret = __db_close(sdbp, txn, 0);
+	return (ret);
+}
+
+/*
+ * __db_s_count --
+ *	Count the number of secondaries associated with a given primary.
+ */
+static int
+__db_s_count(pdbp)
+	DB *pdbp;
+{
+	DB *sdbp;
+	ENV *env;
+	int count;
+
+	env = pdbp->env;
+	count = 0;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	for (sdbp = LIST_FIRST(&pdbp->s_secondaries);
+	    sdbp != NULL;
+	    sdbp = LIST_NEXT(sdbp, s_links))
+		++count;
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	return (count);
+}
+
+/*
+ * __db_buildpartial --
+ *	Build the record that will result after a partial put is applied to
+ *	an existing record.
+ *
+ *	This should probably be merged with __bam_build, but that requires
+ *	a little trickery if we plan to keep the overflow-record optimization
+ *	in that function.
+ *
+ * PUBLIC: int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+ */
+int
+__db_buildpartial(dbp, oldrec, partial, newrec)
+	DB *dbp;
+	DBT *oldrec, *partial, *newrec;
+{
+	ENV *env;
+	u_int32_t len, nbytes;
+	u_int8_t *buf;
+	int ret;
+
+	env = dbp->env;
+
+	DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL));
+
+	memset(newrec, 0, sizeof(DBT));
+
+	nbytes = __db_partsize(oldrec->size, partial);
+	newrec->size = nbytes;
+
+	if ((ret = __os_malloc(env, nbytes, &buf)) != 0)
+		return (ret);
+	newrec->data = buf;
+
+	/* Nul or pad out the buffer, for any part that isn't specified. */
+	memset(buf,
+	    F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad :
+	    0, nbytes);
+
+	/* Copy in any leading data from the original record. */
+	memcpy(buf, oldrec->data,
+	    partial->doff > oldrec->size ? oldrec->size : partial->doff);
+
+	/* Copy the data from partial. */
+	memcpy(buf + partial->doff, partial->data, partial->size);
+
+	/* Copy any trailing data from the original record. */
+	len = partial->doff + partial->dlen;
+	if (oldrec->size > len)
+		memcpy(buf + partial->doff + partial->size,
+		    (u_int8_t *)oldrec->data + len, oldrec->size - len);
+
+	return (0);
+}
+
+/*
+ * __db_partsize --
+ *	Given the number of bytes in an existing record and a DBT that
+ *	is about to be partial-put, calculate the size of the record
+ *	after the put.
+ *
+ *	This code is called from __bam_partsize.
+ *
+ * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *));
+ */
+u_int32_t
+__db_partsize(nbytes, data)
+	u_int32_t nbytes;
+	DBT *data;
+{
+
+	/*
+	 * There are really two cases here:
+	 *
+	 * Case 1: We are replacing some bytes that do not exist (i.e., they
+	 * are past the end of the record).  In this case the number of bytes
+	 * we are replacing is irrelevant and all we care about is how many
+	 * bytes we are going to add from offset.  So, the new record length
+	 * is going to be the size of the new bytes (size) plus wherever those
+	 * new bytes begin (doff).
+	 *
+	 * Case 2: All the bytes we are replacing exist.  Therefore, the new
+	 * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+	 * plus the bytes we are adding (size).
+	 */
+	if (nbytes < data->doff + data->dlen)		/* Case 1 */
+		return (data->doff + data->size);
+
+	return (nbytes + data->size - data->dlen);	/* Case 2 */
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_check_skeyset --
+ *	Diagnostic check that the application's callback returns a set of
+ *	secondary keys without repeats.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_check_skeyset __P((DB *, DBT *));
+ * PUBLIC: #endif
+ */
+void
+__db_check_skeyset(sdbp, skeyp)
+	DB *sdbp;
+	DBT *skeyp;
+{
+	DBT *first_key, *last_key, *key1, *key2;
+	ENV *env;
+
+	env = sdbp->env;
+
+	first_key = (DBT *)skeyp->data;
+	last_key = first_key + skeyp->size;
+	for (key1 = first_key; key1 < last_key; key1++)
+		for (key2 = key1 + 1; key2 < last_key; key2++)
+			DB_ASSERT(env,
+			    ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+			    key1, key2) != 0);
+}
+#endif
diff --git a/src/db/db_cds.c b/src/db/db_cds.c
new file mode 100644
index 00000000..185d5487
--- /dev/null
+++ b/src/db/db_cds.c
@@ -0,0 +1,201 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+static int __cdsgroup_abort __P((DB_TXN *txn));
+static int __cdsgroup_commit __P((DB_TXN *txn, u_int32_t flags));
+static int __cdsgroup_discard __P((DB_TXN *txn, u_int32_t flags));
+static u_int32_t __cdsgroup_id __P((DB_TXN *txn));
+static int __cdsgroup_notsup __P((ENV *env, const char *meth));
+static int __cdsgroup_prepare __P((DB_TXN *txn, u_int8_t *gid));
+static int __cdsgroup_get_name __P((DB_TXN *txn, const char **namep));
+static int __cdsgroup_set_name __P((DB_TXN *txn, const char *name));
+static int __cdsgroup_set_timeout
+    __P((DB_TXN *txn, db_timeout_t timeout, u_int32_t flags));
+
+/*
+ * __cdsgroup_notsup --
+ *	Error when CDS groups don't support a method.
+ */
+static int
+__cdsgroup_notsup(env, meth)
+	ENV *env;
+	const char *meth;
+{
+	__db_errx(env, DB_STR_A("0687", "CDS groups do not support %s", "%s"),
+	    meth);
+	return (DB_OPNOTSUP);
+}
+
+static int
+__cdsgroup_abort(txn)
+	DB_TXN *txn;
+{
+	return (__cdsgroup_notsup(txn->mgrp->env, "abort"));
+}
+
+static int
+__cdsgroup_commit(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_LOCKER *locker;
+	DB_LOCKREQ lreq;
+	ENV *env;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+	env = txn->mgrp->env;
+
+	/* Check for live cursors. */
+	if (txn->cursors != 0) {
+		__db_errx(env, DB_STR("0688", "CDS group has active cursors"));
+		return (EINVAL);
+	}
+
+	/* We may be holding handle locks; release them. */
+	lreq.op = DB_LOCK_PUT_ALL;
+	lreq.obj = NULL;
+	ret = __lock_vec(env, txn->locker, 0, &lreq, 1, NULL);
+
+	env = txn->mgrp->env;
+	locker = txn->locker;
+	__os_free(env, txn->mgrp);
+	__os_free(env, txn);
+	if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int __cdsgroup_discard(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__cdsgroup_notsup(txn->mgrp->env, "discard"));
+}
+
+static u_int32_t __cdsgroup_id(txn)
+	DB_TXN *txn;
+{
+	return (txn->txnid);
+}
+
+static int __cdsgroup_prepare(txn, gid)
+	DB_TXN *txn;
+	u_int8_t *gid;
+{
+	COMPQUIET(gid, NULL);
+	return (__cdsgroup_notsup(txn->mgrp->env, "prepare"));
+}
+
+static int __cdsgroup_get_name(txn, namep)
+	DB_TXN *txn;
+	const char **namep;
+{
+	COMPQUIET(namep, NULL);
+	return (__cdsgroup_notsup(txn->mgrp->env, "get_name"));
+}
+
+static int __cdsgroup_set_name(txn, name)
+	DB_TXN *txn;
+	const char *name;
+{
+	COMPQUIET(name, NULL);
+	return (__cdsgroup_notsup(txn->mgrp->env, "set_name"));
+}
+
+static int __cdsgroup_set_timeout(txn, timeout, flags)
+	DB_TXN *txn;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	COMPQUIET(timeout, 0);
+	COMPQUIET(flags, 0);
+	return (__cdsgroup_notsup(txn->mgrp->env, "set_timeout"));
+}
+
+/*
+ * PUBLIC: int __cdsgroup_begin __P((ENV *, DB_TXN **));
+ */
+int
+__cdsgroup_begin(env, txnpp)
+	ENV *env;
+	DB_TXN **txnpp;
+{
+	DB_TXN *txn;
+	int ret;
+
+	*txnpp = txn = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+		goto err;
+	/*
+	 * We need a dummy DB_TXNMGR -- it's the only way to get from a
+	 * transaction handle to the environment handle.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &txn->mgrp)) != 0)
+		goto err;
+	txn->mgrp->env = env;
+
+	if ((ret = __lock_id(env, &txn->txnid, &txn->locker)) != 0)
+		goto err;
+
+	txn->flags = TXN_FAMILY;
+	txn->abort = __cdsgroup_abort;
+	txn->commit = __cdsgroup_commit;
+	txn->discard = __cdsgroup_discard;
+	txn->id = __cdsgroup_id;
+	txn->prepare = __cdsgroup_prepare;
+	txn->get_name = __cdsgroup_get_name;
+	txn->set_name = __cdsgroup_set_name;
+	txn->set_timeout = __cdsgroup_set_timeout;
+
+	*txnpp = txn;
+
+	if (0) {
+err:		if (txn != NULL) {
+			if (txn->mgrp != NULL)
+				__os_free(env, txn->mgrp);
+			__os_free(env, txn);
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __cds_txn_begin_pp --
+ *	DB_ENV->cdsgroup_begin
+ *
+ * PUBLIC: int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **));
+ */
+int __cdsgroup_begin_pp(dbenv, txnpp)
+	DB_ENV *dbenv;
+	DB_TXN **txnpp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "cdsgroup_begin");
+	if (!CDB_LOCKING(env))
+		return (__env_not_config(env, "cdsgroup_begin", DB_INIT_CDB));
+
+	ENV_ENTER(env, ip);
+	ret = __cdsgroup_begin(env, txnpp);
+	ENV_LEAVE(env, ip);
+	return (ret);
+ }
diff --git a/src/db/db_compact.c b/src/db/db_compact.c
new file mode 100644
index 00000000..d0f4801e
--- /dev/null
+++ b/src/db/db_compact.c
@@ -0,0 +1,1087 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+#ifdef HAVE_FTRUNCATE
+static int __db_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
+static int __db_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
+#endif
+
+#define	SAVE_START							\
+	do {								\
+		save_data = *c_data;					\
+		ret = __db_retcopy(env,				\
+		     &save_start, current.data, current.size,		\
+		     &save_start.data, &save_start.ulen);		\
+	} while (0)
+
+/*
+ * Only restore those things that are negated by aborting the
+ * transaction.  We don't restore the number of deadlocks, for example.
+ */
+
+#define	RESTORE_START							\
+	do {								\
+		c_data->compact_pages_free =				\
+		      save_data.compact_pages_free;			\
+		c_data->compact_levels = save_data.compact_levels;	\
+		c_data->compact_truncate = save_data.compact_truncate;	\
+		c_data->compact_empty_buckets =				\
+		    save_data.compact_empty_buckets;			\
+		ret = __db_retcopy(env, &current,			\
+		     save_start.data, save_start.size,			\
+		     &current.data, &current.ulen);			\
+	} while (0)
+
+/*
+ * __db_compact_int -- compact a database.
+ *
+ * PUBLIC: int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_int(dbp, ip, txn, start, stop, c_data, flags, end)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *start, *stop;
+	DB_COMPACT *c_data;
+	u_int32_t flags;
+	DBT *end;
+{
+	DBC *dbc;
+	DBT current, save_start;
+	DB_COMPACT save_data;
+	DB_TXN *txn_orig;
+	ENV *env;
+	u_int32_t empty_buckets, factor, retry;
+	int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
+
+#ifdef HAVE_FTRUNCATE
+	db_pglist_t *list;
+	db_pgno_t last_pgno;
+	u_int32_t nelems, truncated;
+#endif
+
+	env = dbp->env;
+
+	memset(&current, 0, sizeof(current));
+	memset(&save_start, 0, sizeof(save_start));
+	dbc = NULL;
+	factor = 0;
+	have_freelist = deadlock = isdone = span = 0;
+	ret = retry = 0;
+	txn_orig = txn;
+
+#ifdef HAVE_FTRUNCATE
+	list = NULL;
+	last_pgno = 0;
+	nelems = truncated = 0;
+#endif
+
+	/*
+	 * We pass "current" to the internal routine, indicating where that
+	 * routine should begin its work and expecting that it will return to
+	 * us the last key that it processed.
+	 */
+	if (start != NULL && (ret = __db_retcopy(env,
+	     &current, start->data, start->size,
+	     &current.data, &current.ulen)) != 0)
+		return (ret);
+
+	empty_buckets = c_data->compact_empty_buckets;
+
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		txn_local = 1;
+		LF_SET(DB_AUTO_COMMIT);
+	} else
+		txn_local = 0;
+	if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
+		goto no_free;
+	if (LF_ISSET(DB_FREELIST_ONLY))
+		LF_SET(DB_FREE_SPACE);
+
+#ifdef HAVE_FTRUNCATE
+	/* Sort the freelist and set up the in-memory list representation. */
+	if (txn_local && (ret = __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_free_truncate(dbp, ip,
+	     txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
+		LF_CLR(DB_FREE_SPACE);
+		goto terr;
+	}
+
+	/* If the freelist is empty and we are not filling, get out. */
+	if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
+		ret = 0;
+		LF_CLR(DB_FREE_SPACE);
+		goto terr;
+	}
+	if ((ret = __db_setup_freelist(dbp, list, nelems)) != 0) {
+		/* Someone else owns the free list. */
+		if (ret == EBUSY)
+			ret = 0;
+	}
+	if (ret == 0)
+		have_freelist = 1;
+
+	/* Commit the txn and release the meta page lock. */
+terr:	if (txn_local) {
+		if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+		txn = NULL;
+	}
+	if (ret != 0)
+		goto err;
+
+	/* Save the number truncated so far, we will add what we get below. */
+	truncated = c_data->compact_pages_truncated;
+	if (LF_ISSET(DB_FREELIST_ONLY))
+		goto done;
+#endif
+
+	/*
+	 * We want factor to be the target number of free bytes on each page,
+	 * so we know when to stop adding items to a page.   Make sure to
+	 * subtract the page overhead when computing this target.  This can
+	 * result in a 1-2% error on the smallest page.
+	 * First figure out how many bytes we should use:
+	 */
+no_free:
+	factor = dbp->pgsize - SIZEOF_PAGE;
+	if (c_data->compact_fillpercent != 0) {
+		factor *= c_data->compact_fillpercent;
+		factor /= 100;
+	}
+	/* Now convert to the number of free bytes to target. */
+	factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
+
+	if (c_data->compact_pages == 0)
+		c_data->compact_pages = DB_MAX_PAGES;
+
+	do {
+		deadlock = 0;
+
+		SAVE_START;
+		if (ret != 0)
+			break;
+
+		if (txn_local) {
+			if ((ret =
+			    __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
+				break;
+
+			if (c_data->compact_timeout != 0 &&
+			    (ret = __txn_set_timeout(txn,
+			    c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
+				goto err;
+		}
+
+		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+			goto err;
+
+#ifdef HAVE_HASH
+		if (dbp->type == DB_HASH)
+			ret = __ham_compact_int(dbc,
+			    &current, stop, factor, c_data, &isdone, flags);
+		else
+#endif
+			ret = __bam_compact_int(dbc, &current, stop, factor,
+			     &span, c_data, &isdone);
+		if (ret == DB_LOCK_DEADLOCK && txn_local) {
+			/*
+			 * We retry on deadlock.  Cancel the statistics
+			 * and reset the start point to before this
+			 * iteration.
+			 */
+			deadlock = 1;
+			c_data->compact_deadlock++;
+			RESTORE_START;
+		}
+		/*
+		 * If we could not get a lock while holding an internal
+		 * node latched, commit the current local transaction otherwise
+		 * report a deadlock.
+		 */
+		if (ret == DB_LOCK_NOTGRANTED) {
+			if (txn_local || retry++ < 5)
+				ret = 0;
+			else
+				ret = DB_LOCK_DEADLOCK;
+		} else
+			retry = 0;
+
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+err:		if (txn_local && txn != NULL) {
+			if (ret == 0 && deadlock == 0)
+				ret = __txn_commit(txn, DB_TXN_NOSYNC);
+			else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
+				ret = t_ret;
+			txn = NULL;
+		}
+		DB_ASSERT(env, ip == NULL || ip->dbth_pincount == 0);
+	} while (ret == 0 && !isdone);
+
+	if (ret == 0 && end != NULL)
+		ret = __db_retcopy(env, end, current.data, current.size,
+		    &end->data, &end->ulen);
+	if (current.data != NULL)
+		__os_free(env, current.data);
+	if (save_start.data != NULL)
+		__os_free(env, save_start.data);
+
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * Finish up truncation work.  If there are pages left in the free
+	 * list we can try to move the internal structures around so that we
+	 * can remove more pages from the file.
+	 * For BTREE search the internal nodes of the tree as we may have
+	 * missed some while walking the leaf nodes.
+	 * For HASH we will compact the hash table itself, moving segments
+	 * to lower number pages where possible.
+	 * Then calculate how many pages we have truncated and release
+	 * the in-memory free list.
+	 */
+done:	if (LF_ISSET(DB_FREE_SPACE)) {
+		DBMETA *meta;
+		db_pgno_t pgno;
+
+		pgno = PGNO_BASE_MD;
+		isdone = 1;
+		if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
+		    __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) {
+			isdone = meta->free == PGNO_INVALID;
+			ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
+		}
+
+#ifdef HAVE_HASH
+		if (dbp->type == DB_HASH) {
+			c_data->compact_empty_buckets -= empty_buckets;
+			if (!isdone || c_data->compact_empty_buckets != 0)
+				ret = __ham_compact_hash(dbp,
+				    ip, txn_orig, c_data);
+			c_data->compact_empty_buckets += empty_buckets;
+		} else
+#endif
+		if (!isdone)
+			ret = __bam_truncate_ipages(dbp, ip, txn_orig, c_data);
+
+		/* Clean up the free list. */
+		if (list != NULL)
+			__os_free(env, list);
+
+		if ((t_ret =
+		    __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
+			c_data->compact_pages_truncated =
+			    truncated + last_pgno - meta->last_pgno;
+			if ((t_ret = __memp_fput(dbp->mpf, ip,
+			    meta, dbp->priority)) != 0 && ret == 0)
+				ret = t_ret;
+		} else if (ret == 0)
+			ret = t_ret;
+
+		if (have_freelist && (t_ret =
+		    __db_free_freelist(dbp, ip, txn_orig)) != 0 && ret == 0)
+			t_ret = ret;
+	}
+#endif
+
+	return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+static int
+__db_setup_freelist(dbp, list, nelems)
+	DB *dbp;
+	db_pglist_t *list;
+	u_int32_t nelems;
+{
+	DB_MPOOLFILE *mpf;
+	db_pgno_t *plist;
+	int ret;
+
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
+		return (ret);
+
+	while (nelems-- != 0)
+		*plist++ = list++->pgno;
+
+	return (0);
+}
+
+static int
+__db_free_freelist(dbp, ip, txn)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	DBC *dbc;
+	DB_LOCK lock;
+	int auto_commit, ret, t_ret;
+
+	LOCK_INIT(lock);
+	auto_commit = ret = 0;
+
+	/*
+	 * If we are not in a transaction then we need to get
+	 * a lock on the meta page, otherwise we should already
+	 * have the lock.
+	 */
+
+	dbc = NULL;
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		/*
+		 * We must not timeout the lock or we will not free the list.
+		 * We ignore errors from txn_begin as there is little that
+		 * the application can do with the error and we want to
+		 * get the lock and free the list if at all possible.
+		 */
+		if (__txn_begin(dbp->env, ip, txn, &txn, 0) == 0) {
+			(void)__lock_set_timeout(dbp->env,
+			    txn->locker, 0, DB_SET_TXN_TIMEOUT);
+			(void)__lock_set_timeout(dbp->env,
+			    txn->locker, 0, DB_SET_LOCK_TIMEOUT);
+			auto_commit = 1;
+		}
+		/* Get a cursor so we can call __db_lget. */
+		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+			return (ret);
+
+		if ((ret = __db_lget(dbc,
+		     0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
+			goto err;
+	}
+
+	ret = __memp_free_freelist(dbp->mpf);
+
+err:	if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (auto_commit && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+#endif
+
+/*
+ * __db_exchange_page -- swap a page with a lower numbered page.
+ * The routine will optionally free the higher numbered page.  The cursor
+ * has a stack which includes at least the immediate parent of this page.
+ * PUBLIC: int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
+ */
+int
+__db_exchange_page(dbc, pgp, opg, newpgno, flags)
+	DBC *dbc;
+	PAGE **pgp, *opg;
+	db_pgno_t newpgno;
+	int flags;
+{
+	BTREE_CURSOR *cp;
+	DB *dbp;
+	DBT data, *dp, hdr;
+	DB_LSN lsn;
+	DB_LOCK lock;
+	EPG *epg;
+	PAGE *newpage;
+	db_pgno_t oldpgno, *pgnop;
+	int ret;
+
+	DB_ASSERT(NULL, dbc != NULL);
+	dbp = dbc->dbp;
+	LOCK_INIT(lock);
+
+	/*
+	 * We want to free a page that lives in the part of the file that
+	 * can be truncated, so we're going to move it onto a free page
+	 * that is in the part of the file that need not be truncated.
+	 * In the case of compacting hash table segments the caller already
+	 * identified a contiguous set of pages to use.  Otherwise
+	 * since the freelist is ordered now, we can simply call __db_new
+	 * which will grab the first element off the freelist; we know this
+	 * is the lowest numbered free page.
+	 */
+	if (newpgno != PGNO_INVALID) {
+		if ((ret = __memp_fget(dbp->mpf, &newpgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &newpage)) != 0)
+			return (ret);
+	} else if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
+	     STD_LOCKING(dbc) && TYPE(*pgp) != P_OVERFLOW ? &lock : NULL,
+	     &newpage)) != 0)
+		return (ret);
+
+	/*
+	 * If newpage is null then __db_new would have had to allocate
+	 * a new page from the filesystem, so there is no reason
+	 * to continue this action.
+	 */
+	if (newpage == NULL)
+		return (0);
+
+	/*
+	 * It is possible that a higher page is allocated if other threads
+	 * are allocating at the same time, if so, just put it back.
+	 */
+	if (PGNO(newpage) > PGNO(*pgp)) {
+		/* Its unfortunate but you can't just free a new overflow. */
+		if (TYPE(newpage) == P_OVERFLOW)
+			OV_LEN(newpage) = 0;
+		if ((ret = __LPUT(dbc, lock)) != 0)
+			return (ret);
+		return (__db_free(dbc, newpage, 0));
+	}
+
+	/* Log if necessary. */
+	if (DBC_LOGGING(dbc)) {
+		memset(&hdr, 0, sizeof(hdr));
+		hdr.data = *pgp;
+		hdr.size = P_OVERHEAD(dbp);
+		memset(&data, 0, sizeof(data));
+		dp = &data;
+		switch (TYPE(*pgp)) {
+		case P_OVERFLOW:
+			data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
+			data.size = OV_LEN(*pgp);
+			break;
+		case P_BTREEMETA:
+			hdr.size = sizeof(BTMETA);
+			dp = NULL;
+			break;
+		case P_HASHMETA:
+			hdr.size = sizeof(HMETA);
+			dp = NULL;
+			break;
+		default:
+			data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
+			data.size = dbp->pgsize - HOFFSET(*pgp);
+			hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
+		}
+		if ((ret = __db_merge_log(dbp, dbc->txn,
+		      &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
+		      PGNO(*pgp), &LSN(*pgp), &hdr, dp, 1)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(newpage));
+
+	oldpgno = PGNO(*pgp);
+	newpgno = PGNO(newpage);
+	lsn = LSN(newpage);
+	memcpy(newpage, *pgp, dbp->pgsize);
+	PGNO(newpage) = newpgno;
+	LSN(newpage) = lsn;
+
+	/* Empty the old page. */
+	if ((ret = __memp_dirty(dbp->mpf,
+	    pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+	if (TYPE(*pgp) == P_OVERFLOW)
+		OV_LEN(*pgp) = 0;
+	else {
+		HOFFSET(*pgp) = dbp->pgsize;
+		NUM_ENT(*pgp) = 0;
+	}
+	LSN(*pgp) = lsn;
+
+	/* Update siblings. */
+	switch (TYPE(newpage)) {
+	case P_OVERFLOW:
+	case P_LBTREE:
+	case P_LRECNO:
+	case P_LDUP:
+	case P_HASH:
+		if (NEXT_PGNO(newpage) == PGNO_INVALID &&
+		    PREV_PGNO(newpage) == PGNO_INVALID)
+			break;
+		if ((ret = __db_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
+			goto err;
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * For HASH we may reuse the old page for an even higher numbered
+	 * page.  Otherwise we free the old page.
+	 */
+	if (!LF_ISSET(DB_EXCH_FREE)) {
+		NEXT_PGNO(*pgp) = PREV_PGNO(*pgp) = PGNO_INVALID;
+		ret = __memp_fput(dbp->mpf,
+		    dbc->thread_info, *pgp, dbc->priority);
+	} else
+		ret = __db_free(dbc, *pgp, 0);
+	*pgp = newpage;
+
+	if (ret != 0)
+		return (ret);
+
+	if (!LF_ISSET(DB_EXCH_PARENT))
+		goto done;
+
+	/* Update the parent. */
+	cp = (BTREE_CURSOR *)dbc->internal;
+	epg = &cp->csp[-1];
+
+	switch (TYPE(epg->page)) {
+	case P_IBTREE:
+		pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
+		break;
+	case P_IRECNO:
+		pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
+		break;
+	case P_LBTREE:
+	case P_LRECNO:
+	case P_LDUP:
+		pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(epg->page)));
+	}
+	DB_ASSERT(dbp->env, oldpgno == *pgnop);
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(epg->page),
+		    0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
+		    *pgnop, PGNO(newpage))) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(epg->page));
+
+	*pgnop = PGNO(newpage);
+	cp->csp->page = newpage;
+	if ((ret = __TLPUT(dbc, lock)) != 0)
+		return (ret);
+
+done:	return (0);
+
+err:	(void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
+	(void)__TLPUT(dbc, lock);
+	return (ret);
+}
+
+/*
+ * __db_truncate_overflow -- find overflow pages to truncate.
+ *	Walk the pages of an overflow chain and swap out
+ * high numbered pages.  We are passed the first page
+ * but only deal with the second and subsequent pages.
+ * PUBLIC:  int __db_truncate_overflow __P((DBC *,
+ * PUBLIC:     db_pgno_t, PAGE **, DB_COMPACT *));
+ */
+int
+__db_truncate_overflow(dbc, pgno, ppg, c_data)
+	DBC *dbc;
+	db_pgno_t pgno;
+	PAGE **ppg;
+	DB_COMPACT *c_data;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	PAGE *page;
+	db_pgno_t ppgno;
+	int have_lock, ret, t_ret;
+
+	dbp = dbc->dbp;
+	page = NULL;
+	LOCK_INIT(lock);
+	have_lock = ppg == NULL;
+
+	if ((ret = __memp_fget(dbp->mpf, &pgno,
+	     dbc->thread_info, dbc->txn, 0, &page)) != 0)
+		return (ret);
+
+	while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
+		if ((ret = __memp_fput(dbp->mpf,
+		     dbc->thread_info, page, dbc->priority)) != 0)
+			return (ret);
+		if ((ret = __memp_fget(dbp->mpf, &pgno,
+		    dbc->thread_info, dbc->txn, 0, &page)) != 0)
+			return (ret);
+		if (pgno <= c_data->compact_truncate)
+			continue;
+		if (have_lock == 0) {
+			DB_ASSERT(dbp->env, ppg != NULL);
+			ppgno = PGNO(*ppg);
+			if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
+			     *ppg, dbc->priority)) != 0)
+				goto err;
+			*ppg = NULL;
+			if ((ret = __db_lget(dbc, 0, ppgno,
+			     DB_LOCK_WRITE, 0, &lock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(dbp->mpf, &ppgno,
+			    dbc->thread_info,
+			    dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+				goto err;
+			have_lock = 1;
+		}
+		if ((ret = __db_exchange_page(dbc,
+		    &page, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+			break;
+	}
+
+err:	if (page != NULL &&
+	    (t_ret = __memp_fput( dbp->mpf,
+	    dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+/*
+ * __db_truncate_root -- swap a root page for a lower numbered page.
+ * PUBLIC: int __db_truncate_root __P((DBC *,
+ * PUBLIC:      PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+ */
+int
+__db_truncate_root(dbc, ppg, indx, pgnop, tlen)
+	DBC *dbc;
+	PAGE *ppg;
+	u_int32_t indx;
+	db_pgno_t *pgnop;
+	u_int32_t tlen;
+{
+	DB *dbp;
+	DBT orig;
+	PAGE *page;
+	int ret, t_ret;
+	db_pgno_t newpgno;
+
+	dbp = dbc->dbp;
+
+	DB_ASSERT(dbc->dbp->env, IS_DIRTY(ppg));
+	if ((ret = __memp_fget(dbp->mpf, pgnop,
+	     dbc->thread_info, dbc->txn, 0, &page)) != 0)
+		goto err;
+
+	/*
+	 * If this is a multiply reference overflow key, then we will just
+	 * copy it and decrement the reference count.  This is part of a
+	 * fix to get rid of multiple references.
+	 */
+	if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
+		COMPQUIET(newpgno, 0);
+		if ((ret = __db_ovref(dbc, *pgnop)) != 0)
+			goto err;
+		memset(&orig, 0, sizeof(orig));
+		if ((ret = __db_goff(dbc, &orig, tlen, *pgnop,
+		    &orig.data, &orig.size)) == 0)
+			ret = __db_poff(dbc, &orig, &newpgno);
+		if (orig.data != NULL)
+			__os_free(dbp->env, orig.data);
+		if (ret != 0)
+			goto err;
+	} else {
+		LOCK_CHECK_OFF(dbc->thread_info);
+		ret = __db_exchange_page(dbc,
+		    &page, NULL, PGNO_INVALID, DB_EXCH_FREE);
+		LOCK_CHECK_ON(dbc->thread_info);
+		if (ret != 0)
+			goto err;
+		newpgno = PGNO(page);
+		/* If we could not allocate from the free list, give up.*/
+		if (newpgno == *pgnop)
+			goto err;
+	}
+
+	/* Update the reference. */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_pgno_log(dbp,
+		     dbc->txn, &LSN(ppg), 0, PGNO(ppg),
+		     &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(ppg));
+
+	*pgnop = newpgno;
+
+err:	if (page != NULL && (t_ret =
+	      __memp_fput(dbp->mpf, dbc->thread_info,
+		  page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_find_free --
+ *	Find a contiguous "size" range of free pages that are lower numbers
+ * than the pages starting at "bstart".  We can also return a set of pages
+ * that overlaps with the pages at "bstart".
+ * PUBLIC: int __db_find_free __P((DBC *, u_int32_t,
+ * PUBLIC:	u_int32_t, db_pgno_t, db_pgno_t *));
+ */
+int
+__db_find_free(dbc, type, size, bstart, freep)
+	DBC *dbc;
+	u_int32_t type;
+	u_int32_t size;
+	db_pgno_t bstart, *freep;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DBT listdbt;
+	DB_LOCK metalock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *page, *freepg;
+	u_int32_t i, j, start, nelems;
+	db_pgno_t *list, next_free, pgno;
+	db_pglist_t *lp, *pglist;
+	int hash, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	nelems = 0;
+	hash = 0;
+	page = NULL;
+	pglist = NULL;
+	meta = NULL;
+	LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+	if (dbp->type == DB_HASH) {
+		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+			return (ret);
+		if (meta != NULL)
+			hash = 1;
+	}
+#endif
+	if (meta == NULL) {
+		pgno = PGNO_BASE_MD;
+		if ((ret = __db_lget(dbc,
+		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+	}
+
+	if ((ret = __memp_get_freelist(mpf, &nelems, &list)) != 0)
+		goto err;
+
+	if (nelems == 0) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+
+	for (i = 0; i < nelems; i++) {
+		if (list[i] > bstart) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		start = i;
+		if (size == 1)
+			goto found;
+		while (i < nelems - 1 && list[i] + 1 == list[i + 1]) {
+			i++;
+			if (i - start == size - 1)
+				goto found;
+		}
+		if (i - start == size - 1)
+			goto found;
+		/*
+		 * If the last set of contiguous free pages we found
+		 * are contiguous to the chunk we are trying to move,
+		 * then we can slide the allocated chunk back some number
+		 * of pages -- figure out how many by calculating the
+		 * number of pages before the allocated ones that we have
+		 * found in the free list.
+		 */
+		if (list[i] == bstart - 1) {
+			size = (i - start) + 1;
+			goto found;
+		}
+	}
+	ret = DB_NOTFOUND;
+	goto err;
+
+found:	/* We have size range of pages.  Remove them. */
+	next_free = i == nelems - 1 ? PGNO_INVALID : list[i + 1];
+	*freep = list[start];
+	if (start == 0) {
+		page = (PAGE *)meta;
+	} else if ((ret = __memp_fget(mpf, &list[start - 1],
+	     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &page)) != 0)
+		return (ret);
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __os_malloc(dbp->env,
+		    size * sizeof(db_pglist_t), &pglist)) != 0)
+			goto err;
+		lp = pglist;
+		for (j = start; j < start + size; j++, lp++) {
+			if ((ret = __memp_fget(mpf, &list[j],
+			    dbc->thread_info, dbc->txn, 0, &freepg)) != 0)
+				goto err;
+			lp->pgno = PGNO(freepg);
+			lp->next_pgno = NEXT_PGNO(freepg);
+			lp->lsn = LSN(freepg);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, freepg, dbc->priority)) != 0)
+				goto err;
+		}
+		listdbt.size = size * sizeof(*pglist);
+		listdbt.data = pglist;
+		if ((ret = __db_realloc_log(dbp, dbc->txn, &lsn, 0,
+		    PGNO(page), &LSN(page), next_free, type, &listdbt)) != 0)
+			goto err;
+		__os_free(dbp->env, pglist);
+		pglist = NULL;
+	} else
+		LSN_NOT_LOGGED(lsn);
+
+	LSN(page) = lsn;
+	if (start == 0)
+		meta->free = next_free;
+	else
+		NEXT_PGNO(page) = next_free;
+
+	if (page != (PAGE *)meta && (ret = __memp_fput(mpf,
+	    dbc->thread_info, page, dbc->priority)) != 0)
+		goto err;
+
+	for (j = start; j < start + size; j++) {
+		if ((ret = __memp_fget(mpf,
+		    &list[j], dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &freepg)) != 0)
+			goto err;
+		P_INIT(freepg, dbp->pgsize,
+		    list[j], PGNO_INVALID, PGNO_INVALID, 0, type);
+		LSN(freepg) = lsn;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, freepg, dbc->priority)) != 0)
+			goto err;
+	}
+
+	if (++i != nelems)
+		memmove(&list[start], &list[i], (nelems - i) * sizeof(*list));
+	if ((ret = __memp_extend_freelist(mpf, nelems - size, &list)) != 0)
+		goto err;
+	if (hash == 0)
+		ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	t_ret = __TLPUT(dbc, metalock);
+
+	return (ret == 0 ? t_ret : ret);
+
+err:	if (page != NULL && page != (PAGE *)meta)
+		(void)__memp_fput(mpf, dbc->thread_info, page, dbc->priority);
+	if (pglist != NULL)
+		__os_free(dbp->env, pglist);
+	if (meta != NULL && hash == 0)
+		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	(void)__TLPUT(dbc, metalock);
+	return (ret);
+}
+#endif
+
+/*
+ * __db_relink --
+ *	Relink around a deleted page.
+ *
+ * PUBLIC: int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
+ *	Otherp can be either the previous or the next page to use if
+ * the caller already holds that page.
+ */
+int
+__db_relink(dbc, pagep, otherp, new_pgno)
+	DBC *dbc;
+	PAGE *pagep, *otherp;
+	db_pgno_t new_pgno;
+{
+	DB *dbp;
+	DB_LOCK npl, ppl;
+	DB_LSN *nlsnp, *plsnp, ret_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *np, *pp;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	np = pp = NULL;
+	LOCK_INIT(npl);
+	LOCK_INIT(ppl);
+	nlsnp = plsnp = NULL;
+	mpf = dbp->mpf;
+	ret = 0;
+
+	/*
+	 * Retrieve the one/two pages.  The caller must have them locked
+	 * because the parent is latched. For a remove, we may need
+	 * two pages (the before and after).  For an add, we only need one
+	 * because, the split took care of the prev.
+	 */
+	if (pagep->next_pgno != PGNO_INVALID) {
+		if (((np = otherp) == NULL ||
+		    PGNO(otherp) != pagep->next_pgno) &&
+		    (ret = __memp_fget(mpf, &pagep->next_pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
+			ret = __db_pgerr(dbp, pagep->next_pgno, ret);
+			goto err;
+		}
+		nlsnp = &np->lsn;
+	}
+	if (pagep->prev_pgno != PGNO_INVALID) {
+		if (((pp = otherp) == NULL ||
+		    PGNO(otherp) != pagep->prev_pgno) &&
+		    (ret = __memp_fget(mpf, &pagep->prev_pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
+			ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
+			goto err;
+		}
+		plsnp = &pp->lsn;
+	}
+
+	/* Log the change. */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0,
+		    pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
+		    pagep->next_pgno, nlsnp)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(ret_lsn);
+	if (np != NULL)
+		np->lsn = ret_lsn;
+	if (pp != NULL)
+		pp->lsn = ret_lsn;
+
+	/*
+	 * Modify and release the two pages.
+	 */
+	if (np != NULL) {
+		if (new_pgno == PGNO_INVALID)
+			np->prev_pgno = pagep->prev_pgno;
+		else
+			np->prev_pgno = new_pgno;
+		if (np != otherp)
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, np, dbc->priority);
+		if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+	}
+
+	if (pp != NULL) {
+		if (new_pgno == PGNO_INVALID)
+			pp->next_pgno = pagep->next_pgno;
+		else
+			pp->next_pgno = new_pgno;
+		if (pp != otherp)
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, pp, dbc->priority);
+		if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+	}
+	return (0);
+
+err:	if (np != NULL && np != otherp)
+		(void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
+	if (pp != NULL && pp != otherp)
+		(void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
+	return (ret);
+}
+
+/*
+ * __db_move_metadata -- move a meta data page to a lower page number.
+ * The meta data page must be exclusively latched on entry.
+ *
+ * PUBLIC: int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+ */
+int
+__db_move_metadata(dbc, metap, c_data)
+	DBC *dbc;
+	DBMETA **metap;
+	DB_COMPACT *c_data;
+{
+	BTREE *bt;
+	DB *dbp, *mdbp;
+	DB_LOCK handle_lock;
+	HASH *ht;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	c_data->compact_pages_examine++;
+	if ((ret = __db_exchange_page(dbc,
+	     (PAGE**)metap, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+		return (ret);
+
+	if (PGNO(*metap) == dbp->meta_pgno)
+		return (0);
+
+	if ((ret = __db_master_open(dbp,
+	     dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
+		return (ret);
+
+	dbp->meta_pgno = PGNO(*metap);
+
+	if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
+	     dbc->txn, dbp->dname, dbp->type, MU_MOVE, NULL, 0)) != 0)
+		goto err;
+
+	/*
+	 * The handle lock for subdb's depends on the metadata page number:
+	 * swap the old one for the new one.
+	 */
+	if (STD_LOCKING(dbc)) {
+		/*
+		 * If this dbp is still in an opening transaction we need to
+		 * change its lock in the event.
+		 */
+		if (dbp->cur_txn != NULL)
+			__txn_remlock(dbp->env,
+			    dbp->cur_txn, &dbp->handle_lock, DB_LOCK_INVALIDID);
+
+		handle_lock = dbp->handle_lock;
+		if ((ret = __fop_lock_handle(dbp->env, dbp,
+		    dbp->cur_locker != NULL ? dbp->cur_locker : dbp->locker,
+		    dbp->cur_txn != NULL ? DB_LOCK_WRITE : DB_LOCK_READ,
+		    NULL, 0)) != 0)
+			goto err;
+
+		/* Move all the other handles to the new lock. */
+		if ((ret = __lock_change(dbp->env,
+		    &handle_lock, &dbp->handle_lock)) != 0)
+			goto err;
+
+		/* Reregister the event. */
+		if (dbp->cur_txn != NULL)
+			ret = __txn_lockevent(dbp->env,
+			    dbp->cur_txn, dbp, &dbp->handle_lock, dbp->locker);
+	}
+	if (dbp->log_filename != NULL)
+		dbp->log_filename->meta_pgno = dbp->meta_pgno;
+	if (dbp->type == DB_HASH) {
+		ht = dbp->h_internal;
+		ht->meta_pgno = dbp->meta_pgno;
+		ht->revision = ++dbp->mpf->mfp->revision;
+	} else {
+		bt = dbp->bt_internal;
+		bt->bt_meta = dbp->meta_pgno;
+		bt->revision = ++dbp->mpf->mfp->revision;
+	}
+
+err:	if ((t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
diff --git a/src/db/db_conv.c b/src/db/db_conv.c
new file mode 100644
index 00000000..210b4d6e
--- /dev/null
+++ b/src/db/db_conv.c
@@ -0,0 +1,890 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_pgin --
+ *	Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgin(dbenv, pg, pp, cookie)
+	DB_ENV *dbenv;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB dummydb, *dbp;
+	DB_CIPHER *db_cipher;
+	DB_LSN not_used;
+	DB_PGINFO *pginfo;
+	ENV *env;
+	PAGE *pagep;
+	size_t sum_len;
+	int is_hmac, ret;
+	u_int8_t *chksum;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	env = dbenv->env;
+	pagep = (PAGE *)pp;
+
+	ret = is_hmac = 0;
+	chksum = NULL;
+	memset(&dummydb, 0, sizeof(DB));
+	dbp = &dummydb;
+	dbp->dbenv = dbenv;
+	dbp->env = env;
+	dbp->flags = pginfo->flags;
+	dbp->pgsize = pginfo->db_pagesize;
+	db_cipher = env->crypto_handle;
+	switch (pagep->type) {
+	case P_HASHMETA:
+	case P_HEAPMETA:
+	case P_BTREEMETA:
+	case P_QAMMETA:
+		/*
+		 * If checksumming is set on the meta-page, we must set
+		 * it in the dbp.
+		 */
+		if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM))
+			F_SET(dbp, DB_AM_CHKSUM);
+		else
+			F_CLR(dbp, DB_AM_CHKSUM);
+		if (((DBMETA *)pp)->encrypt_alg != 0 ||
+		    F_ISSET(dbp, DB_AM_ENCRYPT))
+			is_hmac = 1;
+		/*
+		 * !!!
+		 * For all meta pages it is required that the chksum
+		 * be at the same location.  Use BTMETA to get to it
+		 * for any meta type.
+		 */
+		chksum = ((BTMETA *)pp)->chksum;
+		sum_len = DBMETASIZE;
+		break;
+	case P_INVALID:
+		/*
+		 * We assume that we've read a file hole if we have
+		 * a zero LSN, zero page number and P_INVALID.  Otherwise
+		 * we have an invalid page that might contain real data.
+		 */
+		if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) {
+			sum_len = 0;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		chksum = P_CHKSUM(dbp, pagep);
+		sum_len = pginfo->db_pagesize;
+		/*
+		 * If we are reading in a non-meta page, then if we have
+		 * a db_cipher then we are using hmac.
+		 */
+		is_hmac = CRYPTO_ON(env) ? 1 : 0;
+		break;
+	}
+
+	/*
+	 * We expect a checksum error if there was a configuration problem.
+	 * If there is no configuration problem and we don't get a match,
+	 * it's fatal: panic the system.
+	 */
+	if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) {
+		if (F_ISSET(dbp, DB_AM_SWAP) && is_hmac == 0)
+			P_32_SWAP(chksum);
+		switch (ret = __db_check_chksum(
+		    env, NULL, db_cipher, chksum, pp, sum_len, is_hmac)) {
+		case 0:
+			break;
+		case -1:
+			if (DBENV_LOGGING(env))
+				(void)__db_cksum_log(
+				    env, NULL, &not_used, DB_FLUSH);
+			__db_errx(env, DB_STR_A("0684",
+	    "checksum error: page %lu: catastrophic recovery required",
+			    "%lu"), (u_long)pg);
+			return (__env_panic(env, DB_RUNRECOVERY));
+		default:
+			return (ret);
+		}
+	}
+	if ((ret = __db_decrypt_pg(env, dbp, pagep)) != 0)
+		return (ret);
+	switch (pagep->type) {
+	case P_INVALID:
+		if (pginfo->type == DB_QUEUE)
+			return (__qam_pgin_out(env, pg, pp, cookie));
+		else if (pginfo->type == DB_HEAP)
+			return (__heap_pgin(dbp, pg, pp, cookie));
+		/*
+		 * This page is either newly allocated from the end of the
+		 * file, or from the free list, or it is an as-yet unwritten
+		 * hash bucket page. In this last case it needs to be
+		 * initialized, but never byte-swapped. Otherwise the header
+		 * may need swapping. It will not be a metadata page, so the
+		 * byte swapping code of __ham_pgin is adequate.  If hash
+		 * is not configured fall back to btree swapping.
+		 */
+#ifdef HAVE_HASH
+		return (__ham_pgin(dbp, pg, pp, cookie));
+#else
+		return (__bam_pgin(dbp, pg, pp, cookie));
+#endif
+		/* NOTREACHED. */
+		break;
+	case P_HASH_UNSORTED:
+	case P_HASH:
+	case P_HASHMETA:
+		return (__ham_pgin(dbp, pg, pp, cookie));
+	case P_HEAP:
+	case P_HEAPMETA:
+	case P_IHEAP:
+		return (__heap_pgin(dbp, pg, pp, cookie));
+	case P_BTREEMETA:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+	case P_OVERFLOW:
+		return (__bam_pgin(dbp, pg, pp, cookie));
+	case P_QAMMETA:
+	case P_QAMDATA:
+		return (__qam_pgin_out(env, pg, pp, cookie));
+	default:
+		break;
+	}
+	return (__db_pgfmt(env, pg));
+}
+
+/*
+ * __db_pgout --
+ *	Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgout(dbenv, pg, pp, cookie)
+	DB_ENV *dbenv;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB dummydb, *dbp;
+	DB_PGINFO *pginfo;
+	ENV *env;
+	PAGE *pagep;
+	int ret;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	env = dbenv->env;
+	pagep = (PAGE *)pp;
+
+	memset(&dummydb, 0, sizeof(DB));
+	dbp = &dummydb;
+	dbp->dbenv = dbenv;
+	dbp->env = env;
+	dbp->flags = pginfo->flags;
+	dbp->pgsize = pginfo->db_pagesize;
+	ret = 0;
+	switch (pagep->type) {
+	case P_INVALID:
+		switch (pginfo->type) {
+		case DB_QUEUE:
+			ret = __qam_pgin_out(env, pg, pp, cookie);
+			break;
+#ifdef HAVE_HASH
+		case DB_HASH:
+			ret = __ham_pgout(dbp, pg, pp, cookie);
+			break;
+#endif
+#ifdef HAVE_HEAP
+		case DB_HEAP:
+			ret = __heap_pgout(dbp, pg, pp, cookie);
+			break;
+#endif
+		case DB_BTREE:
+		case DB_RECNO:
+			ret = __bam_pgout(dbp, pg, pp, cookie);
+			break;
+		default:
+			return (__db_pgfmt(env, pg));
+		}
+		break;
+	case P_HASH:
+	case P_HASH_UNSORTED:
+		/*
+		 * Support pgout of unsorted hash pages - since online
+		 * replication upgrade can cause pages of this type to be
+		 * written out.
+		 *
+		 * FALLTHROUGH
+		 */
+	case P_HASHMETA:
+		ret = __ham_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_HEAP:
+	case P_HEAPMETA:
+	case P_IHEAP:
+		ret = __heap_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_BTREEMETA:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+	case P_OVERFLOW:
+		ret = __bam_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_QAMMETA:
+	case P_QAMDATA:
+		ret = __qam_pgin_out(env, pg, pp, cookie);
+		break;
+	default:
+		return (__db_pgfmt(env, pg));
+	}
+	if (ret)
+		return (ret);
+
+	return (__db_encrypt_and_checksum_pg(env, dbp, pagep));
+}
+
+/*
+ * __db_decrypt_pg --
+ *      Utility function to decrypt a db page.
+ *
+ * PUBLIC: int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_decrypt_pg (env, dbp, pagep)
+	ENV *env;
+	DB *dbp;
+	PAGE *pagep;
+{
+	DB_CIPHER *db_cipher;
+	size_t pg_len, pg_off;
+	u_int8_t *iv;
+	int ret;
+
+	db_cipher = env->crypto_handle;
+	ret = 0;
+	iv = NULL;
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		DB_ASSERT(env, db_cipher != NULL);
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+		pg_off = P_OVERHEAD(dbp);
+		DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_HEAPMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the iv
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			iv = ((BTMETA *)pagep)->iv;
+			pg_len = DBMETASIZE;
+			break;
+		case P_INVALID:
+			if (IS_ZERO_LSN(LSN(pagep)) &&
+			    pagep->pgno == PGNO_INVALID) {
+				pg_len = 0;
+				break;
+			}
+			/* FALLTHROUGH */
+		default:
+			iv = P_IV(dbp, pagep);
+			pg_len = dbp->pgsize;
+			break;
+		}
+		if (pg_len != 0)
+			ret = db_cipher->decrypt(env, db_cipher->data,
+			    iv, ((u_int8_t *)pagep) + pg_off,
+			    pg_len - pg_off);
+	}
+	return (ret);
+}
+
+/*
+ * __db_encrypt_and_checksum_pg --
+ *	Utility function to encrypt and checksum a db page.
+ *
+ * PUBLIC: int __db_encrypt_and_checksum_pg
+ * PUBLIC:     __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_encrypt_and_checksum_pg (env, dbp, pagep)
+	ENV *env;
+	DB *dbp;
+	PAGE *pagep;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+	size_t pg_off, pg_len, sum_len;
+	u_int8_t *chksum, *iv, *key;
+
+	chksum = iv = key = NULL;
+	db_cipher = env->crypto_handle;
+
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		DB_ASSERT(env, db_cipher != NULL);
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+		pg_off = P_OVERHEAD(dbp);
+		DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+		key = db_cipher->mac_key;
+
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_HEAPMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the iv
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			iv = ((BTMETA *)pagep)->iv;
+			pg_len = DBMETASIZE;
+			break;
+		default:
+			iv = P_IV(dbp, pagep);
+			pg_len = dbp->pgsize;
+			break;
+		}
+		if ((ret = db_cipher->encrypt(env, db_cipher->data,
+		    iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0)
+			return (ret);
+	}
+	if (F_ISSET(dbp, DB_AM_CHKSUM)) {
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_HEAPMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the chksum
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			chksum = ((BTMETA *)pagep)->chksum;
+			sum_len = DBMETASIZE;
+			break;
+		default:
+			chksum = P_CHKSUM(dbp, pagep);
+			sum_len = dbp->pgsize;
+			break;
+		}
+		__db_chksum(NULL, (u_int8_t *)pagep, sum_len, key, chksum);
+		if (F_ISSET(dbp, DB_AM_SWAP) && !F_ISSET(dbp, DB_AM_ENCRYPT))
+			 P_32_SWAP(chksum);
+	}
+	return (0);
+}
+
+/*
+ * __db_metaswap --
+ *	Byteswap the common part of the meta-data page.
+ *
+ * PUBLIC: void __db_metaswap __P((PAGE *));
+ */
+void
+__db_metaswap(pg)
+	PAGE *pg;
+{
+	u_int8_t *p;
+
+	p = (u_int8_t *)pg;
+
+	/* Swap the meta-data information. */
+	SWAP32(p);	/* lsn.file */
+	SWAP32(p);	/* lsn.offset */
+	SWAP32(p);	/* pgno */
+	SWAP32(p);	/* magic */
+	SWAP32(p);	/* version */
+	SWAP32(p);	/* pagesize */
+	p += 4;		/* unused, page type, unused, unused */
+	SWAP32(p);	/* free */
+	SWAP32(p);	/* alloc_lsn part 1 */
+	SWAP32(p);	/* alloc_lsn part 2 */
+	SWAP32(p);	/* cached key count */
+	SWAP32(p);	/* cached record count */
+	SWAP32(p);	/* flags */
+}
+
+/*
+ * __db_byteswap --
+ *	Byteswap an ordinary database page.
+ *
+ * PUBLIC: int __db_byteswap
+ * PUBLIC:         __P((DB *, db_pgno_t, PAGE *, size_t, int));
+ */
+int
+__db_byteswap(dbp, pg, h, pagesize, pgin)
+	DB *dbp;
+	db_pgno_t pg;
+	PAGE *h;
+	size_t pagesize;
+	int pgin;
+{
+	ENV *env;
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	RINTERNAL *ri;
+	db_indx_t i, *inp, len, tmp;
+	u_int8_t *end, *p, *pgend;
+
+	if (pagesize == 0)
+		return (0);
+
+	if (pgin) {
+		M_32_SWAP(h->lsn.file);
+		M_32_SWAP(h->lsn.offset);
+		M_32_SWAP(h->pgno);
+		M_32_SWAP(h->prev_pgno);
+		M_32_SWAP(h->next_pgno);
+		M_16_SWAP(h->entries);
+		M_16_SWAP(h->hf_offset);
+	}
+
+	if (dbp == NULL)
+		return (0);
+	env = dbp->env;
+
+	pgend = (u_int8_t *)h + pagesize;
+
+	inp = P_INP(dbp, h);
+	if ((u_int8_t *)inp >= pgend)
+		goto out;
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			if (P_ENTRY(dbp, h, i) >= pgend)
+				continue;
+
+			switch (HPAGE_TYPE(dbp, h, i)) {
+			case H_KEYDATA:
+				break;
+			case H_DUPLICATE:
+				len = LEN_HKEYDATA(dbp, h, pagesize, i);
+				p = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+				for (end = p + len; p < end;) {
+					if (pgin) {
+						P_16_SWAP(p);
+						memcpy(&tmp,
+						    p, sizeof(db_indx_t));
+						p += sizeof(db_indx_t);
+					} else {
+						memcpy(&tmp,
+						    p, sizeof(db_indx_t));
+						SWAP16(p);
+					}
+					p += tmp;
+					SWAP16(p);
+				}
+				break;
+			case H_OFFDUP:
+				p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+				SWAP32(p);			/* pgno */
+				break;
+			case H_OFFPAGE:
+				p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+				SWAP32(p);			/* pgno */
+				SWAP32(p);			/* tlen */
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+		}
+
+		/*
+		 * The offsets in the inp array are used to determine
+		 * the size of entries on a page; therefore they
+		 * cannot be converted until we've done all the
+		 * entries.
+		 */
+		if (!pgin)
+			for (i = 0; i < NUM_ENT(h); i++)
+				M_16_SWAP(inp[i]);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			/*
+			 * In the case of on-page duplicates, key information
+			 * should only be swapped once.
+			 */
+			if (h->type == P_LBTREE && i > 1) {
+				if (pgin) {
+					if (inp[i] == inp[i - 2])
+						continue;
+				} else {
+					M_16_SWAP(inp[i]);
+					if (inp[i] == inp[i - 2])
+						continue;
+					M_16_SWAP(inp[i]);
+				}
+			}
+
+			bk = GET_BKEYDATA(dbp, h, i);
+			if ((u_int8_t *)bk >= pgend)
+				continue;
+			switch (B_TYPE(bk->type)) {
+			case B_KEYDATA:
+				M_16_SWAP(bk->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				bo = (BOVERFLOW *)bk;
+				M_32_SWAP(bo->pgno);
+				M_32_SWAP(bo->tlen);
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_IBTREE:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			bi = GET_BINTERNAL(dbp, h, i);
+			if ((u_int8_t *)bi >= pgend)
+				continue;
+
+			M_16_SWAP(bi->len);
+			M_32_SWAP(bi->pgno);
+			M_32_SWAP(bi->nrecs);
+
+			switch (B_TYPE(bi->type)) {
+			case B_KEYDATA:
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				bo = (BOVERFLOW *)bi->data;
+				M_32_SWAP(bo->pgno);
+				M_32_SWAP(bo->tlen);
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_IRECNO:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			ri = GET_RINTERNAL(dbp, h, i);
+			if ((u_int8_t *)ri >= pgend)
+				continue;
+
+			M_32_SWAP(ri->pgno);
+			M_32_SWAP(ri->nrecs);
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_HEAP:
+	case P_IHEAP:
+	case P_INVALID:
+	case P_OVERFLOW:
+	case P_QAMDATA:
+		/* Nothing to do. */
+		break;
+	default:
+		return (__db_pgfmt(env, pg));
+	}
+
+out:	if (!pgin) {
+		/* Swap the header information. */
+		M_32_SWAP(h->lsn.file);
+		M_32_SWAP(h->lsn.offset);
+		M_32_SWAP(h->pgno);
+		M_32_SWAP(h->prev_pgno);
+		M_32_SWAP(h->next_pgno);
+		M_16_SWAP(h->entries);
+		M_16_SWAP(h->hf_offset);
+	}
+	return (0);
+}
+
+/*
+ * __db_pageswap --
+ *	Byteswap any database page.  Normally, the page to be swapped will be
+ *	referenced by the "pp" argument and the pdata argument will be NULL.
+ *	This function is also called by automatically generated log functions,
+ *	where the page may be split into separate header and data parts.  In
+ *	that case, pdata is not NULL we reconsitute
+ *
+ * PUBLIC: int __db_pageswap
+ * PUBLIC:         __P((ENV *, DB *, void *, size_t, DBT *, int));
+ */
+int
+__db_pageswap(env, dbp, pp, len, pdata, pgin)
+	ENV *env;
+	DB *dbp;
+	void *pp;
+	size_t len;
+	DBT *pdata;
+	int pgin;
+{
+	db_pgno_t pg;
+	size_t pgsize;
+	void *pgcopy;
+	int ret;
+	u_int16_t hoffset;
+
+	switch (TYPE(pp)) {
+	case P_BTREEMETA:
+		return (__bam_mswap(env, pp));
+
+	case P_HASHMETA:
+		return (__ham_mswap(env, pp));
+
+	case P_QAMMETA:
+		return (__qam_mswap(env, pp));
+
+	case P_INVALID:
+	case P_OVERFLOW:
+	case P_QAMDATA:
+		/*
+		 * We may have been passed an invalid page, or a queue data
+		 * page, or an overflow page where fields like hoffset have a
+		 * special meaning.  In that case, no swapping of the page data
+		 * is required, just the fields in the page header.
+		 */
+		pdata = NULL;
+		break;
+
+	default:
+		break;
+	}
+
+	if (pgin) {
+		P_32_COPYSWAP(&PGNO(pp), &pg);
+		P_16_COPYSWAP(&HOFFSET(pp), &hoffset);
+	} else {
+		pg = PGNO(pp);
+		hoffset = HOFFSET(pp);
+	}
+
+	if (pdata == NULL)
+		ret = __db_byteswap(dbp, pg, (PAGE *)pp, len, pgin);
+	else {
+		pgsize = hoffset + pdata->size;
+		if ((ret = __os_malloc(env, pgsize, &pgcopy)) != 0)
+			return (ret);
+		memset(pgcopy, 0, pgsize);
+		memcpy(pgcopy, pp, len);
+		memcpy((u_int8_t *)pgcopy + hoffset, pdata->data, pdata->size);
+
+		ret = __db_byteswap(dbp, pg, (PAGE *)pgcopy, pgsize, pgin);
+		memcpy(pp, pgcopy, len);
+
+		/*
+		 * If we are swapping data to be written to the log, we can't
+		 * overwrite the buffer that was passed in: it may be a pointer
+		 * into a page in cache.  We set DB_DBT_APPMALLOC here so that
+		 * the calling code can free the memory we allocate here.
+		 */
+		if (!pgin) {
+			if ((ret =
+			    __os_malloc(env, pdata->size, &pdata->data)) != 0) {
+				__os_free(env, pgcopy);
+				return (ret);
+			}
+			F_SET(pdata, DB_DBT_APPMALLOC);
+		}
+		memcpy(pdata->data, (u_int8_t *)pgcopy + hoffset, pdata->size);
+		__os_free(env, pgcopy);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_recordswap --
+ *	Byteswap any database record.
+ *
+ * PUBLIC: void __db_recordswap __P((u_int32_t,
+ * PUBLIC:     u_int32_t, void *, void *, u_int32_t));
+ */
+void
+__db_recordswap(op, size, hdr, data, pgin)
+	u_int32_t op;
+	u_int32_t size;
+	void *hdr, *data;
+	u_int32_t pgin;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	BINTERNAL *bi;
+	RINTERNAL *ri;
+	db_indx_t tmp;
+	u_int8_t *p, *end;
+
+	if (size == 0)
+		return;
+	switch (OP_PAGE_GET(op)) {
+	case P_LDUP:
+	case P_LBTREE:
+	case P_LRECNO:
+		bk = (BKEYDATA *)hdr;
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			M_16_SWAP(bk->len);
+			break;
+		case B_DUPLICATE:
+		case B_OVERFLOW:
+			bo = (BOVERFLOW *)hdr;
+			M_32_SWAP(bo->pgno);
+			M_32_SWAP(bo->tlen);
+			break;
+		default:
+			DB_ASSERT(NULL, bk->type != bk->type);
+		}
+		break;
+	case P_IBTREE:
+		bi = (BINTERNAL *)hdr;
+		M_16_SWAP(bi->len);
+		M_32_SWAP(bi->pgno);
+		M_32_SWAP(bi->nrecs);
+		if (B_TYPE(bi->type) == B_OVERFLOW) {
+			if (data == NULL) {
+				DB_ASSERT(NULL,
+				    size == BINTERNAL_SIZE(BOVERFLOW_SIZE));
+				bo = (BOVERFLOW *)bi->data;
+			} else
+				bo = (BOVERFLOW *)data;
+			M_32_SWAP(bo->pgno);
+		}
+		break;
+	case P_IRECNO:
+		ri = (RINTERNAL *)hdr;
+		M_32_SWAP(ri->pgno);
+		M_32_SWAP(ri->nrecs);
+		break;
+	case P_OVERFLOW:
+		break;
+	case P_HASH:
+	case P_HASH_UNSORTED:
+		switch (OP_MODE_GET(op)) {
+		/* KEYDATA and DUPLICATE records do not include the header. */
+		case H_KEYDATA:
+			break;
+		case H_DUPLICATE:
+			p = (u_int8_t *)hdr;
+			for (end = p + size; p < end;) {
+				if (pgin) {
+					P_16_SWAP(p);
+					memcpy(&tmp,
+					    p, sizeof(db_indx_t));
+					p += sizeof(db_indx_t);
+				} else {
+					memcpy(&tmp,
+					    p, sizeof(db_indx_t));
+					SWAP16(p);
+				}
+				p += tmp;
+				SWAP16(p);
+			}
+			break;
+		/* These two record types include the full header. */
+		case H_OFFDUP:
+			p = (u_int8_t *)hdr;
+			p += SSZ(HOFFPAGE, pgno);
+			SWAP32(p);			/* pgno */
+			break;
+		case H_OFFPAGE:
+			p = (u_int8_t *)hdr;
+			p += SSZ(HOFFPAGE, pgno);
+			SWAP32(p);			/* pgno */
+			SWAP32(p);			/* tlen */
+			break;
+		default:
+			DB_ASSERT(NULL, op != op);
+		}
+		break;
+
+	default:
+		DB_ASSERT(NULL, op != op);
+	}
+}
diff --git a/src/db/db_copy.c b/src/db/db_copy.c
new file mode 100644
index 00000000..359c74be
--- /dev/null
+++ b/src/db/db_copy.c
@@ -0,0 +1,31 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * db_copy --
+ *	Copy a database file coordinated with mpool.
+ * This is for backward compatibility to the quick fix in 5.2.
+ *
+ * EXTERN: int db_copy __P((DB_ENV *,
+ * EXTERN:     const char *, const char *, const char *));
+ */
+int
+db_copy(dbenv, dbfile, target, passwd)
+	DB_ENV *dbenv;
+	const char *dbfile;
+	const char *target;
+	const char *passwd;
+{
+	COMPQUIET(passwd, NULL);
+	return (__db_dbbackup_pp(dbenv, dbfile, target, 0));
+}
diff --git a/src/db/db_dispatch.c b/src/db/db_dispatch.c
new file mode 100644
index 00000000..06de4ef7
--- /dev/null
+++ b/src/db/db_dispatch.c
@@ -0,0 +1,977 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/log_verify.h"
+
+static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
+		db_txnlist_type, u_int32_t,  DB_TXNLIST **,
+		int, u_int32_t *));
+
+/*
+ * __db_dispatch --
+ *
+ * This is the transaction dispatch function used by the db access methods.
+ * It is designed to handle the record format used by all the access
+ * methods (the one automatically generated by the db_{h,log,read}.sh
+ * scripts in the tools directory).  An application using a different
+ * recovery paradigm will supply a different dispatch function to txn_open.
+ *
+ * PUBLIC: int __db_dispatch __P((ENV *,
+ * PUBLIC:     DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_dispatch(env, dtab, db, lsnp, redo, params)
+	ENV *env;		/* The environment. */
+	DB_DISTAB *dtab;
+	DBT *db;		/* The log record upon which to dispatch. */
+	DB_LSN *lsnp;		/* The lsn of the record being dispatched. */
+	db_recops redo;		/* Redo this op (or undo it). */
+	void *params;
+{
+	DB_ENV *dbenv;
+	DB_TXNHEAD *info;	/* Transaction list. */
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN prev_lsn;
+	u_int32_t rectype, status, txnid, urectype;
+	int make_call, ret;
+
+	dbenv = env->dbenv;
+	make_call = ret = 0;
+	lvh = NULL;
+	info = NULL;
+	LOGCOPY_32(env, &rectype, db->data);
+	LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype));
+
+	/*
+	 * Log verification passes a DB_LOG_VRFY_INFO structure, others
+	 * pass a DB_TXNHEAD structure.
+	 */
+	if (redo != DB_TXN_LOG_VERIFY)
+		info = (DB_TXNHEAD *)params;
+	else
+		lvh = (DB_LOG_VRFY_INFO *)params;
+
+	/* If we don't have a dispatch table, it's hard to dispatch. */
+	DB_ASSERT(env, dtab != NULL);
+
+	/*
+	 * If we find a record that is in the user's number space and they
+	 * have specified a recovery routine, let them handle it.  If they
+	 * didn't specify a recovery routine, then we expect that they've
+	 * followed all our rules and registered new recovery functions.
+	 */
+	switch (redo) {
+	case DB_TXN_ABORT:
+	case DB_TXN_APPLY:
+	case DB_TXN_LOG_VERIFY:
+	case DB_TXN_PRINT:
+		make_call = 1;
+		break;
+	case DB_TXN_OPENFILES:
+		/*
+		 * We collect all the transactions that have
+		 * "begin" records, those with no previous LSN,
+		 * so that we do not abort partial transactions.
+		 * These are known to be undone, otherwise the
+		 * log would not have been freeable.
+		 */
+		LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data +
+		    sizeof(rectype) + sizeof(txnid));
+		if (txnid != 0 && prev_lsn.file == 0 && (ret =
+		    __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0)
+			return (ret);
+
+		/* FALLTHROUGH */
+	case DB_TXN_POPENFILES:
+		if (rectype == DB___dbreg_register ||
+		    rectype == DB___txn_child ||
+		    rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+			return ((dtab->int_dispatch[rectype])(env,
+			    db, lsnp, redo, info));
+		break;
+	case DB_TXN_BACKWARD_ROLL:
+		/*
+		 * Running full recovery in the backward pass. In general,
+		 * we only process records during this pass that belong
+		 * to aborted transactions.  Unfortunately, there are several
+		 * exceptions:
+		 * 1. If this is a meta-record, one not associated with
+		 *    a transaction, then we must always process it.
+		 * 2. If this is a transaction commit/abort, we must
+		 *    always process it, so that we know the status of
+		 *    every transaction.
+		 * 3. If this is a child commit, we need to process it
+		 *    because the outcome of the child transaction depends
+		 *    on the outcome of the parent.
+		 * 4. If this is a dbreg_register record, we must always
+		 *    process is because they contain non-transactional
+		 *    closes that must be properly handled.
+		 * 5. If this is a noop, we must always undo it so that we
+		 *    properly handle any aborts before a file was closed.
+		 * 6. If this a file remove, we need to process it to
+		 *    determine if the on-disk file is the same as the
+		 *    one being described.
+		 */
+		switch (rectype) {
+		/*
+		 * These either do not belong to a transaction or (regop)
+		 * must be processed regardless of the status of the
+		 * transaction.
+		 */
+		case DB___txn_regop:
+		case DB___txn_recycle:
+		case DB___txn_ckp:
+			make_call = 1;
+			break;
+		/*
+		 * These belong to a transaction whose status must be
+		 * checked.
+		 */
+		case DB___txn_child:
+		case DB___db_noop:
+		case DB___fop_file_remove:
+		case DB___dbreg_register:
+			make_call = 1;
+
+			/* FALLTHROUGH */
+		default:
+			if (txnid == 0)
+				break;
+
+			ret = __db_txnlist_find(env, info, txnid, &status);
+
+			/* If not found, this is an incomplete abort.  */
+			if (ret == DB_NOTFOUND)
+				return (__db_txnlist_add(env,
+				    info, txnid, TXN_IGNORE, lsnp));
+			if (ret != 0)
+				return (ret);
+
+			/*
+			 * If we ignore the transaction, ignore the operation
+			 * UNLESS this is a child commit in which case we need
+			 * to make sure that the child also gets marked as
+			 * ignore.
+			 */
+			if (status == TXN_IGNORE && rectype != DB___txn_child) {
+				make_call = 0;
+				break;
+			}
+			if (status == TXN_COMMIT)
+				break;
+
+			/* Set make_call in case we came through default */
+			make_call = 1;
+			if (status == TXN_OK &&
+			    (ret = __db_txnlist_update(env,
+			    info, txnid, rectype == DB___txn_prepare ?
+			    TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
+				return (ret);
+		}
+		break;
+	case DB_TXN_FORWARD_ROLL:
+		/*
+		 * In the forward pass, if we haven't seen the transaction,
+		 * do nothing, else recover it.
+		 *
+		 * We need to always redo DB___db_noop records, so that we
+		 * properly handle any commits after the file was closed.
+		 */
+		switch (rectype) {
+		case DB___txn_recycle:
+		case DB___txn_ckp:
+		case DB___db_noop:
+		case DB___dbreg_register:
+			make_call = 1;
+			break;
+
+		default:
+			if (txnid == 0)
+				status = 0;
+			else {
+				ret = __db_txnlist_find(env,
+				    info, txnid, &status);
+
+				if (ret == DB_NOTFOUND)
+					/* Break out out of if clause. */
+					;
+				else if (ret != 0)
+					return (ret);
+				else if (status == TXN_COMMIT) {
+					make_call = 1;
+					break;
+				}
+			}
+
+		}
+		break;
+	default:
+		return (__db_unknown_flag(
+		    env, "__db_dispatch", (u_int32_t)redo));
+	}
+
+	if (make_call) {
+		/*
+		 * If the debug flag is set then we are logging
+		 * records for a non-durable update so that they
+		 * may be examined for diagnostic purposes.
+		 * So only make the call if we are printing,
+		 * otherwise we need to extract the previous
+		 * lsn so undo will work properly.
+		 */
+		if (rectype & DB_debug_FLAG) {
+			if (redo == DB_TXN_PRINT)
+				rectype &= ~DB_debug_FLAG;
+			else {
+				LOGCOPY_TOLSN(env, lsnp,
+				    (u_int8_t *)db->data +
+				    sizeof(rectype) +
+				    sizeof(txnid));
+				return (0);
+			}
+		}
+		if (rectype >= DB_user_BEGIN) {
+			/*
+			 * Increment user log count, we can't pass any extra
+			 * args into app_dispatch, so this has to be done here.
+			 */
+			if (redo == DB_TXN_LOG_VERIFY)
+				lvh->external_logrec_cnt++;
+			if (dbenv->app_dispatch != NULL)
+				return (dbenv->app_dispatch(dbenv,
+				    db, lsnp, redo));
+
+			/* No application-specific dispatch */
+			urectype = rectype - DB_user_BEGIN;
+			if (urectype > dtab->ext_size ||
+			    dtab->ext_dispatch[urectype] == NULL) {
+				__db_errx(env, DB_STR_A("0512",
+	    "Illegal application-specific record type %lu in log",
+				    "%lu"), (u_long)rectype);
+				return (EINVAL);
+			}
+
+			return ((dtab->ext_dispatch[urectype])(dbenv,
+			    db, lsnp, redo));
+		} else {
+			if (rectype > dtab->int_size ||
+			    dtab->int_dispatch[rectype] == NULL) {
+				__db_errx(env, DB_STR_A("0513",
+				    "Illegal record type %lu in log", "%lu"),
+				    (u_long)rectype);
+				if (redo == DB_TXN_LOG_VERIFY)
+					lvh->unknown_logrec_cnt++;
+
+				return (EINVAL);
+			}
+
+			return ((dtab->int_dispatch[rectype])(env,
+			    db, lsnp, redo, params));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __db_add_recovery -- Add recovery functions to the dispatch table.
+ *
+ * We have two versions of this, an external one and an internal one,
+ * because application-specific functions take different arguments
+ * for dispatch (ENV versus DB_ENV).
+ *
+ * This is the external version.
+ *
+ * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
+ * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+ */
+int
+__db_add_recovery(dbenv, dtab, func, ndx)
+	DB_ENV *dbenv;
+	DB_DISTAB *dtab;
+	int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+	u_int32_t ndx;
+{
+	size_t i, nsize;
+	int ret;
+
+	/* Make sure this is an application-specific record. */
+	if (ndx < DB_user_BEGIN) {
+		__db_errx(dbenv->env, DB_STR_A("0514",
+	"Attempting to add application-specific record with invalid type %lu",
+		    "%lu"), (u_long)ndx);
+		return (EINVAL);
+	}
+	ndx -= DB_user_BEGIN;
+
+	/* Check if we have to grow the table. */
+	if (ndx >= dtab->ext_size) {
+		nsize = ndx + 40;
+		if ((ret =
+		    __os_realloc(dbenv->env, nsize *
+		    sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch))
+		    != 0)
+			return (ret);
+		for (i = dtab->ext_size; i < nsize; ++i)
+			(dtab->ext_dispatch)[i] = NULL;
+		dtab->ext_size = nsize;
+	}
+
+	(dtab->ext_dispatch)[ndx] = func;
+	return (0);
+}
+
+/*
+ * __db_add_recovery_int --
+ *
+ * Internal version of dispatch addition function.
+ *
+ *
+ * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
+ * PUBLIC:   int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ */
+int
+__db_add_recovery_int(env, dtab, func, ndx)
+	ENV *env;
+	DB_DISTAB *dtab;
+	int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+	u_int32_t ndx;
+{
+	size_t i, nsize;
+	int ret;
+
+	if (ndx >= DB_user_BEGIN) {
+		__db_errx(env, DB_STR_A("0515",
+		    "Attempting to add internal record with invalid type %lu",
+		    "%lu"), (u_long)ndx);
+		return (EINVAL);
+	}
+
+	/* Check if we have to grow the table. */
+	if (ndx >= dtab->int_size) {
+		nsize = ndx + 40;
+		if ((ret =
+		    __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]),
+		    &dtab->int_dispatch)) != 0)
+			return (ret);
+		for (i = dtab->int_size; i < nsize; ++i)
+			(dtab->int_dispatch)[i] = NULL;
+		dtab->int_size = nsize;
+	}
+
+	(dtab->int_dispatch)[ndx] = func;
+	return (0);
+}
+
+/*
+ * __db_txnlist_init --
+ *	Initialize transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+ */
+int
+__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t low_txn, hi_txn;
+	DB_LSN *trunc_lsn;
+	DB_TXNHEAD **retp;
+{
+	DB_TXNHEAD *headp;
+	u_int32_t size, tmp;
+	int ret;
+
+	/*
+	 * Size a hash table.
+	 *	If low is zero then we are being called during rollback
+	 * and we need only one slot.
+	 *	Hi maybe lower than low if we have recycled txnid's.
+	 *	The numbers here are guesses about txn density, we can afford
+	 * to look at a few entries in each slot.
+	 */
+	if (low_txn == 0)
+		size = 1;
+	else {
+		if (hi_txn < low_txn) {
+			tmp = hi_txn;
+			hi_txn = low_txn;
+			low_txn = tmp;
+		}
+		tmp = hi_txn - low_txn;
+		/* See if we wrapped around. */
+		if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+			tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+		size = tmp / 5;
+		if (size < 100)
+			size = 100;
+	}
+	if ((ret = __os_malloc(env,
+	    sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
+		return (ret);
+
+	memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+	headp->maxid = hi_txn;
+	headp->generation = 0;
+	headp->nslots = size;
+	headp->gen_alloc = 8;
+	headp->thread_info = ip;
+	if ((ret = __os_malloc(env, headp->gen_alloc *
+	    sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+		__os_free(env, headp);
+		return (ret);
+	}
+	headp->gen_array[0].generation = 0;
+	headp->gen_array[0].txn_min = TXN_MINIMUM;
+	headp->gen_array[0].txn_max = TXN_MAXIMUM;
+	if (trunc_lsn != NULL) {
+		headp->trunc_lsn = *trunc_lsn;
+		headp->maxlsn = *trunc_lsn;
+	} else {
+		ZERO_LSN(headp->trunc_lsn);
+		ZERO_LSN(headp->maxlsn);
+	}
+	ZERO_LSN(headp->ckplsn);
+
+	*retp = headp;
+	return (0);
+}
+
+#define	FIND_GENERATION(hp, txnid, gen) do {				\
+	u_int32_t __i;							\
+	for (__i = 0; __i <= (hp)->generation; __i++)			\
+		/* The range may wrap around the end. */		\
+		if ((hp)->gen_array[__i].txn_min <			\
+		    (hp)->gen_array[__i].txn_max ?			\
+		    ((txnid) >= (hp)->gen_array[__i].txn_min &&		\
+		    (txnid) <= (hp)->gen_array[__i].txn_max) :		\
+		    ((txnid) >= (hp)->gen_array[__i].txn_min ||		\
+		    (txnid) <= (hp)->gen_array[__i].txn_max))		\
+			break;						\
+	DB_ASSERT(env, __i <= (hp)->generation);			\
+	gen = (hp)->gen_array[__i].generation;				\
+} while (0)
+
+/*
+ * __db_txnlist_add --
+ *	Add an element to our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_add __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_add(env, hp, txnid, status, lsn)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, status;
+	DB_LSN *lsn;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+		return (ret);
+
+	LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
+
+	/* Find the most recent generation containing this ID */
+	FIND_GENERATION(hp, txnid, elp->u.t.generation);
+	elp->type = TXNLIST_TXNID;
+	elp->u.t.txnid = txnid;
+	elp->u.t.status = status;
+	if (txnid > hp->maxid)
+		hp->maxid = txnid;
+	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+		hp->maxlsn = *lsn;
+
+	DB_ASSERT(env, lsn == NULL ||
+	    status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_remove --
+ *	Remove an element from our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+ */
+int
+__db_txnlist_remove(env, hp, txnid)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid;
+{
+	DB_TXNLIST *entry;
+	u_int32_t status;
+
+	return (__db_txnlist_find_internal(env,
+	    hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
+}
+
+/*
+ * __db_txnlist_ckp --
+ *	Used to record the maximum checkpoint that will be retained
+ * after recovery.  Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
+ *
+ * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+void
+__db_txnlist_ckp(env, hp, ckp_lsn)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *ckp_lsn;
+{
+
+	COMPQUIET(env, NULL);
+
+	if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+	    LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
+		hp->ckplsn = *ckp_lsn;
+}
+
+/*
+ * __db_txnlist_end --
+ *	Discard transaction linked list.
+ *
+ * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+ */
+void
+__db_txnlist_end(env, hp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+{
+	u_int32_t i;
+	DB_TXNLIST *p;
+
+	if (hp == NULL)
+		return;
+
+	for (i = 0; i < hp->nslots; i++)
+		while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+			switch (p->type) {
+			case TXNLIST_LSN:
+				__os_free(env, p->u.l.lsn_stack);
+				break;
+			case TXNLIST_DELETE:
+			case TXNLIST_TXNID:
+			default:
+				/*
+				 * Possibly an incomplete DB_TXNLIST; just
+				 * free it.
+				 */
+				break;
+			}
+			LIST_REMOVE(p, links);
+			__os_free(env, p);
+		}
+
+	if (hp->gen_array != NULL)
+		__os_free(env, hp->gen_array);
+	__os_free(env, hp);
+}
+
+/*
+ * __db_txnlist_find --
+ *	Checks to see if a txnid with the current generation is in the
+ *	txnid list.  This returns DB_NOTFOUND if the item isn't in the
+ *	list otherwise it returns (like __db_txnlist_find_internal)
+ *	the status of the transaction.  A txnid of 0 means the record
+ *	was generated while not in a transaction.
+ *
+ * PUBLIC: int __db_txnlist_find __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
+ */
+int
+__db_txnlist_find(env, hp, txnid, statusp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, *statusp;
+{
+	DB_TXNLIST *entry;
+
+	if (txnid == 0)
+		return (DB_NOTFOUND);
+
+	return (__db_txnlist_find_internal(env, hp,
+	    TXNLIST_TXNID, txnid, &entry, 0, statusp));
+}
+
+/*
+ * __db_txnlist_update --
+ *	Change the status of an existing transaction entry.
+ *	Returns DB_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+ */
+int
+__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, status;
+	DB_LSN *lsn;
+	u_int32_t *ret_status;
+	int add_ok;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if (txnid == 0)
+		return (DB_NOTFOUND);
+
+	ret = __db_txnlist_find_internal(env,
+	    hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
+
+	if (ret == DB_NOTFOUND && add_ok) {
+		*ret_status = status;
+		return (__db_txnlist_add(env, hp, txnid, status, lsn));
+	}
+	if (ret != 0)
+		return (ret);
+
+	if (*ret_status == TXN_IGNORE)
+		return (0);
+
+	elp->u.t.status = status;
+
+	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+		hp->maxlsn = *lsn;
+
+	return (ret);
+}
+
+/*
+ * __db_txnlist_find_internal --
+ *	Find an entry on the transaction list.  If the entry is not there or
+ *	the list pointer is not initialized we return DB_NOTFOUND.  If the
+ *	item is found, we return the status.  Currently we always call this
+ *	with an initialized list pointer but checking for NULL keeps it general.
+ */
+static int
+__db_txnlist_find_internal(env,
+    hp, type, txnid, txnlistp, del, statusp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	db_txnlist_type type;
+	u_int32_t  txnid;
+	DB_TXNLIST **txnlistp;
+	int del;
+	u_int32_t *statusp;
+{
+	struct __db_headlink *head;
+	DB_TXNLIST *p;
+	u_int32_t generation, hash;
+	int ret;
+
+	ret = 0;
+
+	if (hp == NULL)
+		return (DB_NOTFOUND);
+
+	switch (type) {
+	case TXNLIST_TXNID:
+		hash = txnid;
+		FIND_GENERATION(hp, txnid, generation);
+		break;
+	case TXNLIST_DELETE:
+	case TXNLIST_LSN:
+	default:
+		return (__env_panic(env, EINVAL));
+	}
+
+	head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
+	LIST_FOREACH(p, head, links) {
+		if (p->type != type)
+			continue;
+		switch (type) {
+		case TXNLIST_TXNID:
+			if (p->u.t.txnid != txnid ||
+			    generation != p->u.t.generation)
+				continue;
+			*statusp = p->u.t.status;
+			break;
+
+		case TXNLIST_DELETE:
+		case TXNLIST_LSN:
+		default:
+			return (__env_panic(env, EINVAL));
+		}
+		if (del == 1) {
+			LIST_REMOVE(p, links);
+			__os_free(env, p);
+			*txnlistp = NULL;
+		} else if (p != LIST_FIRST(head)) {
+			/* Move it to head of list. */
+			LIST_REMOVE(p, links);
+			LIST_INSERT_HEAD(head, p, links);
+			*txnlistp = p;
+		} else
+			*txnlistp = p;
+		return (ret);
+	}
+
+	return (DB_NOTFOUND);
+}
+
+/*
+ * __db_txnlist_gen --
+ *	Change the current generation number.
+ *
+ * PUBLIC: int __db_txnlist_gen __P((ENV *,
+ * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+ */
+int
+__db_txnlist_gen(env, hp, incr, min, max)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	int incr;
+	u_int32_t min, max;
+{
+	int ret;
+
+	/*
+	 * During recovery generation numbers keep track of "restart"
+	 * checkpoints and recycle records.  Restart checkpoints occur
+	 * whenever we take a checkpoint and there are no outstanding
+	 * transactions.  When that happens, we can reset transaction IDs
+	 * back to TXNID_MINIMUM.  Currently we only do the reset
+	 * at then end of recovery.  Recycle records occur when txnids
+	 * are exhausted during runtime.  A free range of ids is identified
+	 * and logged.  This code maintains a stack of ranges.  A txnid
+	 * is given the generation number of the first range it falls into
+	 * in the stack.
+	 */
+	if (incr < 0) {
+		--hp->generation;
+		memmove(hp->gen_array, &hp->gen_array[1],
+		    (hp->generation + 1) * sizeof(hp->gen_array[0]));
+	} else {
+		++hp->generation;
+		if (hp->generation >= hp->gen_alloc) {
+			hp->gen_alloc *= 2;
+			if ((ret = __os_realloc(env, hp->gen_alloc *
+			    sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+				return (ret);
+		}
+		memmove(&hp->gen_array[1], &hp->gen_array[0],
+		    hp->generation * sizeof(hp->gen_array[0]));
+		hp->gen_array[0].generation = hp->generation;
+		hp->gen_array[0].txn_min = min;
+		hp->gen_array[0].txn_max = max;
+	}
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsnadd --
+ *	Save the prev_lsn from a txn_child record.
+ *
+ * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsnadd(env, hp, lsnp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if (IS_ZERO_LSN(*lsnp))
+		return (0);
+
+	LIST_FOREACH(elp, &hp->head[0], links)
+		if (elp->type == TXNLIST_LSN)
+			break;
+
+	if (elp == NULL) {
+		if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
+			return (ret);
+		return (DB_SURPRISE_KID);
+	}
+
+	if (elp->u.l.stack_indx == elp->u.l.stack_size) {
+		elp->u.l.stack_size <<= 1;
+		if ((ret = __os_realloc(env, sizeof(DB_LSN) *
+		     elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
+			__db_txnlist_end(env, hp);
+			return (ret);
+		}
+	}
+	elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsnget --
+ *
+ * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
+ *	Get the lsn saved from a txn_child record.
+ */
+int
+__db_txnlist_lsnget(env, hp, lsnp, flags)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+	u_int32_t flags;
+{
+	DB_TXNLIST *elp;
+
+	COMPQUIET(env, NULL);
+	COMPQUIET(flags, 0);
+
+	LIST_FOREACH(elp, &hp->head[0], links)
+		if (elp->type == TXNLIST_LSN)
+			break;
+
+	if (elp == NULL || elp->u.l.stack_indx == 0) {
+		ZERO_LSN(*lsnp);
+		return (0);
+	}
+
+	*lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsninit --
+ *	Initialize a transaction list with an lsn array entry.
+ *
+ * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsninit(env, hp, lsnp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	elp = NULL;
+
+	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+		goto err;
+	LIST_INSERT_HEAD(&hp->head[0], elp, links);
+
+	elp->type = TXNLIST_LSN;
+	if ((ret = __os_malloc(env,
+	    sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
+		goto err;
+	elp->u.l.stack_indx = 1;
+	elp->u.l.stack_size = DB_LSN_STACK_SIZE;
+	elp->u.l.lsn_stack[0] = *lsnp;
+
+	return (0);
+
+err:	__db_txnlist_end(env, hp);
+	return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_txnlist_print --
+ *	Print out the transaction list.
+ *
+ * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
+ */
+void
+__db_txnlist_print(hp)
+	DB_TXNHEAD *hp;
+{
+	DB_TXNLIST *p;
+	u_int32_t i;
+	char *txntype;
+
+	printf("Maxid: %lu Generation: %lu\n",
+	    (u_long)hp->maxid, (u_long)hp->generation);
+	for (i = 0; i < hp->nslots; i++)
+		LIST_FOREACH(p, &hp->head[i], links) {
+			if (p->type != TXNLIST_TXNID) {
+				printf("Unrecognized type: %d\n", p->type);
+				continue;
+			}
+			switch (p->u.t.status) {
+			case TXN_OK:
+				txntype = "OK";
+				break;
+			case TXN_COMMIT:
+				txntype = "commit";
+				break;
+			case TXN_PREPARE:
+				txntype = "prepare";
+				break;
+			case TXN_ABORT:
+				txntype = "abort";
+				break;
+			case TXN_IGNORE:
+				txntype = "ignore";
+				break;
+			case TXN_EXPECTED:
+				txntype = "expected";
+				break;
+			case TXN_UNEXPECTED:
+				txntype = "unexpected";
+				break;
+			default:
+				txntype = "UNKNOWN";
+				break;
+			}
+			printf("TXNID: %lx(%lu): %s\n",
+			    (u_long)p->u.t.txnid,
+			    (u_long)p->u.t.generation, txntype);
+		}
+}
+#endif
diff --git a/src/db/db_dup.c b/src/db/db_dup.c
new file mode 100644
index 00000000..9fd04791
--- /dev/null
+++ b/src/db/db_dup.c
@@ -0,0 +1,214 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * __db_ditem_nolog --
+ *	Remove an item from a page without affecting its recoverability.
+ *
+ * PUBLIC:  int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem_nolog(dbc, pagep, indx, nbytes)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx, nbytes;
+{
+	DB *dbp;
+	db_indx_t cnt, *inp, offset;
+	u_int8_t *from;
+
+	dbp = dbc->dbp;
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+	DB_ASSERT(dbp->env, indx < NUM_ENT(pagep));
+
+	/*
+	 * If there's only a single item on the page, we don't have to
+	 * work hard.
+	 */
+	if (NUM_ENT(pagep) == 1) {
+		NUM_ENT(pagep) = 0;
+		HOFFSET(pagep) = dbp->pgsize;
+		return (0);
+	}
+
+	inp = P_INP(dbp, pagep);
+	/*
+	 * Pack the remaining key/data items at the end of the page.  Use
+	 * memmove(3), the regions may overlap.
+	 */
+	from = (u_int8_t *)pagep + HOFFSET(pagep);
+	DB_ASSERT(dbp->env, inp[indx] >= HOFFSET(pagep));
+	memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep));
+	HOFFSET(pagep) += nbytes;
+
+	/* Adjust the indices' offsets. */
+	offset = inp[indx];
+	for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt)
+		if (inp[cnt] < offset)
+			inp[cnt] += nbytes;
+
+	/* Shift the indices down. */
+	--NUM_ENT(pagep);
+	if (indx != NUM_ENT(pagep))
+		memmove(&inp[indx], &inp[indx + 1],
+		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+
+	return (0);
+}
+
+/*
+ * __db_ditem --
+ *	Remove an item from a page, logging it if enabled.
+ *
+ * PUBLIC:  int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem(dbc, pagep, indx, nbytes)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx, nbytes;
+{
+	DB *dbp;
+	DBT ldbt;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	if (DBC_LOGGING(dbc)) {
+		ldbt.data = P_ENTRY(dbp, pagep, indx);
+		ldbt.size = nbytes;
+		if ((ret = __db_addrem_log(dbp, dbc->txn, &LSN(pagep), 0,
+		    OP_SET(DB_REM_DUP, pagep), PGNO(pagep),
+		    (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	return (__db_ditem_nolog(dbc, pagep, indx, nbytes));
+}
+
+/*
+ * __db_pitem_nolog --
+ *	Put an item on a page without logging.
+ *
+ * PUBLIC: int __db_pitem_nolog
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	DBT *hdr, *data;
+{
+	BKEYDATA bk;
+	DB *dbp;
+	DBT thdr;
+	db_indx_t *inp;
+	u_int8_t *p;
+
+	dbp = dbc->dbp;
+
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+	if (nbytes > P_FREESPACE(dbp, pagep)) {
+		DB_ASSERT(dbp->env, nbytes <= P_FREESPACE(dbp, pagep));
+		return (EINVAL);
+	}
+
+	if (hdr == NULL) {
+		B_TSET(bk.type, B_KEYDATA);
+		bk.len = data == NULL ? 0 : data->size;
+
+		thdr.data = &bk;
+		thdr.size = SSZA(BKEYDATA, data);
+		hdr = &thdr;
+	}
+	inp = P_INP(dbp, pagep);
+
+	/* Adjust the index table, then put the item on the page. */
+	if (indx != NUM_ENT(pagep))
+		memmove(&inp[indx + 1], &inp[indx],
+		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+	HOFFSET(pagep) -= nbytes;
+	inp[indx] = HOFFSET(pagep);
+	++NUM_ENT(pagep);
+
+	p = P_ENTRY(dbp, pagep, indx);
+	memcpy(p, hdr->data, hdr->size);
+	if (data != NULL)
+		memcpy(p + hdr->size, data->data, data->size);
+
+	return (0);
+}
+
+/*
+ * __db_pitem --
+ *	Put an item on a page.
+ *
+ * PUBLIC: int __db_pitem
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	DBT *hdr, *data;
+{
+	DB *dbp;
+	MPOOLFILE *mpf;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf->mfp;
+	/*
+	 * Put a single item onto a page.  The logic figuring out where to
+	 * insert and whether it fits is handled in the caller.  All we do
+	 * here is manage the page shuffling.  We cheat a little bit in that
+	 * we don't want to copy the dbt on a normal put twice.  If hdr is
+	 * NULL, we create a BKEYDATA structure on the page, otherwise, just
+	 * copy the caller's information onto the page.
+	 *
+	 * This routine is also used to put entries onto the page where the
+	 * entry is pre-built, e.g., during recovery.  In this case, the hdr
+	 * will point to the entry, and the data argument will be NULL.
+	 *
+	 * If transactional bulk loading is enabled in this
+	 * transaction, and the page is above the file's extension
+	 * watermark, skip logging, but do not invoke LSN_NOT_LOGGED.
+	 *
+	 * !!!
+	 * There's a tremendous potential for off-by-one errors here, since
+	 * the passed in header sizes must be adjusted for the structure's
+	 * placeholder for the trailing variable-length data field.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		if (__txn_pg_above_fe_watermark(dbc->txn, mpf, PGNO(pagep))) {
+			mpf->fe_nlws++; /* Note that logging was skipped. */
+		} else if ((ret = __db_addrem_log(dbp, dbc->txn, &LSN(pagep),
+		    0, OP_SET(DB_ADD_DUP, pagep), PGNO(pagep),
+		    (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep)))) {
+			return (ret);
+		}
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	return (__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data));
+}
diff --git a/src/db/db_iface.c b/src/db/db_iface.c
new file mode 100644
index 00000000..59e0ba53
--- /dev/null
+++ b/src/db/db_iface.c
@@ -0,0 +1,3001 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#ifndef HAVE_QUEUE
+#include "dbinc/qam.h"			/* For __db_no_queue_am(). */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_associate_arg __P((DB *, DB *,
+    int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+static int __dbc_del_arg __P((DBC *, u_int32_t));
+static int __dbc_pget_arg __P((DBC *, DBT *, u_int32_t));
+static int __dbc_put_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_curinval __P((const ENV *));
+static int __db_cursor_arg __P((DB *, u_int32_t));
+static int __db_del_arg __P((DB *, DBT *, u_int32_t));
+static int __db_get_arg __P((const DB *, DBT *, DBT *, u_int32_t));
+static int __db_join_arg __P((DB *, DBC **, u_int32_t));
+static int __db_open_arg __P((DB *,
+    DB_TXN *, const char *, const char *, DBTYPE, u_int32_t));
+static int __db_pget_arg __P((DB *, DBT *, u_int32_t));
+static int __db_put_arg __P((DB *, DBT *, DBT *, u_int32_t));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+static int __db_compact_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __db_associate_foreign_arg __P((DB *, DB *,
+    int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+    u_int32_t));
+
+/*
+ * These functions implement the Berkeley DB API.  They are organized in a
+ * layered fashion.  The interface functions (XXX_pp) perform all generic
+ * error checks (for example, PANIC'd region, replication state change
+ * in progress, inconsistent transaction usage), call function-specific
+ * check routines (_arg) to check for proper flag usage, etc., do pre-amble
+ * processing (incrementing handle counts, handling local transactions),
+ * call the function and then do post-amble processing (local transactions,
+ * decrement handle counts).
+ *
+ * The basic structure is:
+ *	Check for simple/generic errors (PANIC'd region)
+ *	Check if replication is changing state (increment handle count).
+ *	Call function-specific argument checking routine
+ *	Create internal transaction if necessary
+ *	Call underlying worker function
+ *	Commit/abort internal transaction if necessary
+ *	Decrement handle count
+ */
+
+/*
+ * __db_associate_pp --
+ *	DB->associate pre/post processing.
+ *
+ * PUBLIC: int __db_associate_pp __P((DB *, DB_TXN *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate_pp(dbp, txn, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	DB_TXN *txn;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	DBC *sdbc;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Secondary cursors may have the primary's lock file ID, so we need
+	 * to make sure that no older cursors are lying around when we make
+	 * the transition.
+	 */
+	if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+		__db_errx(env, DB_STR("0572",
+    "Databases may not become secondary indices while cursors are open"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if ((ret = __db_associate_arg(dbp, sdbp, callback, flags)) != 0)
+		goto err;
+
+	/*
+	 * Create a local transaction as necessary, check for consistent
+	 * transaction usage, and, if we have no transaction but do have
+	 * locking on, acquire a locker id for the handle lock acquisition.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+		if ((ret = __dbc_destroy(sdbc)) != 0)
+			goto err;
+
+	ret = __db_associate(dbp, ip, txn, sdbp, callback, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_associate_arg --
+ *	Check DB->associate arguments.
+ */
+static int
+__db_associate_arg(dbp, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (sdbp->type == DB_HEAP) {
+		__db_errx(env,
+		    "Heap databases may not be used as secondary databases");
+		return (EINVAL);
+	}
+
+	if (F_ISSET(sdbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0573",
+		    "Secondary index handles may not be re-associated"));
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0574",
+		    "Secondary indices may not be used as primary databases"));
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_DUP)) {
+		__db_errx(env, DB_STR("0575",
+		    "Primary databases may not be configured with duplicates"));
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+		__db_errx(env, DB_STR("0576",
+    "Renumbering recno databases may not be used as primary databases"));
+		return (EINVAL);
+	}
+
+	/*
+	 * It's OK for the primary and secondary to not share an environment IFF
+	 * the environments are local to the DB handle.  (Specifically, cursor
+	 * adjustment will work correctly in this case.)  The environment being
+	 * local implies the environment is not configured for either locking or
+	 * transactions, as neither of those could work correctly.
+	 */
+	if (dbp->env != sdbp->env &&
+	    (!F_ISSET(dbp->env, ENV_DBLOCAL) ||
+	     !F_ISSET(sdbp->env, ENV_DBLOCAL))) {
+		__db_errx(env, DB_STR("0577",
+    "The primary and secondary must be opened in the same environment"));
+		return (EINVAL);
+	}
+	if ((DB_IS_THREADED(dbp) && !DB_IS_THREADED(sdbp)) ||
+	    (!DB_IS_THREADED(dbp) && DB_IS_THREADED(sdbp))) {
+		__db_errx(env, DB_STR("0578",
+    "The DB_THREAD setting must be the same for primary and secondary"));
+		return (EINVAL);
+	}
+	if (callback == NULL &&
+	    (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) {
+		__db_errx(env, DB_STR("0579",
+"Callback function may be NULL only when database handles are read-only"));
+		return (EINVAL);
+	}
+
+	if ((ret = __db_fchk(env, "DB->associate", flags, DB_CREATE |
+	    DB_IMMUTABLE_KEY)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_close_pp --
+ *	DB->close pre/post processing.
+ *
+ * PUBLIC: int __db_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_close_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * Close a DB handle -- as a handle destructor, we can't fail.
+	 *
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_NOSYNC)
+		ret = __db_ferr(env, "DB->close", 0);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+		handle_check = 0;
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if ((t_ret = __db_close(dbp, NULL, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_cursor_pp --
+ *	DB->cursor pre/post processing.
+ *
+ * PUBLIC: int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor_pp(dbp, txn, dbcp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REGENV *renv;
+	int rep_blocked, ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	rep_blocked = 0;
+	if (IS_ENV_REPLICATED(env)) {
+		if (!IS_REAL_TXN(txn)) {
+			if ((ret = __op_rep_enter(env, 0, 1)) != 0)
+				goto err;
+			rep_blocked = 1;
+		}
+		renv = env->reginfo->primary;
+		if (dbp->timestamp != renv->rep_timestamp) {
+			__db_errx(env, DB_STR("0580",
+		    "replication recovery unrolled committed transactions;"
+		    "open DB and DBcursor handles must be closed"));
+			ret = DB_REP_HANDLE_DEAD;
+			goto err;
+		}
+	}
+	if ((ret = __db_cursor_arg(dbp, flags)) != 0)
+		goto err;
+
+	/*
+	 * Check for consistent transaction usage.  For now, assume this
+	 * cursor might be used for read operations only (in which case
+	 * it may not require a txn).  We'll check more stringently in
+	 * c_del and c_put.  (Note this means the read-op txn tests have
+	 * to be a subset of the write-op ones.)
+	 */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		goto err;
+
+	ret = __db_cursor(dbp, ip, txn, dbcp, flags);
+
+	/*
+	 * Register externally created cursors into the valid transaction.
+	 * If a family transaction was passed in, the transaction handle in
+	 * the cursor may not match.
+	 */
+	txn = (*dbcp)->txn;
+	if (txn != NULL && ret == 0)
+		TAILQ_INSERT_HEAD(&(txn->my_cursors), *dbcp, txn_cursors);
+
+err:	/* Release replication block on error. */
+	if (ret != 0 && rep_blocked)
+		(void)__op_rep_exit(env);
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_cursor --
+ *	DB->cursor.
+ *
+ * PUBLIC: int __db_cursor __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor(dbp, ip, txn, dbcp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	db_lockmode_t mode;
+	int ret;
+
+	env = dbp->env;
+
+	if (MULTIVERSION(dbp) && txn == NULL && (LF_ISSET(DB_TXN_SNAPSHOT) ||
+	    F_ISSET(env->dbenv, DB_ENV_TXN_SNAPSHOT))) {
+		if ((ret =
+		    __txn_begin(env, ip, NULL, &txn, DB_TXN_SNAPSHOT)) != 0)
+			return (ret);
+		F_SET(txn, TXN_PRIVATE);
+	}
+
+	PERFMON5(env, db, cursor, dbp->fname,
+	    dbp->dname, txn == NULL ? 0 : txn->txnid, flags, &dbp->fileid[0]);
+
+	if ((ret = __db_cursor_int(dbp, ip, txn, dbp->type, PGNO_INVALID,
+	    LF_ISSET(DB_CURSOR_BULK | DB_CURSOR_TRANSIENT | DB_RECOVER),
+	    NULL, &dbc)) != 0)
+		return (ret);
+
+	/*
+	 * If this is CDB, do all the locking in the interface, which is
+	 * right here.
+	 */
+	if (CDB_LOCKING(env)) {
+		mode = (LF_ISSET(DB_WRITELOCK)) ? DB_LOCK_WRITE :
+		    ((LF_ISSET(DB_WRITECURSOR) || txn != NULL) ?
+		    DB_LOCK_IWRITE : DB_LOCK_READ);
+		if ((ret = __lock_get(env, dbc->locker, 0,
+		    &dbc->lock_dbt, mode, &dbc->mylock)) != 0)
+			goto err;
+		if (LF_ISSET(DB_WRITECURSOR))
+			F_SET(dbc, DBC_WRITECURSOR);
+		if (LF_ISSET(DB_WRITELOCK))
+			F_SET(dbc, DBC_WRITER);
+	}
+
+	if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+		F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+	if (LF_ISSET(DB_READ_COMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+		F_SET(dbc, DBC_READ_COMMITTED);
+
+	*dbcp = dbc;
+	return (0);
+
+err:	(void)__dbc_close(dbc);
+	return (ret);
+}
+
+/*
+ * __db_cursor_arg --
+ *	Check DB->cursor arguments.
+ */
+static int
+__db_cursor_arg(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+
+	/*
+	 * DB_READ_COMMITTED and DB_READ_UNCOMMITTED require locking.
+	 */
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DB->cursor"));
+	}
+
+	LF_CLR(DB_CURSOR_BULK |
+	    DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT);
+
+	/* Check for invalid function flags. */
+	if (LF_ISSET(DB_WRITECURSOR)) {
+		if (DB_IS_READONLY(dbp))
+			return (__db_rdonly(env, "DB->cursor"));
+		if (!CDB_LOCKING(env))
+			return (__db_ferr(env, "DB->cursor", 0));
+		LF_CLR(DB_WRITECURSOR);
+	} else if (LF_ISSET(DB_WRITELOCK)) {
+		if (DB_IS_READONLY(dbp))
+			return (__db_rdonly(env, "DB->cursor"));
+		LF_CLR(DB_WRITELOCK);
+	}
+
+	if (flags != 0)
+		return (__db_ferr(env, "DB->cursor", 0));
+
+	return (0);
+}
+
+/*
+ * __db_del_pp --
+ *	DB->del pre/post processing.
+ *
+ * PUBLIC: int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del_pp(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+#ifdef CONFIG_TEST
+	if (IS_REP_MASTER(env))
+		DB_TEST_WAIT(env, env->test_check);
+#endif
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+			handle_check = 0;
+			goto err;
+	}
+
+	if ((ret = __db_del_arg(dbp, key, flags)) != 0)
+		goto err;
+
+	/* Create local transaction as necessary. */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_del(dbp, ip, txn, key, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, NULL);
+	return (ret);
+}
+
+/*
+ * __db_del_arg --
+ *	Check DB->delete arguments.
+ */
+static int
+__db_del_arg(dbp, key, flags)
+	DB *dbp;
+	DBT *key;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_CONSUME:
+		if (dbp->type != DB_QUEUE)
+			return (__db_ferr(env, "DB->del", 0));
+		goto copy;
+	case DB_MULTIPLE:
+	case DB_MULTIPLE_KEY:
+		if (!F_ISSET(key, DB_DBT_BULK)) {
+			__db_errx(env, DB_STR("0581",
+	    "DB->del with DB_MULTIPLE(_KEY) requires multiple key records"));
+			return (EINVAL);
+		}
+		/* FALL THROUGH */
+	case 0:
+copy:		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	default:
+		return (__db_ferr(env, "DB->del", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_exists --
+ *	DB->exists implementation.
+ *
+ * PUBLIC: int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_exists(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DBT data;
+	int ret;
+
+	/*
+	 * Most flag checking is done in the DB->get call, we only check for
+	 * specific incompatibilities here.  This saves making __get_arg
+	 * aware of the exist method's API constraints.
+	 */
+	STRIP_AUTO_COMMIT(flags);
+
+	if ((ret = __db_fchk(dbp->env, "DB->exists", flags,
+	    DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) != 0)
+		return (ret);
+
+	/*
+	 * Configure a data DBT that returns no bytes so there's no copy
+	 * of the data.
+	 */
+	memset(&data, 0, sizeof(data));
+	data.dlen = 0;
+	data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+	return (dbp->get(dbp, txn, key, &data, flags));
+}
+
+/*
+ * db_fd_pp --
+ *	DB->fd pre/post processing.
+ *
+ * PUBLIC: int __db_fd_pp __P((DB *, int *));
+ */
+int
+__db_fd_pp(dbp, fdp)
+	DB *dbp;
+	int *fdp;
+{
+	DB_FH *fhp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * There's no argument checking to be done.
+	 *
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 *
+	 * XXX
+	 * Truly spectacular layering violation.
+	 */
+	if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) == 0) {
+		if (fhp == NULL) {
+			*fdp = -1;
+			__db_errx(env, DB_STR("0582",
+			    "Database does not have a valid file handle"));
+			ret = ENOENT;
+		} else
+			*fdp = fhp->fd;
+	}
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_get_pp --
+ *	DB->get pre/post processing.
+ *
+ * PUBLIC: int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get_pp(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t mode;
+	int handle_check, ignore_lease, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	mode = 0;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get");
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+
+	if ((ret = __db_get_arg(dbp, key, data, flags)) != 0) {
+		__dbt_userfree(env, key, NULL, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+			handle_check = 0;
+			goto err;
+	}
+
+	if (LF_ISSET(DB_READ_UNCOMMITTED))
+		mode = DB_READ_UNCOMMITTED;
+	else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+	    (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) {
+		mode = DB_WRITELOCK;
+		if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+			if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+				goto err;
+			txn_local = 1;
+		}
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID,
+	    mode == DB_WRITELOCK || LF_ISSET(DB_RMW) ? 0 : 1)) != 0)
+		goto err;
+
+	ret = __db_get(dbp, ip, txn, key, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __db_get --
+ *	DB->get.
+ *
+ * PUBLIC: int __db_get __P((DB *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get(dbp, ip, txn, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t mode;
+	int ret, t_ret;
+
+	/*
+	 * The DB_CURSOR_TRANSIENT flag indicates that we're just doing a single
+	 * operation with this cursor, and that in case of error we don't need
+	 * to restore it to its old position.  Thus, we can perform the get
+	 * without duplicating the cursor, saving some cycles in this common
+	 * case.
+	 */
+	mode = DB_CURSOR_TRANSIENT;
+	if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+		mode |= DB_READ_UNCOMMITTED;
+		LF_CLR(DB_READ_UNCOMMITTED);
+	} else if (LF_ISSET(DB_READ_COMMITTED)) {
+		mode |= DB_READ_COMMITTED;
+		LF_CLR(DB_READ_COMMITTED);
+	} else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+	    (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT)
+		mode |= DB_WRITELOCK;
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+		return (ret);
+
+	DEBUG_LREAD(dbc, txn, "DB->get", key, NULL, flags);
+
+	/*
+	 * The semantics of bulk gets are different for DB->get vs DBC->get.
+	 * Mark the cursor so the low-level bulk get routines know which
+	 * behavior we want.
+	 */
+	F_SET(dbc, DBC_FROM_DB_GET);
+
+	/*
+	 * SET_RET_MEM indicates that if key and/or data have no DBT
+	 * flags set and DB manages the returned-data memory, that memory
+	 * will belong to this handle, not to the underlying cursor.
+	 */
+	SET_RET_MEM(dbc, dbp);
+
+	if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+		LF_SET(DB_SET);
+
+#ifdef HAVE_PARTITION
+	if (F_ISSET(dbc, DBC_PARTITIONED))
+		ret = __partc_get(dbc, key, data, flags);
+	else
+#endif
+		ret = __dbc_get(dbc, key, data, flags);
+
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_get_arg --
+ *	DB->get argument checking, used by both DB->get and DB->pget.
+ */
+static int
+__db_get_arg(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	ENV *env;
+	int dirty, multi, ret;
+
+	env = dbp->env;
+
+	/*
+	 * Check for read-modify-write validity.  DB_RMW doesn't make sense
+	 * with CDB cursors since if you're going to write the cursor, you
+	 * had to create it with DB_WRITECURSOR.  Regardless, we check for
+	 * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+	 * If this changes, confirm that DB does not itself set the DB_RMW
+	 * flag in a path where CDB may have been configured.
+	 */
+	dirty = 0;
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DB->get"));
+		if ((ret = __db_fcchk(env, "DB->get",
+		    flags, DB_READ_UNCOMMITTED, DB_READ_COMMITTED)) != 0)
+			return (ret);
+		if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))
+			dirty = 1;
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	multi = 0;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		if (LF_ISSET(DB_MULTIPLE_KEY))
+			goto multi_err;
+		multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0;
+		LF_CLR(DB_MULTIPLE);
+	}
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_GET_BOTH:
+		if ((ret = __dbt_usercopy(env, data)) != 0)
+			return (ret);
+		/* FALLTHROUGH */
+	case 0:
+		if ((ret = __dbt_usercopy(env, key)) != 0) {
+			__dbt_userfree(env, key, NULL, data);
+			return (ret);
+		}
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_AM_RECNUM))
+			goto err;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		if (dirty) {
+			__db_errx(env, DB_STR_A("0583",
+		    "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT",
+			    "%s"), LF_ISSET(DB_READ_UNCOMMITTED) ?
+			     "DB_READ_UNCOMMITTED" : "DB_READ_COMMITTED");
+			return (EINVAL);
+		}
+		if (multi)
+multi_err:		return (__db_ferr(env, "DB->get", 1));
+		if (dbp->type == DB_QUEUE)
+			break;
+		/* FALLTHROUGH */
+	default:
+err:		return (__db_ferr(env, "DB->get", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.
+	 */
+	if ((ret =
+	    __dbt_ferr(dbp, "key", key, DB_RETURNS_A_KEY(dbp, flags))) != 0)
+		return (ret);
+
+	if (F_ISSET(data, DB_DBT_READONLY)) {
+		__db_errx(env, DB_STR("0584",
+		    "DB_DBT_READONLY should not be set on data DBT."));
+			return (EINVAL);
+	}
+	if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+		return (ret);
+
+	if (multi) {
+		if (!F_ISSET(data, DB_DBT_USERMEM)) {
+			__db_errx(env, DB_STR("0585",
+			    "DB_MULTIPLE requires DB_DBT_USERMEM be set"));
+			return (EINVAL);
+		}
+		if (F_ISSET(key, DB_DBT_PARTIAL) ||
+		    F_ISSET(data, DB_DBT_PARTIAL)) {
+			__db_errx(env, DB_STR("0586",
+			    "DB_MULTIPLE does not support DB_DBT_PARTIAL"));
+			return (EINVAL);
+		}
+		if (data->ulen < 1024 ||
+		    data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+			__db_errx(env, DB_STR("0587",
+			    "DB_MULTIPLE buffers must be aligned, "
+			    "at least page size and multiples of 1KB"));
+			return (EINVAL);
+		}
+	}
+
+	/* Check invalid partial key. */
+	if (F_ISSET(key, DB_DBT_PARTIAL) && !(LF_ISSET(DB_CONSUME) &&
+	    LF_ISSET(DB_CONSUME_WAIT) && LF_ISSET(DB_SET_RECNO))) {
+		__db_errx(env, DB_STR("0708",
+		    "Invalid positioning flag combined with DB_DBT_PARTIAL"));
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_join_pp --
+ *	DB->join pre/post processing.
+ *
+ * PUBLIC: int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join_pp(primary, curslist, dbcp, flags)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = primary->env;
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(
+	    primary, 1, 0, IS_REAL_TXN(curslist[0]->txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __db_join_arg(primary, curslist, flags)) == 0)
+		ret = __db_join(primary, curslist, dbcp, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_join_arg --
+ *	Check DB->join arguments.
+ */
+static int
+__db_join_arg(primary, curslist, flags)
+	DB *primary;
+	DBC **curslist;
+	u_int32_t flags;
+{
+	DB_TXN *txn;
+	ENV *env;
+	int i;
+
+	env = primary->env;
+
+	switch (flags) {
+	case 0:
+	case DB_JOIN_NOSORT:
+		break;
+	default:
+		return (__db_ferr(env, "DB->join", 0));
+	}
+
+	if (curslist == NULL || curslist[0] == NULL) {
+		__db_errx(env, DB_STR("0588",
+	    "At least one secondary cursor must be specified to DB->join"));
+		return (EINVAL);
+	}
+
+	txn = curslist[0]->txn;
+	for (i = 1; curslist[i] != NULL; i++)
+		if (curslist[i]->txn != txn) {
+			__db_errx(env, DB_STR("0589",
+		    "All secondary cursors must share the same transaction"));
+			return (EINVAL);
+		}
+
+	return (0);
+}
+
+/*
+ * __db_key_range_pp --
+ *	DB->key_range pre/post processing.
+ *
+ * PUBLIC: int __db_key_range_pp
+ * PUBLIC:     __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__db_key_range_pp(dbp, txn, key, kr, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	DB_KEY_RANGE *kr;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DB->key_range", 0));
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+
+		/* Acquire a cursor. */
+		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) {
+			__dbt_userfree(env, key, NULL, NULL);
+			break;
+		}
+
+		DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+		if (DB_IS_PARTITIONED(dbp))
+			ret = __part_key_range(dbc, key, kr, flags);
+		else
+#endif
+			ret = __bam_key_range(dbc, key, kr, flags);
+
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		__dbt_userfree(env, key, NULL, NULL);
+		break;
+	case DB_HASH:
+	case DB_QUEUE:
+	case DB_RECNO:
+		ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->key_range", dbp->type);
+		break;
+	}
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_open_pp --
+ *	DB->open pre/post processing.
+ *
+ * PUBLIC: int __db_open_pp __P((DB *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, DBTYPE, u_int32_t, int));
+ */
+int
+__db_open_pp(dbp, txn, fname, dname, type, flags, mode)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, nosync, remove_me, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	nosync = 1;
+	handle_check = remove_me = txn_local = 0;
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * Save the flags.  We do this here because we don't pass all of the
+	 * flags down into the actual DB->open method call, we strip
+	 * DB_AUTO_COMMIT at this layer.
+	 */
+	dbp->open_flags = flags;
+
+	/* Save the current DB handle flags for refresh. */
+	dbp->orig_flags = dbp->flags;
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * A replication client can't create a database, but it's convenient to
+	 * allow a repmgr application to specify DB_CREATE anyway.  Thus for
+	 * such an application the meaning of DB_CREATE becomes "create it if
+	 * I'm a master, and otherwise ignore the flag".  A repmgr application
+	 * running as master can't be sure that it won't spontaneously become a
+	 * client, so there's a race condition.
+	 */
+	if (IS_REP_CLIENT(env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))
+		LF_CLR(DB_CREATE);
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else if (txn != NULL && !TXN_ON(env) &&
+	    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+		ret = __db_not_txn_env(env);
+		goto err;
+	}
+	LF_CLR(DB_AUTO_COMMIT);
+
+	/*
+	 * We check arguments after possibly creating a local transaction,
+	 * which is unusual -- the reason is some flags are illegal if any
+	 * kind of transaction is in effect.
+	 */
+	if ((ret = __db_open_arg(dbp, txn, fname, dname, type, flags)) == 0)
+		if ((ret = __db_open(dbp, ip, txn, fname, dname, type,
+		    flags, mode, PGNO_BASE_MD)) != 0)
+			goto txnerr;
+
+	/*
+	 * You can open the database that describes the subdatabases in the
+	 * rest of the file read-only.  The content of each key's data is
+	 * unspecified and applications should never be adding new records
+	 * or updating existing records.  However, during recovery, we need
+	 * to open these databases R/W so we can redo/undo changes in them.
+	 * Likewise, we need to open master databases read/write during
+	 * rename and remove so we can be sure they're fully sync'ed, so
+	 * we provide an override flag for the purpose.
+	 */
+	if (dname == NULL && !IS_RECOVERING(env) && !LF_ISSET(DB_RDONLY) &&
+	    !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
+		__db_errx(env, DB_STR("0590",
+    "files containing multiple databases may only be opened read-only"));
+		ret = EINVAL;
+		goto txnerr;
+	}
+
+	/*
+	 * Success: file creations have to be synchronous, otherwise we don't
+	 * care.
+	 */
+	if (F_ISSET(dbp, DB_AM_CREATED | DB_AM_CREATED_MSTR))
+		nosync = 0;
+
+	/* Success: don't discard the file on close. */
+	F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR);
+
+	/*
+	 * If not transactional, remove the databases/subdatabases if it is
+	 * persistent.  If we're transactional, the child transaction abort
+	 * cleans up.
+	 */
+txnerr:	if (ret != 0 && !IS_REAL_TXN(txn)) {
+		remove_me = (F_ISSET(dbp, DB_AM_CREATED) &&
+			(fname != NULL || dname != NULL)) ? 1 : 0;
+		if (F_ISSET(dbp, DB_AM_CREATED_MSTR) ||
+		    (dname == NULL && remove_me))
+			/* Remove file. */
+			(void)__db_remove_int(dbp,
+			    ip, txn, fname, NULL, DB_FORCE);
+		else if (remove_me)
+			/* Remove subdatabase. */
+			(void)__db_remove_int(dbp,
+			    ip, txn, fname, dname, DB_FORCE);
+	}
+
+	if (txn_local && (t_ret =
+	     __db_txn_auto_resolve(env, txn, nosync, ret)) && ret == 0)
+		ret = t_ret;
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_open_arg --
+ *	Check DB->open arguments.
+ */
+static int
+__db_open_arg(dbp, txn, fname, dname, type, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+{
+	ENV *env;
+	u_int32_t ok_flags;
+	int ret;
+
+	env = dbp->env;
+
+	/* Validate arguments. */
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_AUTO_COMMIT | DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING |	\
+	DB_MULTIVERSION | DB_NOMMAP | DB_NO_AUTO_COMMIT | DB_RDONLY |	\
+	DB_RDWRMASTER | DB_READ_UNCOMMITTED | DB_THREAD | DB_TRUNCATE)
+	if ((ret = __db_fchk(env, "DB->open", flags, OKFLAGS)) != 0)
+		return (ret);
+	if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
+		return (__db_ferr(env, "DB->open", 1));
+	if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
+		return (__db_ferr(env, "DB->open", 1));
+
+#ifdef	HAVE_VXWORKS
+	if (LF_ISSET(DB_TRUNCATE)) {
+		__db_errx(env, DB_STR("0591",
+		    "DB_TRUNCATE not supported on VxWorks"));
+		return (DB_OPNOTSUP);
+	}
+#endif
+	switch (type) {
+	case DB_UNKNOWN:
+		if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
+			__db_errx(env, DB_STR("0592",
+	    "DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE"));
+			return (EINVAL);
+		}
+		ok_flags = 0;
+		break;
+	case DB_BTREE:
+		ok_flags = DB_OK_BTREE;
+		break;
+	case DB_HASH:
+#ifndef HAVE_HASH
+		return (__db_no_hash_am(env));
+#endif
+		ok_flags = DB_OK_HASH;
+		break;
+	case DB_HEAP:
+		ok_flags = DB_OK_HEAP;
+		break;
+	case DB_QUEUE:
+#ifndef HAVE_QUEUE
+		return (__db_no_queue_am(env));
+#endif
+		ok_flags = DB_OK_QUEUE;
+		break;
+	case DB_RECNO:
+		ok_flags = DB_OK_RECNO;
+		break;
+	default:
+		__db_errx(env, DB_STR_A("0593",
+		    "unknown type: %lu", "%lu"), (u_long)type);
+		return (EINVAL);
+	}
+	if (ok_flags)
+		DB_ILLEGAL_METHOD(dbp, ok_flags);
+
+	/* The environment may have been created, but never opened. */
+	if (!F_ISSET(env, ENV_DBLOCAL | ENV_OPEN_CALLED)) {
+		__db_errx(env, DB_STR("0594",
+		    "database environment not yet opened"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Historically, you could pass in an environment that didn't have a
+	 * mpool, and DB would create a private one behind the scenes.  This
+	 * no longer works.
+	 */
+	if (!F_ISSET(env, ENV_DBLOCAL) && !MPOOL_ON(env)) {
+		__db_errx(env, DB_STR("0595",
+		    "environment did not include a memory pool"));
+		return (EINVAL);
+	}
+
+	/*
+	 * You can't specify threads during DB->open if subsystems in the
+	 * environment weren't configured with them.
+	 */
+	if (LF_ISSET(DB_THREAD) && !F_ISSET(env, ENV_DBLOCAL | ENV_THREAD)) {
+		__db_errx(env, DB_STR("0596",
+		    "environment not created using DB_THREAD"));
+		return (EINVAL);
+	}
+
+	/* Exclusive database handles cannot be threaded.*/
+	if (LF_ISSET(DB_THREAD) && F2_ISSET(dbp, DB2_AM_EXCL)) {
+		__db_errx(env, DB_STR("0744",
+		    "Exclusive database handles cannot be threaded."));
+		return (EINVAL);
+	}
+
+	/* Exclusive database handles require transactional environments. */
+	if (F2_ISSET(dbp, DB2_AM_EXCL) && !TXN_ON(env)) {
+		__db_errx(env, DB_STR("0745",
+	"Exclusive database handles require transactional environments."));
+		return (EINVAL);
+	}
+
+	/* Replication clients cannot open exclusive database handles. */
+	if (F2_ISSET(dbp, DB2_AM_EXCL) && IS_REP_CLIENT(env)) {
+		__db_errx(env, DB_STR("0746",
+"Exclusive database handles cannot be opened on replication clients."));
+		return (EINVAL);
+	}
+
+	/* DB_MULTIVERSION requires a database configured for transactions. */
+	if (LF_ISSET(DB_MULTIVERSION) && !IS_REAL_TXN(txn)) {
+		__db_errx(env, DB_STR("0597",
+		    "DB_MULTIVERSION illegal without a transaction specified"));
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIVERSION) && type == DB_QUEUE) {
+		__db_errx(env, DB_STR("0598",
+		    "DB_MULTIVERSION illegal with queue databases"));
+		return (EINVAL);
+	}
+
+	/* DB_TRUNCATE is neither transaction recoverable nor lockable. */
+	if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) {
+		__db_errx(env, DB_STR_A("0599",
+		    "DB_TRUNCATE illegal with %s specified", "%s"),
+		    LOCKING_ON(env) ? "locking" : "transactions");
+		return (EINVAL);
+	}
+
+	/* Subdatabase checks. */
+	if (dname != NULL) {
+		/* QAM can only be done on in-memory subdatabases. */
+		if (type == DB_QUEUE && fname != NULL) {
+			__db_errx(env, DB_STR("0600",
+			    "Queue databases must be one-per-file"));
+			return (EINVAL);
+		}
+
+		/*
+		 * Named in-memory databases can't support certain flags,
+		 * so check here.
+		 */
+		if (fname == NULL)
+			F_CLR(dbp, DB_AM_CHKSUM | DB_AM_ENCRYPT);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_pget_pp --
+ *	DB->pget pre/post processing.
+ *
+ * PUBLIC: int __db_pget_pp
+ * PUBLIC:     __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget_pp(dbp, txn, skey, pkey, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ignore_lease, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+
+	if ((ret = __db_pget_arg(dbp, pkey, flags)) != 0 ||
+	    (ret = __db_get_arg(dbp, skey, data, flags)) != 0) {
+		__dbt_userfree(env, skey, pkey, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_pget(dbp, ip, txn, skey, pkey, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, skey, pkey, data);
+	return (ret);
+}
+
+/*
+ * __db_pget --
+ *	DB->pget.
+ *
+ * PUBLIC: int __db_pget __P((DB *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, ip, txn, skey, pkey, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t mode;
+	int ret, t_ret;
+
+	mode = DB_CURSOR_TRANSIENT;
+	if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+		mode |= DB_READ_UNCOMMITTED;
+		LF_CLR(DB_READ_UNCOMMITTED);
+	} else if (LF_ISSET(DB_READ_COMMITTED)) {
+		mode |= DB_READ_COMMITTED;
+		LF_CLR(DB_READ_COMMITTED);
+	}
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+		return (ret);
+
+	SET_RET_MEM(dbc, dbp);
+
+	DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+	/*
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 *
+	 * The underlying cursor pget will fill in a default DBT for null
+	 * pkeys, and use the cursor's returned-key memory internally to
+	 * store any intermediate primary keys.  However, we've just set
+	 * the returned-key memory to the DB handle's key memory, which
+	 * is unsafe to use if the DB handle is threaded.  If the pkey
+	 * argument is NULL, use the DBC-owned returned-key memory
+	 * instead;  it'll go away when we close the cursor before we
+	 * return, but in this case that's just fine, as we're not
+	 * returning the primary key.
+	 */
+	if (pkey == NULL)
+		dbc->rkey = &dbc->my_rkey;
+
+	/*
+	 * The cursor is just a perfectly ordinary secondary database cursor.
+	 * Call its c_pget() method to do the dirty work.
+	 */
+	if (flags == 0 || flags == DB_RMW)
+		flags |= DB_SET;
+
+	ret = __dbc_pget(dbc, skey, pkey, data, flags);
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_pget_arg --
+ *	Check DB->pget arguments.
+ */
+static int
+__db_pget_arg(dbp, pkey, flags)
+	DB *dbp;
+	DBT *pkey;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0601",
+		    "DB->pget may only be used on secondary indices"));
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		__db_errx(env,DB_STR("0602",
+"DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"));
+		return (EINVAL);
+	}
+
+	/* DB_CONSUME makes no sense on a secondary index. */
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	switch (flags) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		return (__db_ferr(env, "DB->pget", 0));
+	default:
+		/* __db_get_arg will catch the rest. */
+		break;
+	}
+
+	/*
+	 * We allow the pkey field to be NULL, so that we can make the
+	 * two-DBT get calls into wrappers for the three-DBT ones.
+	 */
+	if (pkey != NULL &&
+	    (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0)
+		return (ret);
+
+	/* Check invalid partial pkey. */
+	if (pkey != NULL && F_ISSET(pkey, DB_DBT_PARTIAL)) {
+		__db_errx(env, DB_STR("0709",
+		    "The primary key returned by pget can't be partial"));
+		return (EINVAL);
+	}
+
+	if (flags == DB_GET_BOTH) {
+		/* The pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+		if (pkey == NULL) {
+			__db_errx(env, DB_STR("0603",
+		    "DB_GET_BOTH on a secondary index requires a primary key"));
+			return (EINVAL);
+		}
+		if ((ret = __dbt_usercopy(env, pkey)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_put_pp --
+ *	DB->put pre/post processing.
+ *
+ * PUBLIC: int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put_pp(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, txn_local, t_ret;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
+
+	if ((ret = __db_put_arg(dbp, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Create local transaction as necessary. */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_put(dbp, ip, txn, key, data, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __db_put_arg --
+ *	Check DB->put arguments.
+ */
+static int
+__db_put_arg(dbp, key, data, flags)
+	DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret, returnkey;
+
+	env = dbp->env;
+	returnkey = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->put"));
+
+	/* Check for puts on a secondary. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0604",
+		    "DB->put forbidden on secondary indices"));
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE_KEY | DB_MULTIPLE)) {
+		if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+			goto err;
+
+		switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+		case 0:
+		case DB_OVERWRITE_DUP:
+			break;
+		default:
+			__db_errx(env, DB_STR("0605",
+"DB->put: DB_MULTIPLE(_KEY) can only be combined with DB_OVERWRITE_DUP"));
+			return (EINVAL);
+		}
+
+		if (!F_ISSET(key, DB_DBT_BULK)) {
+			__db_errx(env, DB_STR("0606",
+	    "DB->put with DB_MULTIPLE(_KEY) requires a bulk key buffer"));
+			return (EINVAL);
+		}
+	}
+	if (LF_ISSET(DB_MULTIPLE)) {
+		if (!F_ISSET(data, DB_DBT_BULK)) {
+			__db_errx(env, DB_STR("0607",
+		    "DB->put with DB_MULTIPLE requires a bulk data buffer"));
+			return (EINVAL);
+		}
+	}
+
+	/* Check for invalid function flags. */
+	switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+	case 0:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		break;
+	case DB_APPEND:
+		if (dbp->type != DB_RECNO &&
+		    dbp->type != DB_QUEUE && dbp->type != DB_HEAP)
+			goto err;
+		returnkey = 1;
+		break;
+	case DB_NODUPDATA:
+		if (F_ISSET(dbp, DB_AM_DUPSORT))
+			break;
+		/* FALLTHROUGH */
+	default:
+err:		return (__db_ferr(env, "DB->put", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.  The key may reasonably be NULL
+	 * if DB_APPEND is set and the application doesn't care about the
+	 * returned key.
+	 */
+	if (((returnkey && key != NULL) || !returnkey) &&
+	    (ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0)
+		return (ret);
+	if (!LF_ISSET(DB_MULTIPLE_KEY) &&
+	    (ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/*
+	 * The key parameter should not be NULL or have the "partial" flag set
+	 * in a put call unless the user doesn't care about a key value we'd
+	 * return.  The user tells us they don't care about the returned key by
+	 * setting the key parameter to NULL or configuring the key DBT to not
+	 * return any information.  (Returned keys from a put are always record
+	 * numbers, and returning part of a record number  doesn't make sense:
+	 * only accept a partial return if the length returned is 0.)
+	 */
+	if ((returnkey &&
+	    key != NULL && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) ||
+	    (!returnkey && F_ISSET(key, DB_DBT_PARTIAL)))
+		return (__db_ferr(env, "key DBT", 0));
+
+	/* Check for partial puts in the presence of duplicates. */
+	if (data != NULL && F_ISSET(data, DB_DBT_PARTIAL) &&
+	    (F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) {
+		__db_errx(env, DB_STR("0608",
+"a partial put in the presence of duplicates requires a cursor operation"));
+		return (EINVAL);
+	}
+
+	if ((flags != DB_APPEND && (ret = __dbt_usercopy(env, key)) != 0) ||
+	    (!LF_ISSET(DB_MULTIPLE_KEY) &&
+	    (ret = __dbt_usercopy(env, data)) != 0))
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_compact_func
+ *	Callback routine to report if the txn has open cursors.
+ */
+static int
+__db_compact_func(dbc, my_dbc, countp, pgno, indx, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *args;
+{
+	DB_TXN *txn;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(countp, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(indx, 0);
+
+	txn = (DB_TXN *)args;
+
+	if (txn == dbc->txn)
+		return (EEXIST);
+	return (0);
+}
+/*
+ * __db_compact_pp --
+ *	DB->compact pre/post processing.
+ *
+ * PUBLIC: int __db_compact_pp __P((DB *, DB_TXN *,
+ * PUBLIC:       DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_pp(dbp, txn, start, stop, c_data, flags, end)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *start, *stop;
+	DB_COMPACT *c_data;
+	u_int32_t flags;
+	DBT *end;
+{
+	DB_COMPACT *dp, l_data;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+	u_int32_t count;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->compact");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(
+	    env, "DB->compact", flags, DB_FREELIST_ONLY | DB_FREE_SPACE)) != 0)
+		return (ret);
+
+	/* Check for changes to a read-only database. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->compact"));
+
+	if (start != NULL && (ret = __dbt_usercopy(env, start)) != 0)
+		return (ret);
+	if (stop != NULL && (ret = __dbt_usercopy(env, stop)) != 0) {
+		__dbt_userfree(env, start, NULL, NULL);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+	    IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if (txn != NULL) {
+		if ((ret = __db_walk_cursors(dbp,
+		    NULL, __db_compact_func, &count, 0, 0, txn)) != 0) {
+			if (ret == EEXIST) {
+				__db_errx(env, DB_STR("0609",
+"DB->compact may not be called with active cursors in the transaction."));
+				ret = EINVAL;
+			}
+			goto err;
+		}
+	}
+
+	if (c_data == NULL) {
+		dp = &l_data;
+		memset(dp, 0, sizeof(*dp));
+	} else
+		dp = c_data;
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __part_compact(dbp, ip, txn, start, stop, dp, flags, end);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_HASH:
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __db_compact_int(dbp, ip,
+		    txn, start, stop, dp, flags, end);
+		break;
+	case DB_HEAP:
+		break;
+	default:
+		ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+		break;
+	}
+
+	/* Release replication block. */
+err:	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, start, stop, NULL);
+	return (ret);
+}
+
+/*
+ * __db_associate_foreign_pp --
+ *	DB->associate_foreign pre/post processing.
+ *
+ * PUBLIC: int __db_associate_foreign_pp __P((DB *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_associate_foreign_pp(fdbp, dbp, callback, flags)
+	DB *dbp, *fdbp;
+	int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+	u_int32_t flags;
+{
+	/* Most of this is based on the implementation of associate */
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	PANIC_CHECK(env);
+	STRIP_AUTO_COMMIT(flags);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __db_associate_foreign_arg(fdbp, dbp, callback, flags)) != 0)
+		goto err;
+
+	ret = __db_associate_foreign(fdbp, dbp, callback, flags);
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_associate_foreign_arg --
+ *	DB->associate_foreign argument checking.
+ */
+static int
+__db_associate_foreign_arg(fdbp, dbp, callback, flags)
+	DB *dbp, *fdbp;
+	int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = fdbp->env;
+
+	if (F_ISSET(fdbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0610",
+		    "Secondary indices may not be used as foreign databases"));
+		return (EINVAL);
+	}
+	if (F_ISSET(fdbp, DB_AM_DUP)) {
+		__db_errx(env, DB_STR("0611",
+		    "Foreign databases may not be configured with duplicates"));
+		return (EINVAL);
+	}
+	if (F_ISSET(fdbp, DB_AM_RENUMBER)) {
+		__db_errx(env, DB_STR("0612",
+    "Renumbering recno databases may not be used as foreign databases"));
+		return (EINVAL);
+	}
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0613",
+		    "The associating database must be a secondary index."));
+		return (EINVAL);
+	}
+	if (LF_ISSET(DB_FOREIGN_NULLIFY) && callback == NULL) {
+		__db_errx(env, DB_STR("0614",
+		    "When specifying a delete action of nullify, a callback "
+		    "function needs to be configured"));
+		return (EINVAL);
+	} else if (!LF_ISSET(DB_FOREIGN_NULLIFY) && callback != NULL) {
+		__db_errx(env, DB_STR("0615",
+		    "When not specifying a delete action of nullify, a "
+		    "callback function cannot be configured"));
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_sync_pp --
+ *	DB->sync pre/post processing.
+ *
+ * PUBLIC: int __db_sync_pp __P((DB *, u_int32_t));
+ */
+int
+__db_sync_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DB->sync", 0));
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_sync(dbp);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_close_pp --
+ *	DBC->close pre/post processing.
+ *
+ * PUBLIC: int __dbc_close_pp __P((DBC *));
+ */
+int
+__dbc_close_pp(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	DB_TXN *txn;
+	int handle_check, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	txn = dbc->txn;
+
+	/*
+	 * If the cursor is already closed we have a serious problem, and we
+	 * assume that the cursor isn't on the active queue.  Don't do any of
+	 * the remaining cursor close processing.
+	 */
+	if (!F_ISSET(dbc, DBC_ACTIVE)) {
+		__db_errx(env, DB_STR("0616",
+		    "Closing already-closed cursor"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = !IS_REAL_TXN(dbc->txn) && IS_ENV_REPLICATED(env);
+
+	/* Unregister the cursor from its transaction, regardless of ret. */
+	if (txn != NULL) {
+		TAILQ_REMOVE(&(txn->my_cursors), dbc, txn_cursors);
+		dbc->txn_cursors.tqe_next = NULL;
+		dbc->txn_cursors.tqe_prev = NULL;
+	} else {
+		DB_ASSERT(env, dbc->txn_cursors.tqe_next == NULL &&
+		    dbc->txn_cursors.tqe_prev == NULL);
+	}
+
+	ret = __dbc_close(dbc);
+
+	/* Release replication block. */
+	if (handle_check &&
+	    (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_cmp_pp --
+ *	DBC->cmp pre/post processing.
+ *
+ * PUBLIC: int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+ */
+int
+__dbc_cmp_pp(dbc, other_cursor, result, flags)
+	DBC *dbc, *other_cursor;
+	int *result;
+	u_int32_t flags;
+{
+	DB *dbp, *odbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	odbp = other_cursor->dbp;
+	env = dbp->env;
+
+	if (flags != 0)
+		return (__db_ferr(env, "DBcursor->cmp", 0));
+
+	if (other_cursor == NULL) {
+		__db_errx(env, DB_STR("0617",
+		    "DBcursor->cmp dbc pointer must not be null"));
+		return (EINVAL);
+	}
+
+	if (dbp != odbp) {
+		__db_errx(env, DB_STR("0618",
+"DBcursor->cmp both cursors must refer to the same database."));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_cmp(dbc, other_cursor, result);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_count_pp --
+ *	DBC->count pre/post processing.
+ *
+ * PUBLIC: int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+ */
+int
+__dbc_count_pp(dbc, recnop, flags)
+	DBC *dbc;
+	db_recno_t *recnop;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 *
+	 * The cursor must be initialized, return EINVAL for an invalid cursor.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DBcursor->count", 0));
+
+	if (!IS_INITIALIZED(dbc))
+		return (__db_curinval(env));
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_count(dbc, recnop);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_del_pp --
+ *	DBC->del pre/post processing.
+ *
+ * PUBLIC: int __dbc_del_pp __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del_pp(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __dbc_del_arg(dbc, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->del", NULL, NULL, flags);
+	ret = __dbc_del(dbc, flags);
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_del_arg --
+ *	Check DBC->del arguments.
+ */
+static int
+__dbc_del_arg(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DBcursor->del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	case DB_CONSUME:
+		if (dbp->type != DB_QUEUE)
+			return (__db_ferr(env, "DBC->del", 0));
+		break;
+	case DB_UPDATE_SECONDARY:
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_SECONDARY));
+		break;
+	default:
+		return (__db_ferr(env, "DBcursor->del", 0));
+	}
+
+	/*
+	 * The cursor must be initialized, return EINVAL for an invalid cursor,
+	 * otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc))
+		return (__db_curinval(env));
+
+	return (0);
+}
+
+/*
+ * __dbc_dup_pp --
+ *	DBC->dup pre/post processing.
+ *
+ * PUBLIC: int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup_pp(dbc, dbcp, flags)
+	DBC *dbc, **dbcp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_blocked, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_POSITION)
+		return (__db_ferr(env, "DBcursor->dup", 0));
+
+	ENV_ENTER(env, ip);
+	rep_blocked = 0;
+	if (dbc->txn == NULL && IS_ENV_REPLICATED(env)) {
+		if ((ret = __op_rep_enter(env, 1, 1)) != 0)
+			goto err;
+		rep_blocked = 1;
+	}
+	ret = __dbc_dup(dbc, dbcp, flags);
+
+	/* Register externally created cursors into the valid transaction. */
+	DB_ASSERT(env, (*dbcp)->txn == dbc->txn);
+	if ((*dbcp)->txn != NULL && ret == 0)
+		TAILQ_INSERT_HEAD(&((*dbcp)->txn->my_cursors), *dbcp,
+		    txn_cursors);
+err:
+	if (ret != 0 && rep_blocked)
+		(void)__op_rep_exit(env);
+
+	ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __dbc_get_pp --
+ *	DBC->get pre/post processing.
+ *
+ * PUBLIC: int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0) {
+		__dbt_userfree(env, key, NULL, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+
+	DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+	ret = __dbc_get(dbc, key, data, flags);
+
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __dbc_get_arg --
+ *	Common DBC->get argument checking, used by both DBC->get and DBC->pget.
+ * PUBLIC: int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_arg(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int dirty, multi, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * Typically in checking routines that modify the flags, we have
+	 * to save them and restore them, because the checking routine
+	 * calls the work routine.  However, this is a pure-checking
+	 * routine which returns to a function that calls the work routine,
+	 * so it's OK that we do not save and restore the flags, even though
+	 * we modify them.
+	 *
+	 * Check for read-modify-write validity.  DB_RMW doesn't make sense
+	 * with CDB cursors since if you're going to write the cursor, you
+	 * had to create it with DB_WRITECURSOR.  Regardless, we check for
+	 * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+	 * If this changes, confirm that DB does not itself set the DB_RMW
+	 * flag in a path where CDB may have been configured.
+	 */
+	dirty = 0;
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DBcursor->get"));
+		if (LF_ISSET(DB_READ_UNCOMMITTED))
+			dirty = 1;
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	multi = 0;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		multi = 1;
+		if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+			goto multi_err;
+		LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY);
+	}
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		if (dirty) {
+			__db_errx(env, DB_STR("0619",
+"DB_READ_UNCOMMITTED is not supported with DB_CONSUME or DB_CONSUME_WAIT"));
+			return (EINVAL);
+		}
+		if (dbp->type != DB_QUEUE)
+			goto err;
+		break;
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+		break;
+	case DB_LAST:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		if (multi)
+multi_err:		return (__db_ferr(env, "DBcursor->get", 1));
+		break;
+	case DB_GET_BOTHC:
+		if (dbp->type == DB_QUEUE)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		if ((ret = __dbt_usercopy(env, data)) != 0)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_SET:
+	case DB_SET_RANGE:
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+		break;
+	case DB_GET_RECNO:
+		/*
+		 * The one situation in which this might be legal with a
+		 * non-RECNUM dbp is if dbp is a secondary and its primary is
+		 * DB_AM_RECNUM.
+		 */
+		if (!F_ISSET(dbp, DB_AM_RECNUM) &&
+		    (!F_ISSET(dbp, DB_AM_SECONDARY) ||
+		    !F_ISSET(dbp->s_primary, DB_AM_RECNUM)))
+			goto err;
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_AM_RECNUM))
+			goto err;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+		break;
+	default:
+err:		__dbt_userfree(env, key, NULL, data);
+		return (__db_ferr(env, "DBcursor->get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if (F_ISSET(data, DB_DBT_READONLY)) {
+		__db_errx(env, DB_STR("0620",
+		    "DB_DBT_READONLY should not be set on data DBT."));
+			return (EINVAL);
+	}
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	if (multi) {
+		if (!F_ISSET(data, DB_DBT_USERMEM)) {
+			__db_errx(env, DB_STR("0621",
+	    "DB_MULTIPLE/DB_MULTIPLE_KEY require DB_DBT_USERMEM be set"));
+			return (EINVAL);
+		}
+		if (F_ISSET(key, DB_DBT_PARTIAL) ||
+		    F_ISSET(data, DB_DBT_PARTIAL)) {
+			__db_errx(env, DB_STR("0622",
+	    "DB_MULTIPLE/DB_MULTIPLE_KEY do not support DB_DBT_PARTIAL"));
+			return (EINVAL);
+		}
+		if (data->ulen < 1024 ||
+		    data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+			__db_errx(env, DB_STR("0623",
+			    "DB_MULTIPLE/DB_MULTIPLE_KEY buffers must be "
+		    "aligned, at least page size and multiples of 1KB"));
+			return (EINVAL);
+		}
+	}
+
+	/* Check compatible flags for partial key. */
+	if (F_ISSET(key, DB_DBT_PARTIAL) && (flags == DB_GET_BOTH ||
+	    flags == DB_GET_BOTH_RANGE || flags == DB_SET)) {
+		__db_errx(env, DB_STR("0710",
+		    "Invalid positioning flag combined with DB_DBT_PARTIAL"));
+		return (EINVAL);
+	}
+
+	/*
+	 * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO,
+	 * DB_PREV_DUP and DB_NEXT_DUP.  Return EINVAL for an invalid
+	 * cursor, otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc) && (flags == DB_CURRENT ||
+	    flags == DB_GET_RECNO ||
+	    flags == DB_NEXT_DUP || flags == DB_PREV_DUP))
+		return (__db_curinval(env));
+
+	/* Check for consistent transaction usage. */
+	if (LF_ISSET(DB_RMW) &&
+	    (ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_secondary_close_pp --
+ *	DB->close for secondaries
+ *
+ * PUBLIC: int __db_secondary_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * As a DB handle destructor, we can't fail.
+	 *
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_NOSYNC)
+		ret = __db_ferr(env, "DB->close", 0);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+		handle_check = 0;
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if ((t_ret = __db_secondary_close(dbp, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_pget_pp --
+ *	DBC->pget pre/post processing.
+ *
+ * PUBLIC: int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget_pp(dbc, skey, pkey, data, flags)
+	DBC *dbc;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_pget_arg(dbc, pkey, flags)) != 0 ||
+	    (ret = __dbc_get_arg(dbc, skey, data, flags)) != 0) {
+		__dbt_userfree(env, skey, pkey, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+	DEBUG_LREAD(dbc, dbc->txn, "DBcursor->pget",
+	    flags == DB_SET ||
+	    flags == DB_SET_RANGE ? skey : NULL, NULL, flags);
+	ret = __dbc_pget(dbc, skey, pkey, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+
+	__dbt_userfree(env, skey, pkey, data);
+	return (ret);
+}
+
+/*
+ * __dbc_pget_arg --
+ *	Check DBC->pget arguments.
+ */
+static int
+__dbc_pget_arg(dbc, pkey, flags)
+	DBC *dbc;
+	DBT *pkey;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0624",
+		    "DBcursor->pget may only be used on secondary indices"));
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		__db_errx(env, DB_STR("0625",
+    "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"));
+		return (EINVAL);
+	}
+
+	switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		/* These flags make no sense on a secondary index. */
+		return (__db_ferr(env, "DBcursor->pget", 0));
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		/* BOTH is "get both the primary and the secondary". */
+		if (pkey == NULL) {
+			__db_errx(env, DB_STR_A("0626",
+			    "%s requires both a secondary and a primary key",
+			    "%s"), LF_ISSET(DB_GET_BOTH) ?
+			    "DB_GET_BOTH" : "DB_GET_BOTH_RANGE");
+			return (EINVAL);
+		}
+		if ((ret = __dbt_usercopy(env, pkey)) != 0)
+			return (ret);
+		break;
+	default:
+		/* __dbc_get_arg will catch the rest. */
+		break;
+	}
+
+	/*
+	 * We allow the pkey field to be NULL, so that we can make the
+	 * two-DBT get calls into wrappers for the three-DBT ones.
+	 */
+	if (pkey != NULL &&
+	    (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0)
+		return (ret);
+
+	/* Check invalid partial pkey. */
+	if (pkey != NULL && F_ISSET(pkey, DB_DBT_PARTIAL)) {
+		__db_errx(env, DB_STR("0711",
+		    "The primary key returned by pget can't be partial."));
+		return (EINVAL);
+	}
+
+	/* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+	if (pkey == NULL && (flags & DB_OPFLAGS_MASK) == DB_GET_BOTH) {
+		__db_errx(env, DB_STR("0627",
+		    "DB_GET_BOTH on a secondary index requires a primary key"));
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __dbc_put_pp --
+ *	DBC->put pre/post processing.
+ *
+ * PUBLIC: int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __dbc_put_arg(dbc, key, data, flags)) != 0) {
+		__dbt_userfree(env, key, NULL, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->put",
+	    flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+	    flags == DB_NODUPDATA || flags == DB_UPDATE_SECONDARY ?
+	    key : NULL, data, flags);
+	ret = __dbc_put(dbc, key, data, flags);
+
+err:	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __dbc_put_arg --
+ *	Check DBC->put arguments.
+ */
+static int
+__dbc_put_arg(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int key_flags, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	key_flags = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DBcursor->put"));
+
+	/* Check for puts on a secondary. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		if (flags == DB_UPDATE_SECONDARY)
+			flags = 0;
+		else {
+			__db_errx(env, DB_STR("0628",
+			    "DBcursor->put forbidden on secondary indices"));
+			return (EINVAL);
+		}
+	}
+
+	if ((ret = __dbt_usercopy(env, data)) != 0)
+		return (ret);
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_AFTER:
+	case DB_BEFORE:
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_HASH:		/* Only with unsorted duplicates. */
+			if (!F_ISSET(dbp, DB_AM_DUP))
+				goto err;
+			if (dbp->dup_compare != NULL)
+				goto err;
+			break;
+		case DB_QUEUE:		/* Not permitted. */
+			goto err;
+		case DB_RECNO:		/* Only with mutable record numbers. */
+			if (!F_ISSET(dbp, DB_AM_RENUMBER))
+				goto err;
+			key_flags = key == NULL ? 0 : 1;
+			break;
+		case DB_UNKNOWN:
+		default:
+			goto err;
+		}
+		break;
+	case DB_CURRENT:
+		/*
+		 * If there is a comparison function, doing a DB_CURRENT
+		 * must not change the part of the data item that is used
+		 * for the comparison.
+		 */
+		break;
+	case DB_NODUPDATA:
+		if (!F_ISSET(dbp, DB_AM_DUPSORT))
+			goto err;
+		/* FALLTHROUGH */
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_OVERWRITE_DUP:
+		key_flags = 1;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	default:
+err:		return (__db_ferr(env, "DBcursor->put", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.  The key may reasonably be NULL
+	 * if DB_AFTER or DB_BEFORE is set and the application doesn't care
+	 * about the returned key, or if the DB_CURRENT flag is set.
+	 */
+	if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/*
+	 * The key parameter should not be NULL or have the "partial" flag set
+	 * in a put call unless the user doesn't care about a key value we'd
+	 * return.  The user tells us they don't care about the returned key by
+	 * setting the key parameter to NULL or configuring the key DBT to not
+	 * return any information.  (Returned keys from a put are always record
+	 * numbers, and returning part of a record number  doesn't make sense:
+	 * only accept a partial return if the length returned is 0.)
+	 */
+	if (key_flags && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0)
+		return (__db_ferr(env, "key DBT", 0));
+
+	/*
+	 * The cursor must be initialized for anything other than DB_KEYFIRST,
+	 * DB_KEYLAST or zero: return EINVAL for an invalid cursor, otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc) && flags != 0 && flags != DB_KEYFIRST &&
+	    flags != DB_KEYLAST && flags != DB_NODUPDATA &&
+	    flags != DB_OVERWRITE_DUP)
+		return (__db_curinval(env));
+
+	return (0);
+}
+
+/*
+ * __dbt_ferr --
+ *	Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+	const DB *dbp;
+	const char *name;
+	const DBT *dbt;
+	int check_thread;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * Check for invalid DBT flags.  We allow any of the flags to be
+	 * specified to any DB or DBcursor call so that applications can
+	 * set DB_DBT_MALLOC when retrieving a data item from a secondary
+	 * database and then specify that same DBT as a key to a primary
+	 * database, without having to clear flags.
+	 */
+	if ((ret = __db_fchk(env, name, dbt->flags,
+	    DB_DBT_APPMALLOC | DB_DBT_BULK | DB_DBT_DUPOK |
+	    DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERCOPY |
+	    DB_DBT_USERMEM | DB_DBT_PARTIAL | DB_DBT_READONLY)) != 0)
+		return (ret);
+	switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+	    DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+	case 0:
+	case DB_DBT_MALLOC:
+	case DB_DBT_REALLOC:
+	case DB_DBT_USERCOPY:
+	case DB_DBT_USERMEM:
+		break;
+	default:
+		return (__db_ferr(env, name, 1));
+	}
+
+	if (F_ISSET(dbt, DB_DBT_BULK) && F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		__db_errx(env, DB_STR_A("0629",
+		    "Bulk and partial operations cannot be combined on %s DBT",
+		    "%s"), name);
+		return (EINVAL);
+	}
+
+	if (check_thread && DB_IS_THREADED(dbp) &&
+	    !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+		DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_READONLY)) {
+		__db_errx(env, DB_STR_A("0630",
+		    "DB_THREAD mandates memory allocation flag on %s DBT",
+		    "%s"), name);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_curinval
+ *	Report that a cursor is in an invalid state.
+ */
+static int
+__db_curinval(env)
+	const ENV *env;
+{
+	__db_errx(env, DB_STR("0631",
+	    "Cursor position must be set before performing this operation"));
+	return (EINVAL);
+}
+
+/*
+ * __db_txn_auto_init --
+ *	Handle DB_AUTO_COMMIT initialization.
+ *
+ * PUBLIC: int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+ */
+int
+__db_txn_auto_init(env, ip, txnidp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN **txnidp;
+{
+	/*
+	 * Method calls where applications explicitly specify DB_AUTO_COMMIT
+	 * require additional validation: the DB_AUTO_COMMIT flag cannot be
+	 * specified if a transaction cookie is also specified, nor can the
+	 * flag be specified in a non-transactional environment.
+	 */
+	if (*txnidp != NULL && !F_ISSET(*txnidp, TXN_FAMILY)) {
+		__db_errx(env, DB_STR("0632",
+    "DB_AUTO_COMMIT may not be specified along with a transaction handle"));
+		return (EINVAL);
+	}
+
+	if (!TXN_ON(env)) {
+		__db_errx(env, DB_STR("0633",
+    "DB_AUTO_COMMIT may not be specified in non-transactional environment"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Our caller checked to see if replication is making a state change.
+	 * Don't call the user-level API (which would repeat that check).
+	 */
+	return (__txn_begin(env, ip, *txnidp, txnidp, 0));
+}
+
+/*
+ * __db_txn_auto_resolve --
+ *	Resolve local transactions.
+ *
+ * PUBLIC: int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+ */
+int
+__db_txn_auto_resolve(env, txn, nosync, ret)
+	ENV *env;
+	DB_TXN *txn;
+	int nosync, ret;
+{
+	int t_ret;
+
+	if (ret == 0)
+		return (__txn_commit(txn, nosync ? DB_TXN_NOSYNC : 0));
+
+	if ((t_ret = __txn_abort(txn)) != 0)
+		return (__env_panic(env, t_ret));
+
+	return (ret);
+}
diff --git a/src/db/db_join.c b/src/db/db_join.c
new file mode 100644
index 00000000..751cf9e2
--- /dev/null
+++ b/src/db/db_join.c
@@ -0,0 +1,940 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_join.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+static int __db_join_close_pp __P((DBC *));
+static int __db_join_cmp __P((const void *, const void *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
+    DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * Check to see if the Nth secondary cursor of join cursor jc is pointing
+ * to a sorted duplicate set.
+ */
+#define	SORTED_SET(jc, n)   ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
+
+/*
+ * This is the duplicate-assisted join functionality.  Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * We optimize the join by sorting cursors from smallest to largest
+ * cardinality.  In most cases, this is indeed optimal.  However, if
+ * a cursor with large cardinality has very few data in common with the
+ * first cursor, it is possible that the join will be made faster by
+ * putting it earlier in the cursor list.  Since we have no way to detect
+ * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
+ * the sort order specified by the caller, who may know more about the
+ * structure of the data.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ *	This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join.  While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, initialized cursors and primary
+ * is the DB of the primary file.  The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned.  When no more items are left in the join
+ * set, the  c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join(primary, curslist, dbcp, flags)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	size_t ncurs, nslots;
+	u_int32_t i;
+	int ret;
+
+	env = primary->env;
+	dbc = NULL;
+	jc = NULL;
+
+	if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+		goto err;
+
+	if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
+		goto err;
+
+	if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
+		goto err;
+	jc->j_key.ulen = 256;
+	F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+	F_SET(&jc->j_rdata, DB_DBT_REALLOC);
+
+	for (jc->j_curslist = curslist;
+	    *jc->j_curslist != NULL; jc->j_curslist++)
+		;
+
+	/*
+	 * The number of cursor slots we allocate is one greater than
+	 * the number of cursors involved in the join, because the
+	 * list is NULL-terminated.
+	 */
+	ncurs = (size_t)(jc->j_curslist - curslist);
+	nslots = ncurs + 1;
+
+	/*
+	 * !!! -- A note on the various lists hanging off jc.
+	 *
+	 * j_curslist is the initial NULL-terminated list of cursors passed
+	 * into __db_join.  The original cursors are not modified; pristine
+	 * copies are required because, in databases with unsorted dups, we
+	 * must reset all of the secondary cursors after the first each
+	 * time the first one is incremented, or else we will lose data
+	 * which happen to be sorted differently in two different cursors.
+	 *
+	 * j_workcurs is where we put those copies that we're planning to
+	 * work with.  They're lazily c_dup'ed from j_curslist as we need
+	 * them, and closed when the join cursor is closed or when we need
+	 * to reset them to their original values (in which case we just
+	 * c_dup afresh).
+	 *
+	 * j_fdupcurs is an array of cursors which point to the first
+	 * duplicate in the duplicate set that contains the data value
+	 * we're currently interested in.  We need this to make
+	 * __db_join_get correctly return duplicate duplicates;  i.e., if a
+	 * given data value occurs twice in the set belonging to cursor #2,
+	 * and thrice in the set belonging to cursor #3, and once in all
+	 * the other cursors, successive calls to __db_join_get need to
+	 * return that data item six times.  To make this happen, each time
+	 * cursor N is allowed to advance to a new datum, all cursors M
+	 * such that M > N have to be reset to the first duplicate with
+	 * that datum, so __db_join_get will return all the dup-dups again.
+	 * We could just reset them to the original cursor from j_curslist,
+	 * but that would be a bit slower in the unsorted case and a LOT
+	 * slower in the sorted one.
+	 *
+	 * j_exhausted is a list of boolean values which represent
+	 * whether or not their corresponding cursors are "exhausted",
+	 * i.e. whether the datum under the corresponding cursor has
+	 * been found not to exist in any unreturned combinations of
+	 * later secondary cursors, in which case they are ready to be
+	 * incremented.
+	 */
+
+	/* We don't want to free regions whose callocs have failed. */
+	jc->j_curslist = NULL;
+	jc->j_workcurs = NULL;
+	jc->j_fdupcurs = NULL;
+	jc->j_exhausted = NULL;
+
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_curslist)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_workcurs)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_fdupcurs)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
+	    &jc->j_exhausted)) != 0)
+		goto err;
+	for (i = 0; curslist[i] != NULL; i++) {
+		jc->j_curslist[i] = curslist[i];
+		jc->j_workcurs[i] = NULL;
+		jc->j_fdupcurs[i] = NULL;
+		jc->j_exhausted[i] = 0;
+	}
+	jc->j_ncurs = (u_int32_t)ncurs;
+
+	/*
+	 * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
+	 * sorting in order of increasing cardinality.
+	 */
+	if (!LF_ISSET(DB_JOIN_NOSORT))
+		qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
+
+	/*
+	 * We never need to reset the 0th cursor, so there's no
+	 * solid reason to use workcurs[0] rather than curslist[0] in
+	 * join_get.  Nonetheless, it feels cleaner to do it for symmetry,
+	 * and this is the most logical place to copy it.
+	 *
+	 * !!!
+	 * There's no need to close the new cursor if we goto err only
+	 * because this is the last thing that can fail.  Modifier of this
+	 * function beware!
+	 */
+	if ((ret =
+	    __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
+		goto err;
+
+	dbc->close = dbc->c_close = __db_join_close_pp;
+	dbc->del = dbc->c_del = __db_join_del;
+	dbc->get = dbc->c_get = __db_join_get_pp;
+	dbc->put = dbc->c_put = __db_join_put;
+	dbc->internal = (DBC_INTERNAL *)jc;
+	dbc->dbp = primary;
+	jc->j_primary = primary;
+
+	/* Stash the first cursor's transaction here for easy access. */
+	dbc->txn = curslist[0]->txn;
+
+	*dbcp = dbc;
+
+	MUTEX_LOCK(env, primary->mutex);
+	TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
+	MUTEX_UNLOCK(env, primary->mutex);
+
+	return (0);
+
+err:	if (jc != NULL) {
+		if (jc->j_curslist != NULL)
+			__os_free(env, jc->j_curslist);
+		if (jc->j_workcurs != NULL) {
+			if (jc->j_workcurs[0] != NULL)
+				(void)__dbc_close(jc->j_workcurs[0]);
+			__os_free(env, jc->j_workcurs);
+		}
+		if (jc->j_fdupcurs != NULL)
+			__os_free(env, jc->j_fdupcurs);
+		if (jc->j_exhausted != NULL)
+			__os_free(env, jc->j_exhausted);
+		__os_free(env, jc);
+	}
+	if (dbc != NULL)
+		__os_free(env, dbc);
+	return (ret);
+}
+
+/*
+ * __db_join_close_pp --
+ *	DBC->close pre/post processing for join cursors.
+ */
+static int
+__db_join_close_pp(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_join_close(dbc);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	COMPQUIET(dbc, NULL);
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(dbc, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+/*
+ * __db_join_get_pp --
+ *	DBjoin->get pre/post processing.
+ */
+static int
+__db_join_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t handle_check, save_flags;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Save the original flags value. */
+	save_flags = flags;
+
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DBC->get"));
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	switch (flags) {
+	case 0:
+	case DB_JOIN_ITEM:
+		break;
+	default:
+		return (__db_ferr(env, "DBC->get", 0));
+	}
+
+	/*
+	 * A partial get of the key of a join cursor don't make much sense;
+	 * the entire key is necessary to query the primary database
+	 * and find the datum, and so regardless of the size of the key
+	 * it would not be a performance improvement.  Since it would require
+	 * special handling, we simply disallow it.
+	 *
+	 * A partial get of the data, however, potentially makes sense (if
+	 * all possible data are a predictable large structure, for instance)
+	 * and causes us no headaches, so we permit it.
+	 */
+	if (F_ISSET(key, DB_DBT_PARTIAL)) {
+		__db_errx(env, DB_STR("0516",
+		    "DB_DBT_PARTIAL may not be set on key during join_get"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Restore the original flags value. */
+	flags = save_flags;
+
+	ret = __db_join_get(dbc, key, data, flags);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, NULL);
+	return (ret);
+}
+
+static int
+__db_join_get(dbc, key_arg, data_arg, flags)
+	DBC *dbc;
+	DBT *key_arg, *data_arg;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *cp;
+	DBT *key_n, key_n_mem;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	int db_manage_data, ret;
+	u_int32_t i, j, operation, opmods;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	jc = (JOIN_CURSOR *)dbc->internal;
+
+	operation = LF_ISSET(DB_OPFLAGS_MASK);
+
+	/* !!!
+	 * If the set of flags here changes, check that __db_join_primget
+	 * is updated to handle them properly.
+	 */
+	opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+
+	/*
+	 * Since we are fetching the key as a datum in the secondary indices,
+	 * we must be careful of caller-specified DB_DBT_* memory
+	 * management flags.  If necessary, use a stack-allocated DBT;
+	 * we'll appropriately copy and/or allocate the data later.
+	 */
+	if (F_ISSET(key_arg,
+	    DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+		/* We just use the default buffer;  no need to go malloc. */
+		key_n = &key_n_mem;
+		memset(key_n, 0, sizeof(DBT));
+	} else {
+		/*
+		 * Either DB_DBT_REALLOC or the default buffer will work
+		 * fine if we have to reuse it, as we do.
+		 */
+		key_n = key_arg;
+	}
+	if (F_ISSET(key_arg, DB_DBT_USERCOPY))
+		key_arg->data = NULL;
+
+	/*
+	 * If our last attempt to do a get on the primary key failed,
+	 * short-circuit the join and try again with the same key.
+	 */
+	if (F_ISSET(jc, JOIN_RETRY))
+		goto samekey;
+	F_CLR(jc, JOIN_RETRY);
+
+retry:	ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
+	    opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
+
+	if (ret == DB_BUFFER_SMALL) {
+		jc->j_key.ulen <<= 1;
+		if ((ret = __os_realloc(env,
+		    jc->j_key.ulen, &jc->j_key.data)) != 0)
+			goto mem_err;
+		goto retry;
+	}
+
+	/*
+	 * If ret == DB_NOTFOUND, we're out of elements of the first
+	 * secondary cursor.  This is how we finally finish the join
+	 * if all goes well.
+	 */
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
+	 * and we're going to want to advance all the cursors that point to
+	 * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
+	 * Close all the cursors in j_fdupcurs;  we'll reopen them the
+	 * first time through the upcoming loop.
+	 */
+	for (i = 1; i < jc->j_ncurs; i++) {
+		if (jc->j_fdupcurs[i] != NULL &&
+		    (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+			goto err;
+		jc->j_fdupcurs[i] = NULL;
+	}
+
+	/*
+	 * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
+	 * Thus, we can safely increment that one cursor on each call
+	 * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
+	 * right away.
+	 *
+	 * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
+	 * increment it until we know we're ready to.
+	 */
+	if (jc->j_curslist[1] == NULL)
+		jc->j_exhausted[0] = 1;
+	else
+		jc->j_exhausted[0] = 0;
+
+	/* We have the first element; now look for it in the other cursors. */
+	for (i = 1; i < jc->j_ncurs; i++) {
+		DB_ASSERT(env, jc->j_curslist[i] != NULL);
+		if (jc->j_workcurs[i] == NULL)
+			/* If this is NULL, we need to dup curslist into it. */
+			if ((ret = __dbc_dup(jc->j_curslist[i],
+			    &jc->j_workcurs[i], DB_POSITION)) != 0)
+				goto err;
+
+retry2:		cp = jc->j_workcurs[i];
+
+		if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
+			    jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
+			/*
+			 * jc->j_workcurs[i] has no more of the datum we're
+			 * interested in.  Go back one cursor and get
+			 * a new dup.  We can't just move to a new
+			 * element of the outer relation, because that way
+			 * we might miss duplicate duplicates in cursor i-1.
+			 *
+			 * If this takes us back to the first cursor,
+			 * -then- we can move to a new element of the outer
+			 * relation.
+			 */
+			--i;
+			jc->j_exhausted[i] = 1;
+
+			if (i == 0) {
+				for (j = 1; jc->j_workcurs[j] != NULL; j++) {
+					/*
+					 * We're moving to a new element of
+					 * the first secondary cursor.  If
+					 * that cursor is sorted, then any
+					 * other sorted cursors can be safely
+					 * reset to the first duplicate
+					 * duplicate in the current set if we
+					 * have a pointer to it (we can't just
+					 * leave them be, or we'll miss
+					 * duplicate duplicates in the outer
+					 * relation).
+					 *
+					 * If the first cursor is unsorted, or
+					 * if cursor j is unsorted, we can
+					 * make no assumptions about what
+					 * we're looking for next or where it
+					 * will be, so we reset to the very
+					 * beginning (setting workcurs NULL
+					 * will achieve this next go-round).
+					 *
+					 * XXX: This is likely to break
+					 * horribly if any two cursors are
+					 * both sorted, but have different
+					 * specified sort functions.  For,
+					 * now, we dismiss this as pathology
+					 * and let strange things happen--we
+					 * can't make rope childproof.
+					 */
+					if ((ret = __dbc_close(
+					    jc->j_workcurs[j])) != 0)
+						goto err;
+					if (!SORTED_SET(jc, 0) ||
+					    !SORTED_SET(jc, j) ||
+					    jc->j_fdupcurs[j] == NULL)
+						/*
+						 * Unsafe conditions;
+						 * reset fully.
+						 */
+						jc->j_workcurs[j] = NULL;
+					else
+						/* Partial reset suffices. */
+						if ((__dbc_dup(
+						    jc->j_fdupcurs[j],
+						    &jc->j_workcurs[j],
+						    DB_POSITION)) != 0)
+							goto err;
+					jc->j_exhausted[j] = 0;
+				}
+				goto retry;
+				/* NOTREACHED */
+			}
+
+			/*
+			 * We're about to advance the cursor and need to
+			 * reset all of the workcurs[j] where j>i, so that
+			 * we don't miss any duplicate duplicates.
+			 */
+			for (j = i + 1;
+			    jc->j_workcurs[j] != NULL;
+			    j++) {
+				if ((ret =
+				    __dbc_close(jc->j_workcurs[j])) != 0)
+					goto err;
+				jc->j_exhausted[j] = 0;
+				if (jc->j_fdupcurs[j] == NULL)
+					jc->j_workcurs[j] = NULL;
+				else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
+				    &jc->j_workcurs[j], DB_POSITION)) != 0)
+					goto err;
+			}
+			goto retry2;
+			/* NOTREACHED */
+		}
+
+		if (ret == DB_BUFFER_SMALL) {
+			jc->j_key.ulen <<= 1;
+			if ((ret = __os_realloc(env, jc->j_key.ulen,
+			    &jc->j_key.data)) != 0) {
+mem_err:			__db_errx(env, DB_STR_A("0517",
+				    "Allocation failed for join key, len = %lu",
+				    "%lu"), (u_long)jc->j_key.ulen);
+				goto err;
+			}
+			goto retry2;
+		}
+
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * If we made it this far, we've found a matching
+		 * datum in cursor i.  Mark the current cursor
+		 * unexhausted, so we don't miss any duplicate
+		 * duplicates the next go-round--unless this is the
+		 * very last cursor, in which case there are none to
+		 * miss, and we'll need that exhausted flag to finally
+		 * get a DB_NOTFOUND and move on to the next datum in
+		 * the outermost cursor.
+		 */
+		if (i + 1 != jc->j_ncurs)
+			jc->j_exhausted[i] = 0;
+		else
+			jc->j_exhausted[i] = 1;
+
+		/*
+		 * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
+		 * sorted, then we're here for the first time since advancing
+		 * cursor 0, and we have a new datum of interest.
+		 * jc->j_workcurs[i] points to the beginning of a set of
+		 * duplicate duplicates;  store this into jc->j_fdupcurs[i].
+		 */
+		if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
+		    __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
+			goto err;
+	}
+
+err:	if (ret != 0)
+		return (ret);
+
+	if (0) {
+samekey:	/*
+		 * Get the key we tried and failed to return last time;
+		 * it should be the current datum of all the secondary cursors.
+		 */
+		if ((ret = __dbc_get(jc->j_workcurs[0],
+		    &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
+			return (ret);
+		F_CLR(jc, JOIN_RETRY);
+	}
+
+	/*
+	 * ret == 0;  we have a key to return.
+	 *
+	 * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
+	 * back into the dbt we were given for the key; call __db_retcopy.
+	 * Otherwise, assert that we do not need to copy anything and proceed.
+	 */
+	DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+	    DB_DBT_USERCOPY) || key_n == key_arg);
+
+	if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+	    DB_DBT_USERCOPY)) &&
+	    (ret = __db_retcopy(env,
+	    key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
+		/*
+		 * The retcopy failed, most commonly because we have a user
+		 * buffer for the key which is too small. Set things up to
+		 * retry next time, and return.
+		 */
+		F_SET(jc, JOIN_RETRY);
+		return (ret);
+	}
+
+	/*
+	 * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
+	 * in the primary and then return.
+	 */
+	if (operation == DB_JOIN_ITEM)
+		return (0);
+
+	/*
+	 * If data_arg->flags == 0--that is, if DB is managing the
+	 * data DBT's memory--it's not safe to just pass the DBT
+	 * through to the primary get call, since we don't want that
+	 * memory to belong to the primary DB handle (and if the primary
+	 * is free-threaded, it can't anyway).
+	 *
+	 * Instead, use memory that is managed by the join cursor, in
+	 * jc->j_rdata.
+	 */
+	if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
+	    DB_DBT_USERMEM | DB_DBT_USERCOPY))
+		db_manage_data = 1;
+	else
+		db_manage_data = 0;
+	if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
+	    jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
+	    db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+			    (jc->j_curslist[0]->txn != NULL && F_ISSET(
+			    jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
+				goto retry;
+			/*
+			 * If ret == DB_NOTFOUND, the primary and secondary
+			 * are out of sync;  every item in each secondary
+			 * should correspond to something in the primary,
+			 * or we shouldn't have done the join this way.
+			 * Wail.
+			 */
+			ret = __db_secondary_corrupt(jc->j_primary);
+		} else
+			/*
+			 * The get on the primary failed for some other
+			 * reason, most commonly because we're using a user
+			 * buffer that's not big enough.  Flag our failure
+			 * so we can return the same key next time.
+			 */
+			F_SET(jc, JOIN_RETRY);
+	}
+	if (db_manage_data && ret == 0) {
+		data_arg->data = jc->j_rdata.data;
+		data_arg->size = jc->j_rdata.size;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_join_close --
+ *	DBC->close for join cursors.
+ *
+ * PUBLIC: int __db_join_close __P((DBC *));
+ */
+int
+__db_join_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	int ret, t_ret;
+	u_int32_t i;
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = t_ret = 0;
+
+	/*
+	 * Remove from active list of join cursors.  Note that this
+	 * must happen before any action that can fail and return, or else
+	 * __db_close may loop indefinitely.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_REMOVE(&dbp->join_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	ENV_ENTER(env, ip);
+	/*
+	 * Close any open scratch cursors.  In each case, there may
+	 * not be as many outstanding as there are cursors in
+	 * curslist, but we want to close whatever's there.
+	 *
+	 * If any close fails, there's no reason not to close everything else;
+	 * we'll just return the error code of the last one to fail.  There's
+	 * not much the caller can do anyway, since these cursors only exist
+	 * hanging off a db-internal data structure that they shouldn't be
+	 * mucking with.
+	 */
+	for (i = 0; i < jc->j_ncurs; i++) {
+		if (jc->j_workcurs[i] != NULL &&
+		    (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
+			ret = t_ret;
+		if (jc->j_fdupcurs[i] != NULL &&
+		    (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+			ret = t_ret;
+	}
+	ENV_LEAVE(env, ip);
+
+	__os_free(env, jc->j_exhausted);
+	__os_free(env, jc->j_curslist);
+	__os_free(env, jc->j_workcurs);
+	__os_free(env, jc->j_fdupcurs);
+	__os_free(env, jc->j_key.data);
+	if (jc->j_rdata.data != NULL)
+		__os_ufree(env, jc->j_rdata.data);
+	__os_free(env, jc);
+	__os_free(env, dbc);
+
+	return (ret);
+}
+
+/*
+ * __db_join_getnext --
+ *	This function replaces the DBC_CONTINUE and DBC_KEYSET
+ *	functionality inside the various cursor get routines.
+ *
+ *	If exhausted == 0, we're not done with the current datum;
+ *	return it if it matches "matching", otherwise search
+ *	using DB_GET_BOTHC (which is faster than iteratively doing
+ *	DB_NEXT_DUP) forward until we find one that does.
+ *
+ *	If exhausted == 1, we are done with the current datum, so just
+ *	leap forward to searching NEXT_DUPs.
+ *
+ *	If no matching datum exists, returns DB_NOTFOUND, else 0.
+ */
+static int
+__db_join_getnext(dbc, key, data, exhausted, opmods)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t exhausted, opmods;
+{
+	int ret, cmp;
+	DB *dbp;
+	DBT ldata;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+
+	dbp = dbc->dbp;
+	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+	switch (exhausted) {
+	case 0:
+		/*
+		 * We don't want to step on data->data;  use a new
+		 * DBT and malloc so we don't step on dbc's rdata memory.
+		 */
+		memset(&ldata, 0, sizeof(DBT));
+		F_SET(&ldata, DB_DBT_MALLOC);
+		if ((ret = __dbc_get(dbc,
+		    key, &ldata, opmods | DB_CURRENT)) != 0)
+			break;
+		cmp = func(dbp, data, &ldata);
+		if (cmp == 0) {
+			/*
+			 * We have to return the real data value.  Copy
+			 * it into data, then free the buffer we malloc'ed
+			 * above.
+			 */
+			if ((ret = __db_retcopy(dbp->env, data, ldata.data,
+			    ldata.size, &data->data, &data->size)) != 0)
+				return (ret);
+			__os_ufree(dbp->env, ldata.data);
+			return (0);
+		}
+
+		/*
+		 * Didn't match--we want to fall through and search future
+		 * dups.  We just forget about ldata and free
+		 * its buffer--data contains the value we're searching for.
+		 */
+		__os_ufree(dbp->env, ldata.data);
+		/* FALLTHROUGH */
+	case 1:
+		ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_join_cmp --
+ *	Comparison function for sorting DBCs in cardinality order.
+ */
+static int
+__db_join_cmp(a, b)
+	const void *a, *b;
+{
+	DBC *dbca, *dbcb;
+	db_recno_t counta, countb;
+
+	dbca = *((DBC * const *)a);
+	dbcb = *((DBC * const *)b);
+
+	if (__dbc_count(dbca, &counta) != 0 ||
+	    __dbc_count(dbcb, &countb) != 0)
+		return (0);
+
+	return ((long)counta - (long)countb);
+}
+
+/*
+ * __db_join_primget --
+ *	Perform a DB->get in the primary, being careful not to use a new
+ * locker ID if we're doing CDB locking.
+ */
+static int
+__db_join_primget(dbp, ip, txn, locker, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_LOCKER  *locker;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t rmw;
+	int ret, t_ret;
+
+	if ((ret = __db_cursor_int(dbp, ip,
+	    txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
+		return (ret);
+
+	/*
+	 * The only allowable flags here are the two flags copied into "opmods"
+	 * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED.  The former is an
+	 * op on the c_get call, the latter on the cursor call.  It's a DB bug
+	 * if we allow any other flags down in here.
+	 */
+	rmw = LF_ISSET(DB_RMW);
+	if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+		F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+	if (LF_ISSET(DB_READ_COMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+		F_SET(dbc, DBC_READ_COMMITTED);
+
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	DB_ASSERT(dbp->env, flags == 0);
+
+	F_SET(dbc, DBC_TRANSIENT);
+
+	/*
+	 * This shouldn't be necessary, thanks to the fact that join cursors
+	 * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
+	 * sake, we mirror what __db_get does.
+	 */
+	SET_RET_MEM(dbc, dbp);
+
+	ret = __dbc_get(dbc, key, data, DB_SET | rmw);
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_secondary_corrupt --
+ *	Report primary/secondary inconsistencies.
+ *
+ * PUBLIC: int __db_secondary_corrupt __P((DB *));
+ */
+int
+__db_secondary_corrupt(dbp)
+	DB *dbp;
+{
+	__db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
+	    dbp->fname == NULL ? "unnamed" : dbp->fname,
+	    dbp->dname == NULL ? "" : "/",
+	    dbp->dname == NULL ? "" : dbp->dname);
+	return (DB_SECONDARY_BAD);
+}
diff --git a/src/db/db_meta.c b/src/db/db_meta.c
new file mode 100644
index 00000000..8f97ebd8
--- /dev/null
+++ b/src/db/db_meta.c
@@ -0,0 +1,1428 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+
+static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+static int  __db_pglistcmp __P((const void *, const void *));
+static int  __db_truncate_freelist __P((DBC *, DBMETA *,
+      PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __db_init_meta --
+ *	Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT).  We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(dbp, p, pgno, pgtype)
+	DB *dbp;
+	void *p;
+	db_pgno_t pgno;
+	u_int32_t pgtype;
+{
+	DBMETA *meta;
+	DB_LSN save_lsn;
+
+	meta = (DBMETA *)p;
+	save_lsn = meta->lsn;
+	memset(meta, 0, sizeof(DBMETA));
+	meta->lsn = save_lsn;
+	meta->pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->metaflags, DBMETA_CHKSUM);
+	meta->pgno = pgno;
+	meta->type = (u_int8_t)pgtype;
+}
+
+/*
+ * __db_new --
+ *	Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+ */
+int
+__db_new(dbc, type, lockp, pagepp)
+	DBC *dbc;
+	u_int32_t type;
+	DB_LOCK *lockp;
+	PAGE **pagepp;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DB_LOCK metalock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t last, *list, pgno, newnext;
+	int extend, hash, ret;
+
+	meta = NULL;
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	newnext = PGNO_INVALID;
+	if (lockp != NULL)
+		LOCK_INIT(*lockp);
+
+	hash = 0;
+	ret = 0;
+	LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+	if (dbp->type == DB_HASH) {
+		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+		if (meta != NULL)
+			hash = 1;
+	}
+#endif
+	if (meta == NULL) {
+		pgno = PGNO_BASE_MD;
+		if ((ret = __db_lget(dbc,
+		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+	}
+
+	last = meta->last_pgno;
+	if (meta->free == PGNO_INVALID) {
+		if (FLD_ISSET(type, P_DONTEXTEND)) {
+			*pagepp = NULL;
+			goto err;
+		}
+		last = pgno = meta->last_pgno + 1;
+		ZERO_LSN(lsn);
+		extend = 1;
+	} else {
+		pgno = meta->free;
+		/*
+		 * Lock the new page.  Do this here because we must do it
+		 * before getting the page and the caller may need the lock
+		 * to keep readers from seeing the page before the transaction
+		 * commits.  We can do this because no one will hold a free
+		 * page locked.
+		 */
+		if (lockp != NULL && (ret =
+		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &h)) != 0)
+			goto err;
+
+		/*
+		 * We want to take the first page off the free list and
+		 * then set meta->free to the that page's next_pgno, but
+		 * we need to log the change first.
+		 */
+		newnext = h->next_pgno;
+		lsn = h->lsn;
+		extend = 0;
+		DB_ASSERT(env, TYPE(h) == P_INVALID);
+
+		if (TYPE(h) != P_INVALID) {
+			__db_errx(env, DB_STR_A("0689",
+			    "%s page %lu is on free list with type %lu",
+			    "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
+			    (u_long)TYPE(h));
+			return (__env_panic(env, EINVAL));
+		}
+
+	}
+
+	FLD_CLR(type, P_DONTEXTEND);
+
+	/*
+	 * Log the allocation before fetching the new page.  If we
+	 * don't have room in the log then we don't want to tell
+	 * mpool to extend the file.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+		    &LSN(meta), PGNO_BASE_MD, &lsn,
+		    pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(meta));
+
+	meta->free = newnext;
+
+	if (extend == 1) {
+		if (lockp != NULL && (ret =
+		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_NEW, &h)) != 0)
+			goto err;
+		DB_ASSERT(env, last == pgno);
+		meta->last_pgno = pgno;
+		ZERO_LSN(h->lsn);
+		h->pgno = pgno;
+
+		/*
+		 * If the file was extended for the first time in this
+		 * transaction, set the MPOOLFILE's file extension
+		 * watermark.
+		 */
+		__txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
+
+	}
+	LSN(h) = LSN(meta);
+
+	if (hash == 0 && (ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0)
+		goto err;
+	meta = NULL;
+
+	switch (type) {
+		case P_BTREEMETA:
+		case P_HASHMETA:
+		case P_QAMMETA:
+			__db_init_meta(dbp, h, h->pgno, type);
+			break;
+		default:
+			P_INIT(h, dbp->pgsize,
+			    h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+			break;
+	}
+
+	/* Fix up the sorted free list if necessary. */
+#ifdef HAVE_FTRUNCATE
+	if (extend == 0) {
+		u_int32_t nelems = 0;
+
+		if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
+			goto err;
+		if (nelems != 0) {
+			DB_ASSERT(env, h->pgno == list[0]);
+			memmove(list, &list[1], (nelems - 1) * sizeof(*list));
+			if ((ret = __memp_extend_freelist(
+			    dbp->mpf, nelems - 1, &list)) != 0)
+				goto err;
+		}
+	}
+#else
+	COMPQUIET(list, NULL);
+#endif
+
+	if ((ret = __TLPUT(dbc, metalock)) != 0)
+		return (ret);
+	*pagepp = h;
+	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
+	return (0);
+
+err:	if (h != NULL)
+		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	if (meta != NULL && hash == 0)
+		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	(void)__TLPUT(dbc, metalock);
+	if (lockp != NULL)
+		(void)__LPUT(dbc, *lockp);
+	/* Failure return - report 0 pgno, null page address. */
+	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
+	return (ret);
+}
+
+/*
+ * __db_free --
+ *	Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__db_free(dbc, h, flags)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DBT ddbt, ldbt;
+	DB_LOCK metalock;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	PAGE *prev;
+	db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
+	u_int32_t lflag;
+	int hash, ret, t_ret;
+#ifdef HAVE_FTRUNCATE
+	db_pgno_t *list, *lp;
+	u_int32_t nelem, position, start;
+	int do_truncate;
+#endif
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	prev_pgno = PGNO_INVALID;
+	meta = NULL;
+	prev = NULL;
+	LOCK_INIT(metalock);
+#ifdef HAVE_FTRUNCATE
+	lp = NULL;
+	nelem = 0;
+	do_truncate = 0;
+#endif
+
+	/*
+	 * Retrieve the metadata page.  If we are not keeping a sorted
+	 * free list put the page at the head of the the free list.
+	 * If we are keeping a sorted free list, for truncation,
+	 * then figure out where this page belongs and either
+	 * link it in or truncate the file as much as possible.
+	 * If either the lock get or page get routines
+	 * fail, then we need to put the page with which we were called
+	 * back because our caller assumes we take care of it.
+	 */
+	hash = 0;
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+
+#ifdef HAVE_HASH
+	if (dbp->type == DB_HASH) {
+		if ((ret = __ham_return_meta(dbc,
+#ifdef HAVE_FTRUNCATE
+		    0,
+#else
+		    DB_MPOOL_DIRTY,
+#endif
+		&meta)) != 0)
+			goto err;
+		if (meta != NULL)
+			hash = 1;
+	}
+#endif
+	if (meta == NULL) {
+		/* If we support truncate, we might not dirty the meta page. */
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+#ifdef HAVE_FTRUNCATE
+		    0,
+#else
+		    DB_MPOOL_DIRTY,
+#endif
+		    &meta)) != 0)
+			goto err1;
+	}
+
+	last_pgno = meta->last_pgno;
+	next_pgno = meta->free;
+	/*
+	 * Assign lsnp here so it always initialized when
+	 * HAVE_FTRUNCATE is not defined.
+	 */
+	lsnp = &LSN(meta);
+
+	DB_ASSERT(dbp->env, h->pgno != next_pgno);
+
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * If we are maintaining a sorted free list see if we either have a
+	 * new truncation point or the page goes somewhere in the middle of
+	 * the list.  If it goes in the middle of the list, we will drop the
+	 * meta page and get the previous page.
+	 */
+	COMPQUIET(position, 0);
+	if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+		goto err1;
+	if (list == NULL)
+		goto no_sort;
+
+	if (h->pgno != last_pgno) {
+		/*
+		 * Put the page number in the sorted list.  Find its
+		 * position and the previous page.  After logging we
+		 * will extend the list, make room and insert the page in
+		 * the list.
+		 */
+		position = 0;
+		if (nelem != 0) {
+			__db_freelist_pos(h->pgno, list, nelem, &position);
+
+			DB_ASSERT(dbp->env, h->pgno != list[position]);
+
+			/* Get the previous page if this is not the smallest. */
+			if (position != 0 || h->pgno > list[0])
+				prev_pgno = list[position];
+		}
+
+	} else if (nelem != 0) {
+		/* Find the truncation point. */
+		for (lp = &list[nelem - 1]; lp >= list; lp--)
+			if (--last_pgno != *lp)
+				break;
+		if (lp < list || last_pgno < h->pgno - 1)
+			do_truncate = 1;
+		last_pgno = meta->last_pgno;
+	}
+
+no_sort:
+	if (prev_pgno == PGNO_INVALID) {
+#ifdef HAVE_HASH
+		if (hash) {
+			if ((ret =
+			    __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+				goto err1;
+		} else
+#endif
+		if ((ret = __memp_dirty(mpf,
+		    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		lsnp = &LSN(meta);
+	} else {
+		pgno = prev_pgno;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
+			goto err1;
+		next_pgno = NEXT_PGNO(prev);
+		lsnp = &LSN(prev);
+	}
+#endif
+
+	/*
+	 * Log the change.
+	 *	We are either logging an update to the metapage or to the
+	 * previous page in the sorted list.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		memset(&ldbt, 0, sizeof(ldbt));
+		ldbt.data = h;
+		ldbt.size = P_OVERHEAD(dbp);
+		/*
+		 * If we are removing pages from the file, we need to make
+		 * sure the logging happens before the truncation.  If we
+		 * are truncating multiple pages we don't need to flush the
+		 * log here as it will be flushed by __db_truncate_freelist.
+		 */
+		lflag = 0;
+
+#ifdef HAVE_FTRUNCATE
+		if (h->pgno == last_pgno && do_truncate == 0)
+			lflag = DB_FLUSH;
+#endif
+		switch (h->type) {
+		case P_HASH:
+		case P_IBTREE:
+		case P_IRECNO:
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			if (h->entries > 0 && (h->pgno == last_pgno ||
+			    !LF_ISSET(DB_LOG_NO_DATA))) {
+				ldbt.size += h->entries * sizeof(db_indx_t);
+				ddbt.data = (u_int8_t *)h + HOFFSET(h);
+				ddbt.size = dbp->pgsize - HOFFSET(h);
+				if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
+				     lsnp, lflag,
+				     h->pgno, lsnp, pgno,
+				     &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
+					goto err1;
+				goto logged;
+			}
+			break;
+		case P_HASHMETA:
+			ldbt.size = sizeof(HMETA);
+			break;
+		case P_BTREEMETA:
+			ldbt.size = sizeof(BTMETA);
+			break;
+		case P_OVERFLOW:
+			ldbt.size += OV_LEN(h);
+			break;
+		default:
+			DB_ASSERT(dbp->env, h->type != P_QAMDATA);
+		}
+
+		if ((ret = __db_pg_free_log(dbp,
+		      dbc->txn, lsnp, lflag, h->pgno,
+		      lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
+			goto err1;
+	} else
+		LSN_NOT_LOGGED(*lsnp);
+
+logged:
+#ifdef HAVE_FTRUNCATE
+	if (do_truncate) {
+		start = (u_int32_t) (lp - list) + 1;
+		meta->last_pgno--;
+		ret = __db_truncate_freelist(
+		      dbc, meta, h, list, start, nelem);
+		h = NULL;
+	} else if (h->pgno == last_pgno) {
+		/*
+		 * We are going to throw this page away, but if we are
+		 * using MVCC then this version may stick around and we
+		 * might have to make a copy.
+		 */
+		if (atomic_read(&mpf->mfp->multiversion) &&
+		    (ret = __memp_dirty(mpf,
+		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		LSN(h) = *lsnp;
+		P_INIT(h, dbp->pgsize,
+		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+			goto err1;
+		h = NULL;
+		/* Give the page back to the OS. */
+		if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+		    last_pgno, 0)) != 0)
+			goto err1;
+		DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
+		meta->last_pgno--;
+	} else {
+		if (list != NULL) {
+			/* Put the page number into the list. */
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+				goto err1;
+			if (prev_pgno != PGNO_INVALID)
+				lp = &list[position + 1];
+			else
+				lp = list;
+			if (nelem != 0 && position != nelem)
+				memmove(lp + 1, lp, (size_t)
+				    ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
+			*lp = h->pgno;
+		}
+#else
+	{
+#endif
+		/*
+		 * If we are not truncating the page then we
+		 * reinitialize it and put it at the head of
+		 * the free list.
+		 */
+		if ((ret = __memp_dirty(mpf,
+		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		LSN(h) = *lsnp;
+		P_INIT(h, dbp->pgsize,
+		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+#ifdef DIAGNOSTIC
+		memset((u_int8_t *) h + P_OVERHEAD(dbp),
+		    CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
+#endif
+		if (prev_pgno == PGNO_INVALID)
+			meta->free = h->pgno;
+		else
+			NEXT_PGNO(prev) = h->pgno;
+	}
+
+	/* Discard the metadata or previous page. */
+err1:	if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the caller's page reference. */
+err:	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
+	/*
+	 * XXX
+	 * We have to unlock the caller's page in the caller!
+	 */
+	return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_freelist_pos -- find the position of a page in the freelist.
+ *	The list is sorted, we do a binary search.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
+ * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
+ * PUBLIC: #endif
+ */
+void
+__db_freelist_pos(pgno, list, nelem, posp)
+	db_pgno_t pgno;
+	db_pgno_t *list;
+	u_int32_t nelem;
+	u_int32_t *posp;
+{
+	u_int32_t base, indx, lim;
+
+	indx = 0;
+	for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
+		indx = base + (lim >> 1);
+		if (pgno == list[indx]) {
+			*posp = indx;
+			return;
+		}
+		if (pgno > list[indx]) {
+			base = indx + 1;
+			--lim;
+		}
+	}
+	if (base != 0)
+		base--;
+	*posp = base;
+	return;
+}
+
+static int
+__db_pglistcmp(a, b)
+	const void *a, *b;
+{
+	db_pglist_t *ap, *bp;
+
+	ap = (db_pglist_t *)a;
+	bp = (db_pglist_t *)b;
+
+	return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
+}
+
+/*
+ * __db_freelist_sort -- sort a list of free pages.
+ * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+ */
+void
+__db_freelist_sort(list, nelems)
+	db_pglist_t *list;
+	u_int32_t nelems;
+{
+	qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
+}
+
+/*
+ * __db_pg_truncate -- find the truncation point in a sorted freelist.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
+ * PUBLIC:    db_pglist_t *, DB_COMPACT *, u_int32_t *,
+ * PUBLIC:    db_pgno_t , db_pgno_t *, DB_LSN *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_pg_truncate(dbc, txn,
+    list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
+	DBC *dbc;
+	DB_TXN *txn;
+	db_pglist_t *list;
+	DB_COMPACT *c_data;
+	u_int32_t *nelemp;
+	db_pgno_t free_pgno, *last_pgno;
+	DB_LSN *lsnp;
+	int in_recovery;
+{
+	DB *dbp;
+	DBT ddbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pglist_t *lp, *slp;
+	db_pgno_t lpgno, pgno;
+	u_int32_t elems, log_size, tpoint;
+	int last, ret;
+
+	ret = 0;
+	h = NULL;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	elems = tpoint = *nelemp;
+
+	/*
+	 * Figure out what (if any) pages can be truncated immediately and
+	 * record the place from which we can truncate, so we can do the
+	 * memp_ftruncate below.  We also use this to avoid ever putting
+	 * these pages on the freelist, which we are about to relink.
+	 */
+	pgno = *last_pgno;
+	lp = &list[elems - 1];
+	last = 1;
+	while (tpoint != 0) {
+		if (lp->pgno != pgno)
+			break;
+		pgno--;
+		tpoint--;
+		lp--;
+	}
+
+	lp = list;
+	slp = &list[elems];
+	/*
+	 * Log the sorted list. We log the whole list so it can be rebuilt.
+	 * Don't overflow the log file.
+	 */
+again:	if (DBC_LOGGING(dbc)) {
+		last = 1;
+		lpgno = *last_pgno;
+		ddbt.size = elems * sizeof(*lp);
+		ddbt.data = lp;
+		log_size = ((LOG *)dbc->env->
+		    lg_handle->reginfo.primary)->log_size;
+		if (ddbt.size > log_size / 2) {
+			elems = (log_size / 2) / sizeof(*lp);
+			ddbt.size = elems * sizeof(*lp);
+			last = 0;
+			/*
+			 * If we stopped after the truncation point
+			 * then we need to truncate from here.
+			 */
+			if (lp + elems >= &list[tpoint])
+				lpgno = lp[elems - 1].pgno;
+		}
+		/*
+		 * If this is not the beginning of the list fetch the end
+		 * of the previous segment.  This page becomes the last_free
+		 * page and will link to this segment if it is not truncated.
+		 */
+		if (lp != list) {
+			if ((ret = __memp_fget(mpf, &lp[-1].pgno,
+			    dbc->thread_info, txn, 0, &h)) != 0)
+				goto err;
+		}
+
+		slp = &lp[elems];
+
+		ZERO_LSN(null_lsn);
+		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+		     lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
+		     lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
+		     h != NULL ? &LSN(h) : &null_lsn,
+		     free_pgno, lpgno, &ddbt)) != 0)
+			goto err;
+		if (h != NULL) {
+			LSN(h) = *lsnp;
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority)) != 0)
+				goto err;
+		}
+		h = NULL;
+	} else if (!in_recovery)
+		LSN_NOT_LOGGED(*lsnp);
+
+	for (; lp < slp && lp < &list[tpoint]; lp++) {
+		if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
+		    txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
+			/* Page may have been truncated later. */
+			if (in_recovery && ret == DB_PAGE_NOTFOUND) {
+				ret = 0;
+				continue;
+			}
+			goto err;
+		}
+		if (in_recovery) {
+			if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
+				if ((ret = __memp_dirty(mpf, &h,
+				    dbc->thread_info,
+				    txn, dbp->priority, 0)) != 0) {
+					(void)__memp_fput(mpf,
+					    dbc->thread_info, h, dbp->priority);
+					goto err;
+				}
+			} else
+				goto skip;
+		}
+
+		if (lp == &list[tpoint - 1])
+			NEXT_PGNO(h) = PGNO_INVALID;
+		else
+			NEXT_PGNO(h) = lp[1].pgno;
+		DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
+
+		LSN(h) = *lsnp;
+skip:		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbp->priority)) != 0)
+			goto err;
+		h = NULL;
+	}
+
+	/*
+	 * If we did not log everything try again.  We start from slp and
+	 * try to go to the end of the list.
+	 */
+	if (last == 0) {
+		elems = (u_int32_t)(&list[*nelemp] - slp);
+		lp = slp;
+		goto again;
+	}
+
+	/*
+	 * Truncate the file.  Its possible that the last page is the
+	 * only one that got truncated and that's done in the caller.
+	 */
+	if (pgno != *last_pgno) {
+		if (tpoint != *nelemp &&
+		    (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+		    pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
+			goto err;
+		if (c_data)
+			c_data->compact_pages_truncated += *last_pgno - pgno;
+		*last_pgno = pgno;
+	}
+	*nelemp = tpoint;
+
+	if (0) {
+err:		if (h != NULL)
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+	}
+	return (ret);
+}
+
+/*
+ * __db_free_truncate --
+ *	  Build a sorted free list and truncate free pages at the end
+ *	  of the file.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:    u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
+ * PUBLIC:    db_pgno_t *));
+ * PUBLIC: #endif
+ */
+int
+__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+	DB_COMPACT *c_data;
+	db_pglist_t **listp;
+	u_int32_t *nelemp;
+	db_pgno_t *last_pgnop;
+{
+	DBC *dbc;
+	DBMETA *meta;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pglist_t *list, *lp;
+	db_pgno_t pgno;
+	u_int32_t nelems;
+	int ret, t_ret;
+	size_t size;
+
+	COMPQUIET(flags, 0);
+	list = NULL;
+	meta = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	nelems = 0;
+	if (listp != NULL) {
+		*listp = NULL;
+		DB_ASSERT(env, nelemp != NULL);
+		*nelemp = 0;
+	}
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
+	    &meta)) != 0)
+		goto err;
+
+	if (last_pgnop != NULL)
+		*last_pgnop = meta->last_pgno;
+	if ((pgno = meta->free) == PGNO_INVALID)
+		goto done;
+
+	size = 128;
+	if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
+		goto err;
+	lp = list;
+
+	do {
+		if (lp == &list[size]) {
+			size *= 2;
+			if ((ret = __os_realloc(env,
+			    size * sizeof(*list), &list)) != 0)
+				goto err;
+			lp = &list[size / 2];
+		}
+		if ((ret = __memp_fget(mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
+			goto err;
+
+		lp->pgno = pgno;
+		lp->next_pgno = NEXT_PGNO(h);
+		lp->lsn = LSN(h);
+		pgno = NEXT_PGNO(h);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbc->priority)) != 0)
+			goto err;
+		lp++;
+	} while (pgno != PGNO_INVALID);
+	nelems = (u_int32_t)(lp - list);
+
+	if ((ret = __memp_dirty(mpf,
+	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+
+	/* Sort the list */
+	__db_freelist_sort(list, nelems);
+
+	if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
+	    &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
+		goto err;
+
+	if (nelems == 0)
+		meta->free = PGNO_INVALID;
+	else
+		meta->free = list[0].pgno;
+
+done:	if (last_pgnop != NULL)
+		*last_pgnop = meta->last_pgno;
+
+	/*
+	 * The truncate point is the number of pages in the free
+	 * list back from the last page.  The number of pages
+	 * in the free list are the number that we can swap in.
+	 * Adjust it down slightly so if we find higher numbered
+	 * pages early and then free other pages later we can
+	 * truncate them.
+	 */
+	if (c_data) {
+		c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
+		if (c_data->compact_truncate > nelems >> 2)
+			c_data->compact_truncate -= nelems >> 2;
+	}
+
+	if (nelems != 0 && listp != NULL) {
+		*listp = list;
+		*nelemp = nelems;
+		list = NULL;
+	}
+
+err:	if (list != NULL)
+		__os_free(env, list);
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	     dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int
+__db_truncate_freelist(dbc, meta, h, list, start, nelem)
+	DBC *dbc;
+	DBMETA *meta;
+	PAGE *h;
+	db_pgno_t *list;
+	u_int32_t start, nelem;
+{
+	DB *dbp;
+	DBT ddbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *last_free, *pg;
+	db_pgno_t *lp, free_pgno, lpgno;
+	db_pglist_t *plist, *pp, *spp;
+	u_int32_t elem, log_size;
+	int last, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	plist = NULL;
+	last_free = NULL;
+	pg = NULL;
+
+	if (start != 0 &&
+	    (ret = __memp_fget(mpf, &list[start - 1],
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
+		goto err;
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __os_malloc(dbp->env,
+		     (nelem - start) * sizeof(*pp), &plist)) != 0)
+			goto err;
+
+		pp = plist;
+		for (lp = &list[start]; lp < &list[nelem]; lp++) {
+			pp->pgno = *lp;
+			if ((ret = __memp_fget(mpf, lp,
+			     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+				goto err;
+			pp->lsn = LSN(pg);
+			pp->next_pgno = NEXT_PGNO(pg);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+				goto err;
+			pg = NULL;
+			pp++;
+		}
+		ZERO_LSN(null_lsn);
+		pp = plist;
+		elem = nelem - start;
+		log_size = ((LOG *)dbc->env->
+		    lg_handle->reginfo.primary)->log_size;
+again:		ddbt.data = spp = pp;
+		free_pgno = pp->pgno;
+		lpgno = meta->last_pgno;
+		ddbt.size = elem * sizeof(*pp);
+		if (ddbt.size > log_size / 2) {
+			elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
+			ddbt.size = elem * sizeof(*pp);
+			pp += elem;
+			elem = (nelem - start) - (u_int32_t)(pp - plist);
+			lpgno = pp[-1].pgno;
+			last = 0;
+		} else
+			last = 1;
+		/*
+		 * Get the page which will link to this section if we abort.
+		 * If this is the first segment then its last_free.
+		 */
+		if (spp == plist)
+			pg = last_free;
+		else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
+		     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+			goto err;
+
+		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+		     &LSN(meta), last == 1 ? DB_FLUSH : 0,
+		     PGNO(meta), &LSN(meta),
+		     pg != NULL ? PGNO(pg) : PGNO_INVALID,
+		     pg != NULL ? &LSN(pg) : &null_lsn,
+		     free_pgno, lpgno, &ddbt)) != 0)
+			goto err;
+		if (pg != NULL) {
+			LSN(pg) = LSN(meta);
+			if (pg != last_free && (ret = __memp_fput(mpf,
+			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+				goto err;
+			pg = NULL;
+		}
+		if (last == 0)
+			goto again;
+	} else
+		LSN_NOT_LOGGED(LSN(meta));
+
+	if ((ret = __memp_fput(mpf,
+	    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+		goto err;
+	h = NULL;
+	if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+	    list[start], 0)) != 0)
+		goto err;
+	meta->last_pgno = list[start] - 1;
+
+	if (start == 0)
+		meta->free = PGNO_INVALID;
+	else {
+		NEXT_PGNO(last_free) = PGNO_INVALID;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, last_free, dbc->priority)) != 0)
+			goto err;
+		last_free = NULL;
+	}
+
+	/* Shrink the number of elements in the list. */
+	ret = __memp_extend_freelist(mpf, start, &list);
+
+err:	if (plist != NULL)
+		__os_free(dbp->env, plist);
+
+	/* We need to put the page on error. */
+	if (h != NULL)
+		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	if (pg != NULL && pg != last_free)
+		(void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
+	if (last_free != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, last_free, dbc->priority);
+
+	return (ret);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * __db_lprint --
+ *	Print out the list of locks currently held by a cursor.
+ *
+ * PUBLIC: int __db_lprint __P((DBC *));
+ */
+int
+__db_lprint(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_LOCKREQ req;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if (LOCKING_ON(env)) {
+		req.op = DB_LOCK_DUMP;
+		(void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
+	}
+	return (0);
+}
+#endif
+
+/*
+ * __db_lget --
+ *	The standard lock get call.
+ *
+ * PUBLIC: int __db_lget __P((DBC *,
+ * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+ */
+int
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
+	DBC *dbc;
+	int action;
+	db_pgno_t pgno;
+	db_lockmode_t mode;
+	u_int32_t lkflags;
+	DB_LOCK *lockp;
+{
+	DB *dbp;
+	DB_LOCKREQ couple[3], *reqp;
+	DB_TXN *txn;
+	ENV *env;
+	int has_timeout, i, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	txn = dbc->txn;
+
+	/*
+	 * We do not always check if we're configured for locking before
+	 * calling __db_lget to acquire the lock.
+	 */
+	if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
+	    (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
+	    dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
+	    F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
+	    (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
+	    (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+		LOCK_INIT(*lockp);
+		return (0);
+	}
+
+	/*
+	 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
+	 * pass that along to the lock call.
+	 */
+	if (DB_NONBLOCK(dbc))
+		lkflags |= DB_LOCK_NOWAIT;
+
+	/*
+	 * If we're trying to run in exclusive mode, attempt to get an
+	 * exclusive database lock.  If it is not available then wait
+	 * for the lock on the database and clear the exclusive bit.
+	 *
+	 * If we get an exclusive lock on the database, mark the cursor
+	 * with DBC_DONTLOCK to avoid any further locking.
+	 */
+	if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+		dbc->lock.type = DB_DATABASE_LOCK;
+		dbc->lock.pgno = PGNO_BASE_MD;
+		if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
+		    &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
+		    DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
+			if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+				F_SET(dbc, DBC_DONTLOCK);
+				if (!IS_REAL_TXN(txn))
+					dbc->mylock = *lockp;
+				LOCK_INIT(*lockp);
+				return (0);
+			}
+		} else if (ret == DB_LOCK_NOTGRANTED &&
+		    (lkflags & DB_LOCK_NOWAIT) == 0) {
+			if ((ret = __lock_get(env, dbc->locker, 0,
+			    &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
+				return (ret);
+			F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
+			if ((ret = __lock_put(env, lockp)) != 0)
+				return (ret);
+			LOCK_INIT(*lockp);
+		} else if (ret != 0)
+			return (ret);
+	}
+
+	dbc->lock.pgno = pgno;
+	if (lkflags & DB_LOCK_RECORD)
+		dbc->lock.type = DB_RECORD_LOCK;
+	else
+		dbc->lock.type = DB_PAGE_LOCK;
+	lkflags &= ~DB_LOCK_RECORD;
+
+	if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
+		mode = DB_LOCK_READ_UNCOMMITTED;
+
+	has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
+	    (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
+
+	/*
+	 * Transactional locking.
+	 * Hold on to the previous read lock only if we are in full isolation.
+	 * COUPLE_ALWAYS indicates we are holding an interior node which need
+	 *	not be isolated.
+	 * Downgrade write locks if we are supporting dirty readers and the
+	 * update did not have an error.
+	 */
+	if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
+	    !LOCK_ISSET(*lockp))
+		action = 0;
+	else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+	    lockp->mode == DB_LOCK_READ)
+		action = LCK_COUPLE;
+	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+	     !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+		action = LCK_DOWNGRADE;
+	else
+		action = 0;
+
+	i = 0;
+	switch (action) {
+	default:
+		if (has_timeout)
+			goto do_couple;
+		ret = __lock_get(env,
+		    dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+		break;
+
+	case LCK_DOWNGRADE:
+		couple[0].op = DB_LOCK_GET;
+		couple[0].obj = NULL;
+		couple[0].lock = *lockp;
+		couple[0].mode = DB_LOCK_WWRITE;
+		UMRW_SET(couple[0].timeout);
+		i++;
+		/* FALLTHROUGH */
+	case LCK_COUPLE:
+do_couple:	couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
+		couple[i].obj = &dbc->lock_dbt;
+		couple[i].mode = mode;
+		UMRW_SET(couple[i].timeout);
+		i++;
+		if (has_timeout)
+			couple[0].timeout =
+			     F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
+		if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
+			couple[i].op = DB_LOCK_PUT;
+			couple[i].lock = *lockp;
+			i++;
+		}
+
+		ret = __lock_vec(env,
+		    dbc->locker, lkflags, couple, i, &reqp);
+		if (ret == 0 || reqp == &couple[i - 1])
+			*lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
+		break;
+	}
+
+	if (txn != NULL && ret == DB_LOCK_DEADLOCK)
+		F_SET(txn, TXN_DEADLOCK);
+	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+		 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_haslock --
+ *	Determine if this locker holds a particular lock.
+ *	Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
+ * PUBLIC:     DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__db_haslock(env, locker, dbmfp, pgno, mode, type)
+	ENV *env;
+	DB_LOCKER *locker;
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t pgno;
+	db_lockmode_t mode;
+	u_int32_t type;
+{
+	DBT lkdata;
+	DB_LOCK lock;
+	DB_LOCK_ILOCK ilock;
+
+	memset(&lkdata, 0, sizeof(lkdata));
+	lkdata.data = &ilock;
+	lkdata.size = sizeof(ilock);
+
+	memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
+	ilock.pgno = pgno;
+	ilock.type = type;
+
+	return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
+}
+/*
+ * __db_has_pagelock --
+ *	Determine if this locker holds a particular page lock.
+ *	Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
+ * PUBLIC:     DB_MPOOLFILE *, PAGE *, db_lockmode_t));
+ * PUBLIC: #endif
+ */
+int
+__db_has_pagelock(env, locker, dbmfp, pagep, mode)
+	ENV *env;
+	DB_LOCKER *locker;
+	DB_MPOOLFILE *dbmfp;
+	PAGE *pagep;
+	db_lockmode_t mode;
+{
+	int ret;
+
+	switch (pagep->type) {
+	case P_OVERFLOW:
+	case P_INVALID:
+	case P_QAMDATA:
+	case P_QAMMETA:
+	case P_IHEAP:
+		return (0);
+	case P_HASH:
+		if (PREV_PGNO(pagep) != PGNO_INVALID)
+			return (0);
+		break;
+	default:
+		break;
+	}
+	if ((ret = __db_haslock(env,
+	    locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
+		ret = __db_haslock(env,
+		    locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
+	return (ret);
+}
+#endif
+
+/*
+ * __db_lput --
+ *	The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+	DBC *dbc;
+	DB_LOCK *lockp;
+{
+	DB_LOCKREQ couple[2], *reqp;
+	ENV *env;
+	int action, ret;
+
+	/*
+	 * Transactional locking.
+	 * Hold on to the read locks only if we are in full isolation.
+	 * Downgrade write locks if we are supporting dirty readers unless
+	 * there was an error.
+	 */
+	if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+	    !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+		action = LCK_DOWNGRADE;
+	else if (dbc->txn == NULL)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+	    lockp->mode == DB_LOCK_READ)
+		action = LCK_COUPLE;
+	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+		action = LCK_COUPLE;
+	else
+		action = 0;
+
+	env = dbc->env;
+	switch (action) {
+	case LCK_COUPLE:
+		ret = __lock_put(env, lockp);
+		break;
+	case LCK_DOWNGRADE:
+		couple[0].op = DB_LOCK_GET;
+		couple[0].obj = NULL;
+		couple[0].mode = DB_LOCK_WWRITE;
+		couple[0].lock = *lockp;
+		UMRW_SET(couple[0].timeout);
+		couple[1].op = DB_LOCK_PUT;
+		couple[1].lock = *lockp;
+		ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
+		if (ret == 0 || reqp == &couple[1])
+			*lockp = couple[0].lock;
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+
+	return (ret);
+}
diff --git a/src/db/db_method.c b/src/db/db_method.c
new file mode 100644
index 00000000..82d03e5f
--- /dev/null
+++ b/src/db/db_method.c
@@ -0,0 +1,1117 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int  __db_get_byteswapped __P((DB *, int *));
+static int  __db_get_dbname __P((DB *, const char **, const char **));
+static DB_ENV *__db_get_env __P((DB *));
+static void __db_get_msgcall
+	      __P((DB *, void (**)(const DB_ENV *, const char *)));
+static DB_MPOOLFILE *__db_get_mpf __P((DB *));
+static int  __db_get_multiple __P((DB *));
+static int  __db_get_transactional __P((DB *));
+static int  __db_get_type __P((DB *, DBTYPE *dbtype));
+static int  __db_init __P((DB *, u_int32_t));
+static int  __db_get_alloc __P((DB *, void *(**)(size_t),
+		void *(**)(void *, size_t), void (**)(void *)));
+static int  __db_set_alloc __P((DB *, void *(*)(size_t),
+		void *(*)(void *, size_t), void (*)(void *)));
+static int  __db_get_append_recno __P((DB *,
+		int (**)(DB *, DBT *, db_recno_t)));
+static int  __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+static int  __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *));
+static int  __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
+static int  __db_get_create_dir __P((DB *, const char **));
+static int  __db_set_create_dir __P((DB *, const char *));
+static int  __db_get_dup_compare
+		__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int  __db_set_dup_compare
+		__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+static int  __db_get_encrypt_flags __P((DB *, u_int32_t *));
+static int  __db_set_encrypt __P((DB *, const char *, u_int32_t));
+static int  __db_get_feedback __P((DB *, void (**)(DB *, int, int)));
+static int  __db_set_feedback __P((DB *, void (*)(DB *, int, int)));
+static int  __db_get_lk_exclusive __P((DB *, int *, int *));
+static int  __db_set_lk_exclusive __P((DB *, int));
+static void __db_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+static int  __db_get_pagesize __P((DB *, u_int32_t *));
+static int  __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int)));
+static int  __db_set_priority __P((DB *, DB_CACHE_PRIORITY));
+static int  __db_get_priority __P((DB *, DB_CACHE_PRIORITY *));
+static void __db_get_errcall __P((DB *,
+	      void (**)(const DB_ENV *, const char *, const char *)));
+static void __db_set_errcall
+	      __P((DB *, void (*)(const DB_ENV *, const char *, const char *)));
+static void __db_get_errfile __P((DB *, FILE **));
+static void __db_set_errfile __P((DB *, FILE *));
+static void __db_get_errpfx __P((DB *, const char **));
+static void __db_set_errpfx __P((DB *, const char *));
+static void __db_set_msgcall
+	      __P((DB *, void (*)(const DB_ENV *, const char *)));
+static void __db_get_msgfile __P((DB *, FILE **));
+static void __db_set_msgfile __P((DB *, FILE *));
+static int  __db_get_assoc_flags __P((DB *, u_int32_t *));
+static void __dbh_err __P((DB *, int, const char *, ...));
+static void __dbh_errx __P((DB *, const char *, ...));
+
+/*
+ * db_create --
+ *	DB constructor.
+ *
+ * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t));
+ */
+int
+db_create(dbpp, dbenv, flags)
+	DB **dbpp;
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	ip = NULL;
+	env = dbenv == NULL ? NULL : dbenv->env;
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	case DB_XA_CREATE:
+		if (dbenv != NULL) {
+			__db_errx(env, DB_STR("0504",
+	    "XA applications may not specify an environment to db_create"));
+			return (EINVAL);
+		}
+
+		/*
+		 * If it's an XA database, open it within the XA environment,
+		 * taken from the global list of environments.  (When the XA
+		 * transaction manager called our xa_start() routine the
+		 * "current" environment was moved to the start of the list.
+		 */
+		env = TAILQ_FIRST(&DB_GLOBAL(envq));
+		if (env == NULL) {
+			__db_errx(env, DB_STR("0505",
+			    "Cannot open XA database before XA is enabled"));
+			return (EINVAL);
+		}
+		break;
+	default:
+		return (__db_ferr(env, "db_create", 0));
+	}
+
+	if (env != NULL)
+		ENV_ENTER(env, ip);
+
+	/*
+	 * If we are opening an XA database, make sure we don't have a global XA
+	 * transaction running.
+	 */
+	if (LF_ISSET(DB_XA_CREATE)) {
+		XA_NO_TXN(ip, ret);
+		if (ret != 0)
+			goto err;
+	}
+
+	ret = __db_create_internal(dbpp, env, flags);
+err:	if (env != NULL)
+		ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __db_create_internal --
+ *	DB constructor internal routine.
+ *
+ * PUBLIC: int __db_create_internal  __P((DB **, ENV *, u_int32_t));
+ */
+int
+__db_create_internal(dbpp, env, flags)
+	DB **dbpp;
+	ENV *env;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_ENV *dbenv;
+	DB_REP *db_rep;
+	int ret;
+
+	*dbpp = NULL;
+
+	/* If we don't have an environment yet, allocate a local one. */
+	if (env == NULL) {
+		if ((ret = db_env_create(&dbenv, 0)) != 0)
+			return (ret);
+		env = dbenv->env;
+		F_SET(env, ENV_DBLOCAL);
+	} else
+		dbenv = env->dbenv;
+
+	/* Allocate and initialize the DB handle. */
+	if ((ret = __os_calloc(env, 1, sizeof(*dbp), &dbp)) != 0)
+		goto err;
+
+	dbp->dbenv = env->dbenv;
+	dbp->env = env;
+	if ((ret = __db_init(dbp, flags)) != 0)
+		goto err;
+
+	MUTEX_LOCK(env, env->mtx_dblist);
+	++env->db_ref;
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	/*
+	 * Set the replication timestamp; it's 0 if we're not in a replicated
+	 * environment.  Don't acquire a lock to read the value, even though
+	 * it's opaque: all we check later is value equality, nothing else.
+	 */
+	dbp->timestamp = REP_ON(env) ?
+	    ((REGENV *)env->reginfo->primary)->rep_timestamp : 0;
+	/*
+	 * Set the replication generation number for fid management; valid
+	 * replication generations start at 1.  Don't acquire a lock to
+	 * read the value.  All we check later is value equality.
+	 */
+	db_rep = env->rep_handle;
+	dbp->fid_gen = REP_ON(env) ? ((REP *)db_rep->region)->gen : 0;
+
+	/* Open a backing DB_MPOOLFILE handle in the memory pool. */
+	if ((ret = __memp_fcreate(env, &dbp->mpf)) != 0)
+		goto err;
+
+	dbp->type = DB_UNKNOWN;
+
+	*dbpp = dbp;
+	return (0);
+
+err:	if (dbp != NULL) {
+		if (dbp->mpf != NULL)
+			(void)__memp_fclose(dbp->mpf, 0);
+		__os_free(env, dbp);
+	}
+
+	if (dbp != NULL && F_ISSET(env, ENV_DBLOCAL))
+		(void)__env_close(dbp->dbenv, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_init --
+ *	Initialize a DB structure.
+ */
+static int
+__db_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	int ret;
+
+	dbp->locker = NULL;
+	dbp->alt_close = NULL;
+	LOCK_INIT(dbp->handle_lock);
+
+	TAILQ_INIT(&dbp->free_queue);
+	TAILQ_INIT(&dbp->active_queue);
+	TAILQ_INIT(&dbp->join_queue);
+	LIST_INIT(&dbp->s_secondaries);
+
+	FLD_SET(dbp->am_ok,
+	    DB_OK_BTREE | DB_OK_HASH | DB_OK_HEAP | DB_OK_QUEUE | DB_OK_RECNO);
+
+	/* DB PUBLIC HANDLE LIST BEGIN */
+	dbp->associate = __db_associate_pp;
+	dbp->associate_foreign = __db_associate_foreign_pp;
+	dbp->close = __db_close_pp;
+	dbp->compact = __db_compact_pp;
+	dbp->cursor = __db_cursor_pp;
+	dbp->del = __db_del_pp;
+	dbp->dump = __db_dump_pp;
+	dbp->err = __dbh_err;
+	dbp->errx = __dbh_errx;
+	dbp->exists = __db_exists;
+	dbp->fd = __db_fd_pp;
+	dbp->get = __db_get_pp;
+	dbp->get_alloc = __db_get_alloc;
+	dbp->get_append_recno = __db_get_append_recno;
+	dbp->get_assoc_flags = __db_get_assoc_flags;
+	dbp->get_byteswapped = __db_get_byteswapped;
+	dbp->get_cachesize = __db_get_cachesize;
+	dbp->get_create_dir = __db_get_create_dir;
+	dbp->get_dbname = __db_get_dbname;
+	dbp->get_dup_compare = __db_get_dup_compare;
+	dbp->get_encrypt_flags = __db_get_encrypt_flags;
+	dbp->get_env = __db_get_env;
+	dbp->get_errcall = __db_get_errcall;
+	dbp->get_errfile = __db_get_errfile;
+	dbp->get_errpfx = __db_get_errpfx;
+	dbp->get_feedback = __db_get_feedback;
+	dbp->get_flags = __db_get_flags;
+	dbp->get_lorder = __db_get_lorder;
+	dbp->get_mpf = __db_get_mpf;
+	dbp->get_msgcall = __db_get_msgcall;
+	dbp->get_msgfile = __db_get_msgfile;
+	dbp->get_multiple = __db_get_multiple;
+	dbp->get_open_flags = __db_get_open_flags;
+	dbp->get_partition_dirs = __partition_get_dirs;
+	dbp->get_partition_callback = __partition_get_callback;
+	dbp->get_partition_keys = __partition_get_keys;
+	dbp->get_pagesize = __db_get_pagesize;
+	dbp->get_priority = __db_get_priority;
+	dbp->get_transactional = __db_get_transactional;
+	dbp->get_type = __db_get_type;
+	dbp->join = __db_join_pp;
+	dbp->key_range = __db_key_range_pp;
+	dbp->get_lk_exclusive = __db_get_lk_exclusive;
+	dbp->set_lk_exclusive = __db_set_lk_exclusive;
+	dbp->open = __db_open_pp;
+	dbp->pget = __db_pget_pp;
+	dbp->put = __db_put_pp;
+	dbp->remove = __db_remove_pp;
+	dbp->rename = __db_rename_pp;
+	dbp->set_alloc = __db_set_alloc;
+	dbp->set_append_recno = __db_set_append_recno;
+	dbp->set_cachesize = __db_set_cachesize;
+	dbp->set_create_dir = __db_set_create_dir;
+	dbp->set_dup_compare = __db_set_dup_compare;
+	dbp->set_encrypt = __db_set_encrypt;
+	dbp->set_errcall = __db_set_errcall;
+	dbp->set_errfile = __db_set_errfile;
+	dbp->set_errpfx = __db_set_errpfx;
+	dbp->set_feedback = __db_set_feedback;
+	dbp->set_flags = __db_set_flags;
+	dbp->set_lorder = __db_set_lorder;
+	dbp->set_msgcall = __db_set_msgcall;
+	dbp->set_msgfile = __db_set_msgfile;
+	dbp->set_pagesize = __db_set_pagesize;
+	dbp->set_paniccall = __db_set_paniccall;
+	dbp->set_partition = __partition_set;
+	dbp->set_partition_dirs = __partition_set_dirs;
+	dbp->set_priority = __db_set_priority;
+	dbp->sort_multiple = __db_sort_multiple;
+	dbp->stat = __db_stat_pp;
+	dbp->stat_print = __db_stat_print_pp;
+	dbp->sync = __db_sync_pp;
+	dbp->truncate = __db_truncate_pp;
+	dbp->upgrade = __db_upgrade_pp;
+	dbp->verify = __db_verify_pp;
+	/* DB PUBLIC HANDLE LIST END */
+
+					/* Access method specific. */
+	if ((ret = __bam_db_create(dbp)) != 0)
+		return (ret);
+	if ((ret = __ham_db_create(dbp)) != 0)
+		return (ret);
+	if ((ret = __heap_db_create(dbp)) != 0)
+		return (ret);
+	if ((ret = __qam_db_create(dbp)) != 0)
+		return (ret);
+
+	COMPQUIET(flags, 0);
+
+	return (0);
+}
+
+/*
+ * __dbh_am_chk --
+ *	Error if an unreasonable method is called.
+ *
+ * PUBLIC: int __dbh_am_chk __P((DB *, u_int32_t));
+ */
+int
+__dbh_am_chk(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	/*
+	 * We start out allowing any access methods to be called, and as the
+	 * application calls the methods the options become restricted.  The
+	 * idea is to quit as soon as an illegal method combination is called.
+	 */
+	if ((LF_ISSET(DB_OK_BTREE) && FLD_ISSET(dbp->am_ok, DB_OK_BTREE)) ||
+	    (LF_ISSET(DB_OK_HASH) && FLD_ISSET(dbp->am_ok, DB_OK_HASH)) ||
+	    (LF_ISSET(DB_OK_HEAP) && FLD_ISSET(dbp->am_ok, DB_OK_HEAP)) ||
+	    (LF_ISSET(DB_OK_QUEUE) && FLD_ISSET(dbp->am_ok, DB_OK_QUEUE)) ||
+	    (LF_ISSET(DB_OK_RECNO) && FLD_ISSET(dbp->am_ok, DB_OK_RECNO))) {
+		FLD_CLR(dbp->am_ok, ~flags);
+		return (0);
+	}
+
+	__db_errx(dbp->env, DB_STR("0506",
+"call implies an access method which is inconsistent with previous calls"));
+	return (EINVAL);
+}
+
+/*
+ * __dbh_err --
+ *	Db.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_err(DB *dbp, int error, const char *fmt, ...)
+#else
+__dbh_err(dbp, error, fmt, va_alist)
+	DB *dbp;
+	int error;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message with error string, to stderr by default. */
+	DB_REAL_ERR(dbp->dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __dbh_errx --
+ *	Db.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_errx(DB *dbp, const char *fmt, ...)
+#else
+__dbh_errx(dbp, fmt, va_alist)
+	DB *dbp;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message without error string, to stderr by default. */
+	DB_REAL_ERR(dbp->dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+/*
+ * __db_get_byteswapped --
+ *	Return if database requires byte swapping.
+ */
+static int
+__db_get_byteswapped(dbp, isswapped)
+	DB *dbp;
+	int *isswapped;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_byteswapped");
+
+	*isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
+	return (0);
+}
+
+/*
+ * __db_get_dbname --
+ *	Get the name of the database as passed to DB->open.
+ */
+static int
+__db_get_dbname(dbp, fnamep, dnamep)
+	DB *dbp;
+	const char **fnamep, **dnamep;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_dbname");
+
+	if (fnamep != NULL)
+		*fnamep = dbp->fname;
+	if (dnamep != NULL)
+		*dnamep = dbp->dname;
+	return (0);
+}
+
+/*
+ * __db_get_env --
+ *	Get the DB_ENV handle that was passed to db_create.
+ */
+static DB_ENV *
+__db_get_env(dbp)
+	DB *dbp;
+{
+	return (dbp->dbenv);
+}
+
+/*
+ * __db_get_mpf --
+ *	Get the underlying DB_MPOOLFILE handle.
+ */
+static DB_MPOOLFILE *
+__db_get_mpf(dbp)
+	DB *dbp;
+{
+	return (dbp->mpf);
+}
+
+/*
+ * get_multiple --
+ *	Return whether this DB handle references a physical file with multiple
+ *	databases.
+ */
+static int
+__db_get_multiple(dbp)
+	DB *dbp;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_multiple");
+
+	/*
+	 * Only return TRUE if the handle is for the master database, not for
+	 * any subdatabase in the physical file.  If it's a Btree, with the
+	 * subdatabases flag set, and the meta-data page has the right value,
+	 * return TRUE.  (We don't need to check it's a Btree, I suppose, but
+	 * it doesn't hurt.)
+	 */
+	return (dbp->type == DB_BTREE &&
+	    F_ISSET(dbp, DB_AM_SUBDB) &&
+	    dbp->meta_pgno == PGNO_BASE_MD ? 1 : 0);
+}
+
+/*
+ * get_transactional --
+ *	Return whether this database was created in a transaction.
+ */
+static int
+__db_get_transactional(dbp)
+	DB *dbp;
+{
+	return (F_ISSET(dbp, DB_AM_TXN) ? 1 : 0);
+}
+
+/*
+ * __db_get_type --
+ *	Return type of underlying database.
+ */
+static int
+__db_get_type(dbp, dbtype)
+	DB *dbp;
+	DBTYPE *dbtype;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_type");
+
+	*dbtype = dbp->type;
+	return (0);
+}
+
+/*
+ * __db_get_append_recno --
+ *	Get record number append routine.
+ */
+static int
+__db_get_append_recno(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, DBT *, db_recno_t));
+{
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+	if (funcp)
+		*funcp = dbp->db_append_recno;
+
+	return (0);
+}
+/*
+ * __db_set_append_recno --
+ *	Set record number append routine.
+ */
+static int
+__db_set_append_recno(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, DBT *, db_recno_t));
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_append_recno");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	dbp->db_append_recno = func;
+
+	return (0);
+}
+
+/*
+ * __db_get_cachesize --
+ *	Get underlying cache size.
+ */
+static int
+__db_get_cachesize(dbp, cache_gbytesp, cache_bytesp, ncachep)
+	DB *dbp;
+	u_int32_t *cache_gbytesp, *cache_bytesp;
+	int *ncachep;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_cachesize");
+
+	return (__memp_get_cachesize(dbp->dbenv,
+	    cache_gbytesp, cache_bytesp, ncachep));
+}
+
+/*
+ * __db_set_cachesize --
+ *	Set underlying cache size.
+ */
+static int
+__db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache)
+	DB *dbp;
+	u_int32_t cache_gbytes, cache_bytes;
+	int ncache;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_cachesize");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_cachesize");
+
+	return (__memp_set_cachesize(
+	    dbp->dbenv, cache_gbytes, cache_bytes, ncache));
+}
+
+static int
+__db_set_create_dir(dbp, dir)
+	DB *dbp;
+	const char *dir;
+{
+	DB_ENV *dbenv;
+	int i;
+
+	dbenv = dbp->dbenv;
+
+	for (i = 0; i < dbenv->data_next; i++)
+		if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+			break;
+
+	if (i == dbenv->data_next) {
+		__db_errx(dbp->env, DB_STR_A("0507",
+		    "Directory %s not in environment list.", "%s"), dir);
+		return (EINVAL);
+	}
+
+	dbp->dirname = dbenv->db_data_dir[i];
+	return (0);
+}
+
+static int
+__db_get_create_dir(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	*dirp = dbp->dirname;
+	return (0);
+}
+
+/*
+ * __db_get_dup_compare --
+ *	Get duplicate comparison routine.
+ */
+static int
+__db_get_dup_compare(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	if (funcp != NULL) {
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbp)) {
+			*funcp =
+			     ((BTREE *)dbp->bt_internal)->compress_dup_compare;
+		} else
+#endif
+			*funcp = dbp->dup_compare;
+	}
+
+	return (0);
+}
+
+/*
+ * __db_set_dup_compare --
+ *	Set duplicate comparison routine.
+ */
+static int
+__db_set_dup_compare(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+{
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_dup_compare");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+		return (ret);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp)) {
+		dbp->dup_compare = __bam_compress_dupcmp;
+		((BTREE *)dbp->bt_internal)->compress_dup_compare = func;
+	} else
+#endif
+		dbp->dup_compare = func;
+
+	return (0);
+}
+
+/*
+ * __db_get_encrypt_flags --
+ */
+static int
+__db_get_encrypt_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_encrypt_flags");
+
+	return (__env_get_encrypt_flags(dbp->dbenv, flagsp));
+}
+
+/*
+ * __db_set_encrypt --
+ *	Set database passwd.
+ */
+static int
+__db_set_encrypt(dbp, passwd, flags)
+	DB *dbp;
+	const char *passwd;
+	u_int32_t flags;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_encrypt");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_encrypt");
+
+	if ((ret = __env_set_encrypt(dbp->dbenv, passwd, flags)) != 0)
+		return (ret);
+
+	/*
+	 * In a real env, this gets initialized with the region.  In a local
+	 * env, we must do it here.
+	 */
+	db_cipher = dbp->env->crypto_handle;
+	if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+	    (ret = db_cipher->init(dbp->env, db_cipher)) != 0)
+		return (ret);
+
+	return (__db_set_flags(dbp, DB_ENCRYPT));
+}
+
+static void
+__db_get_errcall(dbp, errcallp)
+	DB *dbp;
+	void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+	__env_get_errcall(dbp->dbenv, errcallp);
+}
+
+static void
+__db_set_errcall(dbp, errcall)
+	DB *dbp;
+	void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+	__env_set_errcall(dbp->dbenv, errcall);
+}
+
+static void
+__db_get_errfile(dbp, errfilep)
+	DB *dbp;
+	FILE **errfilep;
+{
+	__env_get_errfile(dbp->dbenv, errfilep);
+}
+
+static void
+__db_set_errfile(dbp, errfile)
+	DB *dbp;
+	FILE *errfile;
+{
+	__env_set_errfile(dbp->dbenv, errfile);
+}
+
+static void
+__db_get_errpfx(dbp, errpfxp)
+	DB *dbp;
+	const char **errpfxp;
+{
+	__env_get_errpfx(dbp->dbenv, errpfxp);
+}
+
+static void
+__db_set_errpfx(dbp, errpfx)
+	DB *dbp;
+	const char *errpfx;
+{
+	__env_set_errpfx(dbp->dbenv, errpfx);
+}
+
+static int
+__db_get_feedback(dbp, feedbackp)
+	DB *dbp;
+	void (**feedbackp) __P((DB *, int, int));
+{
+	if (feedbackp != NULL)
+		*feedbackp = dbp->db_feedback;
+	return (0);
+}
+
+static int
+__db_set_feedback(dbp, feedback)
+	DB *dbp;
+	void (*feedback) __P((DB *, int, int));
+{
+	dbp->db_feedback = feedback;
+	return (0);
+}
+
+static int
+__db_get_lk_exclusive(dbp, onoff, nowait)
+	DB *dbp;
+	int *onoff;
+	int *nowait;
+{
+	*onoff = (F2_ISSET(dbp, DB2_AM_EXCL) ? 1 : 0);
+	*nowait = (F2_ISSET(dbp, DB2_AM_NOWAIT) ? 1 : 0);
+	return (0);
+}
+
+static int
+__db_set_lk_exclusive(dbp, nowait)
+	DB *dbp;
+	int nowait;
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lk_exclusive");
+
+	F2_CLR(dbp, DB2_AM_NOWAIT);
+	F2_SET(dbp, (nowait ? DB2_AM_NOWAIT|DB2_AM_EXCL :
+	    DB2_AM_EXCL));
+	return (0);
+}
+
+/*
+ * __db_map_flags --
+ *	Maps between public and internal flag values.
+ *      This function doesn't check for validity, so it can't fail.
+ */
+static void
+__db_map_flags(dbp, inflagsp, outflagsp)
+	DB *dbp;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(dbp, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_CHKSUM)) {
+		FLD_SET(*outflagsp, DB_AM_CHKSUM);
+		FLD_CLR(*inflagsp, DB_CHKSUM);
+	}
+	if (FLD_ISSET(*inflagsp, DB_ENCRYPT)) {
+		FLD_SET(*outflagsp, DB_AM_ENCRYPT | DB_AM_CHKSUM);
+		FLD_CLR(*inflagsp, DB_ENCRYPT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_TXN_NOT_DURABLE)) {
+		FLD_SET(*outflagsp, DB_AM_NOT_DURABLE);
+		FLD_CLR(*inflagsp, DB_TXN_NOT_DURABLE);
+	}
+}
+
+/*
+ * __db_get_assoc_flags --
+ */
+static int
+__db_get_assoc_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_assoc_flags");
+
+	*flagsp = dbp->s_assoc_flags;
+	return (0);
+}
+
+/*
+ * __db_get_flags --
+ *	The DB->get_flags method.
+ *
+ * PUBLIC: int __db_get_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	static const u_int32_t db_flags[] = {
+		DB_CHKSUM,
+		DB_DUP,
+		DB_DUPSORT,
+		DB_ENCRYPT,
+#ifdef HAVE_QUEUE
+		DB_INORDER,
+#endif
+		DB_RECNUM,
+		DB_RENUMBER,
+		DB_REVSPLITOFF,
+		DB_SNAPSHOT,
+		DB_TXN_NOT_DURABLE,
+		0
+	};
+	u_int32_t f, flags, mapped_flag;
+	int i;
+
+	flags = 0;
+	for (i = 0; (f = db_flags[i]) != 0; i++) {
+		mapped_flag = 0;
+		__db_map_flags(dbp, &f, &mapped_flag);
+		__bam_map_flags(dbp, &f, &mapped_flag);
+		__ram_map_flags(dbp, &f, &mapped_flag);
+#ifdef HAVE_QUEUE
+		__qam_map_flags(dbp, &f, &mapped_flag);
+#endif
+		DB_ASSERT(dbp->env, f == 0);
+		if (F_ISSET(dbp, mapped_flag) == mapped_flag)
+			LF_SET(db_flags[i]);
+	}
+
+	*flagsp = flags;
+	return (0);
+}
+
+/*
+ * __db_set_flags --
+ *	DB->set_flags.
+ *
+ * PUBLIC: int  __db_set_flags __P((DB *, u_int32_t));
+ */
+int
+__db_set_flags(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (LF_ISSET(DB_ENCRYPT) && !CRYPTO_ON(env)) {
+		__db_errx(env, DB_STR("0508",
+		    "Database environment not configured for encryption"));
+		return (EINVAL);
+	}
+	if (LF_ISSET(DB_TXN_NOT_DURABLE))
+		ENV_REQUIRES_CONFIG(env,
+		    env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN);
+
+	__db_map_flags(dbp, &flags, &dbp->flags);
+
+	if ((ret = __bam_set_flags(dbp, &flags)) != 0)
+		return (ret);
+	if ((ret = __ram_set_flags(dbp, &flags)) != 0)
+		return (ret);
+#ifdef HAVE_QUEUE
+	if ((ret = __qam_set_flags(dbp, &flags)) != 0)
+		return (ret);
+#endif
+
+	return (flags == 0 ? 0 : __db_ferr(env, "DB->set_flags", 0));
+}
+
+/*
+ * __db_get_lorder --
+ *	Get whether lorder is swapped or not.
+ *
+ * PUBLIC: int  __db_get_lorder __P((DB *, int *));
+ */
+int
+__db_get_lorder(dbp, db_lorderp)
+	DB *dbp;
+	int *db_lorderp;
+{
+	int ret;
+
+	/* Flag if the specified byte order requires swapping. */
+	switch (ret = __db_byteorder(dbp->env, 1234)) {
+	case 0:
+		*db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 4321 : 1234;
+		break;
+	case DB_SWAPBYTES:
+		*db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 1234 : 4321;
+		break;
+	default:
+		return (ret);
+		/* NOTREACHED */
+	}
+
+	return (0);
+}
+
+/*
+ * __db_set_lorder --
+ *	Set whether lorder is swapped or not.
+ *
+ * PUBLIC: int  __db_set_lorder __P((DB *, int));
+ */
+int
+__db_set_lorder(dbp, db_lorder)
+	DB *dbp;
+	int db_lorder;
+{
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lorder");
+
+	/* Flag if the specified byte order requires swapping. */
+	switch (ret = __db_byteorder(dbp->env, db_lorder)) {
+	case 0:
+		F_CLR(dbp, DB_AM_SWAP);
+		break;
+	case DB_SWAPBYTES:
+		F_SET(dbp, DB_AM_SWAP);
+		break;
+	default:
+		return (ret);
+		/* NOTREACHED */
+	}
+	return (0);
+}
+
+static int
+__db_get_alloc(dbp, mal_funcp, real_funcp, free_funcp)
+	DB *dbp;
+	void *(**mal_funcp) __P((size_t));
+	void *(**real_funcp) __P((void *, size_t));
+	void (**free_funcp) __P((void *));
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_alloc");
+
+	return (__env_get_alloc(dbp->dbenv, mal_funcp,
+	    real_funcp, free_funcp));
+}
+
+static int
+__db_set_alloc(dbp, mal_func, real_func, free_func)
+	DB *dbp;
+	void *(*mal_func) __P((size_t));
+	void *(*real_func) __P((void *, size_t));
+	void (*free_func) __P((void *));
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_alloc");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_alloc");
+
+	return (__env_set_alloc(dbp->dbenv, mal_func, real_func, free_func));
+}
+
+static void
+__db_get_msgcall(dbp, msgcallp)
+	DB *dbp;
+	void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+	__env_get_msgcall(dbp->dbenv, msgcallp);
+}
+
+static void
+__db_set_msgcall(dbp, msgcall)
+	DB *dbp;
+	void (*msgcall) __P((const DB_ENV *, const char *));
+{
+	__env_set_msgcall(dbp->dbenv, msgcall);
+}
+
+static void
+__db_get_msgfile(dbp, msgfilep)
+	DB *dbp;
+	FILE **msgfilep;
+{
+	__env_get_msgfile(dbp->dbenv, msgfilep);
+}
+
+static void
+__db_set_msgfile(dbp, msgfile)
+	DB *dbp;
+	FILE *msgfile;
+{
+	__env_set_msgfile(dbp->dbenv, msgfile);
+}
+
+static int
+__db_get_pagesize(dbp, db_pagesizep)
+	DB *dbp;
+	u_int32_t *db_pagesizep;
+{
+	*db_pagesizep = dbp->pgsize;
+	return (0);
+}
+
+/*
+ * __db_set_pagesize --
+ *	DB->set_pagesize
+ *
+ * PUBLIC: int  __db_set_pagesize __P((DB *, u_int32_t));
+ */
+int
+__db_set_pagesize(dbp, db_pagesize)
+	DB *dbp;
+	u_int32_t db_pagesize;
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_pagesize");
+
+	if (db_pagesize < DB_MIN_PGSIZE) {
+		__db_errx(dbp->env, DB_STR_A("0509",
+		    "page sizes may not be smaller than %lu", "%lu"),
+		    (u_long)DB_MIN_PGSIZE);
+		return (EINVAL);
+	}
+	if (db_pagesize > DB_MAX_PGSIZE) {
+		__db_errx(dbp->env, DB_STR_A("0510",
+		    "page sizes may not be larger than %lu", "%lu"),
+		    (u_long)DB_MAX_PGSIZE);
+		return (EINVAL);
+	}
+
+	/*
+	 * We don't want anything that's not a power-of-2, as we rely on that
+	 * for alignment of various types on the pages.
+	 */
+	if (!POWER_OF_TWO(db_pagesize)) {
+		__db_errx(dbp->env, DB_STR("0511",
+		    "page sizes must be a power-of-2"));
+		return (EINVAL);
+	}
+
+	/*
+	 * XXX
+	 * Should we be checking for a page size that's not a multiple of 512,
+	 * so that we never try and write less than a disk sector?
+	 */
+	dbp->pgsize = db_pagesize;
+
+	return (0);
+}
+
+static int
+__db_set_paniccall(dbp, paniccall)
+	DB *dbp;
+	void (*paniccall) __P((DB_ENV *, int));
+{
+	return (__env_set_paniccall(dbp->dbenv, paniccall));
+}
+
+static int
+__db_set_priority(dbp, priority)
+	DB *dbp;
+	DB_CACHE_PRIORITY priority;
+{
+	dbp->priority = priority;
+	return (0);
+}
+
+static int
+__db_get_priority(dbp, priority)
+	DB *dbp;
+	DB_CACHE_PRIORITY *priority;
+{
+	if (dbp->priority == DB_PRIORITY_UNCHANGED)
+		return (__memp_get_priority(dbp->mpf, priority));
+	else
+		*priority = dbp->priority;
+
+	return (0);
+}
diff --git a/src/db/db_open.c b/src/db/db_open.c
new file mode 100644
index 00000000..fefda48f
--- /dev/null
+++ b/src/db/db_open.c
@@ -0,0 +1,857 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_handle_lock __P((DB *));
+
+/*
+ * __db_open --
+ *	DB->open method.
+ *
+ * This routine gets called in three different ways:
+ *
+ * 1. It can be called to open a file/database.  In this case, subdb will
+ *    be NULL and meta_pgno will be PGNO_BASE_MD.
+ * 2. It can be called to open a subdatabase during normal operation.  In
+ *    this case, name and subname will both be non-NULL and meta_pgno will
+ *    be PGNO_BASE_MD (also PGNO_INVALID).
+ * 3. It can be called to open an in-memory database (name == NULL;
+ *    subname = name).
+ * 4. It can be called during recovery to open a file/database, in which case
+ *    name will be non-NULL, subname will be NULL, and meta-pgno will be
+ *    PGNO_BASE_MD.
+ * 5. It can be called during recovery to open a subdatabase, in which case
+ *    name will be non-NULL, subname may be NULL and meta-pgno will be
+ *    a valid pgno (i.e., not PGNO_BASE_MD).
+ * 6. It can be called during recovery to open an in-memory database.
+ *
+ * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+ */
+int
+__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode;
+	db_pgno_t meta_pgno;
+{
+	DB *tdbp;
+	ENV *env;
+	int ret;
+	u_int32_t id;
+
+	env = dbp->env;
+	id = TXN_INVALID;
+
+	/*
+	 * We must flush any existing pages before truncating the file
+	 * since they could age out of mpool and overwrite new pages.
+	 */
+	if (LF_ISSET(DB_TRUNCATE)) {
+		if ((ret = __db_create_internal(&tdbp, dbp->env, 0)) != 0)
+			goto err;
+		ret = __db_open(tdbp, ip, txn, fname, dname, DB_UNKNOWN,
+		     DB_NOERROR | (flags &  ~(DB_TRUNCATE|DB_CREATE)),
+		     mode, meta_pgno);
+		if (ret == 0)
+			ret = __memp_ftruncate(tdbp->mpf, txn, ip, 0, 0);
+		(void)__db_close(tdbp, txn, DB_NOSYNC);
+		if (ret != 0 && ret != ENOENT && ret != EINVAL)
+			goto err;
+		ret = 0;
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname);
+
+	/*
+	 * If the environment was configured with threads, the DB handle
+	 * must also be free-threaded, so we force the DB_THREAD flag on.
+	 * (See SR #2033 for why this is a requirement--recovery needs
+	 * to be able to grab a dbp using __db_fileid_to_dbp, and it has
+	 * no way of knowing which dbp goes with which thread, so whichever
+	 * one it finds has to be usable in any of them.)
+	 */
+	if (F_ISSET(env, ENV_THREAD))
+		LF_SET(DB_THREAD);
+
+	/* Convert any DB->open flags. */
+	if (LF_ISSET(DB_RDONLY))
+		F_SET(dbp, DB_AM_RDONLY);
+	if (LF_ISSET(DB_READ_UNCOMMITTED))
+		F_SET(dbp, DB_AM_READ_UNCOMMITTED);
+
+	if (IS_REAL_TXN(txn))
+		F_SET(dbp, DB_AM_TXN);
+
+	/* Fill in the type. */
+	dbp->type = type;
+
+	/* Save the file and database names. */
+	if ((fname != NULL &&
+	    (ret = __os_strdup(env, fname, &dbp->fname)) != 0))
+		goto err;
+	if ((dname != NULL &&
+	    (ret = __os_strdup(env, dname, &dbp->dname)) != 0))
+		goto err;
+
+	/*
+	 * If both fname and subname are NULL, it's always a create, so make
+	 * sure that we have both DB_CREATE and a type specified.  It would
+	 * be nice if this checking were done in __db_open where most of the
+	 * interface checking is done, but this interface (__db_dbopen) is
+	 * used by the recovery and limbo system, so we need to safeguard
+	 * this interface as well.
+	 */
+	if (fname == NULL) {
+		if (dbp->p_internal != NULL) {
+			__db_errx(env, DB_STR("0634",
+			    "Partitioned databases may not be in memory."));
+			return (ENOENT);
+		}
+		if (dname == NULL) {
+			if (!LF_ISSET(DB_CREATE)) {
+				__db_errx(env, DB_STR("0635",
+		    "DB_CREATE must be specified to create databases."));
+				return (ENOENT);
+			}
+
+			F_SET(dbp, DB_AM_INMEM);
+			F_SET(dbp, DB_AM_CREATED);
+
+			if (dbp->type == DB_UNKNOWN) {
+				__db_errx(env, DB_STR("0636",
+				    "DBTYPE of unknown without existing file"));
+				return (EINVAL);
+			}
+
+			if (dbp->pgsize == 0)
+				dbp->pgsize = DB_DEF_IOSIZE;
+
+			/*
+			 * If the file is a temporary file and we're
+			 * doing locking, then we have to create a
+			 * unique file ID.  We can't use our normal
+			 * dev/inode pair (or whatever this OS uses
+			 * in place of dev/inode pairs) because no
+			 * backing file will be created until the
+			 * mpool cache is filled forcing the buffers
+			 * to disk.  Grab a random locker ID to use
+			 * as a file ID.  The created ID must never
+			 * match a potential real file ID -- we know
+			 * it won't because real file IDs contain a
+			 * time stamp after the dev/inode pair, and
+			 * we're simply storing a 4-byte value.
+
+			 * !!!
+			 * Store the locker in the file id structure
+			 * -- we can get it from there as necessary,
+			 * and it saves having two copies.
+			*/
+			if (LOCKING_ON(env) && (ret = __lock_id(env,
+			    (u_int32_t *)dbp->fileid, NULL)) != 0)
+				return (ret);
+		} else
+			MAKE_INMEM(dbp);
+
+		/*
+		 * Normally we would do handle locking here, however, with
+		 * in-memory files, we cannot do any database manipulation
+		 * until the mpool is open, so it happens later.
+		 */
+	} else if (dname == NULL && meta_pgno == PGNO_BASE_MD) {
+		/* Open/create the underlying file.  Acquire locks. */
+		if ((ret = __fop_file_setup(dbp, ip,
+		    txn, fname, mode, flags, &id)) != 0)
+			return (ret);
+		/*
+		 * If we are creating the first sub-db then this is the
+		 * call to create the master db and we tried to open it
+		 * read-only.  The create will force it to be read/write
+		 * So clear the RDONLY flag if we just created it.
+		 */
+		if (!F_ISSET(dbp, DB_AM_RDONLY))
+			LF_CLR(DB_RDONLY);
+	} else {
+		if (dbp->p_internal != NULL) {
+			__db_errx(env, DB_STR("0637",
+    "Partitioned databases may not be included with multiple databases."));
+			return (ENOENT);
+		}
+		if ((ret = __fop_subdb_setup(dbp, ip,
+		    txn, fname, dname, mode, flags)) != 0)
+			return (ret);
+		meta_pgno = dbp->meta_pgno;
+	}
+
+	/* Set up the underlying environment. */
+	if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0)
+		return (ret);
+
+	/* For in-memory databases, we now need to open/create the database. */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		if (dname == NULL)
+			ret = __db_new_file(dbp, ip, txn, NULL, NULL);
+		else {
+			id = TXN_INVALID;
+			ret = __fop_file_setup(dbp,
+			     ip, txn, dname, mode, flags, &id);
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Internal exclusive databases need to use the shared
+	 * memory pool to lock out existing database handles before
+	 * it gets its handle lock.  So getting the lock is delayed
+	 * until after the memory pool is allocated.
+	 */
+	if (F2_ISSET(dbp, DB2_AM_INTEXCL) &&
+	    (ret = __db_handle_lock(dbp)) != 0)
+			goto err;
+
+	switch (dbp->type) {
+		case DB_BTREE:
+			ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_HASH:
+			ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_HEAP:
+			ret = __heap_open(dbp,
+			    ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_RECNO:
+			ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_QUEUE:
+			ret = __qam_open(
+			    dbp, ip, txn, fname, meta_pgno, mode, flags);
+			break;
+		case DB_UNKNOWN:
+			return (
+			    __db_unknown_type(env, "__db_dbopen", dbp->type));
+	}
+	if (ret != 0)
+		goto err;
+
+#ifdef HAVE_PARTITION
+	if (dbp->p_internal != NULL && (ret =
+	    __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0)
+		goto err;
+#endif
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname);
+
+	/*
+	 * Temporary files don't need handle locks, so we only have to check
+	 * for a handle lock downgrade or lockevent in the case of named
+	 * files.
+	 */
+	if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) &&
+	    LOCK_ISSET(dbp->handle_lock)) {
+		if (IS_REAL_TXN(txn))
+			ret = __txn_lockevent(env,
+			    txn, dbp, &dbp->handle_lock, dbp->locker);
+		else if (LOCKING_ON(env) && !F2_ISSET(dbp, DB2_AM_EXCL))
+			/*
+			 * Trade write handle lock for read handle lock,
+			 * unless this is an exclusive database handle.
+			 */
+			ret = __lock_downgrade(env,
+			    &dbp->handle_lock, DB_LOCK_READ, 0);
+	}
+DB_TEST_RECOVERY_LABEL
+err:
+	PERFMON4(env,
+	    db, open, (char *) fname, (char *) dname, flags, &dbp->fileid[0]);
+	return (ret);
+}
+
+/*
+ * __db_get_open_flags --
+ *	Accessor for flags passed into DB->open call
+ *
+ * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_open_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags");
+
+	*flagsp = dbp->open_flags;
+	return (0);
+}
+
+/*
+ * __db_new_file --
+ *	Create a new database file.
+ *
+ * PUBLIC: int __db_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__db_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	int ret;
+
+	/*
+	 * For in-memory database, it is created by mpool and doesn't
+	 * take any lock, so temporarily turn off the lock checking here.
+	 */
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		LOCK_CHECK_OFF(ip);
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_HASH:
+		ret = __ham_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_HEAP:
+		ret = __heap_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_QUEUE:
+		ret = __qam_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_UNKNOWN:
+	default:
+		__db_errx(dbp->env, DB_STR_A("0638",
+		    "%s: Invalid type %d specified", "%s %d"),
+		    name, dbp->type);
+		ret = EINVAL;
+		break;
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+	/* Sync the file in preparation for moving it into place. */
+	if (ret == 0 && fhp != NULL)
+		ret = __os_fsync(dbp->env, fhp);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		LOCK_CHECK_ON(ip);
+
+DB_TEST_RECOVERY_LABEL
+	return (ret);
+}
+
+/*
+ * __db_init_subdb --
+ *	Initialize the dbp for a subdb.
+ *
+ * PUBLIC: int __db_init_subdb __P((DB *,
+ * PUBLIC:       DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__db_init_subdb(mdbp, dbp, name, ip, txn)
+	DB *mdbp, *dbp;
+	const char *name;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	ret = 0;
+	if (!F_ISSET(dbp, DB_AM_CREATED)) {
+		/* Subdb exists; read meta-data page and initialize. */
+		mpf = mdbp->mpf;
+		if  ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+		    ip, txn, 0, &meta)) != 0)
+			goto err;
+		ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0);
+		if ((t_ret = __memp_fput(mpf,
+		    ip, meta, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		/*
+		 * If __db_meta_setup found that the meta-page hadn't
+		 * been written out during recovery, we can just return.
+		 */
+		if (ret == ENOENT)
+			ret = 0;
+		goto err;
+	}
+
+	/* Handle the create case here. */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_new_subdb(mdbp, dbp, ip, txn);
+		break;
+	case DB_HASH:
+		ret = __ham_new_subdb(mdbp, dbp, ip, txn);
+		break;
+	case DB_QUEUE:
+		ret = EINVAL;
+		break;
+	case DB_UNKNOWN:
+	default:
+		__db_errx(dbp->env, DB_STR_A("0639",
+		    "Invalid subdatabase type %d specified", "%d"),
+		    dbp->type);
+		return (EINVAL);
+	}
+
+err:	return (ret);
+}
+
+/*
+ * __db_chk_meta --
+ *	Take a buffer containing a meta-data page and check it for a valid LSN,
+ *	checksum (and verify the checksum if necessary) and possibly decrypt it.
+ *
+ *	Return 0 on success, >0 (errno).
+ *
+ * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+ */
+int
+__db_chk_meta(env, dbp, meta, flags)
+	ENV *env;
+	DB *dbp;
+	DBMETA *meta;
+	u_int32_t flags;
+{
+	DB_LSN swap_lsn;
+	int is_hmac, ret, swapped;
+	u_int32_t magic, orig_chk;
+	u_int8_t *chksum;
+
+	ret = 0;
+	swapped = 0;
+
+	if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
+		if (dbp != NULL)
+			F_SET(dbp, DB_AM_CHKSUM);
+
+		is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
+		chksum = ((BTMETA *)meta)->chksum;
+
+		/*
+		 * If we need to swap, the checksum function overwrites the
+		 * original checksum with 0, so we need to save a copy of the
+		 * original for swapping later.
+		 */
+		orig_chk = *(u_int32_t *)chksum;
+
+		/*
+		 * We cannot add this to __db_metaswap because that gets done
+		 * later after we've verified the checksum or decrypted.
+		 */
+		if (LF_ISSET(DB_CHK_META)) {
+			swapped = 0;
+chk_retry:		if ((ret =
+			    __db_check_chksum(env, NULL, env->crypto_handle,
+			    chksum, meta, DBMETASIZE, is_hmac)) != 0) {
+				if (is_hmac || swapped)
+					return (DB_CHKSUM_FAIL);
+
+				M_32_SWAP(orig_chk);
+				swapped = 1;
+				*(u_int32_t *)chksum = orig_chk;
+				goto chk_retry;
+			}
+		}
+	} else if (dbp != NULL)
+		F_CLR(dbp, DB_AM_CHKSUM);
+
+#ifdef HAVE_CRYPTO
+	if (__crypto_decrypt_meta(env,
+	     dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)) != 0)
+	     	ret = DB_CHKSUM_FAIL;
+	else
+#endif
+
+	/* Now that we're decrypted, we can check LSN. */
+	if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
+		/*
+		 * This gets called both before and after swapping, so we
+		 * need to check ourselves.  If we already swapped it above,
+		 * we'll know that here.
+		 */
+
+		swap_lsn = meta->lsn;
+		magic = meta->magic;
+lsn_retry:
+		if (swapped) {
+			M_32_SWAP(swap_lsn.file);
+			M_32_SWAP(swap_lsn.offset);
+			M_32_SWAP(magic);
+		}
+		switch (magic) {
+		case DB_BTREEMAGIC:
+		case DB_HASHMAGIC:
+		case DB_HEAPMAGIC:
+		case DB_QAMMAGIC:
+		case DB_RENAMEMAGIC:
+			break;
+		default:
+			if (swapped)
+				return (EINVAL);
+			swapped = 1;
+			goto lsn_retry;
+		}
+		if (!IS_REP_CLIENT(env) &&
+		    !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
+			/* Need to do check. */
+			ret = __log_check_page_lsn(env, dbp, &swap_lsn);
+	}
+	return (ret);
+}
+
+/*
+ * __db_meta_setup --
+ *
+ * Take a buffer containing a meta-data page and figure out if it's
+ * valid, and if so, initialize the dbp from the meta-data page.
+ *
+ * PUBLIC: int __db_meta_setup __P((ENV *,
+ * PUBLIC:     DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+ */
+int
+__db_meta_setup(env, dbp, name, meta, oflags, flags)
+	ENV *env;
+	DB *dbp;
+	const char *name;
+	DBMETA *meta;
+	u_int32_t oflags;
+	u_int32_t flags;
+{
+	u_int32_t magic;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * Figure out what access method we're dealing with, and then
+	 * call access method specific code to check error conditions
+	 * based on conflicts between the found file and application
+	 * arguments.  A found file overrides some user information --
+	 * we don't consider it an error, for example, if the user set
+	 * an expected byte order and the found file doesn't match it.
+	 */
+	F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME);
+	magic = meta->magic;
+
+swap_retry:
+	switch (magic) {
+	case DB_BTREEMAGIC:
+	case DB_HASHMAGIC:
+	case DB_HEAPMAGIC:
+	case DB_QAMMAGIC:
+	case DB_RENAMEMAGIC:
+		break;
+	case 0:
+		/*
+		 * The only time this should be 0 is if we're in the
+		 * midst of opening a subdb during recovery and that
+		 * subdatabase had its meta-data page allocated, but
+		 * not yet initialized.
+		 */
+		if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) &&
+		    F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) ||
+		    meta->pgno != PGNO_INVALID))
+			return (ENOENT);
+
+		goto bad_format;
+	default:
+		if (F_ISSET(dbp, DB_AM_SWAP))
+			goto bad_format;
+
+		M_32_SWAP(magic);
+		F_SET(dbp, DB_AM_SWAP);
+		goto swap_retry;
+	}
+
+	/*
+	 * We can only check the meta page if we are sure we have a meta page.
+	 * If it is random data, then this check can fail.  So only now can we
+	 * checksum and decrypt.  Don't distinguish between configuration and
+	 * checksum match errors here, because we haven't opened the database
+	 * and even a checksum error isn't a reason to panic the environment.
+	 * If DB_SKIP_CHK is set, it means the checksum was already checked
+	 * and the page was already decrypted.
+	 */
+	if (!LF_ISSET(DB_SKIP_CHK) && 
+	    (ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
+		if (ret == DB_CHKSUM_FAIL) 
+			__db_errx(env, DB_STR_A("0640",
+			    "%s: metadata page checksum error", "%s"), name);
+		goto bad_format;
+	}
+
+	switch (magic) {
+	case DB_BTREEMAGIC:
+		if (dbp->type != DB_UNKNOWN &&
+		    dbp->type != DB_RECNO && dbp->type != DB_BTREE)
+			goto bad_format;
+
+		flags = meta->flags;
+		if (F_ISSET(dbp, DB_AM_SWAP))
+			M_32_SWAP(flags);
+		if (LF_ISSET(BTM_RECNO))
+			dbp->type = DB_RECNO;
+		else
+			dbp->type = DB_BTREE;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_HASHMAGIC:
+		if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH)
+			goto bad_format;
+
+		dbp->type = DB_HASH;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_HEAPMAGIC:
+		if (dbp->type != DB_UNKNOWN && dbp->type != DB_HEAP)
+			goto bad_format;
+
+		dbp->type = DB_HEAP;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __heap_metachk(dbp, name, (HEAPMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_QAMMAGIC:
+		if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE)
+			goto bad_format;
+		dbp->type = DB_QUEUE;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_RENAMEMAGIC:
+		F_SET(dbp, DB_AM_IN_RENAME);
+
+		/* Copy the file's ID. */
+		memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+
+		break;
+	default:
+		goto bad_format;
+	}
+
+	if (FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK))
+		if ((ret =
+		    __partition_init(dbp, meta->metaflags)) != 0)
+			return (ret);
+	return (0);
+
+bad_format:
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		ret = ENOENT;
+	else
+		__db_errx(env, DB_STR_A("0641",
+		    "__db_meta_setup: %s: unexpected file type or format",
+		    "%s"), name);
+	return (ret == 0 ? EINVAL : ret);
+}
+
+/*
+ * __db_reopen --
+ *	Reopen a subdatabase if its meta/root pages move.
+ * PUBLIC: int __db_reopen __P((DBC *));
+ */
+int
+__db_reopen(arg_dbc)
+	DBC *arg_dbc;
+{
+	BTREE *bt;
+	DBC *dbc;
+	DB_TXN *txn;
+	HASH *ht;
+	DB *dbp, *mdbp;
+	DB_LOCK new_lock, old_lock;
+	PAGE *new_page, *old_page;
+	db_pgno_t newpgno, oldpgno;
+	int ret, t_ret;
+
+	dbc = arg_dbc;
+	dbp = dbc->dbp;
+	old_page = new_page = NULL;
+	mdbp = NULL;
+
+	COMPQUIET(bt, NULL);
+	COMPQUIET(ht, NULL);
+	COMPQUIET(txn, NULL);
+	LOCK_INIT(new_lock);
+	LOCK_INIT(old_lock);
+
+	/*
+	 * This must be done in the context of a transaction.  If the
+	 * requester does not have a transaction, create one.
+	 */
+
+	if (TXN_ON(dbp->env) && (txn = dbc->txn) == NULL) {
+		if ((ret = __txn_begin(dbp->env,
+		     dbc->thread_info, NULL, &txn, 0)) != 0)
+			return (ret);
+		if ((ret = __db_cursor(dbp,
+		     dbc->thread_info, txn, &dbc, 0)) != 0) {
+			(void)__txn_abort(txn);
+			return (ret);
+		}
+	}
+
+	/*
+	 * Lock and latch the old metadata page before re-opening the
+	 * database so that the information is stable.  Then lock
+	 * and latch the new page before getting the revision so that
+	 * it cannot change.
+	 */
+
+	if (dbp->type == DB_HASH) {
+		ht = (HASH*)dbp->h_internal;
+		oldpgno = ht->meta_pgno;
+	} else {
+		bt = (BTREE *)dbp->bt_internal;
+		oldpgno = bt->bt_root;
+	}
+	if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+	    0, oldpgno, DB_LOCK_READ, 0, &old_lock)) != 0)
+		goto err;
+
+	if ((ret = __memp_fget(dbp->mpf, &oldpgno,
+	    dbc->thread_info, dbc->txn, 0, &old_page)) != 0 &&
+	    ret != DB_PAGE_NOTFOUND)
+		goto err;
+
+	/* If the page is free we must not hold its lock. */
+	if (ret == DB_PAGE_NOTFOUND || TYPE(old_page) == P_INVALID) {
+		if ((ret = __LPUT(dbc, old_lock)) != 0)
+			goto err;
+		/* Drop the latch too. */
+		if (old_page != NULL && (ret = __memp_fput(dbp->mpf,
+		    dbc->thread_info, old_page, dbc->priority)) != 0)
+			goto err;
+		old_page = NULL;
+	}
+
+	if ((ret = __db_master_open(dbp,
+	    dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
+	    dbc->txn, dbp->dname, dbp->type, MU_OPEN, NULL, 0)) != 0)
+		goto err;
+
+	if (dbp->type == DB_HASH)
+		newpgno = ht->meta_pgno = dbp->meta_pgno;
+	else {
+		bt->bt_meta = dbp->meta_pgno;
+		if ((ret = __bam_read_root(dbp,
+		    dbc->thread_info, dbc->txn, bt->bt_meta, 0)) != 0)
+			goto err;
+		newpgno = bt->bt_root;
+	}
+
+	if (oldpgno == newpgno)
+		goto done;
+
+	if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+	    0, newpgno, DB_LOCK_READ, 0, &new_lock)) != 0)
+		goto err;
+
+	if ((ret = __memp_fget(dbp->mpf, &newpgno,
+	    dbc->thread_info, dbc->txn, 0, &new_page)) != 0)
+		goto err;
+
+done:	if (dbp->type == DB_HASH)
+		ht->revision = dbp->mpf->mfp->revision;
+	else
+		bt->revision = dbp->mpf->mfp->revision;
+
+err:	if (old_page != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    dbc->thread_info, old_page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (new_page != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    dbc->thread_info, new_page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (mdbp != NULL &&
+	    (t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (dbc != arg_dbc) {
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+
+static int
+__db_handle_lock(dbp)
+	DB *dbp;
+{
+	ENV *env;
+	int ret;
+	u_int32_t old_flags;
+
+	env = dbp->env;
+	ret = 0;
+	old_flags = dbp->flags;
+
+	/*
+	 * Internal exclusive database handles need to get and hold
+	 * their own handle locks so that the client cannot open any
+	 * external handles on that database.
+	 */
+	F_CLR(dbp, DB_AM_RECOVER);
+	F_SET(dbp, DB_AM_NOT_DURABLE);
+
+	/* Begin exclusive handle lockout. */
+	dbp->mpf->mfp->excl_lockout = 1;
+
+	if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+		goto err;
+	LOCK_INIT(dbp->handle_lock);
+	if ((ret = __fop_lock_handle(env, dbp, dbp->locker, DB_LOCK_WRITE,
+	    NULL, 0))!= 0)
+		goto err;
+
+err:	/* End exclusive handle lockout. */
+	dbp->mpf->mfp->excl_lockout = 0;
+	dbp->flags = old_flags;
+
+	return (ret);
+}
diff --git a/src/db/db_overflow.c b/src/db/db_overflow.c
new file mode 100644
index 00000000..d992ec0d
--- /dev/null
+++ b/src/db/db_overflow.c
@@ -0,0 +1,705 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * Big key/data code.
+ *
+ * Big key and data entries are stored on linked lists of pages.  The initial
+ * reference is a structure with the total length of the item and the page
+ * number where it begins.  Each entry in the linked list contains a pointer
+ * to the next page of data, and so on.
+ */
+
+/*
+ * __db_goff --
+ *	Get an offpage item.
+ *
+ * PUBLIC: int __db_goff __P((DBC *,
+ * PUBLIC:     DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ */
+int
+__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
+	DBC *dbc;
+	DBT *dbt;
+	u_int32_t tlen;
+	db_pgno_t pgno;
+	void **bpp;
+	u_int32_t *bpsz;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	DB_TXN *txn;
+	DBC_INTERNAL *cp;
+	ENV *env;
+	PAGE *h;
+	DB_THREAD_INFO *ip;
+	db_indx_t bytes;
+	u_int32_t curoff, needed, start;
+	u_int8_t *p, *src;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+	env = dbp->env;
+	ip = dbc->thread_info;
+	mpf = dbp->mpf;
+	txn = dbc->txn;
+
+	/*
+	 * Check if the buffer is big enough; if it is not and we are
+	 * allowed to malloc space, then we'll malloc it.  If we are
+	 * not (DB_DBT_USERMEM), then we'll set the dbt and return
+	 * appropriately.
+	 */
+	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		start = dbt->doff;
+		if (start > tlen)
+			needed = 0;
+		else if (dbt->dlen > tlen - start)
+			needed = tlen - start;
+		else
+			needed = dbt->dlen;
+	} else {
+		start = 0;
+		needed = tlen;
+	}
+
+	/*
+	 * If the caller has not requested any data, return success. This
+	 * "early-out" also avoids setting up the streaming optimization when
+	 * no page would be retrieved. If it were removed, the streaming code
+	 * should only initialize when needed is not 0.
+	 */
+	if (needed == 0) {
+		dbt->size = 0;
+		return (0);
+	}
+
+	if (F_ISSET(dbt, DB_DBT_USERCOPY))
+		goto skip_alloc;
+
+	/* Allocate any necessary memory. */
+	if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+		if (needed > dbt->ulen) {
+			dbt->size = needed;
+			return (DB_BUFFER_SMALL);
+		}
+	} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+		if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+		if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+		if ((ret = __os_realloc(env, needed, bpp)) != 0)
+			return (ret);
+		*bpsz = needed;
+		dbt->data = *bpp;
+	} else if (bpp != NULL)
+		dbt->data = *bpp;
+	else {
+		DB_ASSERT(env,
+		    F_ISSET(dbt,
+		    DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+		    bpsz != NULL);
+		return (DB_BUFFER_SMALL);
+	}
+
+skip_alloc:
+	/* Set up a start page in the overflow chain if streaming. */
+	if (cp->stream_start_pgno != PGNO_INVALID &&
+	    pgno == cp->stream_start_pgno && start >= cp->stream_off &&
+	    start < cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+		pgno = cp->stream_curr_pgno;
+		curoff = cp->stream_off;
+	} else {
+		cp->stream_start_pgno = cp->stream_curr_pgno = pgno;
+		cp->stream_off = curoff = 0;
+	}
+
+	/*
+	 * Step through the linked list of pages, copying the data on each
+	 * one into the buffer.  Never copy more than the total data length.
+	 */
+	dbt->size = needed;
+	for (p = dbt->data; pgno != PGNO_INVALID && needed > 0;) {
+		if ((ret = __memp_fget(mpf,
+		    &pgno, ip, txn, 0, &h)) != 0)
+			return (ret);
+		DB_ASSERT(env, TYPE(h) == P_OVERFLOW);
+
+		/* Check if we need any bytes from this page. */
+		if (curoff + OV_LEN(h) >= start) {
+			bytes = OV_LEN(h);
+			src = (u_int8_t *)h + P_OVERHEAD(dbp);
+			if (start > curoff) {
+				src += start - curoff;
+				bytes -= start - curoff;
+			}
+			if (bytes > needed)
+				bytes = needed;
+			if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+				/*
+				 * The offset into the DBT is the total size
+				 * less the amount of data still needed.  Care
+				 * needs to be taken if doing a partial copy
+				 * beginning at an offset other than 0.
+				 */
+				if ((ret = env->dbt_usercopy(
+				    dbt, dbt->size - needed,
+				    src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+					(void)__memp_fput(mpf,
+					    ip, h, dbp->priority);
+					return (ret);
+				}
+			} else
+				memcpy(p, src, bytes);
+			p += bytes;
+			needed -= bytes;
+		}
+		cp->stream_off = curoff;
+		curoff += OV_LEN(h);
+		cp->stream_curr_pgno = pgno;
+		pgno = h->next_pgno;
+		(void)__memp_fput(mpf, ip, h, dbp->priority);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_poff --
+ *	Put an offpage item.
+ *
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+ */
+int
+__db_poff(dbc, dbt, pgnop)
+	DBC *dbc;
+	const DBT *dbt;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep, *lastp;
+	db_indx_t pagespace;
+	db_pgno_t pgno;
+	u_int32_t space, sz, tlen;
+	u_int8_t *p;
+	int ret, t_ret;
+
+	/*
+	 * Allocate pages and copy the key/data item into them.  Calculate the
+	 * number of bytes we get for pages we fill completely with a single
+	 * item.
+	 */
+	dbp = dbc->dbp;
+	lastp = NULL;
+	mpf = dbp->mpf;
+	pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+	p = dbt->data;
+	sz = dbt->size;
+
+	/*
+	 * Check whether we are streaming at the end of the overflow item.
+	 * If so, the last pgno and offset will be cached in the cursor.
+	 */
+	if (F_ISSET(dbt, DB_DBT_STREAMING)) {
+		tlen = dbt->size - dbt->dlen;
+		pgno = dbc->internal->stream_curr_pgno;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &lastp)) != 0)
+			return (ret);
+
+		/*
+		 * Calculate how much we can write on the last page of the
+		 * overflow item.
+		 */
+		DB_ASSERT(dbp->env,
+		    OV_LEN(lastp) == (tlen - dbc->internal->stream_off));
+		space = pagespace - OV_LEN(lastp);
+
+		/* Only copy as much data as we have. */
+		if (space > dbt->dlen)
+			space = dbt->dlen;
+
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = dbt->data;
+			tmp_dbt.size = space;
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn, &LSN(lastp), 0,
+			    OP_SET(DB_APPEND_BIG, lastp), pgno,
+			    PGNO_INVALID, PGNO_INVALID, &tmp_dbt,
+			    &LSN(lastp), &null_lsn, &null_lsn)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(lastp));
+
+		memcpy((u_int8_t *)lastp + P_OVERHEAD(dbp) + OV_LEN(lastp),
+		    dbt->data, space);
+		OV_LEN(lastp) += space;
+		sz -= space + dbt->doff;
+		p += space;
+		*pgnop = dbc->internal->stream_start_pgno;
+	}
+
+	ret = 0;
+	for (; sz > 0; p += pagespace, sz -= pagespace) {
+		/*
+		 * Reduce pagespace so we terminate the loop correctly and
+		 * don't copy too much data.
+		 */
+		if (sz < pagespace)
+			pagespace = sz;
+
+		/*
+		 * Allocate and initialize a new page and copy all or part of
+		 * the item onto the page.  If sz is less than pagespace, we
+		 * have a partial record.
+		 */
+		if ((ret = __db_new(dbc, P_OVERFLOW, NULL, &pagep)) != 0)
+			break;
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = p;
+			tmp_dbt.size = pagespace;
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn, &LSN(pagep), 0,
+			    OP_SET(DB_ADD_BIG, pagep),
+			    PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID,
+			    PGNO_INVALID, &tmp_dbt, &LSN(pagep),
+			    lastp == NULL ? &null_lsn : &LSN(lastp),
+			    &null_lsn)) != 0) {
+				(void)__memp_fput(mpf, dbc->thread_info,
+				    pagep, dbc->priority);
+				goto err;
+			}
+		} else
+			LSN_NOT_LOGGED(LSN(pagep));
+
+		/* Move LSN onto page. */
+		if (lastp != NULL)
+			LSN(lastp) = LSN(pagep);
+
+		OV_LEN(pagep) = pagespace;
+		OV_REF(pagep) = 1;
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace);
+
+		/*
+		 * If this is the first entry, update the user's info and
+		 * initialize the cursor to allow for streaming of subsequent
+		 * updates.  Otherwise, update the entry on the last page
+		 * filled in and release that page.
+		 */
+		if (lastp == NULL) {
+			*pgnop = PGNO(pagep);
+			dbc->internal->stream_start_pgno =
+			    dbc->internal->stream_curr_pgno = *pgnop;
+			dbc->internal->stream_off = 0;
+		} else {
+			lastp->next_pgno = PGNO(pagep);
+			pagep->prev_pgno = PGNO(lastp);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, lastp, dbc->priority)) != 0) {
+				lastp = NULL;
+				goto err;
+			}
+		}
+		lastp = pagep;
+	}
+err:	if (lastp != NULL) {
+		if (ret == 0) {
+			dbc->internal->stream_curr_pgno = PGNO(lastp);
+			dbc->internal->stream_off = dbt->size - OV_LEN(lastp);
+		}
+
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info, lastp,
+		    dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+
+/*
+ * __db_ovref --
+ *	Decrement the reference count on an overflow page.
+ *
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t));
+ */
+int
+__db_ovref(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno,
+	     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &h)) != 0)
+		return (ret);
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_ovref_log(dbp,
+		    dbc->txn, &LSN(h), 0, h->pgno, -1, &LSN(h))) != 0) {
+			(void)__memp_fput(mpf,
+			     dbc->thread_info, h, dbc->priority);
+			return (ret);
+		}
+	} else
+		LSN_NOT_LOGGED(LSN(h));
+
+	/*
+	 * In BDB releases before 4.5, the overflow reference counts were
+	 * incremented when an overflow item was split onto an internal
+	 * page.  There was a lock race in that code, and rather than fix
+	 * the race, we changed BDB to copy overflow items when splitting
+	 * them onto internal pages.  The code to decrement reference
+	 * counts remains so databases already in the field continue to
+	 * work.
+	 */
+	--OV_REF(h);
+
+	return (__memp_fput(mpf, dbc->thread_info, h, dbc->priority));
+}
+
+/*
+ * __db_doff --
+ *	Delete an offpage chain of overflow pages.
+ *
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t));
+ */
+int
+__db_doff(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+
+	do {
+		if ((ret = __memp_fget(mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &pagep)) != 0)
+			return (ret);
+
+		DB_ASSERT(dbp->env, TYPE(pagep) == P_OVERFLOW);
+		/*
+		 * If it's referenced by more than one key/data item,
+		 * decrement the reference count and return.
+		 */
+		if (OV_REF(pagep) > 1) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, pagep, dbc->priority);
+			return (__db_ovref(dbc, pgno));
+		}
+
+		if ((ret = __memp_dirty(mpf, &pagep,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+			if (pagep != NULL)
+				(void)__memp_fput(mpf,
+				    dbc->thread_info, pagep, dbc->priority);
+			return (ret);
+		}
+
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+			tmp_dbt.size = OV_LEN(pagep);
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn, &LSN(pagep), 0,
+			    OP_SET(DB_REM_BIG, pagep), PGNO(pagep),
+			    PREV_PGNO(pagep), NEXT_PGNO(pagep), &tmp_dbt,
+			    &LSN(pagep), &null_lsn, &null_lsn)) != 0) {
+				(void)__memp_fput(mpf,
+				    dbc->thread_info, pagep, dbc->priority);
+				return (ret);
+			}
+		} else
+			LSN_NOT_LOGGED(LSN(pagep));
+		pgno = pagep->next_pgno;
+		OV_LEN(pagep) = 0;
+		if ((ret = __db_free(dbc, pagep, 0)) != 0)
+			return (ret);
+	} while (pgno != PGNO_INVALID);
+
+	return (0);
+}
+
+/*
+ * __db_moff --
+ *	Match on overflow pages.
+ *
+ * Given a starting page number and a key, return <0, 0, >0 to indicate if the
+ * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function.  In this case, we need to materialize
+ * the entire object and call their comparison routine.
+ *
+ * __db_moff and __db_coff are generic functions useful in searching and
+ * ordering off page items. __db_moff matches an overflow DBT with an offpage
+ * item. __db_coff compares two offpage items for lexicographic sort order.
+ *
+ * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
+	DBC *dbc;
+	const DBT *dbt;
+	db_pgno_t pgno;
+	u_int32_t tlen;
+	int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+	DB *dbp;
+	DBT local_dbt;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	void *buf;
+	u_int32_t bufsize, cmp_bytes, key_left;
+	u_int8_t *p1, *p2;
+	int ret;
+
+	dbp = dbc->dbp;
+	ip = dbc->thread_info;
+	mpf = dbp->mpf;
+
+	/*
+	 * If there is a user-specified comparison function, build a
+	 * contiguous copy of the key, and call it.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_dbt, 0, sizeof(local_dbt));
+		buf = NULL;
+		bufsize = 0;
+
+		if ((ret = __db_goff(dbc,
+		    &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+			return (ret);
+		/* Pass the key as the first argument */
+		*cmpp = cmpfunc(dbp, dbt, &local_dbt);
+		__os_free(dbp->env, buf);
+		return (0);
+	}
+
+	/* While there are both keys to compare. */
+	for (*cmpp = 0, p1 = dbt->data,
+	    key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
+		if ((ret =
+		    __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0)
+			return (ret);
+
+		cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
+		tlen -= cmp_bytes;
+		key_left -= cmp_bytes;
+		for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+		    cmp_bytes-- > 0; ++p1, ++p2)
+			if (*p1 != *p2) {
+				*cmpp = (long)*p1 - (long)*p2;
+				break;
+			}
+		pgno = NEXT_PGNO(pagep);
+		if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0)
+			return (ret);
+		if (*cmpp != 0)
+			return (0);
+	}
+	if (key_left > 0)		/* DBT is longer than the page key. */
+		*cmpp = 1;
+	else if (tlen > 0)		/* DBT is shorter than the page key. */
+		*cmpp = -1;
+	else
+		*cmpp = 0;
+
+	return (0);
+}
+
+/*
+ * __db_coff --
+ *	Match two offpage dbts.
+ *
+ * The DBTs must both refer to offpage items.
+ * The match happens a chunk (page) at a time unless a user defined comparison
+ * function exists. It is not possible to optimize this comparison away when
+ * a lexicographic sort order is required on mismatch.
+ *
+ * NOTE: For now this function only works for H_OFFPAGE type items. It would
+ * be simple to extend it for use with B_OVERFLOW type items. It would only
+ * require extracting the total length, and page number, dependent on the
+ * DBT type.
+ *
+ * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_coff(dbc, dbt, match, cmpfunc, cmpp)
+	DBC *dbc;
+	const DBT *dbt, *match;
+	int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_MPOOLFILE *mpf;
+	DB_TXN *txn;
+	DBT local_key, local_match;
+	PAGE *dbt_pagep, *match_pagep;
+	db_pgno_t dbt_pgno, match_pgno;
+	u_int32_t cmp_bytes, dbt_bufsz, dbt_len, match_bufsz;
+	u_int32_t match_len, max_data, page_space;
+	u_int8_t *p1, *p2;
+	int ret;
+	void *dbt_buf, *match_buf;
+
+	dbp = dbc->dbp;
+	ip = dbc->thread_info;
+	txn = dbc->txn;
+	mpf = dbp->mpf;
+	page_space = P_MAXSPACE(dbp, dbp->pgsize);
+	*cmpp = 0;
+	dbt_buf = match_buf = NULL;
+
+	DB_ASSERT(dbp->env, HPAGE_PTYPE(dbt->data) == H_OFFPAGE);
+	DB_ASSERT(dbp->env, HPAGE_PTYPE(match->data) == H_OFFPAGE);
+
+	/* Extract potentially unaligned length and pgno fields from DBTs */
+	memcpy(&dbt_len, HOFFPAGE_TLEN(dbt->data), sizeof(u_int32_t));
+	memcpy(&dbt_pgno, HOFFPAGE_PGNO(dbt->data), sizeof(db_pgno_t));
+	memcpy(&match_len, HOFFPAGE_TLEN(match->data), sizeof(u_int32_t));
+	memcpy(&match_pgno, HOFFPAGE_PGNO(match->data), sizeof(db_pgno_t));
+	max_data = (dbt_len < match_len ? dbt_len : match_len);
+
+	/*
+	 * If there is a custom comparator, fully resolve both DBTs.
+	 * Then call the users comparator.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_key, 0, sizeof(local_key));
+		memset(&local_match, 0, sizeof(local_match));
+		dbt_buf = match_buf = NULL;
+		dbt_bufsz = match_bufsz = 0;
+
+		if ((ret = __db_goff(dbc, &local_key, dbt_len,
+		    dbt_pgno, &dbt_buf, &dbt_bufsz)) != 0)
+			goto err1;
+		if ((ret = __db_goff(dbc, &local_match, match_len,
+		    match_pgno, &match_buf, &match_bufsz)) != 0)
+			goto err1;
+		/* The key needs to be the first argument for sort order */
+		*cmpp = cmpfunc(dbp, &local_key, &local_match);
+
+err1:		if (dbt_buf != NULL)
+			__os_free(dbp->env, dbt_buf);
+		if (match_buf != NULL)
+			__os_free(dbp->env, match_buf);
+		return (ret);
+	}
+
+	/* Match the offpage DBTs a page at a time. */
+	while (dbt_pgno != PGNO_INVALID && match_pgno != PGNO_INVALID) {
+		if ((ret =
+		    __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0)
+			return (ret);
+		if ((ret =
+		    __memp_fget(mpf, &match_pgno,
+			ip, txn, 0, &match_pagep)) != 0) {
+			(void)__memp_fput(
+			    mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED);
+			return (ret);
+		}
+		cmp_bytes = page_space < max_data ? page_space : max_data;
+		for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp),
+		    p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp);
+		    cmp_bytes-- > 0; ++p1, ++p2)
+				if (*p1 != *p2) {
+					*cmpp = (long)*p1 - (long)*p2;
+					break;
+				}
+
+		dbt_pgno = NEXT_PGNO(dbt_pagep);
+		match_pgno = NEXT_PGNO(match_pagep);
+		max_data -= page_space;
+		if ((ret = __memp_fput(mpf,
+		     ip, dbt_pagep, DB_PRIORITY_UNCHANGED)) != 0) {
+			(void)__memp_fput(mpf,
+			    ip, match_pagep, DB_PRIORITY_UNCHANGED);
+			return (ret);
+		}
+		if ((ret = __memp_fput(mpf,
+		    ip, match_pagep, DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+		if (*cmpp != 0)
+			return (0);
+	}
+
+	/* If a lexicographic mismatch was found, then the result has already
+	 * been returned. If the DBTs matched, consider the lengths of the
+	 * items, and return appropriately.
+	 */
+	if (dbt_len > match_len) /* DBT is longer than the match key. */
+		*cmpp = 1;
+	else if (match_len > dbt_len) /* DBT is shorter than the match key. */
+		*cmpp = -1;
+	else
+		*cmpp = 0;
+
+	return (0);
+
+}
diff --git a/src/db/db_ovfl_vrfy.c b/src/db/db_ovfl_vrfy.c
new file mode 100644
index 00000000..fa630f7b
--- /dev/null
+++ b/src/db/db_ovfl_vrfy.c
@@ -0,0 +1,410 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_vrfy_overflow --
+ *	Verify overflow page.
+ *
+ * PUBLIC: int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_vrfy_overflow(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	VRFY_PAGEINFO *pip;
+	int isbad, ret, t_ret;
+
+	isbad = 0;
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	pip->refcount = OV_REF(h);
+	if (pip->refcount < 1) {
+		EPRINT((dbp->env, DB_STR_A("0676",
+		    "Page %lu: overflow page has zero reference count", "%lu"),
+		    (u_long)pgno));
+		isbad = 1;
+	}
+
+	/* Just store for now. */
+	pip->olen = HOFFSET(h);
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_ovfl_structure --
+ *	Walk a list of overflow pages, avoiding cycles and marking
+ *	pages seen.
+ *
+ * PUBLIC: int __db_vrfy_ovfl_structure
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t tlen;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t next, prev;
+	int isbad, ret, seen_cnt, t_ret;
+	u_int32_t refcount;
+
+	env = dbp->env;
+	pgset = vdp->pgset;
+	DB_ASSERT(env, pgset != NULL);
+	isbad = 0;
+
+	/* This shouldn't happen, but just to be sure. */
+	if (!IS_VALID_PGNO(pgno))
+		return (DB_VERIFY_BAD);
+
+	/*
+	 * Check the first prev_pgno;  it ought to be PGNO_INVALID,
+	 * since there's no prev page.
+	 */
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/* The refcount is stored on the first overflow page. */
+	refcount = pip->refcount;
+
+	if (pip->type != P_OVERFLOW) {
+		EPRINT((env, DB_STR_A("0677",
+		    "Page %lu: overflow page of invalid type %lu", "%lu %lu"),
+		    (u_long)pgno, (u_long)pip->type));
+		ret = DB_VERIFY_BAD;
+		goto err;		/* Unsafe to continue. */
+	}
+
+	prev = pip->prev_pgno;
+	if (prev != PGNO_INVALID) {
+		EPRINT((env, DB_STR_A("0678",
+	    "Page %lu: first page in overflow chain has a prev_pgno %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)prev));
+		isbad = 1;
+	}
+
+	for (;;) {
+		/*
+		 * We may have seen this page elsewhere, if the overflow entry
+		 * has been promoted to an internal page;  we just want to
+		 * make sure that each overflow page is seen exactly as many
+		 * times as its refcount dictates.
+		 *
+		 * Note that this code also serves to keep us from looping
+		 * infinitely if there's a cycle in an overflow chain.
+		 */
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, vdp->txn, pgno, &seen_cnt)) != 0)
+			goto err;
+		if ((u_int32_t)seen_cnt > refcount) {
+			EPRINT((env, DB_STR_A("0679",
+		"Page %lu: encountered too many times in overflow traversal",
+			    "%lu"), (u_long)pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		if ((ret = __db_vrfy_pgset_inc(
+		    pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+			goto err;
+
+		/*
+		 * Each overflow page can be referenced multiple times,
+		 * because it's possible for overflow Btree keys to get
+		 * promoted to internal pages.  We want to make sure that
+		 * each page is referenced from a Btree leaf (or Hash data
+		 * page, which we consider a "leaf" here) exactly once; if
+		 * the parent was a leaf, set a flag to indicate that we've
+		 * seen this page in a leaf context.
+		 *
+		 * If the parent is not a leaf--in which case it's a Btree
+		 * internal page--we don't need to bother doing any further
+		 * verification, as we'll do it when we hit the leaf (or
+		 * complain that we never saw the leaf).  Only the first
+		 * page in an overflow chain should ever have a refcount
+		 * greater than 1, and the combination of the LEAFSEEN check
+		 * and the fact that we bail after the first page for
+		 * non-leaves should ensure this.
+		 *
+		 * Note that each "child" of a page, such as an overflow page,
+		 * is stored and verified in a structure check exactly once,
+		 * so this code does not need to contend with the fact that
+		 * overflow chains used as Btree duplicate keys may be
+		 * referenced multiply from a single Btree leaf page.
+		 */
+		if (LF_ISSET(DB_ST_OVFL_LEAF)) {
+			if (F_ISSET(pip, VRFY_OVFL_LEAFSEEN)) {
+				EPRINT((env, DB_STR_A("0680",
+		"Page %lu: overflow page linked twice from leaf or data page",
+				    "%lu"), (u_long)pgno));
+				ret = DB_VERIFY_BAD;
+				goto err;
+			}
+			F_SET(pip, VRFY_OVFL_LEAFSEEN);
+		}
+
+		/*
+		 * We want to verify each overflow chain only once, and
+		 * although no chain should be linked more than once from a
+		 * leaf page, we can't guarantee that it'll be linked that
+		 * once if it's linked from an internal page and the key
+		 * is gone.
+		 *
+		 * seen_cnt is the number of times we'd encountered this page
+		 * before calling this function.
+		 */
+		if (seen_cnt == 0) {
+			/*
+			 * Keep a running tab on how much of the item we've
+			 * seen.
+			 */
+			tlen -= pip->olen;
+
+			/* Send the application feedback about our progress. */
+			if (!LF_ISSET(DB_SALVAGE))
+				__db_vrfy_struct_feedback(dbp, vdp);
+		} else
+			goto done;
+
+		next = pip->next_pgno;
+
+		/* Are we there yet? */
+		if (next == PGNO_INVALID)
+			break;
+
+		/*
+		 * We've already checked this when we saved it, but just
+		 * to be sure...
+		 */
+		if (!IS_VALID_PGNO(next)) {
+			EPRINT((env, DB_STR_A("0681",
+			    "Page %lu: bad next_pgno %lu on overflow page",
+			    "%lu %lu"), (u_long)pgno, (u_long)next));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 ||
+		    (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0)
+			return (ret);
+		if (pip->prev_pgno != pgno) {
+			EPRINT((env, DB_STR_A("0682",
+		"Page %lu: bad prev_pgno %lu on overflow page (should be %lu)",
+			    "%lu %lu %lu"), (u_long)next,
+			    (u_long)pip->prev_pgno, (u_long)pgno));
+			isbad = 1;
+			/*
+			 * It's safe to continue because we have separate
+			 * cycle detection.
+			 */
+		}
+
+		pgno = next;
+	}
+
+	if (tlen > 0) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0683",
+		    "Page %lu: overflow item incomplete", "%lu"),
+		    (u_long)pgno));
+	}
+
+done:
+err:	if ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_safe_goff --
+ *	Get an overflow item, very carefully, from an untrusted database,
+ *	in the context of the salvager.
+ *
+ * PUBLIC: int __db_safe_goff __P((DB *, VRFY_DBINFO *,
+ * PUBLIC:      db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+ */
+int
+__db_safe_goff(dbp, vdp, pgno, dbt, buf, bufsz, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	DBT *dbt;
+	void *buf;
+	u_int32_t *bufsz;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+	u_int32_t bytesgot, bytes;
+	u_int8_t *src, *dest;
+
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = t_ret = 0;
+	bytesgot = bytes = 0;
+
+    DB_ASSERT(dbp->env, bufsz != NULL);
+
+	/*
+	 * Back up to the start of the overflow chain (if necessary) via the
+	 * prev pointer of the overflow page.  This guarantees we transverse the
+	 * longest possible chains of overflow pages and won't be called again
+	 * with a pgno earlier in the chain, stepping on ourselves.
+	 */
+	for (;;) {
+		if ((ret = __memp_fget(
+		    mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+			return (ret);
+
+		if (PREV_PGNO(h) == PGNO_INVALID ||
+		    !IS_VALID_PGNO(PREV_PGNO(h)))
+			break;
+
+		pgno = PREV_PGNO(h);
+
+		if ((ret = __memp_fput(mpf,
+		    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+	}
+	if ((ret = __memp_fput(
+	    mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+		return (ret);
+
+	h = NULL;
+
+	while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) {
+		/*
+		 * Mark that we're looking at this page;  if we've seen it
+		 * already, quit.
+		 */
+		if ((ret = __db_salvage_markdone(vdp, pgno)) != 0)
+			break;
+
+		if ((ret = __memp_fget(mpf, &pgno,
+		    vdp->thread_info, NULL, 0, &h)) != 0)
+			break;
+
+		/*
+		 * Make sure it's really an overflow page, unless we're
+		 * being aggressive, in which case we pretend it is.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) && TYPE(h) != P_OVERFLOW) {
+			ret = DB_VERIFY_BAD;
+			break;
+		}
+
+		src = (u_int8_t *)h + P_OVERHEAD(dbp);
+		bytes = OV_LEN(h);
+
+		if (bytes + P_OVERHEAD(dbp) > dbp->pgsize)
+			bytes = dbp->pgsize - P_OVERHEAD(dbp);
+
+		/*
+		 * Realloc if buf is too small
+		 */
+		if (bytesgot + bytes > *bufsz) {
+			if ((ret =
+			    __os_realloc(dbp->env, bytesgot + bytes, buf)) != 0)
+				break;
+			*bufsz = bytesgot + bytes;
+		}
+
+		dest = *(u_int8_t **)buf + bytesgot;
+		bytesgot += bytes;
+
+		memcpy(dest, src, bytes);
+
+		pgno = NEXT_PGNO(h);
+
+		if ((ret = __memp_fput(mpf,
+		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			break;
+		h = NULL;
+	}
+
+	/*
+	 * If we're being aggressive, salvage a partial datum if there
+	 * was an error somewhere along the way.
+	 */
+	if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) {
+		dbt->size = bytesgot;
+		dbt->data = *(void **)buf;
+	}
+
+	/* If we broke out on error, don't leave pages pinned. */
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/db/db_pr.c b/src/db/db_pr.c
new file mode 100644
index 00000000..d95440f9
--- /dev/null
+++ b/src/db/db_pr.c
@@ -0,0 +1,1956 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/db_verify.h"
+
+static int	 __db_bmeta __P((ENV *, DB *, BTMETA *, u_int32_t));
+static int	 __db_heapmeta __P((ENV *, DB *, HEAPMETA *, u_int32_t));
+static int	 __db_heapint __P((DB *, HEAPPG *, u_int32_t));
+static int	 __db_hmeta __P((ENV *, DB *, HMETA *, u_int32_t));
+static void	 __db_meta __P((ENV *, DB *, DBMETA *, FN const *, u_int32_t));
+static void	 __db_proff __P((ENV *, DB_MSGBUF *, void *));
+static int	 __db_qmeta __P((ENV *, DB *, QMETA *, u_int32_t));
+#ifdef HAVE_STATISTICS
+static void	 __db_prdb __P((DB *, u_int32_t));
+static int	 __db_prtree __P((DB *, DB_TXN *,
+		    u_int32_t, db_pgno_t, db_pgno_t));
+#endif
+
+/*
+ * __db_loadme --
+ *	A nice place to put a breakpoint.
+ *
+ * PUBLIC: void __db_loadme __P((void));
+ */
+void
+__db_loadme()
+{
+	pid_t pid;
+
+	__os_id(NULL, &pid, NULL);
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __db_dumptree --
+ *	Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *,
+ * PUBLIC:     char *, char *, db_pgno_t, db_pgno_t));
+ */
+int
+__db_dumptree(dbp, txn, op, name, first, last)
+	DB *dbp;
+	DB_TXN *txn;
+	char *op, *name;
+	db_pgno_t first, last;
+{
+	ENV *env;
+	FILE *fp, *orig_fp;
+	u_int32_t flags;
+	int ret;
+
+	env = dbp->env;
+
+	for (flags = 0; *op != '\0'; ++op)
+		switch (*op) {
+		case 'a':
+			LF_SET(DB_PR_PAGE);
+			break;
+		case 'h':
+			break;
+		case 'r':
+			LF_SET(DB_PR_RECOVERYTEST);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+	if (name != NULL) {
+		if ((fp = fopen(name, "w")) == NULL)
+			return (__os_get_errno());
+
+		orig_fp = dbp->dbenv->db_msgfile;
+		dbp->dbenv->db_msgfile = fp;
+	} else
+		fp = orig_fp = NULL;
+
+	__db_prdb(dbp, flags);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+
+	ret = __db_prtree(dbp, txn, flags, first, last);
+
+	if (fp != NULL) {
+		(void)fclose(fp);
+		env->dbenv->db_msgfile = orig_fp;
+	}
+
+	return (ret);
+}
+
+static const FN __db_flags_fn[] = {
+	{ DB_AM_CHKSUM,			"checksumming" },
+	{ DB_AM_COMPENSATE,		"created by compensating transaction" },
+	{ DB_AM_CREATED,		"database created" },
+	{ DB_AM_CREATED_MSTR,		"encompassing file created" },
+	{ DB_AM_DBM_ERROR,		"dbm/ndbm error" },
+	{ DB_AM_DELIMITER,		"variable length" },
+	{ DB_AM_DISCARD,		"discard cached pages" },
+	{ DB_AM_DUP,			"duplicates" },
+	{ DB_AM_DUPSORT,		"sorted duplicates" },
+	{ DB_AM_ENCRYPT,		"encrypted" },
+	{ DB_AM_FIXEDLEN,		"fixed-length records" },
+	{ DB_AM_INMEM,			"in-memory" },
+	{ DB_AM_IN_RENAME,		"file is being renamed" },
+	{ DB_AM_NOT_DURABLE,		"changes not logged" },
+	{ DB_AM_OPEN_CALLED,		"open called" },
+	{ DB_AM_PAD,			"pad value" },
+	{ DB_AM_PGDEF,			"default page size" },
+	{ DB_AM_RDONLY,			"read-only" },
+	{ DB_AM_READ_UNCOMMITTED,	"read-uncommitted" },
+	{ DB_AM_RECNUM,			"Btree record numbers" },
+	{ DB_AM_RECOVER,		"opened for recovery" },
+	{ DB_AM_RENUMBER,		"renumber" },
+	{ DB_AM_REVSPLITOFF,		"no reverse splits" },
+	{ DB_AM_SECONDARY,		"secondary" },
+	{ DB_AM_SNAPSHOT,		"load on open" },
+	{ DB_AM_SUBDB,			"subdatabases" },
+	{ DB_AM_SWAP,			"needswap" },
+	{ DB_AM_TXN,			"transactional" },
+	{ DB_AM_VERIFYING,		"verifier" },
+	{ 0,				NULL }
+};
+
+/*
+ * __db_get_flags_fn --
+ *	Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+	return (__db_flags_fn);
+}
+
+/*
+ * __db_prdb --
+ *	Print out the DB structure information.
+ */
+static void
+__db_prdb(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	BTREE *bt;
+	DB_MSGBUF mb;
+	ENV *env;
+	HASH *h;
+	QUEUE *q;
+	HEAP *hp;
+
+	env = dbp->env;
+
+	DB_MSGBUF_INIT(&mb);
+	__db_msg(env, "In-memory DB structure:");
+	__db_msgadd(env, &mb, "%s: %#lx",
+	    __db_dbtype_to_string(dbp->type), (u_long)dbp->flags);
+	__db_prflags(env, &mb, dbp->flags, __db_flags_fn, " (", ")");
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		bt = dbp->bt_internal;
+		__db_msg(env, "bt_meta: %lu bt_root: %lu",
+		    (u_long)bt->bt_meta, (u_long)bt->bt_root);
+		__db_msg(env, "bt_minkey: %lu", (u_long)bt->bt_minkey);
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "bt_compare: %#lx bt_prefix: %#lx",
+			    P_TO_ULONG(bt->bt_compare),
+			    P_TO_ULONG(bt->bt_prefix));
+#ifdef HAVE_COMPRESSION
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "bt_compress: %#lx bt_decompress: %#lx",
+			    P_TO_ULONG(bt->bt_compress),
+			    P_TO_ULONG(bt->bt_decompress));
+#endif
+		__db_msg(env, "bt_lpgno: %lu", (u_long)bt->bt_lpgno);
+		if (dbp->type == DB_RECNO) {
+			__db_msg(env,
+		    "re_pad: %#lx re_delim: %#lx re_len: %lu re_source: %s",
+			    (u_long)bt->re_pad, (u_long)bt->re_delim,
+			    (u_long)bt->re_len,
+			    bt->re_source == NULL ? "" : bt->re_source);
+			__db_msg(env,
+			    "re_modified: %d re_eof: %d re_last: %lu",
+			    bt->re_modified, bt->re_eof, (u_long)bt->re_last);
+		}
+		break;
+	case DB_HASH:
+		h = dbp->h_internal;
+		__db_msg(env, "meta_pgno: %lu", (u_long)h->meta_pgno);
+		__db_msg(env, "h_ffactor: %lu", (u_long)h->h_ffactor);
+		__db_msg(env, "h_nelem: %lu", (u_long)h->h_nelem);
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "h_hash: %#lx", P_TO_ULONG(h->h_hash));
+		break;
+	case DB_QUEUE:
+		q = dbp->q_internal;
+		__db_msg(env, "q_meta: %lu", (u_long)q->q_meta);
+		__db_msg(env, "q_root: %lu", (u_long)q->q_root);
+		__db_msg(env, "re_pad: %#lx re_len: %lu",
+		    (u_long)q->re_pad, (u_long)q->re_len);
+		__db_msg(env, "rec_page: %lu", (u_long)q->rec_page);
+		__db_msg(env, "page_ext: %lu", (u_long)q->page_ext);
+		break;
+	case DB_HEAP:
+		hp = dbp->heap_internal;
+		__db_msg(env, "gbytes: %lu", (u_long)hp->gbytes);
+		__db_msg(env, "bytes: %lu", (u_long)hp->bytes);
+		__db_msg(env, "curregion: %lu", (u_long)hp->curregion);
+		__db_msg(env, "region_size: %lu", (u_long)hp->region_size);
+		__db_msg(env, "maxpgno: %lu", (u_long)hp->maxpgno);
+		break;
+	case DB_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * __db_prtree --
+ *	Print out the entire tree.
+ */
+static int
+__db_prtree(dbp, txn, flags, first, last)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+	db_pgno_t first, last;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pgno_t i;
+	int ret;
+
+	mpf = dbp->mpf;
+
+	if (dbp->type == DB_QUEUE)
+		return (__db_prqueue(dbp, flags));
+
+	/*
+	 * Find out the page number of the last page in the database, then
+	 * dump each page.
+	 */
+	if (last == PGNO_INVALID &&
+	    (ret = __memp_get_last_pgno(mpf, &last)) != 0)
+		return (ret);
+	for (i = first; i <= last; ++i) {
+		if ((ret = __memp_fget(mpf, &i, NULL, txn, 0, &h)) != 0)
+			return (ret);
+		(void)__db_prpage(dbp, h, flags);
+		if ((ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_prnpage
+ *	-- Print out a specific page.
+ *
+ * PUBLIC: int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+ */
+int
+__db_prnpage(dbp, txn, pgno)
+	DB *dbp;
+	DB_TXN *txn;
+	db_pgno_t pgno;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno, NULL, txn, 0, &h)) != 0)
+		return (ret);
+
+	ret = __db_prpage(dbp, h, DB_PR_PAGE);
+
+	if ((t_ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_prpage
+ *	-- Print out a page.
+ *
+ * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t));
+ */
+int
+__db_prpage(dbp, h, flags)
+	DB *dbp;
+	PAGE *h;
+	u_int32_t flags;
+{
+	DB_MSGBUF mb;
+	u_int32_t pagesize;
+	/*
+	 * !!!
+	 * Find out the page size.  We don't want to do it the "right" way,
+	 * by reading the value from the meta-data page, that's going to be
+	 * slow.  Reach down into the mpool region.
+	 */
+	pagesize = (u_int32_t)dbp->mpf->mfp->pagesize;
+	DB_MSGBUF_INIT(&mb);
+	return (__db_prpage_int(dbp->env,
+	    &mb, dbp, "", h, pagesize, NULL, flags));
+}
+
+/*
+ * __db_lockmode_to_string --
+ *	Return the name of the lock mode.
+ *
+ * PUBLIC: const char * __db_lockmode_to_string __P((db_lockmode_t));
+ */
+const char *
+__db_lockmode_to_string(mode)
+	db_lockmode_t mode;
+{
+	switch (mode) {
+	case DB_LOCK_NG:
+		return ("Not granted");
+	case DB_LOCK_READ:
+		return ("Shared/read");
+	case DB_LOCK_WRITE:
+		return ("Exclusive/write");
+	case DB_LOCK_WAIT:
+		return ("Wait for event");
+	case DB_LOCK_IWRITE:
+		return ("Intent exclusive/write");
+	case DB_LOCK_IREAD:
+		return ("Intent shared/read");
+	case DB_LOCK_IWR:
+		return ("Intent to read/write");
+	case DB_LOCK_READ_UNCOMMITTED:
+		return ("Read uncommitted");
+	case DB_LOCK_WWRITE:
+		return ("Was written");
+	default:
+		break;
+	}
+	return ("UNKNOWN LOCK MODE");
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_dumptree --
+ *	Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *,
+ * PUBLIC:     char *, char *, db_pgno_t, db_pgno_t));
+ */
+int
+__db_dumptree(dbp, txn, op, name, first, last)
+	DB *dbp;
+	DB_TXN *txn;
+	char *op, *name;
+	db_pgno_t first, last;
+{
+	COMPQUIET(txn, NULL);
+	COMPQUIET(op, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(first, last);
+
+	return (__db_stat_not_built(dbp->env));
+}
+
+/*
+ * __db_get_flags_fn --
+ *	Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+	/*
+	 * !!!
+	 * The Tcl API uses this interface, stub it off.
+	 */
+	return (NULL);
+}
+#endif
+
+/*
+ * __db_meta --
+ *	Print out common metadata information.
+ */
+static void
+__db_meta(env, dbp, dbmeta, fn, flags)
+	DB *dbp;
+	ENV *env;
+	DBMETA *dbmeta;
+	FN const *fn;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	DB_MSGBUF mb;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int8_t *p;
+	int cnt, ret;
+	const char *sep;
+
+	DB_MSGBUF_INIT(&mb);
+
+	__db_msg(env, "\tmagic: %#lx", (u_long)dbmeta->magic);
+	__db_msg(env, "\tversion: %lu", (u_long)dbmeta->version);
+	__db_msg(env, "\tpagesize: %lu", (u_long)dbmeta->pagesize);
+	__db_msg(env, "\ttype: %lu", (u_long)dbmeta->type);
+	__db_msg(env, "\tmetaflags %#lx", (u_long)dbmeta->metaflags);
+	__db_msg(env, "\tkeys: %lu\trecords: %lu",
+	    (u_long)dbmeta->key_count, (u_long)dbmeta->record_count);
+	if (dbmeta->nparts)
+		__db_msg(env, "\tnparts: %lu", (u_long)dbmeta->nparts);
+
+	/*
+	 * If we're doing recovery testing, don't display the free list,
+	 * it may have changed and that makes the dump diff not work.
+	 */
+	if (dbp != NULL && !LF_ISSET(DB_PR_RECOVERYTEST)) {
+		mpf = dbp->mpf;
+		__db_msgadd(
+		    env, &mb, "\tfree list: %lu", (u_long)dbmeta->free);
+		for (pgno = dbmeta->free,
+		    cnt = 0, sep = ", "; pgno != PGNO_INVALID;) {
+			if ((ret = __memp_fget(mpf,
+			     &pgno, NULL, NULL, 0, &h)) != 0) {
+				DB_MSGBUF_FLUSH(env, &mb);
+				__db_msg(env,
+			    "Unable to retrieve free-list page: %lu: %s",
+				    (u_long)pgno, db_strerror(ret));
+				break;
+			}
+			pgno = h->next_pgno;
+			(void)__memp_fput(mpf, NULL, h, dbp->priority);
+			__db_msgadd(env, &mb, "%s%lu", sep, (u_long)pgno);
+			if (++cnt % 10 == 0) {
+				DB_MSGBUF_FLUSH(env, &mb);
+				cnt = 0;
+				sep = "\t";
+			} else
+				sep = ", ";
+		}
+		DB_MSGBUF_FLUSH(env, &mb);
+		__db_msg(env, "\tlast_pgno: %lu", (u_long)dbmeta->last_pgno);
+	}
+
+	if (fn != NULL) {
+		DB_MSGBUF_FLUSH(env, &mb);
+		__db_msgadd(env, &mb, "\tflags: %#lx", (u_long)dbmeta->flags);
+		__db_prflags(env, &mb, dbmeta->flags, fn, " (", ")");
+	}
+
+	DB_MSGBUF_FLUSH(env, &mb);
+	__db_msgadd(env, &mb, "\tuid: ");
+	for (p = (u_int8_t *)dbmeta->uid,
+	    cnt = 0; cnt < DB_FILE_ID_LEN; ++cnt) {
+		__db_msgadd(env, &mb, "%x", *p++);
+		if (cnt < DB_FILE_ID_LEN - 1)
+			__db_msgadd(env, &mb, " ");
+	}
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_bmeta --
+ *	Print out the btree meta-data page.
+ */
+static int
+__db_bmeta(env, dbp, h, flags)
+	ENV *env;
+	DB *dbp;
+	BTMETA *h;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ BTM_DUP,	"duplicates" },
+		{ BTM_RECNO,	"recno" },
+		{ BTM_RECNUM,	"btree:recnum" },
+		{ BTM_FIXEDLEN,	"recno:fixed-length" },
+		{ BTM_RENUMBER,	"recno:renumber" },
+		{ BTM_SUBDB,	"multiple-databases" },
+		{ BTM_DUPSORT,	"sorted duplicates" },
+		{ BTM_COMPRESS,	"compressed" },
+		{ 0,		NULL }
+	};
+
+	__db_meta(env, dbp, (DBMETA *)h, fn, flags);
+
+	__db_msg(env, "\tminkey: %lu", (u_long)h->minkey);
+	if (F_ISSET(&h->dbmeta, BTM_RECNO))
+		__db_msg(env, "\tre_len: %#lx re_pad: %#lx",
+		    (u_long)h->re_len, (u_long)h->re_pad);
+	__db_msg(env, "\troot: %lu", (u_long)h->root);
+
+	return (0);
+}
+
+/*
+ * __db_hmeta --
+ *	Print out the hash meta-data page.
+ */
+static int
+__db_hmeta(env, dbp, h, flags)
+	ENV *env;
+	DB *dbp;
+	HMETA *h;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_HASH_DUP,		"duplicates" },
+		{ DB_HASH_SUBDB,	"multiple-databases" },
+		{ DB_HASH_DUPSORT,	"sorted duplicates" },
+		{ 0,			NULL }
+	};
+	DB_MSGBUF mb;
+	int i;
+
+	DB_MSGBUF_INIT(&mb);
+
+	__db_meta(env, dbp, (DBMETA *)h, fn, flags);
+
+	__db_msg(env, "\tmax_bucket: %lu", (u_long)h->max_bucket);
+	__db_msg(env, "\thigh_mask: %#lx", (u_long)h->high_mask);
+	__db_msg(env, "\tlow_mask:  %#lx", (u_long)h->low_mask);
+	__db_msg(env, "\tffactor: %lu", (u_long)h->ffactor);
+	__db_msg(env, "\tnelem: %lu", (u_long)h->nelem);
+	__db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey);
+	__db_msgadd(env, &mb, "\tspare points:\n\t");
+	for (i = 0; i < NCACHED; i++) {
+		__db_msgadd(env, &mb, "%lu (%lu) ", (u_long)h->spares[i],
+		    (u_long)(h->spares[i] == 0 ?
+		    0 : h->spares[i] + (i == 0 ? 0 : 1 << (i-1))));
+		if ((i + 1) % 8 == 0)
+			__db_msgadd(env, &mb, "\n\t");
+	}
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	return (0);
+}
+
+/*
+ * __db_qmeta --
+ *	Print out the queue meta-data page.
+ */
+static int
+__db_qmeta(env, dbp, h, flags)
+	ENV *env;
+	DB *dbp;
+	QMETA *h;
+	u_int32_t flags;
+{
+
+	__db_meta(env, dbp, (DBMETA *)h, NULL, flags);
+
+	__db_msg(env, "\tfirst_recno: %lu", (u_long)h->first_recno);
+	__db_msg(env, "\tcur_recno: %lu", (u_long)h->cur_recno);
+	__db_msg(env, "\tre_len: %#lx re_pad: %lu",
+	    (u_long)h->re_len, (u_long)h->re_pad);
+	__db_msg(env, "\trec_page: %lu", (u_long)h->rec_page);
+	__db_msg(env, "\tpage_ext: %lu", (u_long)h->page_ext);
+
+	return (0);
+}
+
+/*
+ * __db_heapmeta --
+ *	Print out the heap meta-data page.
+ */
+static int
+__db_heapmeta(env, dbp, h, flags)
+	ENV *env;
+	DB *dbp;
+	HEAPMETA *h;
+	u_int32_t flags;
+{
+	__db_meta(env, dbp, (DBMETA *)h, NULL, flags);
+
+	__db_msg(env, "\tcurregion: %lu", (u_long)h->curregion);
+	__db_msg(env, "\tregion_size: %lu", (u_long)h->region_size);
+	__db_msg(env, "\tnregions: %lu", (u_long)h->nregions);
+	__db_msg(env, "\tgbytes: %lu", (u_long)h->gbytes);
+	__db_msg(env, "\tbytes: %lu", (u_long)h->bytes);
+
+	return (0);
+}
+
+/*
+ * __db_heapint --
+ *	Print out the heap internal-data page.
+ */
+static int
+__db_heapint(dbp, h, flags)
+	DB *dbp;
+	HEAPPG *h;
+	u_int32_t flags;
+{
+	DB_MSGBUF mb;
+	ENV *env;
+	int count, printed;
+	u_int32_t i, max;
+	u_int8_t avail;
+
+	env = dbp->env;
+	DB_MSGBUF_INIT(&mb);
+	count = printed = 0;
+	COMPQUIET(flags, 0);
+
+	__db_msgadd(env, &mb, "\thigh: %4lu\n", (u_long)h->high_pgno);
+	/* How many entries could there be on a page */
+	max = HEAP_REGION_SIZE(dbp);
+
+	for (i = 0; i < max; i++, count++) {
+		avail = HEAP_SPACE(dbp, h, i);
+		if (avail != 0) {
+			__db_msgadd(env, &mb,
+			    "%5lu:%1lu ", (u_long)i, (u_long)avail);
+			printed = 1;
+		}
+		/* We get 10 entries per line this way */
+		if (count == 9) {
+			DB_MSGBUF_FLUSH(env, &mb);
+			count = -1;
+		}
+	}
+	/* All pages were less than 33% full */
+	if (printed == 0)
+		 __db_msgadd(env, &mb,
+		    "All pages in this region less than 33 percent full");
+
+	DB_MSGBUF_FLUSH(env, &mb);
+	return (0);
+}
+
+/*
+ * For printing pages from the log we may be passed the data segment
+ * separate from the header, if so then it starts at HOFFSET.
+ */
+#define	PR_ENTRY(dbp, h, i, data)				\
+	(data == NULL ? P_ENTRY(dbp, h, i) :			\
+	    (u_int8_t *)data + P_INP(dbp, h)[i] - HOFFSET(h))
+/*
+ * __db_prpage_int
+ *	-- Print out a page.
+ *
+ * PUBLIC: int __db_prpage_int __P((ENV *, DB_MSGBUF *,
+ * PUBLIC:      DB *, char *, PAGE *, u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DB *dbp;
+	char *lead;
+	PAGE *h;
+	u_int32_t pagesize;
+	u_int8_t *data;
+	u_int32_t flags;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	HOFFPAGE a_hkd;
+	QAMDATA *qp, *qep;
+	RINTERNAL *ri;
+	HEAPHDR *hh;
+	HEAPSPLITHDR *hs;
+	db_indx_t dlen, len, i, *inp, max;
+	db_pgno_t pgno;
+	db_recno_t recno;
+	u_int32_t qlen;
+	u_int8_t *ep, *hk, *p;
+	int deleted, ret;
+	const char *s;
+	void *hdata, *sp;
+
+	/*
+	 * If we're doing recovery testing and this page is P_INVALID,
+	 * assume it's a page that's on the free list, and don't display it.
+	 */
+	if (LF_ISSET(DB_PR_RECOVERYTEST) && TYPE(h) == P_INVALID)
+		return (0);
+
+	if ((s = __db_pagetype_to_string(TYPE(h))) == NULL) {
+		__db_msg(env, "%sILLEGAL PAGE TYPE: page: %lu type: %lu",
+		    lead, (u_long)h->pgno, (u_long)TYPE(h));
+		return (EINVAL);
+	}
+
+	/* Page number, page type. */
+	__db_msgadd(env, mbp, "%spage %lu: %s:", lead, (u_long)h->pgno, s);
+
+	/*
+	 * LSNs on a metadata page will be different from the original after an
+	 * abort, in some cases.  Don't display them if we're testing recovery.
+	 */
+	if (!LF_ISSET(DB_PR_RECOVERYTEST) ||
+	    (TYPE(h) != P_BTREEMETA && TYPE(h) != P_HASHMETA &&
+	    TYPE(h) != P_QAMMETA && TYPE(h) != P_QAMDATA &&
+	    TYPE(h) != P_HEAPMETA))
+		__db_msgadd(env, mbp, " LSN [%lu][%lu]:",
+		    (u_long)LSN(h).file, (u_long)LSN(h).offset);
+
+	/*
+	 * Page level (only applicable for Btree/Recno, but we always display
+	 * it, for no particular reason, except for Heap.
+	 */
+	if (!HEAPTYPE(h))
+	    __db_msgadd(env, mbp, " level %lu", (u_long)h->level);
+
+	/* Record count. */
+	if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO ||
+	    (dbp != NULL && TYPE(h) == P_LRECNO &&
+	    h->pgno == ((BTREE *)dbp->bt_internal)->bt_root))
+		__db_msgadd(env, mbp, " records: %lu", (u_long)RE_NREC(h));
+	DB_MSGBUF_FLUSH(env, mbp);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		return (__db_bmeta(env, dbp, (BTMETA *)h, flags));
+	case P_HASHMETA:
+		return (__db_hmeta(env, dbp, (HMETA *)h, flags));
+	case P_QAMMETA:
+		return (__db_qmeta(env, dbp, (QMETA *)h, flags));
+	case P_QAMDATA:				/* Should be meta->start. */
+		if (!LF_ISSET(DB_PR_PAGE) || dbp == NULL)
+			return (0);
+
+		qlen = ((QUEUE *)dbp->q_internal)->re_len;
+		recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+		i = 0;
+		qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+		for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+		    recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+			if (!F_ISSET(qp, QAM_SET))
+				continue;
+
+			__db_msgadd(env, mbp, "%s",
+			    F_ISSET(qp, QAM_VALID) ? "\t" : "       D");
+			__db_msgadd(env, mbp, "[%03lu] %4lu ", (u_long)recno,
+			    (u_long)((u_int8_t *)qp - (u_int8_t *)h));
+			__db_prbytes(env, mbp, qp->data, qlen);
+		}
+		return (0);
+	case P_HEAPMETA:
+		return (__db_heapmeta(env, dbp, (HEAPMETA *)h, flags));
+	case P_IHEAP:
+		if (!LF_ISSET(DB_PR_PAGE) || dbp == NULL)
+			return (0);
+		return (__db_heapint(dbp, (HEAPPG *)h, flags));
+	default:
+		break;
+	}
+
+	s = "\t";
+	if (!HEAPTYPE(h) && TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+		__db_msgadd(env, mbp, "%sprev: %4lu next: %4lu",
+		    s, (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
+		s = " ";
+	}
+
+	if (HEAPTYPE(h)) {
+		__db_msgadd(env, mbp, "%shigh indx: %4lu free indx: %4lu", s,
+		    (u_long)HEAP_HIGHINDX(h), (u_long)HEAP_FREEINDX(h));
+		s = " ";
+	}
+
+	if (TYPE(h) == P_OVERFLOW) {
+		__db_msgadd(env, mbp,
+		    "%sref cnt: %4lu ", s, (u_long)OV_REF(h));
+		if (dbp == NULL)
+			__db_msgadd(env, mbp,
+			    " len: %4lu ", (u_long)OV_LEN(h));
+		else
+			__db_prbytes(env,
+			    mbp, (u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h));
+		return (0);
+	}
+	__db_msgadd(env, mbp, "%sentries: %4lu", s, (u_long)NUM_ENT(h));
+	__db_msgadd(env, mbp, " offset: %4lu", (u_long)HOFFSET(h));
+	DB_MSGBUF_FLUSH(env, mbp);
+
+	if (dbp == NULL || TYPE(h) == P_INVALID || !LF_ISSET(DB_PR_PAGE))
+		return (0);
+
+	if (data != NULL)
+		pagesize += HOFFSET(h);
+	else if (pagesize < HOFFSET(h))
+		return (0);
+
+	ret = 0;
+	inp = P_INP(dbp, h);
+	max = TYPE(h) == P_HEAP ? HEAP_HIGHINDX(h) + 1 : NUM_ENT(h);
+	for (i = 0; i < max; i++) {
+		if (TYPE(h) == P_HEAP && inp[i] == 0)
+			continue;
+		if ((uintptr_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) <
+		    (uintptr_t)(P_OVERHEAD(dbp)) ||
+		    (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) {
+			__db_msg(env,
+			    "ILLEGAL PAGE OFFSET: indx: %lu of %lu",
+			    (u_long)i, (u_long)inp[i]);
+			ret = EINVAL;
+			continue;
+		}
+		deleted = 0;
+		switch (TYPE(h)) {
+		case P_HASH_UNSORTED:
+		case P_HASH:
+		case P_IBTREE:
+		case P_IRECNO:
+			sp = PR_ENTRY(dbp, h, i, data);
+			break;
+		case P_HEAP:
+			sp = P_ENTRY(dbp, h, i);
+			break;
+		case P_LBTREE:
+			sp = PR_ENTRY(dbp, h, i, data);
+			deleted = i % 2 == 0 &&
+			    B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type);
+			break;
+		case P_LDUP:
+		case P_LRECNO:
+			sp = PR_ENTRY(dbp, h, i, data);
+			deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type);
+			break;
+		default:
+			goto type_err;
+		}
+		__db_msgadd(env, mbp, "%s", deleted ? "       D" : "\t");
+		__db_msgadd(
+		    env, mbp, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]);
+		switch (TYPE(h)) {
+		case P_HASH_UNSORTED:
+		case P_HASH:
+			hk = sp;
+			switch (HPAGE_PTYPE(hk)) {
+			case H_OFFDUP:
+				memcpy(&pgno,
+				    HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+				__db_msgadd(env, mbp,
+				    "%4lu [offpage dups]", (u_long)pgno);
+				DB_MSGBUF_FLUSH(env, mbp);
+				break;
+			case H_DUPLICATE:
+				/*
+				 * If this is the first item on a page, then
+				 * we cannot figure out how long it is, so
+				 * we only print the first one in the duplicate
+				 * set.
+				 */
+				if (i != 0)
+					len = LEN_HKEYDATA(dbp, h, 0, i);
+				else
+					len = 1;
+
+				__db_msgadd(env, mbp, "Duplicates:");
+				DB_MSGBUF_FLUSH(env, mbp);
+				for (p = HKEYDATA_DATA(hk),
+				    ep = p + len; p < ep;) {
+					memcpy(&dlen, p, sizeof(db_indx_t));
+					p += sizeof(db_indx_t);
+					__db_msgadd(env, mbp, "\t\t");
+					__db_prbytes(env, mbp, p, dlen);
+					p += sizeof(db_indx_t) + dlen;
+				}
+				break;
+			case H_KEYDATA:
+				__db_prbytes(env, mbp, HKEYDATA_DATA(hk),
+				    LEN_HKEYDATA(dbp, h, i == 0 ?
+				    pagesize : 0, i));
+				break;
+			case H_OFFPAGE:
+				memcpy(&a_hkd, hk, HOFFPAGE_SIZE);
+				__db_msgadd(env, mbp,
+				    "overflow: total len: %4lu page: %4lu",
+				    (u_long)a_hkd.tlen, (u_long)a_hkd.pgno);
+				DB_MSGBUF_FLUSH(env, mbp);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, mbp);
+				__db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu",
+				    (u_long)HPAGE_PTYPE(hk));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		case P_IBTREE:
+			bi = sp;
+
+			if (F_ISSET(dbp, DB_AM_RECNUM))
+				__db_msgadd(env, mbp,
+				    "count: %4lu ", (u_long)bi->nrecs);
+			__db_msgadd(env, mbp,
+			    "pgno: %4lu type: %lu ",
+			    (u_long)bi->pgno, (u_long)bi->type);
+			switch (B_TYPE(bi->type)) {
+			case B_KEYDATA:
+				__db_prbytes(env, mbp, bi->data, bi->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				__db_proff(env, mbp, bi->data);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, mbp);
+				__db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu",
+				    (u_long)B_TYPE(bi->type));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		case P_IRECNO:
+			ri = sp;
+			__db_msgadd(env, mbp, "entries %4lu pgno %4lu",
+			    (u_long)ri->nrecs, (u_long)ri->pgno);
+			DB_MSGBUF_FLUSH(env, mbp);
+			break;
+		case P_LBTREE:
+		case P_LDUP:
+		case P_LRECNO:
+			bk = sp;
+			switch (B_TYPE(bk->type)) {
+			case B_KEYDATA:
+				__db_prbytes(env, mbp, bk->data, bk->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				__db_proff(env, mbp, bk);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, mbp);
+				__db_msg(env,
+			    "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu",
+				    (u_long)B_TYPE(bk->type));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		case P_HEAP:
+			hh = sp;
+			if (!F_ISSET(hh,HEAP_RECSPLIT))
+				hdata = (u_int8_t *)hh + sizeof(HEAPHDR);
+			else {
+				hs = sp;
+				__db_msgadd(env, mbp,
+				     "split: 0x%02x tsize: %lu next: %lu.%lu ",
+				     hh->flags, (u_long)hs->tsize,
+				     (u_long)hs->nextpg, (u_long)hs->nextindx);
+
+				hdata = (u_int8_t *)hh + sizeof(HEAPSPLITHDR);
+			}
+			__db_prbytes(env, mbp, hdata, hh->size);
+			break;
+		default:
+type_err:		DB_MSGBUF_FLUSH(env, mbp);
+			__db_msg(env,
+			    "ILLEGAL PAGE TYPE: %lu", (u_long)TYPE(h));
+			ret = EINVAL;
+			continue;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __db_prbytes --
+ *	Print out a data element.
+ *
+ * PUBLIC: void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+ */
+void
+__db_prbytes(env, mbp, bytes, len)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	u_int8_t *bytes;
+	u_int32_t len;
+{
+	u_int8_t *p;
+	u_int32_t i, not_printable;
+	int msg_truncated;
+
+	__db_msgadd(env, mbp, "len: %3lu", (u_long)len);
+	if (len != 0) {
+		__db_msgadd(env, mbp, " data: ");
+
+		/*
+		 * Print the first N bytes of the data.   If that
+		 * chunk is at least 3/4  printable characters, print
+		 * it as text, else print it in hex.  We have this
+		 * heuristic because we're displaying things like
+		 * lock objects that could be either text or data.
+		 */
+		if (len > env->data_len) {
+			len = env->data_len;
+			msg_truncated = 1;
+		} else
+			msg_truncated = 0;
+		not_printable = 0;
+		for (p = bytes, i = 0; i < len; ++i, ++p) {
+			if (!isprint((int)*p) && *p != '\t' && *p != '\n') {
+				if (i == len - 1 && *p == '\0')
+					break;
+				if (++not_printable >= (len >> 2))
+					break;
+			}
+		}
+		if (not_printable < (len >> 2))
+			for (p = bytes, i = len; i > 0; --i, ++p) {
+				if (isprint((int)*p))
+					__db_msgadd(env, mbp, "%c", *p);
+				else
+					__db_msgadd(env,
+					    mbp, "\\%x", (u_int)*p);
+			}
+		else
+			for (p = bytes, i = len; i > 0; --i, ++p)
+				__db_msgadd(env, mbp, "%.2x", (u_int)*p);
+		if (msg_truncated)
+			__db_msgadd(env, mbp, "...");
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_proff --
+ *	Print out an off-page element.
+ */
+static void
+__db_proff(env, mbp, vp)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	void *vp;
+{
+	BOVERFLOW *bo;
+
+	bo = vp;
+	switch (B_TYPE(bo->type)) {
+	case B_OVERFLOW:
+		__db_msgadd(env, mbp, "overflow: total len: %4lu page: %4lu",
+		    (u_long)bo->tlen, (u_long)bo->pgno);
+		break;
+	case B_DUPLICATE:
+		__db_msgadd(
+		    env, mbp, "duplicate: page: %4lu", (u_long)bo->pgno);
+		break;
+	default:
+		/* NOTREACHED */
+		break;
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_prflags --
+ *	Print out flags values.
+ *
+ * PUBLIC: void __db_prflags __P((ENV *, DB_MSGBUF *,
+ * PUBLIC:     u_int32_t, const FN *, const char *, const char *));
+ */
+void
+__db_prflags(env, mbp, flags, fn, prefix, suffix)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	u_int32_t flags;
+	FN const *fn;
+	const char *prefix, *suffix;
+{
+	DB_MSGBUF mb;
+	const FN *fnp;
+	int found, standalone;
+	const char *sep;
+
+	if (fn == NULL)
+		return;
+
+	/*
+	 * If it's a standalone message, output the suffix (which will be the
+	 * label), regardless of whether we found anything or not, and flush
+	 * the line.
+	 */
+	if (mbp == NULL) {
+		standalone = 1;
+		mbp = &mb;
+		DB_MSGBUF_INIT(mbp);
+	} else
+		standalone = 0;
+
+	sep = prefix == NULL ? "" : prefix;
+	for (found = 0, fnp = fn; fnp->mask != 0; ++fnp)
+		if (LF_ISSET(fnp->mask)) {
+			__db_msgadd(env, mbp, "%s%s", sep, fnp->name);
+			sep = ", ";
+			found = 1;
+		}
+
+	if ((standalone || found) && suffix != NULL)
+		__db_msgadd(env, mbp, "%s", suffix);
+	if (standalone)
+		DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_name_to_val --
+ *	Return the integral value associated with the string, or -1 if missing.
+ *	It is intended for looking up string names of enums and single bit
+ *	in order to get a numeric value.
+ *
+ * PUBLIC: int __db_name_to_val __P((FN const *, char *));
+ */
+int
+__db_name_to_val(strtable, s)
+	FN const *strtable;
+	char *s;
+{
+	if (s != NULL) {
+		do {
+			if (strcasecmp(strtable->name, s) == 0)
+				return ((int)strtable->mask);
+		} while ((++strtable)->name != NULL);
+	}
+	return (-1);
+}
+
+/*
+ * __db_pagetype_to_string --
+ *	Return the name of the specified page type.
+ * PUBLIC: const char *__db_pagetype_to_string __P((u_int32_t));
+ */
+const char *
+__db_pagetype_to_string(type)
+	u_int32_t type;
+{
+	char *s;
+
+	s = NULL;
+	switch (type) {
+	case P_BTREEMETA:
+		s = "btree metadata";
+		break;
+	case P_LDUP:
+		s = "duplicate";
+		break;
+	case P_HASH_UNSORTED:
+		s = "hash unsorted";
+		break;
+	case P_HASH:
+		s = "hash";
+		break;
+	case P_HASHMETA:
+		s = "hash metadata";
+		break;
+	case P_IBTREE:
+		s = "btree internal";
+		break;
+	case P_INVALID:
+		s = "invalid";
+		break;
+	case P_IRECNO:
+		s = "recno internal";
+		break;
+	case P_LBTREE:
+		s = "btree leaf";
+		break;
+	case P_LRECNO:
+		s = "recno leaf";
+		break;
+	case P_OVERFLOW:
+		s = "overflow";
+		break;
+	case P_QAMMETA:
+		s = "queue metadata";
+		break;
+	case P_QAMDATA:
+		s = "queue";
+		break;
+	case P_HEAPMETA:
+		s = "heap metadata";
+		break;
+	case P_HEAP:
+		s = "heap data";
+		break;
+	case P_IHEAP:
+		s = "heap internal";
+		break;
+	default:
+		/* Just return a NULL. */
+		break;
+	}
+	return (s);
+}
+
+/*
+ * __db_dump_pp --
+ *	DB->dump pre/post processing.
+ *
+ * PUBLIC: int __db_dump_pp __P((DB *, const char *,
+ * PUBLIC:     int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump_pp(dbp, subname, callback, handle, pflag, keyflag)
+	DB *dbp;
+	const char *subname;
+	int (*callback) __P((void *, const void *));
+	void *handle;
+	int pflag, keyflag;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->dump");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_dump(dbp, subname, callback, handle, pflag, keyflag);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_dump --
+ *	DB->dump.
+ *
+ * PUBLIC: int __db_dump __P((DB *, const char *,
+ * PUBLIC:     int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump(dbp, subname, callback, handle, pflag, keyflag)
+	DB *dbp;
+	const char *subname;
+	int (*callback) __P((void *, const void *));
+	void *handle;
+	int pflag, keyflag;
+{
+	DBC *dbcp;
+	DBT key, data;
+	DBT keyret, dataret;
+	DB_HEAP_RID rid;
+	ENV *env;
+	db_recno_t recno;
+	int is_recno, is_heap, ret, t_ret;
+	void *pointer;
+
+	env = dbp->env;
+	is_heap = 0;
+
+	if ((ret = __db_prheader(
+	    dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Get a cursor and step through the database, printing out each
+	 * key/data pair.
+	 */
+	if ((ret = __db_cursor(dbp, NULL, NULL, &dbcp, 0)) != 0)
+		return (ret);
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __os_malloc(env, 1024 * 1024, &data.data)) != 0)
+		goto err;
+	data.ulen = 1024 * 1024;
+	data.flags = DB_DBT_USERMEM;
+	is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE);
+	keyflag = is_recno ? keyflag : 1;
+	if (is_recno) {
+		keyret.data = &recno;
+		keyret.size = sizeof(recno);
+	}
+
+	if (dbp->type == DB_HEAP) {
+		is_heap = 1;
+		key.data = &rid;
+		key.size = key.ulen = sizeof(DB_HEAP_RID);
+		key.flags = DB_DBT_USERMEM;
+	}
+
+retry: while ((ret =
+	    __dbc_get(dbcp, &key, &data,
+	    !is_heap ? DB_NEXT | DB_MULTIPLE_KEY : DB_NEXT )) == 0) {
+		if (is_heap) {
+			/* Never dump keys for HEAP */
+			if ((ret = __db_prdbt(
+			    &data, pflag, " ", handle, callback, 0, 0)) != 0)
+				goto err;
+			continue;
+		}
+		DB_MULTIPLE_INIT(pointer, &data);
+		for (;;) {
+			if (is_recno)
+				DB_MULTIPLE_RECNO_NEXT(pointer, &data,
+				    recno, dataret.data, dataret.size);
+			else
+				DB_MULTIPLE_KEY_NEXT(pointer, &data,
+				    keyret.data, keyret.size,
+				    dataret.data, dataret.size);
+
+			if (dataret.data == NULL)
+				break;
+
+			if ((keyflag &&
+			    (ret = __db_prdbt(&keyret, pflag, " ",
+			    handle, callback, is_recno, 0)) != 0) ||
+			    (ret = __db_prdbt(&dataret, pflag, " ",
+			    handle, callback, 0, 0)) != 0)
+					goto err;
+		}
+	}
+	if (ret == DB_BUFFER_SMALL) {
+		data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
+		if ((ret = __os_realloc(env, data.size, &data.data)) != 0)
+			goto err;
+		data.ulen = data.size;
+		goto retry;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	if ((t_ret = __db_prfooter(handle, callback)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (data.data != NULL)
+		__os_free(env, data.data);
+
+	return (ret);
+}
+
+/*
+ * __db_prdbt --
+ *	Print out a DBT data element.
+ *
+ * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC:     int (*)(void *, const void *), int, int));
+ */
+int
+__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
+	DBT *dbtp;
+	int checkprint;
+	const char *prefix;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	int is_recno;
+	int is_heap;
+{
+	static const u_char hex[] = "0123456789abcdef";
+	db_recno_t recno;
+	DB_HEAP_RID rid;
+	size_t len;
+	int ret;
+#define	DBTBUFLEN	100
+	u_int8_t *p, *hp;
+	char buf[DBTBUFLEN], hbuf[DBTBUFLEN];
+
+	/*
+	 * !!!
+	 * This routine is the routine that dumps out items in the format
+	 * used by db_dump(1) and db_load(1).  This means that the format
+	 * cannot change.
+	 */
+	if (prefix != NULL && (ret = callback(handle, prefix)) != 0)
+		return (ret);
+	if (is_recno) {
+		/*
+		 * We're printing a record number, and this has to be done
+		 * in a platform-independent way.  So we use the numeral in
+		 * straight ASCII.
+		 */
+		(void)__ua_memcpy(&recno, dbtp->data, sizeof(recno));
+		snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno);
+
+		/* If we're printing data as hex, print keys as hex too. */
+		if (!checkprint) {
+			for (len = strlen(buf), p = (u_int8_t *)buf,
+			    hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+				*hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+				*hp++ = hex[*p & 0x0f];
+			}
+			*hp = '\0';
+			ret = callback(handle, hbuf);
+		} else
+			ret = callback(handle, buf);
+
+		if (ret != 0)
+			return (ret);
+	} else if (is_heap) {
+		/*
+		 * We're printing a heap record number, and this has to be
+		 * done in a platform-independent way.  So we use the numeral
+		 * in straight ASCII.
+		 */
+		(void)__ua_memcpy(&rid, dbtp->data, sizeof(rid));
+		snprintf(buf, DBTBUFLEN, "%lu %hu",
+		    (u_long)rid.pgno, (u_short)rid.indx);
+
+		/* If we're printing data as hex, print keys as hex too. */
+		if (!checkprint) {
+			for (len = strlen(buf), p = (u_int8_t *)buf,
+			    hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+				*hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+				*hp++ = hex[*p & 0x0f];
+			}
+			*hp = '\0';
+			ret = callback(handle, hbuf);
+		} else
+			ret = callback(handle, buf);
+
+		if (ret != 0)
+			return (ret);
+	} else if (checkprint) {
+		for (len = dbtp->size, p = dbtp->data; len--; ++p)
+			if (isprint((int)*p)) {
+				if (*p == '\\' &&
+				    (ret = callback(handle, "\\")) != 0)
+					return (ret);
+				snprintf(buf, DBTBUFLEN, "%c", *p);
+				if ((ret = callback(handle, buf)) != 0)
+					return (ret);
+			} else {
+				snprintf(buf, DBTBUFLEN, "\\%c%c",
+				    hex[(u_int8_t)(*p & 0xf0) >> 4],
+				    hex[*p & 0x0f]);
+				if ((ret = callback(handle, buf)) != 0)
+					return (ret);
+			}
+	} else
+		for (len = dbtp->size, p = dbtp->data; len--; ++p) {
+			snprintf(buf, DBTBUFLEN, "%c%c",
+			    hex[(u_int8_t)(*p & 0xf0) >> 4],
+			    hex[*p & 0x0f]);
+			if ((ret = callback(handle, buf)) != 0)
+				return (ret);
+		}
+
+	return (callback(handle, "\n"));
+}
+
+/*
+ * __db_prheader --
+ *	Write out header information in the format expected by db_load.
+ *
+ * PUBLIC: int	__db_prheader __P((DB *, const char *, int, int, void *,
+ * PUBLIC:     int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
+	DB *dbp;
+	const char *subname;
+	int pflag, keyflag;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta_pgno;
+{
+	DBT dbt;
+	DBTYPE dbtype;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	u_int32_t flags, tmp_u_int32;
+	size_t buflen;
+	char *buf;
+	int using_vdp, ret, t_ret, tmp_int;
+#ifdef HAVE_HEAP
+	u_int32_t tmp2_u_int32;
+#endif
+
+	ret = 0;
+	buf = NULL;
+	COMPQUIET(buflen, 0);
+
+	/*
+	 * If dbp is NULL, then pip is guaranteed to be non-NULL; we only ever
+	 * call __db_prheader with a NULL dbp from one case inside __db_prdbt,
+	 * and this is a special subdatabase for "lost" items.  In this case
+	 * we have a vdp (from which we'll get a pip).  In all other cases, we
+	 * will have a non-NULL dbp (and vdp may or may not be NULL depending
+	 * on whether we're salvaging).
+	 */
+	if (dbp == NULL)
+		env = NULL;
+	else
+		env = dbp->env;
+	DB_ASSERT(env, dbp != NULL || vdp != NULL);
+
+	/*
+	 * If we've been passed a verifier statistics object, use that;  we're
+	 * being called in a context where dbp->stat is unsafe.
+	 *
+	 * Also, the verifier may set the pflag on a per-salvage basis.  If so,
+	 * respect that.
+	 */
+	if (vdp != NULL) {
+		if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+			return (ret);
+
+		if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+			pflag = 1;
+		using_vdp = 1;
+	} else {
+		pip = NULL;
+		using_vdp = 0;
+	}
+
+	/*
+	 * If dbp is NULL, make it a btree.  Otherwise, set dbtype to whatever
+	 * appropriate type for the specified meta page, or the type of the dbp.
+	 */
+	if (dbp == NULL)
+		dbtype = DB_BTREE;
+	else if (using_vdp)
+		switch (pip->type) {
+		case P_BTREEMETA:
+			if (F_ISSET(pip, VRFY_IS_RECNO))
+				dbtype = DB_RECNO;
+			else
+				dbtype = DB_BTREE;
+			break;
+		case P_HASHMETA:
+			dbtype = DB_HASH;
+			break;
+		case P_HEAPMETA:
+			dbtype = DB_HEAP;
+			break;
+		case P_QAMMETA:
+			dbtype = DB_QUEUE;
+			break;
+		default:
+			/*
+			 * If the meta page is of a bogus type, it's because
+			 * we have a badly corrupt database.  (We must be in
+			 * the verifier for pip to be non-NULL.) Pretend we're
+			 * a Btree and salvage what we can.
+			 */
+			DB_ASSERT(env, F_ISSET(dbp, DB_AM_VERIFYING));
+			dbtype = DB_BTREE;
+			break;
+		}
+	else
+		dbtype = dbp->type;
+
+	if ((ret = callback(handle, "VERSION=3\n")) != 0)
+		goto err;
+	if (pflag) {
+		if ((ret = callback(handle, "format=print\n")) != 0)
+			goto err;
+	} else if ((ret = callback(handle, "format=bytevalue\n")) != 0)
+		goto err;
+
+	/*
+	 * 64 bytes is long enough, as a minimum bound, for any of the
+	 * fields besides subname.  Subname uses __db_prdbt and therefore
+	 * does not need buffer space here.
+	 */
+	buflen = 64;
+	if ((ret = __os_malloc(env, buflen, &buf)) != 0)
+		goto err;
+	if (subname != NULL) {
+		snprintf(buf, buflen, "database=");
+		if ((ret = callback(handle, buf)) != 0)
+			goto err;
+		DB_INIT_DBT(dbt, subname, strlen(subname));
+		if ((ret = __db_prdbt(&dbt, 1,
+		    NULL, handle, callback, 0, 0)) != 0)
+			goto err;
+	}
+	switch (dbtype) {
+	case DB_BTREE:
+		if ((ret = callback(handle, "type=btree\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_HAS_RECNUMS) ? 1 : 0;
+		else {
+			if ((ret = __db_get_flags(dbp, &flags)) != 0) {
+				__db_err(env, ret, "DB->get_flags");
+				goto err;
+			}
+			tmp_int = F_ISSET(dbp, DB_AM_RECNUM) ? 1 : 0;
+		}
+		if (tmp_int && (ret = callback(handle, "recnum=1\n")) != 0)
+			goto err;
+
+		if (using_vdp)
+			tmp_u_int32 = pip->bt_minkey;
+		else
+			if ((ret =
+			    __bam_get_bt_minkey(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_bt_minkey");
+				goto err;
+			}
+		if (tmp_u_int32 != 0 && tmp_u_int32 != DEFMINKEYPAGE) {
+			snprintf(buf, buflen,
+			    "bt_minkey=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+	case DB_HASH:
+#ifdef HAVE_HASH
+		if ((ret = callback(handle, "type=hash\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_u_int32 = pip->h_ffactor;
+		else
+			if ((ret =
+			    __ham_get_h_ffactor(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_h_ffactor");
+				goto err;
+			}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf, buflen,
+			    "h_ffactor=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+
+		if (using_vdp)
+			tmp_u_int32 = pip->h_nelem;
+		else
+			if ((ret = __ham_get_h_nelem(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_h_nelem");
+				goto err;
+			}
+		/*
+		 * Hash databases have an h_nelem field of 0 or 1, neither
+		 * of those values is interesting.
+		 */
+		if (tmp_u_int32 > 1) {
+			snprintf(buf, buflen,
+			    "h_nelem=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+#else
+		ret = __db_no_hash_am(env);
+		goto err;
+#endif
+	case DB_HEAP:
+#ifdef HAVE_HEAP
+		if ((ret = callback(handle, "type=heap\n")) != 0)
+			goto err;
+
+		if ((ret = __heap_get_heapsize(
+		    dbp, &tmp_u_int32, &tmp2_u_int32)) != 0) {
+			__db_err(env, ret, "DB->get_heapsize");
+			goto err;
+		}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf,
+			    buflen, "heap_gbytes=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		if (tmp2_u_int32 != 0) {
+			snprintf(buf,
+			    buflen, "heap_bytes=%lu\n", (u_long)tmp2_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+
+		if ((ret =
+		    __heap_get_heap_regionsize(dbp, &tmp_u_int32)) != 0) {
+			__db_err(env, ret, "DB->get_heap_regionsize");
+			goto err;
+		}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf, buflen,
+			    "heap_regionsize=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+#else
+		ret = __db_no_heap_am(env);
+		goto err;
+#endif
+	case DB_QUEUE:
+#ifdef HAVE_QUEUE
+		if ((ret = callback(handle, "type=queue\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_u_int32 = vdp->re_len;
+		else
+			if ((ret = __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_re_len");
+				goto err;
+			}
+		snprintf(buf, buflen, "re_len=%lu\n", (u_long)tmp_u_int32);
+		if ((ret = callback(handle, buf)) != 0)
+			goto err;
+
+		if (using_vdp)
+			tmp_int = (int)vdp->re_pad;
+		else
+			if ((ret = __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+				__db_err(env, ret, "DB->get_re_pad");
+				goto err;
+			}
+		if (tmp_int != 0 && tmp_int != ' ') {
+			snprintf(buf, buflen, "re_pad=%#x\n", tmp_int);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+
+		if (using_vdp)
+			tmp_u_int32 = vdp->page_ext;
+		else
+			if ((ret =
+			    __qam_get_extentsize(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_q_extentsize");
+				goto err;
+			}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf, buflen,
+			    "extentsize=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+#else
+		ret = __db_no_queue_am(env);
+		goto err;
+#endif
+	case DB_RECNO:
+		if ((ret = callback(handle, "type=recno\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_IS_RRECNO) ? 1 : 0;
+		else
+			tmp_int = F_ISSET(dbp, DB_AM_RENUMBER) ? 1 : 0;
+		if (tmp_int != 0 &&
+		    (ret = callback(handle, "renumber=1\n")) != 0)
+				goto err;
+
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_IS_FIXEDLEN) ? 1 : 0;
+		else
+			tmp_int = F_ISSET(dbp, DB_AM_FIXEDLEN) ? 1 : 0;
+		if (tmp_int) {
+			if (using_vdp)
+				tmp_u_int32 = pip->re_len;
+			else
+				if ((ret =
+				    __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+					__db_err(env, ret, "DB->get_re_len");
+					goto err;
+				}
+			snprintf(buf, buflen,
+			    "re_len=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+
+			if (using_vdp)
+				tmp_int = (int)pip->re_pad;
+			else
+				if ((ret =
+				    __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+					__db_err(env, ret, "DB->get_re_pad");
+					goto err;
+				}
+			if (tmp_int != 0 && tmp_int != ' ') {
+				snprintf(buf,
+				    buflen, "re_pad=%#x\n", (u_int)tmp_int);
+				if ((ret = callback(handle, buf)) != 0)
+					goto err;
+			}
+		}
+		break;
+	case DB_UNKNOWN:			/* Impossible. */
+		ret = __db_unknown_path(env, "__db_prheader");
+		goto err;
+	}
+
+	if (using_vdp) {
+		if (F_ISSET(pip, VRFY_HAS_CHKSUM))
+			if ((ret = callback(handle, "chksum=1\n")) != 0)
+				goto err;
+		if (F_ISSET(pip, VRFY_HAS_DUPS))
+			if ((ret = callback(handle, "duplicates=1\n")) != 0)
+				goto err;
+		if (F_ISSET(pip, VRFY_HAS_DUPSORT))
+			if ((ret = callback(handle, "dupsort=1\n")) != 0)
+				goto err;
+#ifdef HAVE_COMPRESSION
+		if (F_ISSET(pip, VRFY_HAS_COMPRESS))
+			if ((ret = callback(handle, "compressed=1\n")) != 0)
+				goto err;
+#endif
+		/*
+		 * !!!
+		 * We don't know if the page size was the default if we're
+		 * salvaging.  It doesn't seem that interesting to have, so
+		 * we ignore it for now.
+		 */
+	} else {
+		if (F_ISSET(dbp, DB_AM_CHKSUM))
+			if ((ret = callback(handle, "chksum=1\n")) != 0)
+				goto err;
+		if (F_ISSET(dbp, DB_AM_DUP))
+			if ((ret = callback(handle, "duplicates=1\n")) != 0)
+				goto err;
+		if (F_ISSET(dbp, DB_AM_DUPSORT))
+			if ((ret = callback(handle, "dupsort=1\n")) != 0)
+				goto err;
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbp))
+			if ((ret = callback(handle, "compressed=1\n")) != 0)
+				goto err;
+#endif
+		if (!F_ISSET(dbp, DB_AM_PGDEF)) {
+			snprintf(buf, buflen,
+			    "db_pagesize=%lu\n", (u_long)dbp->pgsize);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+	}
+
+#ifdef HAVE_PARTITION
+	if (dbp != NULL && DB_IS_PARTITIONED(dbp) &&
+	    F_ISSET((DB_PARTITION *)dbp->p_internal, PART_RANGE)) {
+		DBT *keys;
+		u_int32_t i;
+
+		if ((ret = __partition_get_keys(dbp, &tmp_u_int32, &keys)) != 0)
+			goto err;
+		if (tmp_u_int32 != 0) {
+			snprintf(buf,
+			     buflen, "nparts=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+			for (i = 0; i < tmp_u_int32 - 1; i++)
+			    if ((ret = __db_prdbt(&keys[i],
+				pflag, " ", handle, callback, 0, 0)) != 0)
+					goto err;
+		}
+	}
+#endif
+
+	if (keyflag && (ret = callback(handle, "keys=1\n")) != 0)
+		goto err;
+
+	ret = callback(handle, "HEADER=END\n");
+
+err:	if (using_vdp &&
+	    (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (buf != NULL)
+		__os_free(env, buf);
+
+	return (ret);
+}
+
+/*
+ * __db_prfooter --
+ *	Print the footer that marks the end of a DB dump.  This is trivial,
+ *	but for consistency's sake we don't want to put its literal contents
+ *	in multiple places.
+ *
+ * PUBLIC: int __db_prfooter __P((void *, int (*)(void *, const void *)));
+ */
+int
+__db_prfooter(handle, callback)
+	void *handle;
+	int (*callback) __P((void *, const void *));
+{
+	return (callback(handle, "DATA=END\n"));
+}
+
+/*
+ * __db_pr_callback --
+ *	Callback function for using pr_* functions from C.
+ *
+ * PUBLIC: int  __db_pr_callback __P((void *, const void *));
+ */
+int
+__db_pr_callback(handle, str_arg)
+	void *handle;
+	const void *str_arg;
+{
+	char *str;
+	FILE *f;
+
+	str = (char *)str_arg;
+	f = (FILE *)handle;
+
+	if (fprintf(f, "%s", str) != (int)strlen(str))
+		return (EIO);
+
+	return (0);
+}
+
+/*
+ * __db_dbtype_to_string --
+ *	Return the name of the database type.
+ *
+ * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE));
+ */
+const char *
+__db_dbtype_to_string(type)
+	DBTYPE type;
+{
+	switch (type) {
+	case DB_BTREE:
+		return ("btree");
+	case DB_HASH:
+		return ("hash");
+	case DB_RECNO:
+		return ("recno");
+	case DB_QUEUE:
+		return ("queue");
+	case DB_HEAP:
+		return ("heap");
+	case DB_UNKNOWN:
+	default:
+		break;
+	}
+	return ("UNKNOWN TYPE");
+}
diff --git a/src/db/db_rec.c b/src/db/db_rec.c
new file mode 100644
index 00000000..8ba1124e
--- /dev/null
+++ b/src/db/db_rec.c
@@ -0,0 +1,2796 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/lock.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+
+static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *,
+    __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *,
+    __db_pg_freedata_42_args *,
+    DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+
+/*
+ * PUBLIC: int __db_addrem_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page.  On recover, we just do the opposite.
+ */
+int
+__db_addrem_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_addrem_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+	u_int32_t opcode;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_addrem_print);
+	REC_INTRO(__db_addrem_read, ip, 1);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+	modified = 0;
+
+	opcode = OP_MODE_GET(argp->opcode);
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_DUP) ||
+	    (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_DUP)) {
+		/* Need to redo an add, or undo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+		    argp->hdr.size == 0 ? NULL : &argp->hdr,
+		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+			goto out;
+		modified = 1;
+
+	} else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_DUP) ||
+	    (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_DUP)) {
+		/* Need to undo an add, or redo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_ditem(dbc,
+		    pagep, argp->indx, argp->nbytes)) != 0)
+			goto out;
+		modified = 1;
+	}
+
+	if (modified) {
+		if (DB_REDO(op))
+			LSN(pagep) = *lsnp;
+		else
+			LSN(pagep) = argp->pagelsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_addrem_42_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page.  On recover, we just do the opposite.
+ */
+int
+__db_addrem_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_addrem_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_addrem_print);
+	REC_INTRO(__db_addrem_42_read, ip, 1);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+	modified = 0;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) {
+		/* Need to redo an add, or undo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+		    argp->hdr.size == 0 ? NULL : &argp->hdr,
+		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+			goto out;
+		modified = 1;
+
+	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) ||
+	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) {
+		/* Need to undo an add, or redo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_ditem(dbc,
+		    pagep, argp->indx, argp->nbytes)) != 0)
+			goto out;
+		modified = 1;
+	}
+
+	if (modified) {
+		if (DB_REDO(op))
+			LSN(pagep) = *lsnp;
+		else
+			LSN(pagep) = argp->pagelsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_big_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+	u_int32_t opcode;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_big_print);
+	REC_INTRO(__db_big_read, ip, 0);
+
+	opcode = OP_MODE_GET(argp->opcode);
+	REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+	modified = 0;
+
+	/*
+	 * There are three pages we need to check.  The one on which we are
+	 * adding data, the previous one whose next_pointer may have
+	 * been updated, and the next one whose prev_pointer may have
+	 * been updated.
+	 */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_BIG) ||
+	    (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_BIG)) {
+		/* We are either redo-ing an add, or undoing a delete. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+			argp->next_pgno, 0, P_OVERFLOW);
+		OV_LEN(pagep) = argp->dbt.size;
+		OV_REF(pagep) = 1;
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+		    argp->dbt.size);
+		PREV_PGNO(pagep) = argp->prev_pgno;
+		modified = 1;
+	} else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_BIG) ||
+	    (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_BIG)) {
+		/*
+		 * We are either undo-ing an add or redo-ing a delete.
+		 * The page is about to be reclaimed in either case, so
+		 * there really isn't anything to do here.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		modified = 1;
+	} else if (cmp_p == 0 && DB_REDO(op) && opcode == DB_APPEND_BIG) {
+		/* We are redoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+		OV_LEN(pagep) += argp->dbt.size;
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op) && opcode == DB_APPEND_BIG) {
+		/* We are undoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_LEN(pagep) -= argp->dbt.size;
+		memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), 0, argp->dbt.size);
+		modified = 1;
+	}
+	if (modified)
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto out;
+
+	/*
+	 * We only delete a whole chain of overflow items, and appends only
+	 * apply to a single page.  Adding a page is the only case that
+	 * needs to update the chain.
+	 */
+ppage:	if (opcode != DB_ADD_BIG)
+		goto done;
+
+	/* Now check the previous page. */
+	if (argp->prev_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+		if (cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_BIG) {
+			/* Redo add, undo delete. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		} else if (cmp_n == 0 &&
+		    DB_UNDO(op) && opcode == DB_ADD_BIG) {
+			/* Redo delete, undo add. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->next_pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+	/* Now check the next page.  Can only be set on a delete. */
+npage:	if (argp->next_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+		if (cmp_p == 0 && DB_REDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = PGNO_INVALID;
+			modified = 1;
+		} else if (cmp_n == 0 && DB_UNDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_42_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_big_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_big_print);
+	REC_INTRO(__db_big_42_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+	modified = 0;
+
+	/*
+	 * There are three pages we need to check.  The one on which we are
+	 * adding data, the previous one whose next_pointer may have
+	 * been updated, and the next one whose prev_pointer may have
+	 * been updated.
+	 */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
+		/* We are either redo-ing an add, or undoing a delete. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+			argp->next_pgno, 0, P_OVERFLOW);
+		OV_LEN(pagep) = argp->dbt.size;
+		OV_REF(pagep) = 1;
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+		    argp->dbt.size);
+		PREV_PGNO(pagep) = argp->prev_pgno;
+		modified = 1;
+	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
+	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
+		/*
+		 * We are either undo-ing an add or redo-ing a delete.
+		 * The page is about to be reclaimed in either case, so
+		 * there really isn't anything to do here.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		modified = 1;
+	} else if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_APPEND_BIG) {
+		/* We are redoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+		OV_LEN(pagep) += argp->dbt.size;
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_APPEND_BIG) {
+		/* We are undoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_LEN(pagep) -= argp->dbt.size;
+		memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), 0, argp->dbt.size);
+		modified = 1;
+	}
+	if (modified)
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto out;
+
+	/*
+	 * We only delete a whole chain of overflow items, and appends only
+	 * apply to a single page.  Adding a page is the only case that
+	 * needs to update the chain.
+	 */
+ppage:	if (argp->opcode != DB_ADD_BIG)
+		goto done;
+
+	/* Now check the previous page. */
+	if (argp->prev_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+		if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
+			/* Redo add, undo delete. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		} else if (cmp_n == 0 &&
+		    DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
+			/* Redo delete, undo add. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->next_pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+	/* Now check the next page.  Can only be set on a delete. */
+npage:	if (argp->next_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+		if (cmp_p == 0 && DB_REDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = PGNO_INVALID;
+			modified = 1;
+		} else if (cmp_n == 0 && DB_UNDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+/*
+ * __db_ovref_recover --
+ *	Recovery function for __db_ovref().
+ *
+ * PUBLIC: int __db_ovref_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_ovref_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_ovref_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_ovref_print);
+	REC_INTRO(__db_ovref_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn);
+	if (cmp == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_REF(pagep) += argp->adjust;
+		pagep->lsn = *lsnp;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_REF(pagep) -= argp->adjust;
+		pagep->lsn = argp->lsn;
+	}
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_debug_recover --
+ *	Recovery function for debug.
+ *
+ * PUBLIC: int __db_debug_recover __P((ENV *,
+ * PUBLIC:     DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_debug_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_debug_args *argp;
+	int ret;
+
+	COMPQUIET(op, DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__db_debug_print);
+	REC_NOOP_INTRO(__db_debug_read);
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __db_noop_recover --
+ *	Recovery function for noop.
+ *
+ * PUBLIC: int __db_noop_recover __P((ENV *,
+ * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_noop_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_noop_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_noop_print);
+	REC_INTRO(__db_noop_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->prevlsn;
+	}
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf,
+		    ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_alloc_recover --
+ *	Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_alloc_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, created, level, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	pagep = NULL;
+	created = 0;
+	REC_PRINT(__db_pg_alloc_print);
+	REC_INTRO(__db_pg_alloc_read, ip, 0);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing the operation, we have
+	 * to get the metadata page and update its LSN and its free pointer.
+	 * If we're undoing the operation and the page was ever created, we put
+	 * it on the freelist.
+	 */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+		/* The metadata page must always exist on redo. */
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = *lsnp;
+		meta->free = argp->next;
+		if (argp->pgno > meta->last_pgno)
+			meta->last_pgno = argp->pgno;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = argp->meta_lsn;
+		/*
+		 * If the page has a zero LSN then its newly created and
+		 * will be truncated rather than go on the free list.
+		 */
+		if (!IS_ZERO_LSN(argp->page_lsn))
+			meta->free = argp->pgno;
+		meta->last_pgno = argp->last_pgno;
+	}
+
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * check to see if we are keeping a sorted freelist, if so put
+	 * this back in the in memory list.  It must be the first element.
+	 */
+	if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) {
+		db_pgno_t *list;
+		u_int32_t nelem;
+
+		if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+			goto out;
+		if (list != NULL && (nelem == 0 || *list != argp->pgno)) {
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+				goto out;
+			if (nelem != 0)
+				memmove(list + 1, list, nelem * sizeof(*list));
+			*list = argp->pgno;
+		}
+	}
+#endif
+
+	/*
+	 * Fix up the allocated page. If the page does not exist
+	 * and we can truncate it then don't create it.
+	 * Otherwise if we're redoing the operation, we have
+	 * to get the page (creating it if it doesn't exist), and update its
+	 * LSN.  If we're undoing the operation, we have to reset the page's
+	 * LSN and put it on the free list.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		/*
+		 * We have to be able to identify if a page was newly
+		 * created so we can recover it properly.  We cannot simply
+		 * look for an empty header, because hash uses a pgin
+		 * function that will set the header.  Instead, we explicitly
+		 * try for the page without CREATE and if that fails, then
+		 * create it.
+		 */
+		if (DB_UNDO(op))
+			goto do_truncate;
+		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE, &pagep)) != 0) {
+			if (DB_UNDO(op) && ret == ENOSPC)
+				goto do_truncate;
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		created = 1;
+	}
+
+	/* Fix up the allocated page. */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+	/*
+	 * If an initial allocation is aborted and then reallocated during
+	 * an archival restore the log record will have an LSN for the page
+	 * but the page will be empty.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+	/*
+	 * Another special case we have to handle is if we ended up with a
+	 * page of all 0's which can happen if we abort between allocating a
+	 * page in mpool and initializing it.  In that case, even if we're
+	 * undoing, we need to re-initialize the page.
+	 */
+	if (DB_REDO(op) && cmp_p == 0) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		switch (argp->ptype) {
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			level = LEAFLEVEL;
+			break;
+		default:
+			level = 0;
+			break;
+		}
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+		pagep->lsn = *lsnp;
+	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+		/*
+		 * This is where we handle the case of a 0'd page (pagep->pgno
+		 * is equal to PGNO_INVALID).
+		 * Undo the allocation, reinitialize the page and
+		 * link its next pointer to the free list.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+		pagep->lsn = argp->page_lsn;
+	}
+
+do_truncate:
+	/*
+	 * If the page was newly created, give it back.
+	 */
+	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+		/* Discard the page. */
+		if (pagep != NULL) {
+			if ((ret = __memp_fput(mpf, ip,
+			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			pagep = NULL;
+		}
+		/* Give the page back to the OS. */
+		if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate(
+		    mpf, NULL, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0)
+			goto out;
+	}
+
+	if (pagep != NULL) {
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+
+	ret = __memp_fput(mpf, ip, meta, file_dbp->priority);
+	meta = NULL;
+	if (ret != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_int --
+ */
+static int
+__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__db_pg_freedata_args *argp;
+	DB *file_dbp;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	db_recops op;
+	int data;
+{
+	DBMETA *meta;
+	DB_LSN copy_lsn;
+	PAGE *pagep, *prevp;
+	int cmp_n, cmp_p, is_meta, ret;
+
+	meta = NULL;
+	pagep = prevp = NULL;
+
+	/*
+	 * Get the "metapage".  This will either be the metapage
+	 * or the previous page in the free list if we are doing
+	 * sorted allocations.  If its a previous page then
+	 * we will not be truncating.
+	 */
+	is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+	if (argp->meta_pgno != PGNO_BASE_MD)
+		prevp = (PAGE *)meta;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing or undoing the operation
+	 * we get the page and update its LSN, last and free pointer.
+	 */
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		/*
+		 * If we are at the end of the file truncate, otherwise
+		 * put on the free list.
+		 */
+#ifdef HAVE_FTRUNCATE
+		if (argp->pgno == argp->last_pgno)
+			meta->last_pgno = argp->pgno - 1;
+		else
+#endif
+		if (is_meta)
+			meta->free = argp->pgno;
+		else
+			NEXT_PGNO(prevp) = argp->pgno;
+		LSN(meta) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (is_meta) {
+			if (meta->last_pgno < argp->pgno)
+				meta->last_pgno = argp->pgno;
+			meta->free = argp->next;
+		} else
+			NEXT_PGNO(prevp) = argp->next;
+		LSN(meta) = argp->meta_lsn;
+	}
+
+check_meta:
+	if (ret != 0 && is_meta) {
+		/* The metadata page must always exist. */
+		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+		goto out;
+	}
+
+	/*
+	 * Get the freed page.  Don't create the page if we are going to
+	 * free it.  If we're redoing the operation we get the page and
+	 * explicitly discard its contents, then update its LSN. If we're
+	 * undoing the operation, we get the page and restore its header.
+	 */
+	if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) {
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, 0, &pagep)) != 0) {
+			if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+#ifdef HAVE_FTRUNCATE
+			if (is_meta &&
+			    DB_REDO(op) && meta->last_pgno <= argp->pgno)
+				goto trunc;
+#endif
+			goto done;
+		}
+	} else if ((ret = __memp_fget(mpf, &argp->pgno,
+	   ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+		goto out;
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+	/*
+	 * This page got extended by a later allocation,
+	 * but its allocation was not in the scope of this
+	 * recovery pass.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	/*
+	 * We need to check that the page could have the current LSN
+	 * which was copied before it was truncated in addition to
+	 * the usual of having the previous LSN.
+	 */
+	if (DB_REDO(op) &&
+	    (cmp_p == 0 || cmp_n == 0 ||
+	    (IS_ZERO_LSN(copy_lsn) &&
+	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+		/* Need to redo the deallocation. */
+		/*
+		 * The page can be truncated if it was truncated at runtime
+		 * and the current metapage reflects the truncation.
+		 */
+#ifdef HAVE_FTRUNCATE
+		if (is_meta && meta->last_pgno <= argp->pgno &&
+		    argp->last_pgno <= argp->pgno) {
+			if ((ret = __memp_fput(mpf, ip,
+			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			pagep = NULL;
+trunc:			if ((ret = __memp_ftruncate(mpf, NULL, ip,
+			    argp->pgno, MP_TRUNC_RECOVER)) != 0)
+				goto out;
+		} else if (argp->last_pgno == argp->pgno) {
+			/* The page was truncated at runtime, zero it out. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			P_INIT(pagep, 0, PGNO_INVALID,
+			    PGNO_INVALID, PGNO_INVALID, 0, P_INVALID);
+			ZERO_LSN(pagep->lsn);
+		} else
+#endif
+		if (cmp_p == 0 || IS_ZERO_LSN(LSN(pagep))) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			P_INIT(pagep, file_dbp->pgsize,
+			    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+			pagep->lsn = *lsnp;
+
+		}
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to reallocate the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (data)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+	pagep = NULL;
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * If we are keeping an in memory free list remove this
+	 * element from the list.
+	 */
+	if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) {
+		db_pgno_t *lp;
+		u_int32_t nelem, pos;
+
+		if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0)
+			goto out;
+		if (lp != NULL) {
+			pos = 0;
+			if (!is_meta) {
+				__db_freelist_pos(argp->pgno, lp, nelem, &pos);
+
+				/*
+				 * If we aborted after logging but before
+				 * updating the free list don't do anything.
+				 */
+				if (argp->pgno != lp[pos]) {
+					DB_ASSERT(env,
+					    argp->meta_pgno == lp[pos]);
+					goto done;
+				}
+				DB_ASSERT(env,
+				    argp->meta_pgno == lp[pos - 1]);
+			} else if (nelem != 0 && argp->pgno != lp[pos])
+				goto done;
+
+			if (pos < nelem)
+				memmove(&lp[pos], &lp[pos + 1],
+				    ((nelem - pos) - 1) * sizeof(*lp));
+
+			/* Shrink the list */
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0)
+				goto out;
+		}
+	}
+#endif
+done:
+	if (meta != NULL &&
+	     (ret = __memp_fput(mpf, ip,  meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip,  pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip,  meta, file_dbp->priority);
+
+	return (ret);
+}
+
+/*
+ * __db_pg_free_recover --
+ *	Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_free_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_free_print);
+	REC_INTRO(__db_pg_free_read, ip, 0);
+
+	if ((ret = __db_pg_free_recover_int(env, ip,
+	     (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_recover --
+ *	Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_freedata_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_freedata_print);
+	REC_INTRO(__db_pg_freedata_read, ip, 0);
+
+	if ((ret = __db_pg_free_recover_int(env,
+	    ip, argp, file_dbp, lsnp, mpf, op, 1)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_cksum_recover --
+ *	Recovery function for checksum failure log record.
+ *
+ * PUBLIC: int __db_cksum_recover __P((ENV *,
+ * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_cksum_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_cksum_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op, DB_TXN_ABORT);
+
+	REC_PRINT(__db_cksum_print);
+
+	if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * We had a checksum failure -- the only option is to run catastrophic
+	 * recovery.
+	 */
+	if (F_ISSET(env, ENV_RECOVER_FATAL))
+		ret = 0;
+	else {
+		__db_errx(env, DB_STR("0642",
+		    "Checksum failure requires catastrophic recovery"));
+		ret = __env_panic(env, DB_RUNRECOVERY);
+	}
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * __db_pg_init_recover --
+ *	Recovery function to reinit pages after truncation.
+ *
+ * PUBLIC: int __db_pg_init_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_init_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_init_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN copy_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret, type;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_init_print);
+	REC_INTRO(__db_pg_init_read, ip, 0);
+
+	mpf = file_dbp->mpf;
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			if (ret == DB_PAGE_NOTFOUND)
+				goto done;
+			else {
+				ret = __db_pgerr(file_dbp, argp->pgno, ret);
+				goto out;
+			}
+		}
+
+		/*
+		 * This page was truncated and may simply not have
+		 * had an item written to it yet.  This should only
+		 * happen on hash databases, so confirm that.
+		 */
+		DB_ASSERT(env, file_dbp->type == DB_HASH);
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		if (TYPE(pagep) == P_HASH)
+			type = P_HASH;
+		else
+			type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID,
+		    PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Put the data back on the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (argp->data.size > 0)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_trunc_recover --
+ *	Recovery function for pg_trunc.
+ *
+ * PUBLIC: int __db_pg_trunc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_trunc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+#ifdef HAVE_FTRUNCATE
+	__db_pg_trunc_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pglist_t *pglist, *lp;
+	db_pgno_t last_pgno, *list;
+	u_int32_t felem, nelem, pos;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_trunc_print);
+	REC_INTRO(__db_pg_trunc_read, ip, 1);
+
+	pglist = (db_pglist_t *) argp->list.data;
+	nelem = argp->list.size / sizeof(db_pglist_t);
+	if (DB_REDO(op)) {
+		/*
+		 * First call __db_pg_truncate to find the truncation
+		 * point, truncate the file and return the new last_pgno.
+		 */
+		last_pgno = argp->last_pgno;
+		if ((ret = __db_pg_truncate(dbc, NULL, pglist,
+		    NULL, &nelem, argp->next_free, &last_pgno, lsnp, 1)) != 0)
+			goto out;
+
+		if (argp->last_free != PGNO_INVALID) {
+			/*
+			 * Update the next pointer of the last page in
+			 * the freelist.  If the truncation point is
+			 * beyond next_free then this is still in the freelist
+			 * otherwise the last_free page is at the end.
+			 */
+			if ((ret = __memp_fget(mpf,
+			    &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta),
+				     &argp->last_lsn) == 0) {
+					REC_DIRTY(mpf,
+					    ip, dbc->priority, &meta);
+					if (pglist->pgno > last_pgno)
+						NEXT_PGNO(meta) = PGNO_INVALID;
+					else
+						NEXT_PGNO(meta) = pglist->pgno;
+					LSN(meta) = *lsnp;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+				meta = NULL;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+		    0, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			if (argp->last_free == PGNO_INVALID) {
+				if (nelem == 0)
+					meta->free = PGNO_INVALID;
+				else
+					meta->free = pglist->pgno;
+			}
+			/*
+			 * If this is part of a multi record truncate
+			 * this could be just the last page of this record
+			 * don't move the meta->last_pgno forward.
+			 */
+			if (meta->last_pgno > last_pgno)
+				meta->last_pgno = last_pgno;
+			LSN(meta) = *lsnp;
+		}
+	} else {
+		/* Put the free list back in its original order. */
+		for (lp = pglist; lp < &pglist[nelem]; lp++) {
+			if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+			    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
+			if (IS_ZERO_LSN(LSN(pagep)) ||
+			     LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+				P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+				    PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+				LSN(pagep) = lp->lsn;
+			}
+			if ((ret = __memp_fput(mpf,
+			    ip, pagep, file_dbp->priority)) != 0)
+				goto out;
+		}
+		/*
+		 * Link the truncated part back into the free list.
+		 * Its either after the last_free page or directly
+		 * linked to the metadata page.
+		 */
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->last_free,
+			    ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+					NEXT_PGNO(meta) = argp->next_free;
+					LSN(meta) = argp->last_lsn;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+			meta = NULL;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta,
+		    ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			/*
+			 * If we had to break up the list last_pgno
+			 * may only represent the end of the block.
+			 */
+			if (meta->last_pgno < argp->last_pgno)
+				meta->last_pgno = argp->last_pgno;
+			if (argp->last_free == PGNO_INVALID)
+				meta->free = argp->next_free;
+			LSN(meta) = argp->meta_lsn;
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+
+	if (op == DB_TXN_ABORT) {
+		/*
+		 * Put the pages back on the in memory free list.
+		 * If this is part of a multi-record truncate then
+		 * we need to find this batch, it may not be at the end.
+		 * If we aborted while writing one of the log records
+		 * then this set may still be in the list.
+		 */
+		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+			goto out;
+		if (list != NULL) {
+			if (felem != 0 && list[felem - 1] > pglist->pgno) {
+				__db_freelist_pos(
+				    pglist->pgno, list, felem, &pos);
+				DB_ASSERT(env, pos < felem);
+				if (pglist->pgno == list[pos])
+					goto done;
+				pos++;
+			} else if (felem != 0 &&
+			    list[felem - 1] == pglist->pgno)
+				goto done;
+			else
+				pos = felem;
+			if ((ret = __memp_extend_freelist(
+			    mpf, felem + nelem, &list)) != 0)
+				goto out;
+			if (pos != felem)
+				memmove(&list[nelem + pos], &list[pos],
+				    sizeof(*list) * (felem - pos));
+			for (lp = pglist; lp < &pglist[nelem]; lp++)
+				list[pos++] = lp->pgno;
+		}
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+#else
+	/*
+	 * If HAVE_FTRUNCATE is not defined, we'll never see pg_trunc records
+	 * to recover.
+	 */
+	COMPQUIET(env, NULL);
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op,  DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+	return (EINVAL);
+#endif
+}
+/*
+ * __db_realloc_recover --
+ *	Recovery function for realloc.
+ *
+ * PUBLIC: int __db_realloc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_realloc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_realloc_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	db_pglist_t *pglist, *lp;
+#ifdef HAVE_FTRUNCATE
+	db_pgno_t *list;
+	u_int32_t felem, pos;
+#endif
+	u_int32_t nelem;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+
+	REC_PRINT(__db_realloc_print);
+	REC_INTRO(__db_realloc_read, ip, 1);
+	mpf = file_dbp->mpf;
+
+	/*
+	 * First, iterate over all the pages and make sure they are all in
+	 * their prior or new states (according to the op).
+	 */
+	pglist = (db_pglist_t *) argp->list.data;
+	nelem = argp->list.size / sizeof(db_pglist_t);
+	for (lp = pglist; lp < &pglist[nelem]; lp++) {
+		if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+		    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+			goto out;
+		if (DB_REDO(op) && LOG_COMPARE(&LSN(pagep), &lp->lsn) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+			P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+			    PGNO_INVALID, PGNO_INVALID, 0, argp->ptype);
+			LSN(pagep) = *lsnp;
+		} else if (DB_UNDO(op) && (IS_ZERO_LSN(LSN(pagep)) ||
+		     LOG_COMPARE(&LSN(pagep), lsnp) == 0)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+			P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+			    PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+			LSN(pagep) = lp->lsn;
+		}
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, file_dbp->priority)) != 0)
+			goto out;
+	}
+
+	/* Now, fix up the free list. */
+	if ((ret = __memp_fget(mpf,
+	    &argp->prev_pgno, ip, NULL, 0, &pagep)) != 0)
+		goto out;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if (DB_REDO(op) && cmp_p == 0) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (argp->prev_pgno == PGNO_BASE_MD)
+			((DBMETA *)pagep)->free = argp->next_free;
+		else
+			NEXT_PGNO(pagep) = argp->next_free;
+		LSN(pagep) = *lsnp;
+	} else if (DB_UNDO(op) && cmp_n == 0) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (argp->prev_pgno == PGNO_BASE_MD)
+			((DBMETA *)pagep)->free = pglist->pgno;
+		else
+			NEXT_PGNO(pagep) = pglist->pgno;
+		LSN(pagep) = argp->page_lsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+#ifdef HAVE_FTRUNCATE
+	if (op == DB_TXN_ABORT) {
+		/* Put the pages back in the sorted list. */
+		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+			goto out;
+		if (list != NULL) {
+			__db_freelist_pos(pglist->pgno, list, felem, &pos);
+			if (pglist->pgno == list[pos])
+				goto done;
+			if ((ret = __memp_extend_freelist(
+			    mpf, felem + nelem, &list)) != 0)
+				goto out;
+			pos++;
+			if (pos != felem)
+				memmove(&list[pos+nelem],
+				    &list[pos], nelem * sizeof(*list));
+			for (lp = pglist; lp < &pglist[nelem]; lp++)
+				list[pos++] = lp->pgno;
+		}
+	}
+#endif
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+}
+/*
+ * __db_pg_sort_44_recover --
+ *	Recovery function for pg_sort.
+ * This is deprecated and kept for replication upgrades.
+ *
+ * PUBLIC: int __db_pg_sort_44_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_sort_44_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+#ifdef HAVE_FTRUNCATE
+	__db_pg_sort_44_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pglist_t *pglist, *lp;
+	db_pgno_t pgno, *list;
+	u_int32_t felem, nelem;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_sort_44_print);
+	REC_INTRO(__db_pg_sort_44_read, ip, 1);
+
+	pglist = (db_pglist_t *) argp->list.data;
+	nelem = argp->list.size / sizeof(db_pglist_t);
+	if (DB_REDO(op)) {
+		pgno = argp->last_pgno;
+		__db_freelist_sort(pglist, nelem);
+		if ((ret = __db_pg_truncate(dbc, NULL,
+		    pglist, NULL, &nelem, PGNO_INVALID, &pgno, lsnp, 1)) != 0)
+			goto out;
+
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf,
+			    &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta),
+				     &argp->last_lsn) == 0) {
+					REC_DIRTY(mpf,
+					    ip, dbc->priority, &meta);
+					NEXT_PGNO(meta) = PGNO_INVALID;
+					LSN(meta) = *lsnp;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+				meta = NULL;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+		    0, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			if (argp->last_free == PGNO_INVALID) {
+				if (nelem == 0)
+					meta->free = PGNO_INVALID;
+				else
+					meta->free = pglist->pgno;
+			}
+			meta->last_pgno = pgno;
+			LSN(meta) = *lsnp;
+		}
+	} else {
+		/* Put the free list back in its original order. */
+		for (lp = pglist; lp < &pglist[nelem]; lp++) {
+			if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+			    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
+			if (IS_ZERO_LSN(LSN(pagep)) ||
+			     LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+				if (lp == &pglist[nelem - 1])
+					pgno = PGNO_INVALID;
+				else
+					pgno = lp[1].pgno;
+
+				P_INIT(pagep, file_dbp->pgsize,
+				    lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID);
+				LSN(pagep) = lp->lsn;
+			}
+			if ((ret = __memp_fput(mpf,
+			    ip, pagep, file_dbp->priority)) != 0)
+				goto out;
+		}
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->last_free,
+			    ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+					NEXT_PGNO(meta) = pglist->pgno;
+					LSN(meta) = argp->last_lsn;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+			meta = NULL;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta,
+		    ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->last_pgno = argp->last_pgno;
+			if (argp->last_free == PGNO_INVALID)
+				meta->free = pglist->pgno;
+			LSN(meta) = argp->meta_lsn;
+		}
+	}
+	if (op == DB_TXN_ABORT) {
+		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+			goto out;
+		if (list != NULL) {
+			DB_ASSERT(env, felem == 0 ||
+			    argp->last_free == list[felem - 1]);
+			if ((ret = __memp_extend_freelist(
+			    mpf, felem + nelem, &list)) != 0)
+				goto out;
+			for (lp = pglist; lp < &pglist[nelem]; lp++)
+				list[felem++] = lp->pgno;
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+#else
+	/*
+	 * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records
+	 * to recover.
+	 */
+	COMPQUIET(env, NULL);
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op,  DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __db_pg_alloc_42_recover --
+ *	Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_alloc_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, created, level, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	pagep = NULL;
+	created = 0;
+	REC_PRINT(__db_pg_alloc_42_print);
+	REC_INTRO(__db_pg_alloc_42_read, ip, 0);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing the operation, we have
+	 * to get the metadata page and update its LSN and its free pointer.
+	 * If we're undoing the operation and the page was ever created, we put
+	 * it on the freelist.
+	 */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+		/* The metadata page must always exist on redo. */
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = *lsnp;
+		meta->free = argp->next;
+		if (argp->pgno > meta->last_pgno)
+			meta->last_pgno = argp->pgno;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		goto no_rollback;
+	}
+
+	/*
+	 * Fix up the allocated page. If the page does not exist
+	 * and we can truncate it then don't create it.
+	 * Otherwise if we're redoing the operation, we have
+	 * to get the page (creating it if it doesn't exist), and update its
+	 * LSN.  If we're undoing the operation, we have to reset the page's
+	 * LSN and put it on the free list, or truncate it.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		/*
+		 * We have to be able to identify if a page was newly
+		 * created so we can recover it properly.  We cannot simply
+		 * look for an empty header, because hash uses a pgin
+		 * function that will set the header.  Instead, we explicitly
+		 * try for the page without CREATE and if that fails, then
+		 * create it.
+		 */
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			if (DB_UNDO(op) && ret == ENOSPC)
+				goto do_truncate;
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		created = 1;
+	}
+
+	/* Fix up the allocated page. */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+	/*
+	 * If an initial allocation is aborted and then reallocated during
+	 * an archival restore the log record will have an LSN for the page
+	 * but the page will be empty.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)) ||
+	    (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep))))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+	/*
+	 * Another special case we have to handle is if we ended up with a
+	 * page of all 0's which can happen if we abort between allocating a
+	 * page in mpool and initializing it.  In that case, even if we're
+	 * undoing, we need to re-initialize the page.
+	 */
+	if (DB_REDO(op) && cmp_p == 0) {
+		/* Need to redo update described. */
+		switch (argp->ptype) {
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			level = LEAFLEVEL;
+			break;
+		default:
+			level = 0;
+			break;
+		}
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+		pagep->lsn = *lsnp;
+	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+		/*
+		 * This is where we handle the case of a 0'd page (pagep->pgno
+		 * is equal to PGNO_INVALID).
+		 * Undo the allocation, reinitialize the page and
+		 * link its next pointer to the free list.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+		pagep->lsn = argp->page_lsn;
+	}
+
+do_truncate:
+	/*
+	 * We cannot undo things from 4.2 land, because we nolonger
+	 * have limbo processing.
+	 */
+	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+no_rollback:	__db_errx(env, DB_STR("0643",
+"Cannot replicate prepared transactions from master running release 4.2 "));
+		ret = __env_panic(env, EINVAL);
+	}
+
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_42_int --
+ */
+static int
+__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__db_pg_freedata_42_args *argp;
+	DB *file_dbp;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	db_recops op;
+	int data;
+{
+	DBMETA *meta;
+	DB_LSN copy_lsn;
+	PAGE *pagep, *prevp;
+	int cmp_n, cmp_p, is_meta, ret;
+
+	meta = NULL;
+	pagep = NULL;
+	prevp = NULL;
+
+	/*
+	 * Get the "metapage".  This will either be the metapage
+	 * or the previous page in the free list if we are doing
+	 * sorted allocations.  If its a previous page then
+	 * we will not be truncating.
+	 */
+	is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+	if (argp->meta_pgno != PGNO_BASE_MD)
+		prevp = (PAGE *)meta;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing or undoing the operation
+	 * we get the page and update its LSN, last and free pointer.
+	 */
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (prevp == NULL)
+			meta->free = argp->pgno;
+		else
+			NEXT_PGNO(prevp) = argp->pgno;
+		/*
+		 * If this was a compensating transaction and
+		 * we are a replica, then we never executed the
+		 * original allocation which incremented meta->free.
+		 */
+		if (prevp == NULL && meta->last_pgno < meta->free)
+			meta->last_pgno = meta->free;
+		LSN(meta) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (prevp == NULL)
+			meta->free = argp->next;
+		else
+			NEXT_PGNO(prevp) = argp->next;
+		LSN(meta) = argp->meta_lsn;
+		if (prevp == NULL && meta->last_pgno < argp->pgno)
+			meta->last_pgno = argp->pgno;
+	}
+
+check_meta:
+	if (ret != 0 && is_meta) {
+		/* The metadata page must always exist. */
+		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+		goto out;
+	}
+
+	/*
+	 * Get the freed page.  If we support truncate then don't
+	 * create the page if we are going to free it.  If we're
+	 * redoing the operation we get the page and explicitly discard
+	 * its contents, then update its LSN.  If we're undoing the
+	 * operation, we get the page and restore its header.
+	 * If we don't support truncate, then we must create the page
+	 * and roll it back.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno,
+	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+		goto out;
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	if (DB_REDO(op) &&
+	    (cmp_p == 0 ||
+	    (IS_ZERO_LSN(copy_lsn) &&
+	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+		/* Need to redo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to reallocate the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (data)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+	pagep = NULL;
+	if (meta != NULL &&
+	    (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+	return (ret);
+}
+
+/*
+ * __db_pg_free_42_recover --
+ *	Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_free_42_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_free_42_print);
+	REC_INTRO(__db_pg_free_42_read, ip, 0);
+
+	ret = __db_pg_free_recover_42_int(env, ip,
+	     (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_42_recover --
+ *	Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_freedata_42_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_freedata_42_print);
+	REC_INTRO(__db_pg_freedata_42_read, ip, 0);
+
+	ret = __db_pg_free_recover_42_int(
+	    env, ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_relink_42_recover --
+ *	Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_relink_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_relink_42_print);
+	REC_INTRO(__db_relink_42_read, ip, 0);
+
+	/*
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		goto next2;
+	}
+	if (argp->opcode == DB_ADD_PAGE_COMPAT)
+		goto next1;
+
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		pagep->prev_pgno = argp->prev;
+		pagep->lsn = argp->lsn;
+	}
+next1:	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+next2:	if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->next, ret);
+			goto out;
+		}
+		goto prev;
+	}
+	modified = 0;
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+	if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) ||
+	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) {
+		/* Redo the remove or undo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->prev;
+		modified = 1;
+	} else if ((argp->opcode == DB_REM_PAGE_COMPAT &&
+	    cmp_n == 0 && DB_UNDO(op)) ||
+	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) {
+		/* Undo the remove or redo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_next;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+	if (argp->opcode == DB_ADD_PAGE_COMPAT)
+		goto done;
+
+prev:	if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->prev, ret);
+			goto out;
+		}
+		goto done;
+	}
+	modified = 0;
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		modified = 1;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_prev;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_relink_recover --
+ *	Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_relink_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_relink_print);
+	REC_INTRO(__db_relink_read, ip, 0);
+
+	/*
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
+	 */
+	if (argp->next_pgno == PGNO_INVALID)
+		goto prev;
+	if ((ret = __memp_fget(mpf,
+	    &argp->next_pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->next_pgno, ret);
+			goto out;
+		} else
+			goto prev;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the remove or replace. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (argp->new_pgno == PGNO_INVALID)
+			pagep->prev_pgno = argp->prev_pgno;
+		else
+			pagep->prev_pgno = argp->new_pgno;
+
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the remove or replace. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->pgno;
+
+		pagep->lsn = argp->lsn_next;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+prev:	if (argp->prev_pgno == PGNO_INVALID)
+		goto done;
+	if ((ret = __memp_fget(mpf,
+	    &argp->prev_pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->prev_pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (argp->new_pgno == PGNO_INVALID)
+			pagep->next_pgno = argp->next_pgno;
+		else
+			pagep->next_pgno = argp->new_pgno;
+
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->pgno;
+		pagep->lsn = argp->lsn_prev;
+	}
+
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_merge_recover --
+ *	Recovery function for merge.
+ *
+ * PUBLIC: int __db_merge_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_merge_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_merge_args *argp;
+	BTREE *bt;
+	DB_THREAD_INFO *ip;
+	BKEYDATA *bk;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LOCK handle_lock;
+	DB_LOCKREQ request;
+	DB_MPOOLFILE *mpf;
+	HASH *ht;
+	PAGE *pagep;
+	db_indx_t indx, *ninp, *pinp;
+	u_int32_t size;
+	u_int8_t *bp;
+	int cmp_n, cmp_p, i, ret, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_merge_print);
+	REC_INTRO(__db_merge_read, ip, op != DB_TXN_APPLY);
+
+	/* Allocate our own cursor without DB_RECOVER as we need a locker. */
+	if (op == DB_TXN_APPLY && (ret = __db_cursor_int(file_dbp, ip, NULL,
+	    DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+		goto out;
+	F_SET(dbc, DBC_RECOVER);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto next;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/*
+		 * When pg_copy is set, we are copying onto a new page.
+		 */
+		DB_ASSERT(env, !argp->pg_copy || NUM_ENT(pagep) == 0);
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (argp->pg_copy) {
+			if (argp->data.size == 0) {
+				memcpy(pagep, argp->hdr.data, argp->hdr.size);
+				pagep->pgno = argp->pgno;
+				goto do_lsn;
+			}
+			P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+			     PREV_PGNO(argp->hdr.data),
+			     NEXT_PGNO(argp->hdr.data),
+			     LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+		}
+		if (TYPE(pagep) == P_OVERFLOW) {
+			OV_REF(pagep) = OV_REF(argp->hdr.data);
+			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+			bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+			memcpy(bp, argp->data.data, argp->data.size);
+		} else {
+			/* Copy the data segment. */
+			bp = (u_int8_t *)pagep +
+			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+			memcpy(bp, argp->data.data, argp->data.size);
+
+			/* Copy index table offset past the current entries. */
+			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+			ninp = P_INP(file_dbp, argp->hdr.data);
+			for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+				*pinp++ = *ninp++
+				      - (file_dbp->pgsize - HOFFSET(pagep));
+			HOFFSET(pagep) -= argp->data.size;
+			NUM_ENT(pagep) += i;
+		}
+do_lsn:		pagep->lsn = *lsnp;
+		if (op == DB_TXN_APPLY) {
+			/*
+			 * If applying to an active system we must bump
+			 * the revision number so that the db will get
+			 * reopened.  We also need to move the handle
+			 * locks.  Note that the dbp will not have a
+			 * locker in a replication client apply thread.
+			 */
+			if (file_dbp->type == DB_HASH) {
+				if (argp->npgno == file_dbp->meta_pgno)
+					file_dbp->mpf->mfp->revision++;
+			} else {
+				bt = file_dbp->bt_internal;
+				if (argp->npgno == bt->bt_meta ||
+				    argp->npgno == bt->bt_root)
+					file_dbp->mpf->mfp->revision++;
+			}
+			if (argp->npgno == file_dbp->meta_pgno) {
+				F_CLR(file_dbp, DB_AM_RECOVER);
+				if ((ret = __fop_lock_handle(file_dbp->env,
+				    file_dbp, dbc->locker, DB_LOCK_READ,
+				    NULL, 0)) != 0)
+					goto err;
+				handle_lock = file_dbp->handle_lock;
+
+				file_dbp->meta_pgno = argp->pgno;
+				if ((ret = __fop_lock_handle(file_dbp->env,
+				    file_dbp, dbc->locker, DB_LOCK_READ,
+				    NULL, 0)) != 0)
+					goto err;
+
+				/* Move the other handles to the new lock. */
+				ret = __lock_change(file_dbp->env,
+				    &handle_lock, &file_dbp->handle_lock);
+
+err:				memset(&request, 0, sizeof(request));
+				request.op = DB_LOCK_PUT_ALL;
+				if ((t_ret = __lock_vec(
+				    file_dbp->env, dbc->locker,
+				    0, &request, 1, NULL)) != 0 && ret == 0)
+					ret = t_ret;
+				F_SET(file_dbp, DB_AM_RECOVER);
+				if (ret != 0)
+					goto out;
+			}
+		}
+
+	} else if (cmp_n == 0 && !DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (TYPE(pagep) == P_OVERFLOW) {
+			HOFFSET(pagep) = file_dbp->pgsize;
+			goto setlsn;
+		}
+
+		if (argp->pg_copy) {
+			/* The page was empty when we started. */
+			P_INIT(pagep, file_dbp->pgsize,
+			    pagep->pgno, PGNO_INVALID,
+			    PGNO_INVALID, 0, TYPE(argp->hdr.data));
+			goto setlsn;
+		}
+
+		/*
+		 * Since logging is logical at the page level we cannot just
+		 * truncate the data space.  Delete the proper number of items
+		 * from the logical end of the page.
+		 */
+		for (i = 0; i < NUM_ENT(argp->hdr.data); i++) {
+			indx = NUM_ENT(pagep) - 1;
+			if (TYPE(pagep) == P_LBTREE && indx != 0 &&
+			     P_INP(file_dbp, pagep)[indx] ==
+			     P_INP(file_dbp, pagep)[indx - P_INDX]) {
+				NUM_ENT(pagep)--;
+				continue;
+			}
+			switch (TYPE(pagep)) {
+			case P_LBTREE:
+			case P_LRECNO:
+			case P_LDUP:
+				bk = GET_BKEYDATA(file_dbp, pagep, indx);
+				size = BITEM_SIZE(bk);
+				break;
+
+			case P_IBTREE:
+				size = BINTERNAL_SIZE(
+				     GET_BINTERNAL(file_dbp, pagep, indx)->len);
+				break;
+			case P_IRECNO:
+				size = RINTERNAL_SIZE;
+				break;
+			case P_HASH:
+				size = LEN_HITEM(file_dbp,
+				    pagep, file_dbp->pgsize, indx);
+				break;
+			default:
+				ret = __db_pgfmt(env, PGNO(pagep));
+				goto out;
+			}
+			if ((ret = __db_ditem(dbc, pagep, indx, size)) != 0)
+				goto out;
+		}
+setlsn:		pagep->lsn = argp->lsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+
+next:	if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+		if (ret != DB_PAGE_NOTFOUND) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to truncate the page. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		HOFFSET(pagep) = file_dbp->pgsize;
+		NUM_ENT(pagep) = 0;
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && !DB_REDO(op)) {
+		/* Need to put the data back on the page. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if (TYPE(pagep) == P_OVERFLOW) {
+			OV_REF(pagep) = OV_REF(argp->hdr.data);
+			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+			bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+			memcpy(bp, argp->data.data, argp->data.size);
+		} else {
+			bp = (u_int8_t *)pagep +
+			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+			memcpy(bp, argp->data.data, argp->data.size);
+
+			if (argp->pg_copy)
+				memcpy(pagep, argp->hdr.data, argp->hdr.size);
+			else {
+				/* Copy index table. */
+				pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+				ninp = P_INP(file_dbp, argp->hdr.data);
+				for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+					*pinp++ = *ninp++;
+				HOFFSET(pagep) -= argp->data.size;
+				NUM_ENT(pagep) += i;
+			}
+		}
+		pagep->lsn = argp->nlsn;
+		if (op == DB_TXN_ABORT) {
+			/*
+			 * If we are undoing a meta/root page move we must
+			 * bump the revision number. Put the handle
+			 * locks back to their original state if we
+			 * moved the metadata page.
+			 */
+			i = 0;
+			if (file_dbp->type == DB_HASH) {
+				ht = file_dbp->h_internal;
+				if (argp->pgno == ht->meta_pgno) {
+					ht->meta_pgno = argp->npgno;
+					file_dbp->mpf->mfp->revision++;
+					i = 1;
+				}
+			} else {
+				bt = file_dbp->bt_internal;
+				if (argp->pgno == bt->bt_meta) {
+					file_dbp->mpf->mfp->revision++;
+					bt->bt_meta = argp->npgno;
+					i = 1;
+				} else if (argp->pgno == bt->bt_root) {
+					file_dbp->mpf->mfp->revision++;
+					bt->bt_root = argp->npgno;
+				}
+			}
+			if (argp->pgno == file_dbp->meta_pgno)
+				file_dbp->meta_pgno = argp->npgno;
+
+			/*
+			 * If we detected a metadata page above, move
+			 * the handle locks to the new page.
+			 */
+			if (i == 1) {
+				handle_lock = file_dbp->handle_lock;
+				if ((ret = __fop_lock_handle(file_dbp->env,
+				    file_dbp, file_dbp->locker, DB_LOCK_READ,
+				    NULL, 0)) != 0)
+					goto out;
+
+				/* Move the other handles to the new lock. */
+				if ((ret = __lock_change(file_dbp->env,
+				    &handle_lock, &file_dbp->handle_lock)) != 0)
+					goto out;
+			}
+		}
+	}
+
+	if ((ret = __memp_fput(mpf,
+	     ip, pagep, dbc->priority)) != 0)
+		goto out;
+done:
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __db_pgno_recover --
+ *	Recovery function for page number replacment.
+ *
+ * PUBLIC: int __db_pgno_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pgno_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	BINTERNAL *bi;
+	__db_pgno_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep, *npagep;
+	db_pgno_t pgno, *pgnop;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pgno_print);
+	REC_INTRO(__db_pgno_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && !DB_REDO(op))) {
+		switch (TYPE(pagep)) {
+		case P_IBTREE:
+			/*
+			 * An internal record can have both a overflow
+			 * and child pointer.  Fetch the page to see
+			 * which it is.
+			 */
+			bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
+			if (B_TYPE(bi->type) == B_OVERFLOW) {
+				REC_FGET(mpf, ip, argp->npgno, &npagep, out);
+
+				if (TYPE(npagep) == P_OVERFLOW)
+					pgnop =
+					     &((BOVERFLOW *)(bi->data))->pgno;
+				else
+					pgnop = &bi->pgno;
+				if ((ret = __memp_fput(mpf, ip,
+				    npagep, file_dbp->priority)) != 0)
+					goto out;
+				break;
+			}
+			pgnop = &bi->pgno;
+			break;
+		case P_IRECNO:
+			pgnop =
+			     &GET_RINTERNAL(file_dbp, pagep, argp->indx)->pgno;
+			break;
+		case P_HASH:
+			pgnop = &pgno;
+			break;
+		default:
+			pgnop =
+			     &GET_BOVERFLOW(file_dbp, pagep, argp->indx)->pgno;
+			break;
+		}
+
+		if (DB_REDO(op)) {
+			/* Need to redo update described. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			*pgnop = argp->npgno;
+			pagep->lsn = *lsnp;
+		} else {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			*pgnop = argp->opgno;
+			pagep->lsn = argp->lsn;
+		}
+		if (TYPE(pagep) == P_HASH)
+			memcpy(HOFFDUP_PGNO(P_ENTRY(file_dbp,
+			    pagep, argp->indx)), pgnop, sizeof(db_pgno_t));
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+done:
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __db_pglist_swap -- swap a list of freelist pages.
+ * PUBLIC: void __db_pglist_swap __P((u_int32_t, void *));
+ */
+void
+__db_pglist_swap(size, list)
+	u_int32_t size;
+	void *list;
+{
+	db_pglist_t *lp;
+	u_int32_t nelem;
+
+	nelem = size / sizeof(db_pglist_t);
+
+	lp = (db_pglist_t *)list;
+	while (nelem-- > 0) {
+		P_32_SWAP(&lp->pgno);
+		P_32_SWAP(&lp->lsn.file);
+		P_32_SWAP(&lp->lsn.offset);
+		lp++;
+	}
+}
+
+/*
+ * __db_pglist_print -- print a list of freelist pages.
+ * PUBLIC: void __db_pglist_print __P((ENV *, DB_MSGBUF *, DBT *));
+ */
+void
+__db_pglist_print(env, mbp, list)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DBT *list;
+{
+	db_pglist_t *lp;
+	u_int32_t nelem;
+
+	nelem = list->size / sizeof(db_pglist_t);
+	lp = (db_pglist_t *)list->data;
+	__db_msgadd(env, mbp, "\t");
+	while (nelem-- > 0) {
+		__db_msgadd(env, mbp, "%lu [%lu][%lu]", (u_long)lp->pgno,
+		    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+		if (nelem % 4 == 0)
+			__db_msgadd(env, mbp, "\n\t");
+		else
+			__db_msgadd(env, mbp, " ");
+		lp++;
+	}
+}
diff --git a/src/db/db_reclaim.c b/src/db/db_reclaim.c
new file mode 100644
index 00000000..b902769a
--- /dev/null
+++ b/src/db/db_reclaim.c
@@ -0,0 +1,245 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_traverse_big
+ *	Traverse a chain of overflow pages and call the callback routine
+ * on each one.  The calling convention for the callback is:
+ *	callback(dbc, page, cookie, did_put),
+ * where did_put is a return value indicating if the page in question has
+ * already been returned to the mpool.
+ *
+ * PUBLIC: int __db_traverse_big __P((DBC *, db_pgno_t,
+ * PUBLIC:	int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__db_traverse_big(dbc, pgno, callback, cookie)
+	DBC *dbc;
+	db_pgno_t pgno;
+	int (*callback) __P((DBC *, PAGE *, void *, int *));
+	void *cookie;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *p;
+	int did_put, ret;
+
+	mpf = dbc->dbp->mpf;
+
+	do {
+		did_put = 0;
+		if ((ret = __memp_fget(mpf,
+		     &pgno, dbc->thread_info, dbc->txn, 0, &p)) != 0)
+			return (ret);
+		/*
+		 * If we are freeing pages only process the overflow
+		 * chain if the head of the chain has a refcount of 1.
+		 */
+		pgno = NEXT_PGNO(p);
+		if (callback == __db_truncate_callback && OV_REF(p) != 1)
+			pgno = PGNO_INVALID;
+		if ((ret = callback(dbc, p, cookie, &did_put)) == 0 &&
+		    !did_put)
+			ret = __memp_fput(mpf,
+			     dbc->thread_info, p, dbc->priority);
+	} while (ret == 0 && pgno != PGNO_INVALID);
+
+	return (ret);
+}
+
+/*
+ * __db_reclaim_callback
+ * This is the callback routine used during a delete of a subdatabase.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.  Since they share common code for duplicates and overflow
+ * items, we traverse them identically and use this routine to do the
+ * actual free.  The reason that this is callback is because hash uses
+ * the same traversal code for statistics gathering.
+ *
+ * PUBLIC: int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_reclaim_callback(dbc, p, cookie, putp)
+	DBC *dbc;
+	PAGE *p;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	/*
+	 * We don't want to log the free of the root with the subdb.
+	 * If we abort then the subdb may not be openable to undo
+	 * the free.
+	 */
+	if ((dbp->type == DB_BTREE || dbp->type == DB_RECNO) &&
+	    PGNO(p) == ((BTREE *)dbp->bt_internal)->bt_root)
+		return (0);
+	if ((ret = __db_free(dbc, p, *(u_int32_t *)cookie)) != 0)
+		return (ret);
+	*putp = 1;
+
+	return (0);
+}
+
+/*
+ * __db_truncate_callback
+ * This is the callback routine used during a truncate.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.
+ *
+ * PUBLIC: int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_truncate_callback(dbc, p, cookie, putp)
+	DBC *dbc;
+	PAGE *p;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	DBT ddbt, ldbt;
+	DB_MPOOLFILE *mpf;
+	db_indx_t indx, len, off, tlen, top;
+	u_int8_t *hk, type;
+	u_int32_t *countp;
+	int ret;
+
+	top = NUM_ENT(p);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	countp = cookie;
+	*putp = 1;
+
+	switch (TYPE(p)) {
+	case P_LBTREE:
+		/* Skip for off-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type;
+			if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+				++*countp;
+		}
+		/* FALLTHROUGH */
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_INVALID:
+		if (dbp->type != DB_HASH &&
+		    ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+			type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+			goto reinit;
+		}
+		break;
+	case P_OVERFLOW:
+		if ((ret = __memp_dirty(mpf,
+		    &p, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __db_ovref_log(dbp, dbc->txn,
+			    &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0)
+				return (ret);
+		} else
+			LSN_NOT_LOGGED(LSN(p));
+		if (--OV_REF(p) != 0)
+			*putp = 0;
+		break;
+	case P_LRECNO:
+		for (indx = 0; indx < top; indx += O_INDX) {
+			type = GET_BKEYDATA(dbp, p, indx)->type;
+			if (!B_DISSET(type))
+				++*countp;
+		}
+
+		if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+			type = P_LRECNO;
+			goto reinit;
+		}
+		break;
+	case P_LDUP:
+		/* Correct for deleted items. */
+		for (indx = 0; indx < top; indx += O_INDX)
+			if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type))
+				++*countp;
+
+		break;
+	case P_HASH:
+		/* Correct for on-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			switch (*H_PAIRDATA(dbp, p, indx)) {
+			case H_OFFDUP:
+				break;
+			case H_OFFPAGE:
+			case H_KEYDATA:
+				++*countp;
+				break;
+			case H_DUPLICATE:
+				tlen = LEN_HDATA(dbp, p, 0, indx);
+				hk = H_PAIRDATA(dbp, p, indx);
+				for (off = 0; off < tlen;
+				    off += len + 2 * sizeof(db_indx_t)) {
+					++*countp;
+					memcpy(&len,
+					    HKEYDATA_DATA(hk)
+					    + off, sizeof(db_indx_t));
+				}
+				break;
+			default:
+				return (__db_pgfmt(dbp->env, p->pgno));
+			}
+		}
+		/* Don't free the head of the bucket. */
+		if (PREV_PGNO(p) == PGNO_INVALID) {
+			type = P_HASH;
+
+reinit:			if ((ret = __memp_dirty(mpf, &p,
+			    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+				return (ret);
+			*putp = 0;
+			if (DBC_LOGGING(dbc)) {
+				memset(&ldbt, 0, sizeof(ldbt));
+				memset(&ddbt, 0, sizeof(ddbt));
+				ldbt.data = p;
+				ldbt.size = P_OVERHEAD(dbp);
+				ldbt.size += p->entries * sizeof(db_indx_t);
+				ddbt.data = (u_int8_t *)p + HOFFSET(p);
+				ddbt.size = dbp->pgsize - HOFFSET(p);
+				if ((ret = __db_pg_init_log(dbp,
+				    dbc->txn, &LSN(p), 0,
+				    p->pgno, &ldbt, &ddbt)) != 0)
+					return (ret);
+			} else
+				LSN_NOT_LOGGED(LSN(p));
+
+			P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID,
+			    PGNO_INVALID, type == P_HASH ? 0 : 1, type);
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, p->pgno));
+	}
+
+	if (*putp == 1) {
+		if ((ret = __db_free(dbc, p, 0)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __memp_fput(mpf, dbc->thread_info, p,
+		    dbc->priority)) != 0)
+			return (ret);
+		*putp = 1;
+	}
+
+	return (0);
+}
diff --git a/src/db/db_remove.c b/src/db/db_remove.c
new file mode 100644
index 00000000..591a29b2
--- /dev/null
+++ b/src/db/db_remove.c
@@ -0,0 +1,515 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_dbtxn_remove __P((DB *,
+    DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+static int __db_subdb_remove __P((DB *,
+    DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+
+/*
+ * __env_dbremove_pp
+ *	ENV->dbremove pre/post processing.
+ *
+ * PUBLIC: int __env_dbremove_pp __P((DB_ENV *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbremove_pp(dbenv, txn, name, subdb, flags)
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	dbp = NULL;
+	env = dbenv->env;
+	txn_local = 0;
+	handle_check = 0;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbremove");
+
+	/*
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(env, "DB->remove", flags,
+		DB_AUTO_COMMIT | DB_LOG_NO_DATA |
+		DB_NOSYNC | DB_TXN_NOT_DURABLE)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	XA_NO_TXN(ip, ret);
+	if (ret != 0)
+		goto err;
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else if (txn != NULL && !TXN_ON(env) &&
+	    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+		ret = __db_not_txn_env(env);
+		goto err;
+	} else if (txn != NULL && LF_ISSET(DB_LOG_NO_DATA)) {
+		ret = EINVAL;
+		__db_errx(env, DB_STR("0690",
+	    "DB_LOG_NO_DATA may not be specified within a transaction."));
+		goto err;
+	}
+	LF_CLR(DB_AUTO_COMMIT);
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+	if (LF_ISSET(DB_TXN_NOT_DURABLE) &&
+	    (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	LF_CLR(DB_TXN_NOT_DURABLE);
+
+	ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+	if (txn_local) {
+		/*
+		 * We created the DBP here and when we commit/abort, we'll
+		 * release all the transactional locks, including the handle
+		 * lock; mark the handle cleared explicitly.
+		 */
+		LOCK_INIT(dbp->handle_lock);
+		dbp->locker = NULL;
+	} else if (IS_REAL_TXN(txn)) {
+		/*
+		 * We created this handle locally so we need to close it
+		 * and clean it up.  Unfortunately, it's holding transactional
+		 * locks that need to persist until the end of transaction.
+		 * If we invalidate the locker id (dbp->locker), then the close
+		 * won't free these locks prematurely.
+		 */
+		 dbp->locker = NULL;
+	}
+
+err:	if (txn_local && (t_ret =
+	    __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * We never opened this dbp for real, so don't include a transaction
+	 * handle, and use NOSYNC to avoid calling into mpool.
+	 *
+	 * !!!
+	 * Note we're reversing the order of operations: we started the txn and
+	 * then opened the DB handle; we're resolving the txn and then closing
+	 * closing the DB handle -- a DB handle cannot be closed before
+	 * resolving the txn.
+	 */
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_remove_pp
+ *	DB->remove pre/post processing.
+ *
+ * PUBLIC: int __db_remove_pp
+ * PUBLIC:     __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_pp(dbp, name, subdb, flags)
+	DB *dbp;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	/*
+	 * Validate arguments, continuing to destroy the handle on failure.
+	 *
+	 * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+	 *
+	 * !!!
+	 * We have a serious problem if we're here with a handle used to open
+	 * a database -- we'll destroy the handle, and the application won't
+	 * ever be able to close the database.
+	 */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (__db_mi_open(env, "DB->remove", 1));
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB->remove", flags, DB_NOSYNC)) != 0)
+		return (ret);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Remove the file. */
+	ret = __db_remove(dbp, ip, NULL, name, subdb, flags);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_remove
+ *	DB->remove method.
+ *
+ * PUBLIC: int __db_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+
+	ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+	if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_remove_int
+ *	Worker function for the DB->remove method.
+ *
+ * PUBLIC: int __db_remove_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:    DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_int(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+	char *real_name, *tmpname;
+
+	env = dbp->env;
+	real_name = tmpname = NULL;
+
+	if (name == NULL && subdb == NULL) {
+		__db_errx(env, DB_STR("0691",
+		    "Remove on temporary files invalid"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (name == NULL) {
+		MAKE_INMEM(dbp);
+		real_name = (char *)subdb;
+	} else if (subdb != NULL) {
+		ret = __db_subdb_remove(dbp, ip, txn, name, subdb, flags);
+		goto err;
+	}
+
+	/* Handle transactional file removes separately. */
+	if (IS_REAL_TXN(txn)) {
+		ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb);
+		goto err;
+	}
+
+	/*
+	 * The remaining case is a non-transactional file remove.
+	 *
+	 * Find the real name of the file.
+	 */
+	if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = __db_appname(env,
+	    DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+		goto err;
+
+	/*
+	 * If this is a file and force is set, remove the temporary file, which
+	 * may have been left around.  Ignore errors because the temporary file
+	 * might not exist.
+	 */
+	if (!F_ISSET(dbp, DB_AM_INMEM) && LF_ISSET(DB_FORCE) &&
+	    (ret = __db_backup_name(env, real_name, NULL, &tmpname)) == 0)
+		(void)__os_unlink(env, tmpname, 0);
+
+	if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0)
+		goto err;
+
+	if (dbp->db_am_remove != NULL &&
+	    (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0)
+		goto err;
+
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	    __db_inmem_remove(dbp, NULL, real_name) :
+	    __fop_remove(env,
+	    NULL, dbp->fileid, name, &dbp->dirname, DB_APP_DATA,
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err:	if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+		__os_free(env, real_name);
+	if (tmpname != NULL)
+		__os_free(env, tmpname);
+
+	return (ret);
+}
+
+/*
+ * __db_inmem_remove --
+ *	Removal of a named in-memory database.
+ *
+ * PUBLIC: int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+ */
+int
+__db_inmem_remove(dbp, txn, name)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+{
+	DBT fid_dbt, name_dbt;
+	DB_LOCKER *locker;
+	DB_LSN lsn;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+	locker = NULL;
+
+	DB_ASSERT(env, name != NULL);
+
+	/* This had better exist if we are trying to do a remove. */
+	(void)__memp_set_flags(dbp->mpf, DB_MPOOL_NOFILE, 1);
+	if ((ret = __memp_fopen(dbp->mpf, NULL,
+	    name, &dbp->dirname, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __memp_get_fileid(dbp->mpf, dbp->fileid)) != 0)
+		return (ret);
+	dbp->preserve_fid = 1;
+
+	if (LOCKING_ON(env)) {
+		if (dbp->locker == NULL &&
+		    (ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+			return (ret);
+		if (!CDB_LOCKING(env) &&
+		    txn != NULL && F_ISSET(txn, TXN_INFAMILY)) {
+			if ((ret = __lock_addfamilylocker(env,
+			    txn->txnid, dbp->locker->id, 1)) != 0)
+				return (ret);
+			txn = NULL;
+		}
+		locker = txn == NULL ? dbp->locker : txn->locker;
+	}
+
+	/*
+	 * In a transactional environment, we'll play the same game we play
+	 * for databases in the file system -- create a temporary database
+	 * and put it in with the current name and then rename this one to
+	 * another name.  We'll then use a commit-time event to remove the
+	 * entry.
+	 */
+	if ((ret =
+	    __fop_lock_handle(env, dbp, locker, DB_LOCK_WRITE, NULL, 0)) != 0)
+		return (ret);
+
+	if (!IS_REAL_TXN(txn))
+		ret = __memp_nameop(env, dbp->fileid, NULL, name, NULL, 1);
+	else if (LOGGING_ON(env)) {
+		if (txn != NULL && (ret =
+		    __txn_remevent(env, txn, name, dbp->fileid, 1)) != 0)
+			return (ret);
+
+		DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+		DB_INIT_DBT(fid_dbt, dbp->fileid, DB_FILE_ID_LEN);
+		ret = __crdel_inmem_remove_log(
+		    env, txn, &lsn, 0, &name_dbt, &fid_dbt);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_subdb_remove --
+ *	Remove a subdatabase.
+ */
+static int
+__db_subdb_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	DB *mdbp, *sdbp;
+	int ret, t_ret;
+
+	mdbp = sdbp = NULL;
+
+	/* Open the subdatabase. */
+	if ((ret = __db_create_internal(&sdbp, dbp->env, 0)) != 0)
+		goto err;
+	if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+		(ret = __db_set_flags(sdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(sdbp, ip,
+	    txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
+
+	/* Have the handle locked so we will not lock pages. */
+	LOCK_CHECK_OFF(ip);
+
+	/* Free up the pages in the subdatabase. */
+	switch (sdbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bam_reclaim(sdbp, ip, txn, flags)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __ham_reclaim(sdbp, ip, txn, flags)) != 0)
+				goto err;
+			break;
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(
+			    sdbp->env, "__db_subdb_remove", sdbp->type);
+			goto err;
+	}
+
+	/*
+	 * Remove the entry from the main database and free the subdatabase
+	 * metadata page.
+	 */
+	if ((ret = __db_master_open(sdbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp,
+	    sdbp, ip, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+	/* Close the main and subdatabases. */
+	if ((t_ret = __db_close(sdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (mdbp != NULL && (t_ret = __db_close(mdbp, txn,
+	    (LF_ISSET(DB_NOSYNC) || txn != NULL) ? DB_NOSYNC : 0)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	LOCK_CHECK_ON(ip);
+	return (ret);
+}
+
+static int
+__db_dbtxn_remove(dbp, ip, txn, name, subdb)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+{
+	ENV *env;
+	int ret;
+	char *tmpname;
+
+	env = dbp->env;
+	tmpname = NULL;
+
+	/*
+	 * This is a transactional remove, so we have to keep the name
+	 * of the file locked until the transaction commits.  As a result,
+	 * we implement remove by renaming the file to some other name
+	 * (which creates a dummy named file as a placeholder for the
+	 * file being rename/dremoved) and then deleting that file as
+	 * a delayed remove at commit.
+	 */
+	if ((ret = __db_backup_name(env,
+	    F_ISSET(dbp, DB_AM_INMEM) ? subdb : name, txn, &tmpname)) != 0)
+		return (ret);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+	if ((ret = __db_rename_int(dbp,
+	    txn->thread_info, txn, name, subdb, tmpname, DB_NOSYNC)) != 0)
+		goto err;
+
+	/*
+	 * The internal removes will also translate into delayed removes.
+	 */
+	if (dbp->db_am_remove != NULL &&
+	    (ret = dbp->db_am_remove(dbp, ip, txn, tmpname, NULL, 0)) != 0)
+		goto err;
+
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	     __db_inmem_remove(dbp, txn, tmpname) :
+	    __fop_remove(env,
+	    txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA,
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+err:
+DB_TEST_RECOVERY_LABEL
+	if (tmpname != NULL)
+		__os_free(env, tmpname);
+
+	return (ret);
+}
diff --git a/src/db/db_rename.c b/src/db/db_rename.c
new file mode 100644
index 00000000..2812b948
--- /dev/null
+++ b/src/db/db_rename.c
@@ -0,0 +1,383 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_rename __P((DB *, DB_THREAD_INFO *,
+	     DB_TXN *, const char *, const char *, const char *, u_int32_t));
+static int __db_subdb_rename __P((DB *, DB_THREAD_INFO *,
+	     DB_TXN *, const char *, const char *, const char *, u_int32_t));
+
+/*
+ * __env_dbrename_pp
+ *	ENV->dbrename pre/post processing.
+ *
+ * PUBLIC: int __env_dbrename_pp __P((DB_ENV *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbrename_pp(dbenv, txn, name, subdb, newname, flags)
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbenv->env;
+	dbp = NULL;
+	txn_local = 0;
+	handle_check = 0;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbrename");
+
+	/*
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(env, "DB->rename", flags,
+	    DB_AUTO_COMMIT | DB_NOSYNC)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	XA_NO_TXN(ip, ret);
+	if (ret != 0)
+		goto err;
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else
+		if (txn != NULL && !TXN_ON(env) &&
+		    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+			ret = __db_not_txn_env(env);
+			goto err;
+		}
+
+	LF_CLR(DB_AUTO_COMMIT);
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+
+	ret = __db_rename_int(dbp, ip, txn, name, subdb, newname, flags);
+
+	if (txn_local) {
+		/*
+		 * We created the DBP here and when we commit/abort, we'll
+		 * release all the transactional locks, including the handle
+		 * lock; mark the handle cleared explicitly.
+		 */
+		LOCK_INIT(dbp->handle_lock);
+		dbp->locker = NULL;
+	} else if (IS_REAL_TXN(txn)) {
+		/*
+		 * We created this handle locally so we need to close it and
+		 * clean it up.  Unfortunately, it's holding transactional
+		 * or CDS group locks that need to persist until the end of
+		 * transaction.  If we invalidate the locker (dbp->locker),
+		 * then the close won't free these locks prematurely.
+		 */
+		 dbp->locker = NULL;
+	}
+
+err:	if (txn_local && (t_ret =
+	    __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * We never opened this dbp for real, so don't include a transaction
+	 * handle, and use NOSYNC to avoid calling into mpool.
+	 *
+	 * !!!
+	 * Note we're reversing the order of operations: we started the txn and
+	 * then opened the DB handle; we're resolving the txn and then closing
+	 * closing the DB handle -- it's safer.
+	 */
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_rename_pp
+ *	DB->rename pre/post processing.
+ *
+ * PUBLIC: int __db_rename_pp __P((DB *,
+ * PUBLIC:     const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_pp(dbp, name, subdb, newname, flags)
+	DB *dbp;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	handle_check = 0;
+
+	/*
+	 * Validate arguments, continuing to destroy the handle on failure.
+	 *
+	 * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+	 *
+	 * !!!
+	 * We have a serious problem if we're here with a handle used to open
+	 * a database -- we'll destroy the handle, and the application won't
+	 * ever be able to close the database.
+	 */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (__db_mi_open(env, "DB->rename", 1));
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB->rename", flags, DB_NOSYNC)) != 0)
+		return (ret);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Rename the file. */
+	ret = __db_rename(dbp, ip, NULL, name, subdb, newname, flags);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_rename
+ *	DB->rename method.
+ *
+ */
+static int
+__db_rename(dbp, ip, txn, name, subdb, newname, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+
+	ret = __db_rename_int(dbp, ip, txn, name, subdb, newname, flags);
+
+	if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_rename_int
+ *	Worker function for DB->rename method; the close of the dbp is
+ * left in the wrapper routine.
+ *
+ * PUBLIC: int __db_rename_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_int(dbp, ip, txn, name, subdb, newname, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+	char *old, *real_name;
+
+	env = dbp->env;
+	real_name = NULL;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+	if (name == NULL && subdb == NULL) {
+		__db_errx(env, DB_STR("0503",
+		    "Rename on temporary files invalid"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (name == NULL)
+		MAKE_INMEM(dbp);
+	else if (subdb != NULL) {
+		ret = __db_subdb_rename(dbp, ip,
+		    txn, name, subdb, newname, flags);
+		goto err;
+	}
+
+	/*
+	 * From here on down, this pertains to files or in-memory databases.
+	 *
+	 * Find the real name of the file.
+	 */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		old = (char *)subdb;
+		real_name = (char *)subdb;
+	} else {
+		if ((ret = __db_appname(env, DB_APP_DATA,
+		    name, &dbp->dirname, &real_name)) != 0)
+			goto err;
+		old = (char *)name;
+	}
+	DB_ASSERT(env, old != NULL);
+
+	if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0)
+		goto err;
+
+	if (dbp->db_am_rename != NULL &&
+	    (ret = dbp->db_am_rename(dbp, ip, txn, name, subdb, newname)) != 0)
+		goto err;
+
+	/*
+	 * The transactional case and non-transactional case are
+	 * quite different.  In the non-transactional case, we simply
+	 * do the rename.  In the transactional case, since we need
+	 * the ability to back out and maintain locking, we have to
+	 * create a temporary object as a placeholder.  This is all
+	 * taken care of in the fop layer.
+	 */
+	if (IS_REAL_TXN(txn)) {
+		if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0)
+			goto err;
+	} else {
+		if ((ret = __fop_dbrename(dbp, old, newname)) != 0)
+			goto err;
+	}
+
+	/*
+	 * I am pretty sure that we haven't gotten a dbreg id, so calling
+	 * dbreg_filelist_update is not necessary.
+	 */
+	DB_ASSERT(env, dbp->log_filename == NULL ||
+	    dbp->log_filename->id == DB_LOGFILEID_INVALID);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname);
+
+DB_TEST_RECOVERY_LABEL
+err:	if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+		__os_free(env, real_name);
+
+	return (ret);
+}
+
+/*
+ * __db_subdb_rename --
+ *	Rename a subdatabase.
+ */
+static int
+__db_subdb_rename(dbp, ip, txn, name, subdb, newname, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB *mdbp;
+	ENV *env;
+	PAGE *meta;
+	int ret, t_ret;
+
+	mdbp = NULL;
+	meta = NULL;
+	env = dbp->env;
+
+	/*
+	 * We have not opened this dbp so it isn't marked as a subdb,
+	 * but it ought to be.
+	 */
+	F_SET(dbp, DB_AM_SUBDB);
+
+	/*
+	 * Rename the entry in the main database.  We need to first
+	 * get the meta-data page number (via MU_OPEN) so that we can
+	 * read the meta-data page and obtain a handle lock.  Once we've
+	 * done that, we can proceed to do the rename in the master.
+	 */
+	if ((ret = __db_master_open(dbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp, dbp, ip, txn, subdb, dbp->type,
+	    MU_OPEN, NULL, 0)) != 0)
+		goto err;
+
+	if ((ret = __memp_fget(mdbp->mpf, &dbp->meta_pgno,
+	    ip, txn, 0, &meta)) != 0)
+		goto err;
+	memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+	if ((ret = __fop_lock_handle(env, dbp,
+	    (mdbp->cur_locker != NULL) ? mdbp->cur_locker : mdbp->locker,
+	    DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0)
+		goto err;
+
+	ret = __memp_fput(mdbp->mpf, ip, meta, dbp->priority);
+	meta = NULL;
+	if (ret != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp, dbp, ip, txn,
+	    subdb, dbp->type, MU_RENAME, newname, 0)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+	if (meta != NULL && (t_ret =
+	    __memp_fput(mdbp->mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (mdbp != NULL && (t_ret = __db_close(mdbp, txn,
+	    (LF_ISSET(DB_NOSYNC) || txn != NULL) ? DB_NOSYNC : 0)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/db/db_ret.c b/src/db/db_ret.c
new file mode 100644
index 00000000..709605f6
--- /dev/null
+++ b/src/db/db_ret.c
@@ -0,0 +1,169 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+
+/*
+ * __db_ret --
+ *	Build return DBT.
+ *
+ * PUBLIC: int __db_ret __P((DBC *,
+ * PUBLIC:    PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+ */
+int
+__db_ret(dbc, h, indx, dbt, memp, memsize)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+	DBT *dbt;
+	void **memp;
+	u_int32_t *memsize;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	DB *dbp;
+	HEAPHDR *hdr;
+	HOFFPAGE ho;
+	u_int32_t len;
+	u_int8_t *hk;
+	void *data;
+
+	if (F_ISSET(dbt, DB_DBT_READONLY))
+		return (0);
+	dbp = dbc->dbp;
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		hk = P_ENTRY(dbp, h, indx);
+		if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+			memcpy(&ho, hk, sizeof(HOFFPAGE));
+			return (__db_goff(dbc, dbt,
+			    ho.tlen, ho.pgno, memp, memsize));
+		}
+		len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
+		data = HKEYDATA_DATA(hk);
+		break;
+	case P_HEAP:
+		hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx);
+		if (F_ISSET(hdr,(HEAP_RECSPLIT | HEAP_RECFIRST)))
+			return (__heapc_gsplit(dbc, dbt, memp, memsize));
+		len = hdr->size;
+		data = (u_int8_t *)hdr + sizeof(HEAPHDR);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		bk = GET_BKEYDATA(dbp, h, indx);
+		if (B_TYPE(bk->type) == B_OVERFLOW) {
+			bo = (BOVERFLOW *)bk;
+			return (__db_goff(dbc, dbt,
+			    bo->tlen, bo->pgno, memp, memsize));
+		}
+		len = bk->len;
+		data = bk->data;
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, h->pgno));
+	}
+
+	return (__db_retcopy(dbp->env, dbt, data, len, memp, memsize));
+}
+
+/*
+ * __db_retcopy --
+ *	Copy the returned data into the user's DBT, handling special flags.
+ *
+ * PUBLIC: int __db_retcopy __P((ENV *, DBT *,
+ * PUBLIC:    void *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__db_retcopy(env, dbt, data, len, memp, memsize)
+	ENV *env;
+	DBT *dbt;
+	void *data;
+	u_int32_t len;
+	void **memp;
+	u_int32_t *memsize;
+{
+	int ret;
+
+	if (F_ISSET(dbt, DB_DBT_READONLY))
+		return (0);
+	ret = 0;
+
+	/* If returning a partial record, reset the length. */
+	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		data = (u_int8_t *)data + dbt->doff;
+		if (len > dbt->doff) {
+			len -= dbt->doff;
+			if (len > dbt->dlen)
+				len = dbt->dlen;
+		} else
+			len = 0;
+	}
+
+	/*
+	 * Allocate memory to be owned by the application: DB_DBT_MALLOC,
+	 * DB_DBT_REALLOC.
+	 *
+	 * !!!
+	 * We always allocate memory, even if we're copying out 0 bytes. This
+	 * guarantees consistency, i.e., the application can always free memory
+	 * without concern as to how many bytes of the record were requested.
+	 *
+	 * Use the memory specified by the application: DB_DBT_USERMEM.
+	 *
+	 * !!!
+	 * If the length we're going to copy is 0, the application-supplied
+	 * memory pointer is allowed to be NULL.
+	 */
+	if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+		dbt->size = len;
+		return (len == 0 ? 0 : env->dbt_usercopy(dbt, 0, data,
+		    len, DB_USERCOPY_SETDATA));
+
+	} else if (F_ISSET(dbt, DB_DBT_MALLOC))
+		ret = __os_umalloc(env, len, &dbt->data);
+	else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+		if (dbt->data == NULL || dbt->size == 0 || dbt->size < len)
+			ret = __os_urealloc(env, len, &dbt->data);
+	} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+		if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
+			ret = DB_BUFFER_SMALL;
+	} else if (memp == NULL || memsize == NULL)
+		ret = EINVAL;
+	else {
+		if (len != 0 && (*memsize == 0 || *memsize < len)) {
+			if ((ret = __os_realloc(env, len, memp)) == 0)
+				*memsize = len;
+			else
+				*memsize = 0;
+		}
+		if (ret == 0)
+			dbt->data = *memp;
+	}
+
+	if (ret == 0 && len != 0)
+		memcpy(dbt->data, data, len);
+
+	/*
+	 * Return the length of the returned record in the DBT size field.
+	 * This satisfies the requirement that if we're using user memory
+	 * and insufficient memory was provided, return the amount necessary
+	 * in the size field.
+	 */
+	dbt->size = len;
+
+	return (ret);
+}
diff --git a/src/db/db_setid.c b/src/db/db_setid.c
new file mode 100644
index 00000000..697c3ff7
--- /dev/null
+++ b/src/db/db_setid.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * __env_fileid_reset_pp --
+ *	ENV->fileid_reset pre/post processing.
+ *
+ * PUBLIC: int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_fileid_reset_pp(dbenv, name, flags)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->fileid_reset");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_ENCRYPT)
+		return (__db_ferr(env, "DB_ENV->fileid_reset", 0));
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__env_fileid_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+	    1, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __env_fileid_reset --
+ *	Reset the file IDs for every database in the file.
+ * PUBLIC: int __env_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, int));
+ */
+int
+__env_fileid_reset(env, ip, name, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *name;
+	int encrypted;
+{
+	DB *dbp;
+	DBC *dbcp;
+	DBMETA *meta;
+	DBT key, data;
+	DB_FH *fhp;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO cookie;
+	db_pgno_t pgno;
+	int subdb, t_ret, ret;
+	size_t n;
+	char *real_name;
+	u_int8_t fileid[DB_FILE_ID_LEN], mbuf[DBMETASIZE];
+	void *pagep;
+
+	dbp = NULL;
+	dbcp = NULL;
+	fhp = NULL;
+	real_name = NULL;
+
+	/* Get the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, NULL, &real_name)) != 0)
+		return (ret);
+
+	/* Get a new file ID. */
+	if ((ret = __os_fileid(env, real_name, 1, fileid)) != 0)
+		goto err;
+
+	/*
+	 * The user may have physically copied a file currently open in the
+	 * cache, which means if we open this file through the cache before
+	 * updating the file ID on page 0, we might connect to the file from
+	 * which the copy was made.
+	 */
+	if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		goto err;
+	}
+	if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+
+	if (n != sizeof(mbuf)) {
+		ret = EINVAL;
+		__db_errx(env, DB_STR_A("0675",
+		    "__env_fileid_reset: %s: unexpected file type or format",
+		    "%s"), real_name);
+		goto err;
+	}
+
+	/*
+	 * Create the DB object.
+	 */
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+
+	/* If configured with a password, the databases are encrypted. */
+	if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+		goto err;
+
+	if ((ret = __db_meta_setup(env,
+	    dbp, real_name, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0)
+		goto err;
+
+	meta = (DBMETA *)mbuf;
+	if (FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && (ret =
+	    __part_fileid_reset(env, ip, name, meta->nparts, encrypted)) != 0)
+		goto err;
+
+	subdb = meta->type == P_BTREEMETA && F_ISSET(meta, BTM_SUBDB);
+
+	memcpy(meta->uid, fileid, DB_FILE_ID_LEN);
+	cookie.db_pagesize = sizeof(mbuf);
+	cookie.flags = dbp->flags;
+	cookie.type = dbp->type;
+	key.data = &cookie;
+
+	if ((ret = __db_pgout(env->dbenv, 0, mbuf, &key)) != 0)
+		goto err;
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		goto err;
+	if ((ret = __os_write(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+	if ((ret = __os_fsync(env, fhp)) != 0)
+		goto err;
+
+	/*
+	 * Page 0 of the file has an updated file ID, and we can open it in
+	 * the cache without connecting to a different, existing file.  Open
+	 * the file in the cache, and update the file IDs for subdatabases.
+	 */
+
+	/*
+	 * If the database file doesn't support subdatabases, we only have
+	 * to update a single metadata page.  Otherwise, we have to open a
+	 * cursor and step through the master database, and update all of
+	 * the subdatabases' metadata pages.
+	 */
+	if (!subdb)
+		goto err;
+
+	/*
+	 * Open the DB file.
+	 *
+	 * !!!
+	 * Note DB_RDWRMASTER flag, we need to open the master database file
+	 * for writing in this case.
+	 */
+	if ((ret = __db_open(dbp, ip, NULL,
+	    name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	mpf = dbp->mpf;
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbcp, 0)) != 0)
+		goto err;
+	while ((ret = __dbc_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+		/*
+		 * XXX
+		 * We're handling actual data, not on-page meta-data, so it
+		 * hasn't been converted to/from opposite endian architectures.
+		 * Do it explicitly, now.
+		 */
+		memcpy(&pgno, data.data, sizeof(db_pgno_t));
+		DB_NTOHL_SWAP(env, &pgno);
+		if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+		    DB_MPOOL_DIRTY, &pagep)) != 0)
+			goto err;
+		memcpy(((DBMETA *)pagep)->uid, fileid, DB_FILE_ID_LEN);
+		if ((ret = __memp_fput(mpf, ip, pagep, dbcp->priority)) != 0)
+			goto err;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:	if (dbcp != NULL && (t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (real_name != NULL)
+		__os_free(env, real_name);
+
+	return (ret);
+}
diff --git a/src/db/db_setlsn.c b/src/db/db_setlsn.c
new file mode 100644
index 00000000..1a3280ed
--- /dev/null
+++ b/src/db/db_setlsn.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+
+static int __env_lsn_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+
+/*
+ * __env_lsn_reset_pp --
+ *	ENV->lsn_reset pre/post processing.
+ *
+ * PUBLIC: int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_lsn_reset_pp(dbenv, name, flags)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->lsn_reset");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_ENCRYPT)
+		return (__db_ferr(env, "DB_ENV->lsn_reset", 0));
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__env_lsn_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+	    1, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __env_lsn_reset --
+ *	Reset the LSNs for every page in the file.
+ */
+static int
+__env_lsn_reset(env, ip, name, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *name;
+	int encrypted;
+{
+	DB *dbp;
+	int t_ret, ret;
+
+	/* Create the DB object. */
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+
+	/* If configured with a password, the databases are encrypted. */
+	if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+		goto err;
+
+	/*
+	 * Open the DB file.
+	 *
+	 * !!!
+	 * Note DB_RDWRMASTER flag, we need to open the master database file
+	 * for writing in this case.
+	 */
+	if ((ret = __db_open(dbp, ip, NULL,
+	    name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) {
+		__db_err(env, ret, "%s", name);
+		goto err;
+	}
+
+	ret = __db_lsn_reset(dbp->mpf, ip);
+#ifdef HAVE_PARTITION
+	if (ret == 0 && DB_IS_PARTITIONED(dbp))
+		ret = __part_lsn_reset(dbp, ip);
+	else
+#endif
+	if (ret == 0 && dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+		ret = __qam_lsn_reset(dbp, ip);
+#else
+		ret = __db_no_queue_am(env);
+#endif
+
+err:	if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_lsn_reset -- reset the lsn for a db mpool handle.
+ * PUBLIC: int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+ */
+int
+__db_lsn_reset(mpf, ip)
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+{
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int ret;
+
+	/* Reset the LSN on every page of the database file. */
+	for (pgno = 0;
+	    (ret = __memp_fget(mpf,
+	    &pgno, ip, NULL, DB_MPOOL_DIRTY, &pagep)) == 0;
+	    ++pgno) {
+		LSN_NOT_LOGGED(pagep->lsn);
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, DB_PRIORITY_UNCHANGED)) != 0)
+			break;
+	}
+
+	if (ret == DB_PAGE_NOTFOUND)
+		ret = 0;
+
+	return (ret);
+}
diff --git a/src/db/db_sort_multiple.c b/src/db/db_sort_multiple.c
new file mode 100644
index 00000000..c5e2e941
--- /dev/null
+++ b/src/db/db_sort_multiple.c
@@ -0,0 +1,327 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_quicksort __P((DB *, DBT *, DBT *, u_int32_t *, u_int32_t *,
+		u_int32_t *, u_int32_t *, u_int32_t));
+
+/*
+ * __db_compare_both --
+ *	Use the comparison functions from db to compare akey and bkey, and if
+ *	DB_DUPSORT adata and bdata.
+ *
+ * PUBLIC: int __db_compare_both __P((DB *, const DBT *, const DBT *,
+ * PUBLIC:   const DBT *, const DBT *));
+ */
+int
+__db_compare_both(db, akey, adata, bkey, bdata)
+	DB *db;
+	const DBT *akey;
+	const DBT *adata;
+	const DBT *bkey;
+	const DBT *bdata;
+{
+	BTREE *t;
+	int cmp;
+
+	t = (BTREE *)db->bt_internal;
+
+	cmp = t->bt_compare(db, akey, bkey);
+	if (cmp != 0) return cmp;
+	if (!F_ISSET(db, DB_AM_DUPSORT))
+	    return (0);
+
+	if (adata == 0) return bdata == 0 ? 0 : -1;
+	if (bdata == 0) return 1;
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(db))
+		return t->compress_dup_compare(db, adata, bdata);
+#endif
+	return db->dup_compare(db, adata, bdata);
+}
+
+#define	DB_SORT_SWAP(a, ad, b, bd)					\
+do {									\
+	tmp = (a)[0]; (a)[0] = (b)[0]; (b)[0] = tmp;			\
+	tmp = (a)[-1]; (a)[-1] = (b)[-1]; (b)[-1] = tmp;		\
+	if (data != NULL) {						\
+		tmp = (ad)[0]; (ad)[0] = (bd)[0]; (bd)[0] = tmp;	\
+		tmp = (ad)[-1]; (ad)[-1] = (bd)[-1]; (bd)[-1] = tmp;	\
+	}								\
+} while (0)
+
+#define	DB_SORT_LOAD_DBT(a, ad, aptr, adptr)				\
+do {									\
+	(a).data = (u_int8_t*)key->data + (aptr)[0];			\
+	(a).size = (aptr)[-1];						\
+	if (data != NULL) {						\
+		(ad).data = (u_int8_t*)data->data + (adptr)[0];		\
+		(ad).size = (adptr)[-1];				\
+	}								\
+} while (0)
+
+#define	DB_SORT_COMPARE(a, ad, b, bd) (data != NULL ?			\
+	__db_compare_both(db, &(a), &(ad), &(b), &(bd)) :		\
+	__db_compare_both(db, &(a), 0, &(b), 0))
+
+#define	DB_SORT_STACKSIZE 32
+
+/*
+ * __db_quicksort --
+ *	The quicksort implementation for __db_sort_multiple() and
+ *	__db_sort_multiple_key().
+ */
+static int
+__db_quicksort(db, key, data, kstart, kend, dstart, dend, size)
+	DB *db;
+	DBT *key, *data;
+	u_int32_t *kstart, *kend, *dstart, *dend;
+	u_int32_t size;
+{
+	int ret, cmp;
+	u_int32_t tmp, len;
+	u_int32_t *kptr, *dptr, *kl, *dl, *kr, *dr;
+	DBT a, ad, b, bd, m, md;
+	ENV *env;
+
+	struct DB_SORT_quicksort_stack {
+		u_int32_t *kstart;
+		u_int32_t *kend;
+		u_int32_t *dstart;
+		u_int32_t *dend;
+	} stackbuf[DB_SORT_STACKSIZE], *stack;
+	u_int32_t soff, slen;
+
+	ret = 0;
+	env = db->env;
+
+	memset(&a, 0, sizeof(DBT));
+	memset(&ad, 0, sizeof(DBT));
+	memset(&b, 0, sizeof(DBT));
+	memset(&bd, 0, sizeof(DBT));
+	memset(&m, 0, sizeof(DBT));
+	memset(&md, 0, sizeof(DBT));
+
+	/* NB end is smaller than start */
+
+	stack = stackbuf;
+	soff = 0;
+	slen = DB_SORT_STACKSIZE;
+
+ start:
+	if (kend >= kstart) goto pop;
+
+	/* If there's only one value, it's already sorted */
+	len = (u_int32_t)(kstart - kend) / size;
+	if (len == 1) goto pop;
+
+	DB_SORT_LOAD_DBT(a, ad, kstart, dstart);
+	DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+
+	if (len == 2) {
+		/* Special case the sorting of two value sequences */
+		if (DB_SORT_COMPARE(a, ad, b, bd) > 0) {
+			DB_SORT_SWAP(kstart, dstart, kend + size,
+				dend + size);
+		}
+		goto pop;
+	}
+
+	kptr = kstart - (len / 2) * size;
+	dptr = dstart - (len / 2) * size;
+	DB_SORT_LOAD_DBT(m, md, kptr, dptr);
+
+	/* Find the median of three */
+	if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+		if (DB_SORT_COMPARE(m, md, a, ad) < 0) {
+			/* m < a < b */
+			if (len == 3) {
+				DB_SORT_SWAP(kstart, dstart, kptr, dptr);
+				goto pop;
+			}
+			DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+		} else if (DB_SORT_COMPARE(m, md, b, bd) < 0) {
+			/* a <= m < b */
+			if (len == 3) {
+				goto pop;
+			}
+			DB_SORT_SWAP(kptr, dptr, kend + size, dend + size);
+		} else {
+			/* a < b <= m */
+			if (len == 3) {
+				DB_SORT_SWAP(kptr, dptr, kend + size,
+					dend + size);
+				goto pop;
+			}
+			/* Do nothing */
+		}
+	} else {
+		if (DB_SORT_COMPARE(a, ad, m, md) < 0) {
+			/* b <= a < m */
+			DB_SORT_SWAP(kstart, dstart, kend + size,
+			    dend + size);
+			if (len == 3) {
+				DB_SORT_SWAP(kptr, dptr, kend + size,
+				    dend + size);
+				goto pop;
+			}
+		} else if (DB_SORT_COMPARE(b, bd, m, md) < 0) {
+			/* b < m <= a */
+			if (len == 3) {
+				DB_SORT_SWAP(kstart, dstart, kend + size,
+					dend + size);
+				goto pop;
+			}
+			DB_SORT_SWAP(kptr, dptr, kend + size, dend + size);
+		} else {
+			/* m <= b <= a */
+			if (len == 3) {
+				DB_SORT_SWAP(kstart, dstart, kptr, dptr);
+				DB_SORT_SWAP(kptr, dptr, kend + size,
+					dend + size);
+				goto pop;
+			}
+			/* Do nothing */
+		}
+	}
+
+	/* partition */
+	DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+	kl = kstart;
+	dl = dstart;
+	kr = kend + size;
+	dr = dend + size;
+	kptr = kstart;
+	dptr = dstart;
+	while (kptr >= kr) {
+		DB_SORT_LOAD_DBT(a, ad, kptr, dptr);
+		cmp = DB_SORT_COMPARE(a, ad, b, bd);
+		if (cmp < 0) {
+			DB_SORT_SWAP(kl, dl, kptr, dptr);
+			kl -= size;
+			dl -= size;
+			kptr -= size;
+			dptr -= size;
+		} else if (cmp > 0) {
+			DB_SORT_SWAP(kr, dr, kptr, dptr);
+			kr += size;
+			dr += size;
+		} else {
+			kptr -= size;
+			dptr -= size;
+		}
+	}
+
+	if (soff == slen) {
+		/* Grow the stack */
+		slen = slen * 2;
+		if (stack == stackbuf) {
+			ret = __os_malloc(env, slen *
+				sizeof(struct DB_SORT_quicksort_stack), &stack);
+			if (ret != 0) goto error;
+			memcpy(stack, stackbuf, soff *
+				sizeof(struct DB_SORT_quicksort_stack));
+		} else {
+			ret = __os_realloc(env, slen *
+				sizeof(struct DB_SORT_quicksort_stack), &stack);
+			if (ret != 0) goto error;
+		}
+	}
+
+	/* divide and conquer */
+	stack[soff].kstart = kr - size;
+	stack[soff].kend = kend;
+	stack[soff].dstart = dr - size;
+	stack[soff].dend = dend;
+	++soff;
+
+	kend = kl;
+	dend = dl;
+
+	goto start;
+
+ pop:
+	if (soff != 0) {
+		--soff;
+		kstart = stack[soff].kstart;
+		kend = stack[soff].kend;
+		dstart = stack[soff].dstart;
+		dend = stack[soff].dend;
+		goto start;
+	}
+
+ error:
+	if (stack != stackbuf)
+		__os_free(env, stack);
+
+	return (ret);
+}
+
+#undef DB_SORT_SWAP
+#undef DB_SORT_LOAD_DBT
+
+/*
+ * __db_sort_multiple --
+ *	If flags == DB_MULTIPLE_KEY, sorts a DB_MULTIPLE_KEY format DBT using
+ *	the BTree comparison function and duplicate comparison function.
+ *
+ *	If flags == DB_MULTIPLE, sorts one or two DB_MULTIPLE format DBTs using
+ *	the BTree comparison function and duplicate comparison function. Will
+ *	assume key and data specifies pairs of key/data to sort together. If
+ *	data is NULL, will just sort key according to the btree comparison
+ *	function.
+ *
+ *	Uses an in-place quicksort algorithm, with median of three for the pivot
+ *	point.
+ *
+ * PUBLIC: int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_sort_multiple(db, key, data, flags)
+	DB *db;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	u_int32_t *kstart, *kend, *dstart, *dend;
+
+	/* TODO: sanity checks on the DBTs */
+	/* DB_ILLEGAL_METHOD(db, DB_OK_BTREE); */
+
+	kstart = (u_int32_t*)((u_int8_t *)key->data + key->ulen) - 1;
+
+	switch (flags) {
+	case DB_MULTIPLE:
+		if (data != NULL)
+			dstart = (u_int32_t*)((u_int8_t *)data->data +
+				data->ulen) - 1;
+		else
+			dstart = kstart;
+
+		/* Find the end */
+		for (kend = kstart, dend = dstart;
+		    *kend != (u_int32_t)-1 && *dend != (u_int32_t)-1;
+		    kend -= 2, dend -= 2)
+			;
+
+		return (__db_quicksort(db, key, data, kstart, kend, dstart,
+			dend, 2));
+	case DB_MULTIPLE_KEY:
+		/* Find the end */
+		for (kend = kstart; *kend != (u_int32_t)-1; kend -= 4)
+			;
+
+		return (__db_quicksort(db, key, key, kstart, kend, kstart - 2,
+			kend - 2, 4));
+	default:
+		return (__db_ferr(db->env, "DB->sort_multiple", 0));
+	}
+}
diff --git a/src/db/db_stati.c b/src/db/db_stati.c
new file mode 100644
index 00000000..61744e81
--- /dev/null
+++ b/src/db/db_stati.c
@@ -0,0 +1,502 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+static int __db_print_all __P((DB *, u_int32_t));
+static int __db_print_citem __P((DBC *));
+static int __db_print_cursor __P((DB *));
+static int __db_print_stats __P((DB *, DB_THREAD_INFO *, u_int32_t));
+static int __db_stat __P((DB *, DB_THREAD_INFO *, DB_TXN *, void *, u_int32_t));
+static int __db_stat_arg __P((DB *, u_int32_t));
+
+/*
+ * __db_stat_pp --
+ *	DB->stat pre/post processing.
+ *
+ * PUBLIC: int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+ */
+int
+__db_stat_pp(dbp, txn, spp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+	if ((ret = __db_stat_arg(dbp, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+	    IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_stat(dbp, ip, txn, spp, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_stat --
+ *	DB->stat.
+ *
+ */
+static int
+__db_stat(dbp, ip, txn, spp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, txn,
+	     &dbc, LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, NULL, "DB->stat", NULL, NULL, flags);
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __partition_stat(dbc, spp, flags);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_stat(dbc, spp, flags);
+		break;
+	case DB_HASH:
+		ret = __ham_stat(dbc, spp, flags);
+		break;
+	case DB_HEAP:
+		ret = __heap_stat(dbc, spp, flags);
+		break;
+	case DB_QUEUE:
+		ret = __qam_stat(dbc, spp, flags);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = (__db_unknown_type(env, "DB->stat", dbp->type));
+		break;
+	}
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_stat_arg --
+ *	Check DB->stat arguments.
+ */
+static int
+__db_stat_arg(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+	switch (flags) {
+	case 0:
+	case DB_FAST_STAT:
+		break;
+	default:
+		return (__db_ferr(env, "DB->stat", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_stat_print_pp --
+ *	DB->stat_print pre/post processing.
+ *
+ * PUBLIC: int __db_stat_print_pp __P((DB *, u_int32_t));
+ */
+int
+__db_stat_print_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat_print");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline.
+	 */
+	if ((ret = __db_fchk(env,
+	    "DB->stat_print", flags, DB_FAST_STAT | DB_STAT_ALL)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_stat_print(dbp, ip, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_stat_print --
+ *	DB->stat_print.
+ *
+ * PUBLIC: int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+ */
+int
+__db_stat_print(dbp, ip, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	u_int32_t flags;
+{
+	time_t now;
+	int ret;
+	char time_buf[CTIME_BUFLEN];
+
+	(void)time(&now);
+	__db_msg(dbp->env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+	if (LF_ISSET(DB_STAT_ALL) && (ret = __db_print_all(dbp, flags)) != 0)
+		return (ret);
+
+	if ((ret = __db_print_stats(dbp, ip, flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_print_stats --
+ *	Display default DB handle statistics.
+ */
+static int
+__db_print_stats(dbp, ip, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, NULL, "DB->stat_print", NULL, NULL, 0);
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_stat_print(dbc, flags);
+		break;
+	case DB_HASH:
+		ret = __ham_stat_print(dbc, flags);
+		break;
+	case DB_HEAP:
+		ret = __heap_stat_print(dbc, flags);
+		break;
+	case DB_QUEUE:
+		ret = __qam_stat_print(dbc, flags);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = (__db_unknown_type(env, "DB->stat_print", dbp->type));
+		break;
+	}
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_print_all --
+ *	Display debugging DB handle statistics.
+ */
+static int
+__db_print_all(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_AM_CHKSUM,			"DB_AM_CHKSUM" },
+		{ DB_AM_COMPENSATE,		"DB_AM_COMPENSATE" },
+		{ DB_AM_CREATED,		"DB_AM_CREATED" },
+		{ DB_AM_CREATED_MSTR,		"DB_AM_CREATED_MSTR" },
+		{ DB_AM_DBM_ERROR,		"DB_AM_DBM_ERROR" },
+		{ DB_AM_DELIMITER,		"DB_AM_DELIMITER" },
+		{ DB_AM_DISCARD,		"DB_AM_DISCARD" },
+		{ DB_AM_DUP,			"DB_AM_DUP" },
+		{ DB_AM_DUPSORT,		"DB_AM_DUPSORT" },
+		{ DB_AM_ENCRYPT,		"DB_AM_ENCRYPT" },
+		{ DB_AM_FIXEDLEN,		"DB_AM_FIXEDLEN" },
+		{ DB_AM_INMEM,			"DB_AM_INMEM" },
+		{ DB_AM_IN_RENAME,		"DB_AM_IN_RENAME" },
+		{ DB_AM_NOT_DURABLE,		"DB_AM_NOT_DURABLE" },
+		{ DB_AM_OPEN_CALLED,		"DB_AM_OPEN_CALLED" },
+		{ DB_AM_PAD,			"DB_AM_PAD" },
+		{ DB_AM_PGDEF,			"DB_AM_PGDEF" },
+		{ DB_AM_RDONLY,			"DB_AM_RDONLY" },
+		{ DB_AM_READ_UNCOMMITTED,	"DB_AM_READ_UNCOMMITTED" },
+		{ DB_AM_RECNUM,			"DB_AM_RECNUM" },
+		{ DB_AM_RECOVER,		"DB_AM_RECOVER" },
+		{ DB_AM_RENUMBER,		"DB_AM_RENUMBER" },
+		{ DB_AM_REVSPLITOFF,		"DB_AM_REVSPLITOFF" },
+		{ DB_AM_SECONDARY,		"DB_AM_SECONDARY" },
+		{ DB_AM_SNAPSHOT,		"DB_AM_SNAPSHOT" },
+		{ DB_AM_SUBDB,			"DB_AM_SUBDB" },
+		{ DB_AM_SWAP,			"DB_AM_SWAP" },
+		{ DB_AM_TXN,			"DB_AM_TXN" },
+		{ DB_AM_VERIFYING,		"DB_AM_VERIFYING" },
+		{ 0,				NULL }
+	};
+	ENV *env;
+	char time_buf[CTIME_BUFLEN];
+
+	env = dbp->env;
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB handle information:");
+	STAT_ULONG("Page size", dbp->pgsize);
+	STAT_ISSET("Append recno", dbp->db_append_recno);
+	STAT_ISSET("Feedback", dbp->db_feedback);
+	STAT_ISSET("Dup compare", dbp->dup_compare);
+	STAT_ISSET("App private", dbp->app_private);
+	STAT_ISSET("DbEnv", dbp->env);
+	STAT_STRING("Type", __db_dbtype_to_string(dbp->type));
+
+	__mutex_print_debug_single(env, "Thread mutex", dbp->mutex, flags);
+
+	STAT_STRING("File", dbp->fname);
+	STAT_STRING("Database", dbp->dname);
+	STAT_HEX("Open flags", dbp->open_flags);
+
+	__db_print_fileid(env, dbp->fileid, "\tFile ID");
+
+	STAT_ULONG("Cursor adjust ID", dbp->adj_fileid);
+	STAT_ULONG("Meta pgno", dbp->meta_pgno);
+	if (dbp->locker != NULL)
+		STAT_ULONG("Locker ID", dbp->locker->id);
+	if (dbp->cur_locker != NULL)
+		STAT_ULONG("Handle lock", dbp->cur_locker->id);
+	if (dbp->associate_locker != NULL)
+		STAT_ULONG("Associate lock", dbp->associate_locker->id);
+
+	__db_msg(env,
+	    "%.24s\tReplication handle timestamp",
+	    dbp->timestamp == 0 ? "0" : __os_ctime(&dbp->timestamp, time_buf));
+
+	STAT_ISSET("Secondary callback", dbp->s_callback);
+	STAT_ISSET("Primary handle", dbp->s_primary);
+
+	STAT_ISSET("api internal", dbp->api_internal);
+	STAT_ISSET("Btree/Recno internal", dbp->bt_internal);
+	STAT_ISSET("Hash internal", dbp->h_internal);
+	STAT_ISSET("Queue internal", dbp->q_internal);
+
+	__db_prflags(env, NULL, dbp->flags, fn, NULL, "\tFlags");
+
+	if (dbp->log_filename == NULL)
+		STAT_ISSET("File naming information", dbp->log_filename);
+	else
+		__dbreg_print_fname(env, dbp->log_filename);
+
+	(void)__db_print_cursor(dbp);
+
+	return (0);
+}
+
+/*
+ * __db_print_cursor --
+ *	Display the cursor active and free queues.
+ */
+static int
+__db_print_cursor(dbp)
+	DB *dbp;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB handle cursors:");
+
+	ret = 0;
+	MUTEX_LOCK(dbp->env, dbp->mutex);
+	__db_msg(env, "Active queue:");
+	TAILQ_FOREACH(dbc, &dbp->active_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	__db_msg(env, "Join queue:");
+	TAILQ_FOREACH(dbc, &dbp->join_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	__db_msg(env, "Free queue:");
+	TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+	return (ret);
+}
+
+static int
+__db_print_citem(dbc)
+	DBC *dbc;
+{
+	static const FN fn[] = {
+		{ DBC_ACTIVE,		"DBC_ACTIVE" },
+		{ DBC_DONTLOCK,		"DBC_DONTLOCK" },
+		{ DBC_MULTIPLE,		"DBC_MULTIPLE" },
+		{ DBC_MULTIPLE_KEY,	"DBC_MULTIPLE_KEY" },
+		{ DBC_OPD,		"DBC_OPD" },
+		{ DBC_OWN_LID,		"DBC_OWN_LID" },
+		{ DBC_READ_COMMITTED,	"DBC_READ_COMMITTED" },
+		{ DBC_READ_UNCOMMITTED,	"DBC_READ_UNCOMMITTED" },
+		{ DBC_RECOVER,		"DBC_RECOVER" },
+		{ DBC_RMW,		"DBC_RMW" },
+		{ DBC_TRANSIENT,	"DBC_TRANSIENT" },
+		{ DBC_WAS_READ_COMMITTED,"DBC_WAS_READ_COMMITTED" },
+		{ DBC_WRITECURSOR,	"DBC_WRITECURSOR" },
+		{ DBC_WRITER,		"DBC_WRITER" },
+		{ 0,			NULL }
+	};
+	DB *dbp;
+	DBC_INTERNAL *cp;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = dbc->internal;
+
+	STAT_POINTER("DBC", dbc);
+	STAT_POINTER("Associated dbp", dbc->dbp);
+	STAT_POINTER("Associated txn", dbc->txn);
+	STAT_POINTER("Internal", cp);
+	STAT_HEX("Default locker ID", dbc->lref == NULL ? 0 : dbc->lref->id);
+	STAT_HEX("Locker", dbc->locker == NULL ? 0 : dbc->locker->id);
+	STAT_STRING("Type", __db_dbtype_to_string(dbc->dbtype));
+
+	STAT_POINTER("Off-page duplicate cursor", cp->opd);
+	STAT_POINTER("Referenced page", cp->page);
+	STAT_ULONG("Root", cp->root);
+	STAT_ULONG("Page number", cp->pgno);
+	STAT_ULONG("Page index", cp->indx);
+	STAT_STRING("Lock mode", __db_lockmode_to_string(cp->lock_mode));
+	__db_prflags(env, NULL, dbc->flags, fn, NULL, "\tFlags");
+
+	switch (dbc->dbtype) {
+	case DB_BTREE:
+	case DB_RECNO:
+		__bam_print_cursor(dbc);
+		break;
+	case DB_HASH:
+		__ham_print_cursor(dbc);
+		break;
+	case DB_HEAP:
+		__heap_print_cursor(dbc);
+		break;
+	case DB_UNKNOWN:
+		DB_ASSERT(env, dbp->type != DB_UNKNOWN);
+		/* FALLTHROUGH */
+	case DB_QUEUE:
+	default:
+		break;
+	}
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__db_stat_pp(dbp, txn, spp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbp->env));
+}
+
+int
+__db_stat_print_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbp->env));
+}
+#endif
diff --git a/src/db/db_truncate.c b/src/db/db_truncate.c
new file mode 100644
index 00000000..0eeb0c64
--- /dev/null
+++ b/src/db/db_truncate.c
@@ -0,0 +1,233 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_cursor_check_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __db_cursor_check __P((DB *));
+
+/*
+ * __db_truncate_pp
+ *	DB->truncate pre/post processing.
+ *
+ * PUBLIC: int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ */
+int
+__db_truncate_pp(dbp, txn, countp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t *countp, flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	handle_check = txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+
+	/* Check for invalid flags. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, DB_STR("0685",
+		    "DB->truncate forbidden on secondary indices"));
+		return (EINVAL);
+	}
+	if ((ret = __db_fchk(env, "DB->truncate", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	XA_CHECK_TXN(ip, txn);
+
+	/*
+	 * Make sure there are no active cursors on this db.  Since we drop
+	 * pages we cannot really adjust cursors.
+	 */
+	if ((ret = __db_cursor_check(dbp)) != 0) {
+		__db_errx(env, DB_STR("0686",
+		    "DB->truncate not permitted with active cursors"));
+		goto err;
+	}
+
+#ifdef CONFIG_TEST
+	if (IS_REP_MASTER(env))
+		DB_TEST_WAIT(env, env->test_check);
+#endif
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Check for changes to a read-only database.  This must be after the
+	 * replication block so that we cannot race master/client state changes.
+	 */
+	if (DB_IS_READONLY(dbp)) {
+		ret = __db_rdonly(env, "DB->truncate");
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_truncate(dbp, ip, txn, countp);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_truncate
+ *	DB->truncate.
+ *
+ * PUBLIC: int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     u_int32_t *));
+ */
+int
+__db_truncate(dbp, ip, txn, countp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t *countp;
+{
+	DB *sdbp;
+	DBC *dbc;
+	ENV *env;
+	u_int32_t scount;
+	int ret, t_ret;
+
+	env = dbp->env;
+	dbc = NULL;
+	ret = 0;
+
+	/*
+	 * Run through all secondaries and truncate them first.  The count
+	 * returned is the count of the primary only.  QUEUE uses normal
+	 * processing to truncate so it will update the secondaries normally.
+	 */
+	if (dbp->type != DB_QUEUE && DB_IS_PRIMARY(dbp)) {
+		if ((ret = __db_s_first(dbp, &sdbp)) != 0)
+			return (ret);
+		for (; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, txn))
+			if ((ret = __db_truncate(sdbp, ip, txn, &scount)) != 0)
+				break;
+		if (sdbp != NULL)
+			(void)__db_s_done(sdbp, txn);
+		if (ret != 0)
+			return (ret);
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL);
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "DB->truncate", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __part_truncate(dbc, countp);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_truncate(dbc, countp);
+		break;
+	case DB_HASH:
+		ret = __ham_truncate(dbc, countp);
+		break;
+	case DB_HEAP:
+		ret = __heap_truncate(dbc, countp);
+		break;
+	case DB_QUEUE:
+		ret = __qam_truncate(dbc, countp);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->truncate", dbp->type);
+		break;
+	}
+
+	/* Discard the cursor. */
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
+
+DB_TEST_RECOVERY_LABEL
+
+	return (ret);
+}
+
+static int
+__db_cursor_check_func(dbc, my_dbc, foundp, pgno, indx, args)
+	DBC *dbc, *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *args;
+{
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(args, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(indx, 0);
+	if (IS_INITIALIZED(dbc)) {
+		*foundp = 1;
+		return (EEXIST);
+	}
+	return (0);
+}
+/*
+ * __db_cursor_check --
+ *	See if there are any active cursors on this db.
+ */
+static int
+__db_cursor_check(dbp)
+	DB *dbp;
+{
+	int ret;
+	u_int32_t found;
+
+	ret = __db_walk_cursors(dbp, NULL,
+	    __db_cursor_check_func, &found, 0, 0, NULL);
+	return (ret == EEXIST ? EINVAL : ret);
+}
diff --git a/src/db/db_upg.c b/src/db/db_upg.c
new file mode 100644
index 00000000..de5d0dc7
--- /dev/null
+++ b/src/db/db_upg.c
@@ -0,0 +1,527 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_upgrade_pp --
+ *	DB->upgrade pre/post processing.
+ *
+ * PUBLIC: int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade_pp(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+#ifdef HAVE_UPGRADE_SUPPORT
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline.
+	 */
+	if ((ret = __db_fchk(env, "DB->upgrade", flags, DB_DUPSORT)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __db_upgrade(dbp, fname, flags);
+	ENV_LEAVE(env, ip);
+	return (ret);
+#else
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(fname, NULL);
+	COMPQUIET(flags, 0);
+
+	__db_errx(dbp->env, DB_STR("0665", "upgrade not supported"));
+	return (EINVAL);
+#endif
+}
+
+#ifdef HAVE_UPGRADE_SUPPORT
+static int (* const func_31_list[P_PAGETYPE_MAX])
+    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+	NULL,			/* P_INVALID */
+	NULL,			/* __P_DUPLICATE */
+	__ham_31_hash,		/* P_HASH_UNSORTED */
+	NULL,			/* P_IBTREE */
+	NULL,			/* P_IRECNO */
+	__bam_31_lbtree,	/* P_LBTREE */
+	NULL,			/* P_LRECNO */
+	NULL,			/* P_OVERFLOW */
+	__ham_31_hashmeta,	/* P_HASHMETA */
+	__bam_31_btreemeta,	/* P_BTREEMETA */
+	NULL,			/* P_QAMMETA */
+	NULL,			/* P_QAMDATA */
+	NULL,			/* P_LDUP */
+	NULL,			/* P_HASH */
+	NULL,			/* P_HEAPMETA */
+	NULL,			/* P_HEAP */
+	NULL,			/* P_IHEAP */
+};
+
+static int (* const func_46_list[P_PAGETYPE_MAX])
+    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+	NULL,			/* P_INVALID */
+	NULL,			/* __P_DUPLICATE */
+	__ham_46_hash,		/* P_HASH_UNSORTED */
+	NULL,			/* P_IBTREE */
+	NULL,			/* P_IRECNO */
+	NULL,			/* P_LBTREE */
+	NULL,			/* P_LRECNO */
+	NULL,			/* P_OVERFLOW */
+	__ham_46_hashmeta,	/* P_HASHMETA */
+	NULL,			/* P_BTREEMETA */
+	NULL,			/* P_QAMMETA */
+	NULL,			/* P_QAMDATA */
+	NULL,			/* P_LDUP */
+	NULL,			/* P_HASH */
+	NULL,			/* P_HEAPMETA */
+	NULL,			/* P_HEAP */
+	NULL,			/* P_IHEAP */
+};
+
+static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const [])
+	       (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *));
+static int __db_set_lastpgno __P((DB *, char *, DB_FH *));
+
+/*
+ * __db_upgrade --
+ *	Upgrade an existing database.
+ *
+ * PUBLIC: int __db_upgrade __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	DB_FH *fhp;
+	ENV *env;
+	size_t n;
+	int ret, t_ret, use_mp_open;
+	u_int8_t mbuf[256], tmpflags;
+	char *real_name;
+
+	use_mp_open = 0;
+	env = dbp->env;
+	fhp = NULL;
+
+	/* Get the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, fname, NULL, &real_name)) != 0)
+		return (ret);
+
+	/* Open the file. */
+	if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		return (ret);
+	}
+
+	/* Initialize the feedback. */
+	if (dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_UPGRADE, 0);
+
+	/*
+	 * Read the metadata page.  We read 256 bytes, which is larger than
+	 * any access method's metadata page and smaller than any disk sector.
+	 */
+	if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+
+	switch (((DBMETA *)mbuf)->magic) {
+	case DB_BTREEMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 6:
+			/*
+			 * Before V7 not all pages had page types, so we do the
+			 * single meta-data page by hand.
+			 */
+			if ((ret =
+			    __bam_30_btreemeta(dbp, real_name, mbuf)) != 0)
+				goto err;
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 7:
+			/*
+			 * We need the page size to do more.  Rip it out of
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_31_list, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 8:
+			if ((ret =
+			     __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 9:
+			break;
+		default:
+			__db_errx(env, DB_STR_A("0666",
+			    "%s: unsupported btree version: %lu", "%s %lu"),
+			    real_name, (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	case DB_HASHMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 4:
+		case 5:
+			/*
+			 * Before V6 not all pages had page types, so we do the
+			 * single meta-data page by hand.
+			 */
+			if ((ret =
+			    __ham_30_hashmeta(dbp, real_name, mbuf)) != 0)
+				goto err;
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+
+			/*
+			 * Before V6, we created hash pages one by one as they
+			 * were needed, using hashhdr.ovfl_point to reserve
+			 * a block of page numbers for them.  A consequence
+			 * of this was that, if no overflow pages had been
+			 * created, the current doubling might extend past
+			 * the end of the database file.
+			 *
+			 * In DB 3.X, we now create all the hash pages
+			 * belonging to a doubling atomically; it's not
+			 * safe to just save them for later, because when
+			 * we create an overflow page we'll just create
+			 * a new last page (whatever that may be).  Grow
+			 * the database to the end of the current doubling.
+			 */
+			if ((ret =
+			    __ham_30_sizefix(dbp, fhp, real_name, mbuf)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 6:
+			/*
+			 * We need the page size to do more.  Rip it out of
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_31_list, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 7:
+			if ((ret =
+			     __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 8:
+			/*
+			 * Any upgrade that has proceeded this far has metadata
+			 * pages compatible with hash version 8 metadata pages,
+			 * so casting mbuf to a dbmeta is safe.
+			 * If a newer revision moves the pagesize, checksum or
+			 * encrypt_alg flags in the metadata, then the
+			 * extraction of the fields will need to use hard coded
+			 * offsets.
+			 */
+			meta = (DBMETA*)mbuf;
+			/*
+			 * We need the page size to do more.  Extract it from
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, &meta->pagesize,
+			    sizeof(u_int32_t));
+			/*
+			 * Rip out metadata and encrypt_alg fields from the
+			 * metadata page. So the upgrade can know how big
+			 * the page metadata pre-amble is. Any upgrade that has
+			 * proceeded this far has metadata pages compatible
+			 * with hash version 8 metadata pages, so extracting
+			 * the fields is safe.
+			 */
+			memcpy(&tmpflags, &meta->metaflags, sizeof(u_int8_t));
+			if (FLD_ISSET(tmpflags, DBMETA_CHKSUM))
+				F_SET(dbp, DB_AM_CHKSUM);
+			memcpy(&tmpflags, &meta->encrypt_alg, sizeof(u_int8_t));
+			if (tmpflags != 0) {
+				if (!CRYPTO_ON(dbp->env)) {
+					__db_errx(env, DB_STR("0667",
+"Attempt to upgrade an encrypted database without providing a password."));
+					ret = EINVAL;
+					goto err;
+				}
+				F_SET(dbp, DB_AM_ENCRYPT);
+			}
+
+			/*
+			 * This is ugly. It is necessary to have a usable
+			 * mpool in the dbp to upgrade from an unsorted
+			 * to a sorted hash database. The mpool file is used
+			 * to resolve offpage key items, which are needed to
+			 * determine sort order. Having mpool open and access
+			 * the file does not affect the page pass, since the
+			 * page pass only updates DB_HASH_UNSORTED pages
+			 * in-place, and the mpool file is only used to read
+			 * OFFPAGE items.
+			 */
+			use_mp_open = 1;
+			if ((ret = __os_closehandle(env, fhp)) != 0)
+				return (ret);
+			dbp->type = DB_HASH;
+			if ((ret = __env_mpool(dbp, fname,
+			    DB_AM_NOT_DURABLE | DB_AM_VERIFYING)) != 0)
+				return (ret);
+			fhp = dbp->mpf->fhp;
+
+			/* Do the actual conversion pass. */
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_46_list, fhp)) != 0)
+				goto err;
+
+			/* FALLTHROUGH */
+		case 9:
+			break;
+		default:
+			__db_errx(env, DB_STR_A("0668",
+			    "%s: unsupported hash version: %lu", "%s %lu"),
+			    real_name, (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	case DB_HEAPMAGIC:
+		/*
+		 * There's no upgrade needed for Heap yet.
+		 */
+		break;
+	case DB_QAMMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 1:
+			/*
+			 * If we're in a Queue database, the only page that
+			 * needs upgrading is the meta-database page, don't
+			 * bother with a full pass.
+			 */
+			if ((ret = __qam_31_qammeta(dbp, real_name, mbuf)) != 0)
+				return (ret);
+			/* FALLTHROUGH */
+		case 2:
+			if ((ret = __qam_32_qammeta(dbp, real_name, mbuf)) != 0)
+				return (ret);
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 3:
+		case 4:
+			break;
+		default:
+			__db_errx(env, DB_STR_A("0669",
+			    "%s: unsupported queue version: %lu",
+			    "%s %lu"), real_name,
+			    (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	default:
+		M_32_SWAP(((DBMETA *)mbuf)->magic);
+		switch (((DBMETA *)mbuf)->magic) {
+		case DB_BTREEMAGIC:
+		case DB_HASHMAGIC:
+		case DB_HEAPMAGIC:
+		case DB_QAMMAGIC:
+			__db_errx(env, DB_STR_A("0670",
+		"%s: DB->upgrade only supported on native byte-order systems",
+			    "%s"), real_name);
+			break;
+		default:
+			__db_errx(env, DB_STR_A("0671",
+			    "%s: unrecognized file type", "%s"), real_name);
+			break;
+		}
+		ret = EINVAL;
+		goto err;
+	}
+
+	ret = __os_fsync(env, fhp);
+
+	/*
+	 * If mp_open was used, then rely on the database close to clean up
+	 * any file handles.
+	 */
+err:	if (use_mp_open == 0 && fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	__os_free(env, real_name);
+
+	/* We're done. */
+	if (dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_UPGRADE, 100);
+
+	return (ret);
+}
+
+/*
+ * __db_page_pass --
+ *	Walk the pages of the database, upgrading whatever needs it.
+ */
+static int
+__db_page_pass(dbp, real_name, flags, fl, fhp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	int (* const fl[P_PAGETYPE_MAX])
+	    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+	DB_FH *fhp;
+{
+	ENV *env;
+	PAGE *page;
+	db_pgno_t i, pgno_last;
+	size_t n;
+	int dirty, ret;
+
+	env = dbp->env;
+
+	/* Determine the last page of the file. */
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+		return (ret);
+
+	/* Allocate memory for a single page. */
+	if ((ret = __os_malloc(env, dbp->pgsize, &page)) != 0)
+		return (ret);
+
+	/* Walk the file, calling the underlying conversion functions. */
+	for (i = 0; i < pgno_last; ++i) {
+		if (dbp->db_feedback != NULL)
+			dbp->db_feedback(
+			    dbp, DB_UPGRADE, (int)((i * 100)/pgno_last));
+		if ((ret = __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+			break;
+		if ((ret = __os_read(env, fhp, page, dbp->pgsize, &n)) != 0)
+			break;
+		dirty = 0;
+		/* Always decrypt the page. */
+		if ((ret = __db_decrypt_pg(env, dbp, page)) != 0)
+			break;
+		if (fl[TYPE(page)] != NULL && (ret = fl[TYPE(page)]
+		    (dbp, real_name, flags, fhp, page, &dirty)) != 0)
+			break;
+		if (dirty) {
+			if ((ret = __db_encrypt_and_checksum_pg(
+			    env, dbp, page)) != 0)
+				break;
+			if ((ret =
+			    __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+				break;
+			if ((ret = __os_write(env,
+			    fhp, page, dbp->pgsize, &n)) != 0)
+				break;
+		}
+	}
+
+	__os_free(dbp->env, page);
+	return (ret);
+}
+
+/*
+ * __db_lastpgno --
+ *	Return the current last page number of the file.
+ *
+ * PUBLIC: int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+ */
+int
+__db_lastpgno(dbp, real_name, fhp, pgno_lastp)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+	db_pgno_t *pgno_lastp;
+{
+	ENV *env;
+	db_pgno_t pgno_last;
+	u_int32_t mbytes, bytes;
+	int ret;
+
+	env = dbp->env;
+
+	if ((ret = __os_ioinfo(env,
+	    real_name, fhp, &mbytes, &bytes, NULL)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		return (ret);
+	}
+
+	/* Page sizes have to be a power-of-two. */
+	if (bytes % dbp->pgsize != 0) {
+		__db_errx(env, DB_STR_A("0672",
+		    "%s: file size not a multiple of the pagesize", "%s"),
+		    real_name);
+		return (EINVAL);
+	}
+	pgno_last = mbytes * (MEGABYTE / dbp->pgsize);
+	pgno_last += bytes / dbp->pgsize;
+
+	*pgno_lastp = pgno_last;
+	return (0);
+}
+
+/*
+ * __db_set_lastpgno --
+ *	Update the meta->last_pgno field.
+ *
+ * Code assumes that we do not have checksums/crypto on the page.
+ */
+static int
+__db_set_lastpgno(dbp, real_name, fhp)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+{
+	DBMETA meta;
+	ENV *env;
+	int ret;
+	size_t n;
+
+	env = dbp->env;
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __os_read(env, fhp, &meta, sizeof(meta), &n)) != 0)
+		return (ret);
+	dbp->pgsize = meta.pagesize;
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &meta.last_pgno)) != 0)
+		return (ret);
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __os_write(env, fhp, &meta, sizeof(meta), &n)) != 0)
+		return (ret);
+
+	return (0);
+}
+#endif /* HAVE_UPGRADE_SUPPORT */
diff --git a/src/db/db_upg_opd.c b/src/db/db_upg_opd.c
new file mode 100644
index 00000000..992115ad
--- /dev/null
+++ b/src/db/db_upg_opd.c
@@ -0,0 +1,343 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t));
+
+#define	GET_PAGE(dbp, fhp, pgno, page) {				\
+	if ((ret = __os_seek(						\
+	    dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0)		\
+		goto err;						\
+	if ((ret = __os_read(dbp->env,				\
+	    fhp, page, (dbp)->pgsize, &n)) != 0)			\
+		goto err;						\
+}
+#define	PUT_PAGE(dbp, fhp, pgno, page) {				\
+	if ((ret = __os_seek(						\
+	    dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0)		\
+		goto err;						\
+	if ((ret = __os_write(dbp->env,				\
+	    fhp, page, (dbp)->pgsize, &n)) != 0)			\
+		goto err;						\
+}
+
+/*
+ * __db_31_offdup --
+ *	Convert 3.0 off-page duplicates to 3.1 off-page duplicates.
+ *
+ * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+ */
+int
+__db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+	int sorted;
+	db_pgno_t *pgnop;
+{
+	PAGE *ipage, *page;
+	db_indx_t indx;
+	db_pgno_t cur_cnt, i, next_cnt, pgno, *pgno_cur, pgno_last;
+	db_pgno_t *pgno_next, pgno_max, *tmp;
+	db_recno_t nrecs;
+	size_t n;
+	int level, nomem, ret;
+
+	ipage = page = NULL;
+	pgno_cur = pgno_next = NULL;
+
+	/* Allocate room to hold a page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+		goto err;
+
+	/*
+	 * Walk the chain of 3.0 off-page duplicates.  Each one is converted
+	 * in place to a 3.1 off-page duplicate page.  If the duplicates are
+	 * sorted, they are converted to a Btree leaf page, otherwise to a
+	 * Recno leaf page.
+	 */
+	for (nrecs = 0, cur_cnt = pgno_max = 0,
+	    pgno = *pgnop; pgno != PGNO_INVALID;) {
+		if (pgno_max == cur_cnt) {
+			pgno_max += 20;
+			if ((ret = __os_realloc(dbp->env, pgno_max *
+			    sizeof(db_pgno_t), &pgno_cur)) != 0)
+				goto err;
+		}
+		pgno_cur[cur_cnt++] = pgno;
+
+		GET_PAGE(dbp, fhp, pgno, page);
+		nrecs += NUM_ENT(page);
+		LEVEL(page) = LEAFLEVEL;
+		TYPE(page) = sorted ? P_LDUP : P_LRECNO;
+		/*
+		 * !!!
+		 * DB didn't zero the LSNs on off-page duplicates pages.
+		 */
+		ZERO_LSN(LSN(page));
+		PUT_PAGE(dbp, fhp, pgno, page);
+
+		pgno = NEXT_PGNO(page);
+	}
+
+	/* If we only have a single page, it's easy. */
+	if (cur_cnt <= 1)
+		goto done;
+
+	/*
+	 * pgno_cur is the list of pages we just converted.  We're
+	 * going to walk that list, but we'll need to create a new
+	 * list while we do so.
+	 */
+	if ((ret = __os_malloc(dbp->env,
+	    cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0)
+		goto err;
+
+	/* Figure out where we can start allocating new pages. */
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+		goto err;
+
+	/* Allocate room for an internal page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &ipage)) != 0)
+		goto err;
+	PGNO(ipage) = PGNO_INVALID;
+
+	/*
+	 * Repeatedly walk the list of pages, building internal pages, until
+	 * there's only one page at a level.
+	 */
+	for (level = LEAFLEVEL + 1; cur_cnt > 1; ++level) {
+		for (indx = 0, i = next_cnt = 0; i < cur_cnt;) {
+			if (indx == 0) {
+				P_INIT(ipage, dbp->pgsize, pgno_last,
+				    PGNO_INVALID, PGNO_INVALID,
+				    level, sorted ? P_IBTREE : P_IRECNO);
+				ZERO_LSN(LSN(ipage));
+
+				pgno_next[next_cnt++] = pgno_last++;
+			}
+
+			GET_PAGE(dbp, fhp, pgno_cur[i], page);
+
+			/*
+			 * If the duplicates are sorted, put the first item on
+			 * the lower-level page onto a Btree internal page. If
+			 * the duplicates are not sorted, create an internal
+			 * Recno structure on the page.  If either case doesn't
+			 * fit, push out the current page and start a new one.
+			 */
+			nomem = 0;
+			if (sorted) {
+				if ((ret = __db_build_bi(
+				    dbp, fhp, ipage, page, indx, &nomem)) != 0)
+					goto err;
+			} else
+				if ((ret = __db_build_ri(
+				    dbp, fhp, ipage, page, indx, &nomem)) != 0)
+					goto err;
+			if (nomem) {
+				indx = 0;
+				PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+			} else {
+				++indx;
+				++NUM_ENT(ipage);
+				++i;
+			}
+		}
+
+		/*
+		 * Push out the last internal page.  Set the top-level record
+		 * count if we've reached the top.
+		 */
+		if (next_cnt == 1)
+			RE_NREC_SET(ipage, nrecs);
+		PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+
+		/* Swap the current and next page number arrays. */
+		cur_cnt = next_cnt;
+		tmp = pgno_cur;
+		pgno_cur = pgno_next;
+		pgno_next = tmp;
+	}
+
+done:	*pgnop = pgno_cur[0];
+
+err:	if (pgno_cur != NULL)
+		__os_free(dbp->env, pgno_cur);
+	if (pgno_next != NULL)
+		__os_free(dbp->env, pgno_next);
+	if (ipage != NULL)
+		__os_free(dbp->env, ipage);
+	if (page != NULL)
+		__os_free(dbp->env, page);
+
+	return (ret);
+}
+
+/*
+ * __db_build_bi --
+ *	Build a BINTERNAL entry for a parent page.
+ */
+static int
+__db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
+	DB *dbp;
+	DB_FH *fhp;
+	PAGE *ipage, *page;
+	u_int32_t indx;
+	int *nomemp;
+{
+	BINTERNAL bi, *child_bi;
+	BKEYDATA *child_bk;
+	u_int8_t *p;
+	int ret;
+	db_indx_t *inp;
+
+	inp = P_INP(dbp, ipage);
+	switch (TYPE(page)) {
+	case P_IBTREE:
+		child_bi = GET_BINTERNAL(dbp, page, 0);
+		if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) {
+			*nomemp = 1;
+			return (0);
+		}
+		inp[indx] =
+		    HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
+		p = P_ENTRY(dbp, ipage, indx);
+
+		bi.len = child_bi->len;
+		B_TSET(bi.type, child_bi->type);
+		bi.pgno = PGNO(page);
+		bi.nrecs = __bam_total(dbp, page);
+		memcpy(p, &bi, SSZA(BINTERNAL, data));
+		p += SSZA(BINTERNAL, data);
+		memcpy(p, child_bi->data, child_bi->len);
+
+		/* Increment the overflow ref count. */
+		if (B_TYPE(child_bi->type) == B_OVERFLOW)
+			if ((ret = __db_up_ovref(dbp, fhp,
+			    ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+				return (ret);
+		break;
+	case P_LDUP:
+		child_bk = GET_BKEYDATA(dbp, page, 0);
+		switch (B_TYPE(child_bk->type)) {
+		case B_KEYDATA:
+			if (P_FREESPACE(dbp, ipage) <
+			    BINTERNAL_PSIZE(child_bk->len)) {
+				*nomemp = 1;
+				return (0);
+			}
+			inp[indx] =
+			    HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len);
+			p = P_ENTRY(dbp, ipage, indx);
+
+			bi.len = child_bk->len;
+			B_TSET(bi.type, child_bk->type);
+			bi.pgno = PGNO(page);
+			bi.nrecs = __bam_total(dbp, page);
+			memcpy(p, &bi, SSZA(BINTERNAL, data));
+			p += SSZA(BINTERNAL, data);
+			memcpy(p, child_bk->data, child_bk->len);
+			break;
+		case B_OVERFLOW:
+			if (P_FREESPACE(dbp, ipage) <
+			    BINTERNAL_PSIZE(BOVERFLOW_SIZE)) {
+				*nomemp = 1;
+				return (0);
+			}
+			inp[indx] =
+			    HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE);
+			p = P_ENTRY(dbp, ipage, indx);
+
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, child_bk->type);
+			bi.pgno = PGNO(page);
+			bi.nrecs = __bam_total(dbp, page);
+			memcpy(p, &bi, SSZA(BINTERNAL, data));
+			p += SSZA(BINTERNAL, data);
+			memcpy(p, child_bk, BOVERFLOW_SIZE);
+
+			/* Increment the overflow ref count. */
+			if ((ret = __db_up_ovref(dbp, fhp,
+			    ((BOVERFLOW *)child_bk)->pgno)) != 0)
+				return (ret);
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, PGNO(page)));
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(page)));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_build_ri --
+ *	Build a RINTERNAL entry for an internal parent page.
+ */
+static int
+__db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
+	DB *dbp;
+	DB_FH *fhp;
+	PAGE *ipage, *page;
+	u_int32_t indx;
+	int *nomemp;
+{
+	RINTERNAL ri;
+	db_indx_t *inp;
+
+	COMPQUIET(fhp, NULL);
+	inp = P_INP(dbp, ipage);
+	if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) {
+		*nomemp = 1;
+		return (0);
+	}
+
+	ri.pgno = PGNO(page);
+	ri.nrecs = __bam_total(dbp, page);
+	inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
+	memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE);
+
+	return (0);
+}
+
+/*
+ * __db_up_ovref --
+ *	Increment/decrement the reference count on an overflow page.
+ */
+static int
+__db_up_ovref(dbp, fhp, pgno)
+	DB *dbp;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+{
+	PAGE *page;
+	size_t n;
+	int ret;
+
+	/* Allocate room to hold a page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+		return (ret);
+
+	GET_PAGE(dbp, fhp, pgno, page);
+	++OV_REF(page);
+	PUT_PAGE(dbp, fhp, pgno, page);
+
+err:	__os_free(dbp->env, page);
+
+	return (ret);
+}
diff --git a/src/db/db_vrfy.c b/src/db/db_vrfy.c
new file mode 100644
index 00000000..9cb94ad2
--- /dev/null
+++ b/src/db/db_vrfy.c
@@ -0,0 +1,3055 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * This is the code for DB->verify, the DB database consistency checker.
+ * For now, it checks all subdatabases in a database, and verifies
+ * everything it knows how to (i.e. it's all-or-nothing, and one can't
+ * check only for a subset of possible problems).
+ */
+
+static u_int __db_guesspgsize __P((ENV *, DB_FH *));
+static int   __db_is_valid_magicno __P((u_int32_t, DBTYPE *));
+static int   __db_meta2pgset
+		__P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *));
+static int   __db_salvage __P((DB *, VRFY_DBINFO *,
+		db_pgno_t, void *, int (*)(void *, const void *), u_int32_t));
+static int   __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *,
+		PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+static int   __db_salvage_all __P((DB *, VRFY_DBINFO *, void *,
+		int(*)(void *, const void *), u_int32_t, int *));
+static int   __db_salvage_unknowns __P((DB *, VRFY_DBINFO *, void *,
+		int (*)(void *, const void *), u_int32_t));
+static int   __db_verify_arg __P((DB *, const char *, void *, u_int32_t));
+static int   __db_vrfy_freelist
+		__P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+static int   __db_vrfy_getpagezero
+		__P((DB *, DB_FH *, const char *, u_int8_t *, u_int32_t));
+static int   __db_vrfy_invalid
+		__P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+static int   __db_vrfy_orderchkonly __P((DB *,
+		VRFY_DBINFO *, const char *, const char *, u_int32_t));
+static int   __db_vrfy_pagezero __P((DB *,
+		VRFY_DBINFO *, DB_FH *, const char *, u_int32_t));
+static int   __db_vrfy_subdbs
+		__P((DB *, VRFY_DBINFO *, const char *, u_int32_t));
+static int   __db_vrfy_structure __P((DB *, VRFY_DBINFO *,
+		const char *, db_pgno_t, void *, void *, u_int32_t));
+static int   __db_vrfy_walkpages __P((DB *, VRFY_DBINFO *,
+		void *, int (*)(void *, const void *), u_int32_t));
+
+#define	VERIFY_FLAGS							\
+    (DB_AGGRESSIVE |							\
+     DB_NOORDERCHK | DB_ORDERCHKONLY | DB_PRINTABLE | DB_SALVAGE | DB_UNREF)
+
+/*
+ * __db_verify_pp --
+ *	DB->verify public interface.
+ *
+ * PUBLIC: int __db_verify_pp
+ * PUBLIC:     __P((DB *, const char *, const char *, FILE *, u_int32_t));
+ */
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+	DB *dbp;
+	const char *file, *database;
+	FILE *outfile;
+	u_int32_t flags;
+{
+	/*
+	 * __db_verify_pp is a wrapper to __db_verify_internal, which lets
+	 * us pass appropriate equivalents to FILE * in from the non-C APIs.
+	 * That's why the usual ENV_ENTER macros are in __db_verify_internal,
+	 * not here.
+	 */
+	return (__db_verify_internal(dbp,
+	    file, database, outfile, __db_pr_callback, flags));
+}
+
+/*
+ * __db_verify_internal --
+ *
+ * PUBLIC: int __db_verify_internal __P((DB *, const char *,
+ * PUBLIC:     const char *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_verify_internal(dbp, fname, dname, handle, callback, flags)
+	DB *dbp;
+	const char *fname, *dname;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->verify");
+
+	if (!LF_ISSET(DB_SALVAGE))
+		LF_SET(DB_UNREF);
+
+	ENV_ENTER(env, ip);
+
+	if ((ret = __db_verify_arg(dbp, dname, handle, flags)) == 0)
+		ret = __db_verify(dbp, ip,
+		     fname, dname, handle, callback, NULL, NULL, flags);
+
+	/* Db.verify is a DB handle destructor. */
+	if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_verify_arg --
+ *	Check DB->verify arguments.
+ */
+static int
+__db_verify_arg(dbp, dname, handle, flags)
+	DB *dbp;
+	const char *dname;
+	void *handle;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_fchk(env, "DB->verify", flags, VERIFY_FLAGS)) != 0)
+		return (ret);
+
+	/*
+	 * DB_SALVAGE is mutually exclusive with the other flags except
+	 * DB_AGGRESSIVE, DB_PRINTABLE.
+	 *
+	 * DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging.
+	 *
+	 * DB_SALVAGE requires an output stream.
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if (LF_ISSET(~(DB_AGGRESSIVE | DB_PRINTABLE | DB_SALVAGE)))
+			return (__db_ferr(env, "DB->verify", 1));
+		if (handle == NULL) {
+			__db_errx(env, DB_STR("0518",
+			    "DB_SALVAGE requires a an output handle"));
+			return (EINVAL);
+		}
+	} else
+		if (LF_ISSET(DB_AGGRESSIVE | DB_PRINTABLE))
+			return (__db_ferr(env, "DB->verify", 1));
+
+	/*
+	 * DB_ORDERCHKONLY is mutually exclusive with DB_SALVAGE and
+	 * DB_NOORDERCHK, and requires a database name.
+	 */
+	if ((ret = __db_fcchk(env, "DB->verify", flags,
+	    DB_ORDERCHKONLY, DB_SALVAGE | DB_NOORDERCHK)) != 0)
+		return (ret);
+	if (LF_ISSET(DB_ORDERCHKONLY) && dname == NULL) {
+		__db_errx(env, DB_STR("0519",
+		    "DB_ORDERCHKONLY requires a database name"));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_verify --
+ *	Walk the entire file page-by-page, either verifying with or without
+ *	dumping in db_dump -d format, or DB_SALVAGE-ing whatever key/data
+ *	pairs can be found and dumping them in standard (db_load-ready)
+ *	dump format.
+ *
+ *	(Salvaging isn't really a verification operation, but we put it
+ *	here anyway because it requires essentially identical top-level
+ *	code.)
+ *
+ *	flags may be 0, DB_NOORDERCHK, DB_ORDERCHKONLY, or DB_SALVAGE
+ *	(and optionally DB_AGGRESSIVE).
+ * PUBLIC: int   __db_verify __P((DB *, DB_THREAD_INFO *, const char *,
+ * PUBLIC:		const char *, void *, int (*)(void *, const void *),
+ * PUBLIC:		void *, void *, u_int32_t));
+ */
+int
+__db_verify(dbp, ip, name, subdb, handle, callback, lp, rp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	const char *name, *subdb;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	void *lp, *rp;
+	u_int32_t flags;
+{
+	DB_FH *fhp;
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	u_int32_t sflags;
+	int has_subdbs, isbad, ret, t_ret;
+	char *real_name;
+
+	env = dbp->env;
+	fhp = NULL;
+	vdp = NULL;
+	real_name = NULL;
+	has_subdbs = isbad = ret = t_ret = 0;
+
+	F_SET(dbp, DB_AM_VERIFYING);
+
+	/* Initialize any feedback function. */
+	if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_VERIFY, 0);
+
+	/*
+	 * We don't know how large the cache is, and if the database
+	 * in question uses a small page size--which we don't know
+	 * yet!--it may be uncomfortably small for the default page
+	 * size [#2143].  However, the things we need temporary
+	 * databases for in dbinfo are largely tiny, so using a
+	 * 1024-byte pagesize is probably not going to be a big hit,
+	 * and will make us fit better into small spaces.
+	 */
+	if ((ret = __db_vrfy_dbinfo_create(env, ip,  1024, &vdp)) != 0)
+		goto err;
+
+	/*
+	 * Note whether the user has requested that we use printable
+	 * chars where possible.  We won't get here with this flag if
+	 * we're not salvaging.
+	 */
+	if (LF_ISSET(DB_PRINTABLE))
+		F_SET(vdp, SALVAGE_PRINTABLE);
+
+	if (name != NULL) {
+		/* Find the real name of the file. */
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+			goto err;
+
+		/*
+		 * Our first order of business is to verify page 0, which is the
+		 * metadata page for the master database of subdatabases or of
+		 * the only database in the file.  We want to do this by hand
+		 * rather than just calling __db_open in case it's
+		 * corrupt--various things in __db_open might act funny.
+		 *
+		 * Once we know the metadata page is healthy, I believe that
+		 * it's safe to open the database normally and then use the page
+		 * swapping code, which makes life easier.
+		 */
+		if ((ret = __os_open(env,
+		    real_name, 0, DB_OSO_RDONLY, 0, &fhp)) != 0)
+			goto err;
+	} else {
+		MAKE_INMEM(dbp);
+	}
+
+	/* Verify the metadata page 0; set pagesize and type. */
+	if ((ret = __db_vrfy_pagezero(dbp, vdp, fhp, subdb, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * We can assume at this point that dbp->pagesize and dbp->type are
+	 * set correctly, or at least as well as they can be, and that
+	 * locking, logging, and txns are not in use.  Thus we can trust
+	 * the memp code not to look at the page, and thus to be safe
+	 * enough to use.
+	 *
+	 * The dbp is not open, but the file is open in the fhp, and we
+	 * cannot assume that __db_open is safe.  Call __env_setup,
+	 * the [safe] part of __db_open that initializes the environment--
+	 * and the mpool--manually.
+	 */
+	if ((ret = __env_setup(dbp, NULL,
+	    name, subdb, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0)
+		goto err;
+
+	/*
+	 * Set our name in the Queue subsystem;  we may need it later
+	 * to deal with extents.  In-memory databases are not allowed to have
+	 * extents.
+	 */
+	if (dbp->type == DB_QUEUE && name != NULL &&
+	    (ret = __qam_set_ext_data(dbp, name)) != 0)
+		goto err;
+
+	/* Mark the dbp as opened, so that we correctly handle its close. */
+	F_SET(dbp, DB_AM_OPEN_CALLED);
+
+	/*
+	 * Find out the page number of the last page in the database.  We'll
+	 * use this later to verify the metadata page.  We don't verify now
+	 * because the data from __db_vrfy_pagezero could be stale.
+	 */
+	if ((ret = __memp_get_last_pgno(dbp->mpf, &vdp->last_pgno)) != 0)
+		goto err;
+	/*
+	 * DB_ORDERCHKONLY is a special case;  our file consists of
+	 * several subdatabases, which use different hash, bt_compare,
+	 * and/or dup_compare functions.  Consequently, we couldn't verify
+	 * sorting and hashing simply by calling DB->verify() on the file.
+	 * DB_ORDERCHKONLY allows us to come back and check those things;  it
+	 * requires a subdatabase, and assumes that everything but that
+	 * database's sorting/hashing is correct.
+	 */
+	if (LF_ISSET(DB_ORDERCHKONLY)) {
+		ret = __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags);
+		goto done;
+	}
+
+	sflags = flags;
+	if (dbp->p_internal != NULL)
+		LF_CLR(DB_SALVAGE);
+
+	/*
+	 * When salvaging, we use a db to keep track of whether we've seen a
+	 * given overflow or dup page in the course of traversing normal data.
+	 * If in the end we have not, we assume its key got lost and print it
+	 * with key "UNKNOWN".
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if ((ret = __db_salvage_init(vdp)) != 0)
+			goto err;
+
+		/*
+		 * If we're not being aggressive, salvage by walking the tree
+		 * and only printing the leaves we find.  "has_subdbs" will
+		 * indicate whether we found subdatabases.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) && __db_salvage_all(
+		    dbp, vdp, handle, callback, flags, &has_subdbs) != 0)
+			isbad = 1;
+
+		/*
+		 * If we have subdatabases, flag if any keys are found that
+		 * don't belong to a subdatabase -- they'll need to have an
+		 * "__OTHER__" subdatabase header printed first.
+		 */
+		if (has_subdbs) {
+			F_SET(vdp, SALVAGE_PRINTHEADER);
+			F_SET(vdp, SALVAGE_HASSUBDBS);
+		}
+	}
+
+	/* Walk all the pages, if a page cannot be read, verify structure. */
+	if ((ret =
+	    __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else if (ret != DB_PAGE_NOTFOUND)
+			goto err;
+	}
+
+	/* If we're verifying, verify inter-page structure. */
+	if (!LF_ISSET(DB_SALVAGE) && isbad == 0)
+		if ((t_ret = __db_vrfy_structure(dbp,
+		    vdp, name, 0, lp, rp, flags)) != 0) {
+			if (t_ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+
+	/*
+	 * If we're salvaging, output with key UNKNOWN any overflow or dup pages
+	 * we haven't been able to put in context.  Then destroy the salvager's
+	 * state-saving database.
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if ((ret = __db_salvage_unknowns(dbp,
+		    vdp, handle, callback, flags)) != 0)
+			isbad = 1;
+	}
+
+	flags = sflags;
+
+#ifdef HAVE_PARTITION
+	if (t_ret == 0 && dbp->p_internal != NULL)
+		t_ret = __part_verify(dbp, vdp, name, handle, callback, flags);
+#endif
+
+	if (ret == 0)
+		ret = t_ret;
+
+	/* Don't display a footer for a database holding other databases. */
+	if (LF_ISSET(DB_SALVAGE | DB_VERIFY_PARTITION) == DB_SALVAGE &&
+	    (!has_subdbs || F_ISSET(vdp, SALVAGE_PRINTFOOTER)))
+		(void)__db_prfooter(handle, callback);
+
+done: err:
+	/* Send feedback that we're done. */
+	if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_VERIFY, 100);
+
+	if (LF_ISSET(DB_SALVAGE) &&
+	    (t_ret = __db_salvage_destroy(vdp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (vdp != NULL &&
+	    (t_ret = __db_vrfy_dbinfo_destroy(env, vdp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (real_name != NULL)
+		__os_free(env, real_name);
+
+	/*
+	 * DB_VERIFY_FATAL is a private error, translate to a public one.
+	 *
+	 * If we didn't find a page, it's probably a page number was corrupted.
+	 * Return the standard corruption error.
+	 *
+	 * Otherwise, if we found corruption along the way, set the return.
+	 */
+	if (ret == DB_VERIFY_FATAL ||
+	    ret == DB_PAGE_NOTFOUND || (ret == 0 && isbad == 1))
+		ret = DB_VERIFY_BAD;
+
+	/* Make sure there's a public complaint if we found corruption. */
+	if (ret != 0)
+		__db_err(env, ret, "%s", name);
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_getpagezero --
+ *      Store the master metadata page into a local buffer.  For safety, skip
+ *      the DB paging code and read the page directly from disk (via seek and
+ *      read) or the mpool.
+ */
+static int
+__db_vrfy_getpagezero(dbp, fhp, name, mbuf, flags)
+	DB *dbp;
+	DB_FH *fhp;
+	const char *name;
+	u_int8_t *mbuf;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t pgno;
+	int ret, t_ret;
+	size_t nr;
+
+	env = dbp->env;
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		/*
+		 * Now get the metadata page from the cache, if possible.  If
+		 * we're verifying an in-memory db, this is the only metadata
+		 * page we have.
+		 *
+		 *
+		 * Open the in-memory db file and get the metadata page.
+		 */
+		if ((ret = __memp_fcreate_pp(env->dbenv, &mpf, DB_VERIFY)) != 0)
+			return (ret);
+		if ((ret = __memp_set_flags(mpf, DB_MPOOL_NOFILE, 1)) != 0)
+			goto mpf_err;
+		if ((ret = __memp_fopen_pp(mpf,
+		    name, DB_ODDFILESIZE | DB_RDONLY, 0, 0)) != 0)
+			goto mpf_err;
+		pgno = PGNO_BASE_MD;
+		if ((ret = __memp_fget_pp(mpf, &pgno, NULL, 0, &h)) != 0) {
+			__db_err(env, ret, DB_STR_A("0747",
+			    "Metadata page %lu cannot be read from mpool",
+			    "%lu"), (u_long)pgno);
+			goto mpf_err;
+		}
+		memcpy(mbuf, (u_int8_t *)h, DBMETASIZE);
+		ret = __memp_fput_pp(mpf, h, DB_PRIORITY_UNCHANGED, 0);
+mpf_err:	if ((t_ret = __memp_fclose_pp(mpf, 0)) != 0 || ret != 0) {
+			return (ret == 0 ? t_ret : ret);
+		}
+	} else {
+		/*
+		 * Seek to the metadata page.
+		 *
+		 * Note that if we're just starting a verification, dbp->pgsize
+		 * may be zero;  this is okay, as we want page zero anyway and
+		 * 0*0 == 0.
+		 */
+		if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0 ||
+		    (ret = __os_read(env, fhp, mbuf, DBMETASIZE, &nr)) != 0) {
+			__db_err(env, ret, DB_STR_A("0520",
+			    "Metadata page %lu cannot be read", "%lu"),
+			    (u_long)PGNO_BASE_MD);
+			return (ret);
+		}
+
+		if (nr != DBMETASIZE) {
+			EPRINT((env, DB_STR_A("0521",
+			    "Page %lu: Incomplete metadata page", "%lu"),
+			    (u_long)PGNO_BASE_MD));
+			return (DB_VERIFY_FATAL);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_pagezero --
+ *	Verify the master metadata page.  Use seek, read, and a local buffer
+ *	rather than the DB paging code, for safety.
+ *
+ *	Must correctly (or best-guess) set dbp->type and dbp->pagesize.
+ */
+static int
+__db_vrfy_pagezero(dbp, vdp, fhp, name, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DB_FH *fhp;
+	const char *name;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t freelist;
+	int isbad, ret, swapped;
+	u_int8_t mbuf[DBMETASIZE];
+
+	isbad = ret = swapped = 0;
+	freelist = 0;
+	env = dbp->env;
+	meta = (DBMETA *)mbuf;
+	dbp->type = DB_UNKNOWN;
+
+	if ((ret = __db_vrfy_getpagezero(dbp, fhp, name, mbuf, flags)) != 0)
+		return (ret);
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+		return (ret);
+
+	if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) {
+		EPRINT((env, DB_STR_A("0522",
+		    "Page %lu: metadata page corrupted", "%lu"),
+		    (u_long)PGNO_BASE_MD));
+		isbad = 1;
+		if (ret != DB_CHKSUM_FAIL) {
+			EPRINT((env, DB_STR_A("0523",
+			    "Page %lu: could not check metadata page", "%lu"),
+			    (u_long)PGNO_BASE_MD));
+			return (DB_VERIFY_FATAL);
+		}
+	}
+
+	/*
+	 * Check all of the fields that we can.
+	 *
+	 * 08-11: Current page number.  Must == pgno.
+	 * Note that endianness doesn't matter--it's zero.
+	 */
+	if (meta->pgno != PGNO_BASE_MD) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0524",
+		    "Page %lu: pgno incorrectly set to %lu", "%lu %lu"),
+		    (u_long)PGNO_BASE_MD, (u_long)meta->pgno));
+	}
+
+	/* 12-15: Magic number.  Must be one of valid set. */
+	if (__db_is_valid_magicno(meta->magic, &dbp->type))
+		swapped = 0;
+	else {
+		M_32_SWAP(meta->magic);
+		if (__db_is_valid_magicno(meta->magic,
+		    &dbp->type))
+			swapped = 1;
+		else {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0525",
+			    "Page %lu: bad magic number %lu", "%lu %lu"),
+			    (u_long)PGNO_BASE_MD, (u_long)meta->magic));
+		}
+	}
+
+	/*
+	 * 16-19: Version.  Must be current;  for now, we
+	 * don't support verification of old versions.
+	 */
+	if (swapped)
+		M_32_SWAP(meta->version);
+	if ((dbp->type == DB_BTREE &&
+	    (meta->version > DB_BTREEVERSION ||
+	    meta->version < DB_BTREEOLDVER)) ||
+	    (dbp->type == DB_HASH &&
+	    (meta->version > DB_HASHVERSION ||
+	    meta->version < DB_HASHOLDVER)) ||
+	    (dbp->type == DB_HEAP &&
+	    (meta->version > DB_HEAPVERSION ||
+	    meta->version < DB_HEAPOLDVER)) ||
+	    (dbp->type == DB_QUEUE &&
+	    (meta->version > DB_QAMVERSION ||
+	    meta->version < DB_QAMOLDVER))) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0526",
+    "Page %lu: unsupported DB version %lu; extraneous errors may result",
+		    "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->version));
+	}
+
+	/*
+	 * 20-23: Pagesize.  Must be power of two,
+	 * greater than 512, and less than 64K.
+	 */
+	if (swapped)
+		M_32_SWAP(meta->pagesize);
+	if (IS_VALID_PAGESIZE(meta->pagesize))
+		dbp->pgsize = meta->pagesize;
+	else {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0527", "Page %lu: bad page size %lu",
+		    "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->pagesize));
+
+		/*
+		 * Now try to settle on a pagesize to use.
+		 * If the user-supplied one is reasonable,
+		 * use it;  else, guess.
+		 */
+		if (!IS_VALID_PAGESIZE(dbp->pgsize))
+			dbp->pgsize = __db_guesspgsize(env, fhp);
+	}
+
+	/*
+	 * 25: Page type.  Must be correct for dbp->type,
+	 * which is by now set as well as it can be.
+	 */
+	/* Needs no swapping--only one byte! */
+	if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) ||
+	    (dbp->type == DB_HASH && meta->type != P_HASHMETA) ||
+	    (dbp->type == DB_HEAP && meta->type != P_HEAPMETA) ||
+	    (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0528", "Page %lu: bad page type %lu",
+		    "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->type));
+	}
+
+	/*
+	 * 26: Meta-flags.
+	 */
+	if (meta->metaflags != 0) {
+		if (FLD_ISSET(meta->metaflags,
+		    ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0529",
+			    "Page %lu: bad meta-data flags value %#lx",
+			    "%lu %#lx"), (u_long)PGNO_BASE_MD,
+			    (u_long)meta->metaflags));
+		}
+		if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+			F_SET(pip, VRFY_HAS_CHKSUM);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+			F_SET(pip, VRFY_HAS_PART_RANGE);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+			F_SET(pip, VRFY_HAS_PART_CALLBACK);
+
+		if (FLD_ISSET(meta->metaflags,
+		    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) &&
+		    (ret = __partition_init(dbp, meta->metaflags)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * 28-31: Free list page number.
+	 * 32-35: Last page in database file.
+	 * We'll verify last_pgno once we open the db in the mpool;
+	 * for now, just store it.
+	 */
+	if (swapped)
+	    M_32_SWAP(meta->free);
+	freelist = meta->free;
+	if (swapped)
+	    M_32_SWAP(meta->last_pgno);
+	vdp->meta_last_pgno = meta->last_pgno;
+
+	/*
+	 * Initialize vdp->pages to fit a single pageinfo structure for
+	 * this one page.  We'll realloc later when we know how many
+	 * pages there are.
+	 */
+	pip->pgno = PGNO_BASE_MD;
+	pip->type = meta->type;
+
+	/*
+	 * Signal that we still have to check the info specific to
+	 * a given type of meta page.
+	 */
+	F_SET(pip, VRFY_INCOMPLETE);
+
+	pip->free = freelist;
+
+	if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+		return (ret);
+
+	/* Set up the dbp's fileid.  We don't use the regular open path. */
+	memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN);
+	dbp->preserve_fid = 1;
+
+	if (swapped == 1)
+		F_SET(dbp, DB_AM_SWAP);
+
+	return (isbad ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_vrfy_walkpages --
+ *	Main loop of the verifier/salvager.  Walks through,
+ *	page by page, and verifies all pages and/or prints all data pages.
+ */
+static int
+__db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i;
+	int ret, t_ret, isbad;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = isbad = t_ret = 0;
+
+	for (i = 0; i <= vdp->last_pgno; i++) {
+		/*
+		 * If DB_SALVAGE is set, we inspect our database of completed
+		 * pages, and skip any we've already printed in the subdb pass.
+		 */
+		if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+			continue;
+
+		/*
+		 * An individual page get can fail if:
+		 *  * This is a hash database, it is expected to find
+		 *    empty buckets, which don't have allocated pages. Create
+		 *    a dummy page so the verification can proceed.
+		 *  * We are salvaging, flag the error and continue.
+		 */
+		if ((t_ret = __memp_fget(mpf, &i,
+		    vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (dbp->type == DB_HASH ||
+			    (dbp->type == DB_QUEUE &&
+			    F_ISSET(dbp, DB_AM_INMEM))) {
+				if ((t_ret =
+				    __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+					goto err1;
+				pip->type = P_INVALID;
+				pip->pgno = i;
+				F_CLR(pip, VRFY_IS_ALLZEROES);
+				F_SET(pip, VRFY_NONEXISTENT);
+				if ((t_ret = __db_vrfy_putpageinfo(
+				    env, vdp, pip)) != 0)
+					goto err1;
+				continue;
+			}
+			if (t_ret == DB_PAGE_NOTFOUND) {
+				EPRINT((env, DB_STR_A("0530",
+    "Page %lu: beyond the end of the file, metadata page has last page as %lu",
+				    "%lu %lu"), (u_long)i,
+				    (u_long)vdp->last_pgno));
+				if (ret == 0)
+					return (t_ret);
+			}
+
+err1:			if (ret == 0)
+				ret = t_ret;
+			if (LF_ISSET(DB_SALVAGE))
+				continue;
+			return (ret);
+		}
+
+		if (LF_ISSET(DB_SALVAGE)) {
+			/*
+			 * We pretty much don't want to quit unless a
+			 * bomb hits.  May as well return that something
+			 * was screwy, however.
+			 */
+			if ((t_ret = __db_salvage_pg(dbp,
+			    vdp, i, h, handle, callback, flags)) != 0) {
+				if (ret == 0)
+					ret = t_ret;
+				isbad = 1;
+			}
+		} else {
+			/*
+			 * If we are not salvaging, and we get any error
+			 * other than DB_VERIFY_BAD, return immediately;
+			 * it may not be safe to proceed.  If we get
+			 * DB_VERIFY_BAD, keep going;  listing more errors
+			 * may make it easier to diagnose problems and
+			 * determine the magnitude of the corruption.
+			 *
+			 * Verify info common to all page types.
+			 */
+			if (i != PGNO_BASE_MD) {
+				ret = __db_vrfy_common(dbp, vdp, h, i, flags);
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else if (ret != 0)
+					goto err;
+			}
+
+			switch (TYPE(h)) {
+			case P_INVALID:
+				ret = __db_vrfy_invalid(dbp, vdp, h, i, flags);
+				break;
+			case __P_DUPLICATE:
+				isbad = 1;
+				EPRINT((env, DB_STR_A("0531",
+				    "Page %lu: old-style duplicate page",
+				    "%lu"), (u_long)i));
+				break;
+			case P_HASH_UNSORTED:
+			case P_HASH:
+				ret = __ham_vrfy(dbp, vdp, h, i, flags);
+				break;
+			case P_HEAP:
+			case P_IHEAP:
+				ret = __heap_vrfy(dbp, vdp, h, i, flags);
+				break;
+			case P_IBTREE:
+			case P_IRECNO:
+			case P_LBTREE:
+			case P_LDUP:
+				ret = __bam_vrfy(dbp, vdp, h, i, flags);
+				break;
+			case P_LRECNO:
+				ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags);
+				break;
+			case P_OVERFLOW:
+				ret = __db_vrfy_overflow(dbp, vdp, h, i, flags);
+				break;
+			case P_HASHMETA:
+				ret = __ham_vrfy_meta(dbp,
+				    vdp, (HMETA *)h, i, flags);
+				break;
+			case P_HEAPMETA:
+				ret = __heap_vrfy_meta(dbp,
+				    vdp, (HEAPMETA *)h, i, flags);
+				break;
+			case P_BTREEMETA:
+				ret = __bam_vrfy_meta(dbp,
+				    vdp, (BTMETA *)h, i, flags);
+				break;
+			case P_QAMMETA:
+				ret = __qam_vrfy_meta(dbp,
+				    vdp, (QMETA *)h, i, flags);
+				break;
+			case P_QAMDATA:
+				ret = __qam_vrfy_data(dbp,
+				    vdp, (QPAGE *)h, i, flags);
+				break;
+			default:
+				EPRINT((env, DB_STR_A("0532",
+				    "Page %lu: unknown page type %lu",
+				    "%lu %lu"), (u_long)i, (u_long)TYPE(h)));
+				isbad = 1;
+				break;
+			}
+
+			/*
+			 * Set up error return.
+			 */
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else if (ret != 0)
+				goto err;
+
+			/*
+			 * Provide feedback to the application about our
+			 * progress.  The range 0-50% comes from the fact
+			 * that this is the first of two passes through the
+			 * database (front-to-back, then top-to-bottom).
+			 */
+			if (dbp->db_feedback != NULL)
+				dbp->db_feedback(dbp, DB_VERIFY,
+				    (int)((i + 1) * 50 / (vdp->last_pgno + 1)));
+		}
+
+		/*
+		 * Just as with the page get, bail if and only if we're
+		 * not salvaging.
+		 */
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			if (!LF_ISSET(DB_SALVAGE))
+				return (ret);
+		}
+	}
+
+	/*
+	 * If we've seen a Queue metadata page, we may need to walk Queue
+	 * extent pages that won't show up between 0 and vdp->last_pgno.
+	 */
+	if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret =
+	    __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) {
+		if (ret == 0)
+			ret = t_ret;
+		if (t_ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else if (!LF_ISSET(DB_SALVAGE))
+			return (ret);
+	}
+
+	if (0) {
+err:		if (h != NULL && (t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0)
+			return (ret == 0 ? t_ret : ret);
+	}
+
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_structure--
+ *	After a beginning-to-end walk through the database has been
+ *	completed, put together the information that has been collected
+ *	to verify the overall database structure.
+ *
+ *	Should only be called if we want to do a database verification,
+ *	i.e. if DB_SALVAGE is not set.
+ */
+static int
+__db_vrfy_structure(dbp, vdp, dbname, meta_pgno, lp, rp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *dbname;
+	db_pgno_t meta_pgno;
+	void *lp, *rp;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i;
+	int ret, isbad, hassubs, p;
+
+	isbad = 0;
+	pip = NULL;
+	env = dbp->env;
+	pgset = vdp->pgset;
+
+	/*
+	 * Providing feedback here is tricky;  in most situations,
+	 * we fetch each page one more time, but we do so in a top-down
+	 * order that depends on the access method.  Worse, we do this
+	 * recursively in btree, such that on any call where we're traversing
+	 * a subtree we don't know where that subtree is in the whole database;
+	 * worse still, any given database may be one of several subdbs.
+	 *
+	 * The solution is to decrement a counter vdp->pgs_remaining each time
+	 * we verify (and call feedback on) a page.  We may over- or
+	 * under-count, but the structure feedback function will ensure that we
+	 * never give a percentage under 50 or over 100.  (The first pass
+	 * covered the range 0-50%.)
+	 */
+	if (dbp->db_feedback != NULL)
+		vdp->pgs_remaining = vdp->last_pgno + 1;
+
+	/*
+	 * Call the appropriate function to downwards-traverse the db type.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		if ((ret =
+		    __bam_vrfy_structure(dbp, vdp, 0, lp, rp, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+
+		/*
+		 * If we have subdatabases and we know that the database is,
+		 * thus far, sound, it's safe to walk the tree of subdatabases.
+		 * Do so, and verify the structure of the databases within.
+		 */
+		if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0)
+			goto err;
+		hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0;
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		pip = NULL;
+
+		if (isbad == 0 && hassubs)
+			if ((ret =
+			    __db_vrfy_subdbs(dbp, vdp, dbname, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+		break;
+	case DB_HASH:
+		if ((ret = __ham_vrfy_structure(dbp, vdp, 0, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+		break;
+	case DB_HEAP:
+		if ((ret = __heap_vrfy_structure(dbp, vdp, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+		}
+		/* Skip the freelist check for heap, it doesn't apply. */
+		goto err;
+	case DB_QUEUE:
+		if ((ret = __qam_vrfy_structure(dbp, vdp, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+		}
+
+		/*
+		 * Queue pages may be unreferenced and totally zeroed, if
+		 * they're empty;  queue doesn't have much structure, so
+		 * this is unlikely to be wrong in any troublesome sense.
+		 * Skip to "err".
+		 */
+		goto err;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_path(env, "__db_vrfy_structure");
+		goto err;
+	}
+
+	/* Walk free list. */
+	if ((ret =
+	    __db_vrfy_freelist(dbp, vdp, meta_pgno, flags)) == DB_VERIFY_BAD)
+		isbad = 1;
+
+	/*
+	 * If structure checks up until now have failed, it's likely that
+	 * checking what pages have been missed will result in oodles of
+	 * extraneous error messages being EPRINTed.  Skip to the end
+	 * if this is the case;  we're going to be printing at least one
+	 * error anyway, and probably all the more salient ones.
+	 */
+	if (ret != 0 || isbad == 1)
+		goto err;
+
+	/*
+	 * Make sure no page has been missed and that no page is still marked
+	 * "all zeroes" unless we are looking at unused hash bucket pages or
+	 * pagesoff the end of database.
+	 */
+	for (i = 0; i < vdp->last_pgno + 1; i++) {
+		if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+			goto err;
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, vdp->txn, i, &p)) != 0)
+			goto err;
+		if (pip->type == P_OVERFLOW) {
+			if ((u_int32_t)p != pip->refcount) {
+				EPRINT((env, DB_STR_A("0533",
+		    "Page %lu: overflow refcount %lu, referenced %lu times",
+				    "%lu %lu %lu"), (u_long)i,
+				    (u_long)pip->refcount, (u_long)p));
+				isbad = 1;
+			}
+		} else if (p == 0 &&
+#ifndef HAVE_FTRUNCATE
+		    !(i > vdp->meta_last_pgno &&
+		    (F_ISSET(pip, VRFY_IS_ALLZEROES) || pip->type == P_HASH)) &&
+#endif
+		    !(dbp->type == DB_HASH &&
+		    (pip->type == P_HASH || pip->type == P_INVALID))) {
+			/*
+			 * It is OK for unreferenced hash buckets to be
+			 * marked invalid and unreferenced.
+			 */
+			EPRINT((env, DB_STR_A("0534",
+			    "Page %lu: unreferenced page", "%lu"), (u_long)i));
+			isbad = 1;
+		}
+
+		if (F_ISSET(pip, VRFY_IS_ALLZEROES)
+#ifndef HAVE_FTRUNCATE
+		    && i <= vdp->meta_last_pgno
+#endif
+		    ) {
+			EPRINT((env, DB_STR_A("0535",
+			    "Page %lu: totally zeroed page", "%lu"),
+			    (u_long)i));
+			isbad = 1;
+		}
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		pip = NULL;
+	}
+
+err:	if (pip != NULL)
+		(void)__db_vrfy_putpageinfo(env, vdp, pip);
+
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_is_valid_magicno
+ */
+static int
+__db_is_valid_magicno(magic, typep)
+	u_int32_t magic;
+	DBTYPE *typep;
+{
+	switch (magic) {
+	case DB_BTREEMAGIC:
+		*typep = DB_BTREE;
+		return (1);
+	case DB_HASHMAGIC:
+		*typep = DB_HASH;
+		return (1);
+	case DB_HEAPMAGIC:
+		*typep = DB_HEAP;
+		return (1);
+	case DB_QAMMAGIC:
+		*typep = DB_QUEUE;
+		return (1);
+	default:
+		break;
+	}
+	*typep = DB_UNKNOWN;
+	return (0);
+}
+
+/*
+ * __db_vrfy_common --
+ *	Verify info common to all page types.
+ *
+ * PUBLIC: int  __db_vrfy_common
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_common(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret;
+	u_int8_t *p;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	pip->pgno = pgno;
+	F_CLR(pip, VRFY_IS_ALLZEROES);
+
+	/*
+	 * Hash expands the table by leaving some pages between the
+	 * old last and the new last totally zeroed.  These pages may
+	 * not be all zero if they were used, freed and then reallocated.
+	 *
+	 * Queue will create sparse files if sparse record numbers are used.
+	 */
+	if (pgno != 0 && PGNO(h) == 0) {
+		F_SET(pip, VRFY_IS_ALLZEROES);
+		for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++)
+			if (*p != 0) {
+				F_CLR(pip, VRFY_IS_ALLZEROES);
+				break;
+			}
+		/*
+		 * Mark it as a hash, and we'll
+		 * check that that makes sense structurally later.
+		 * (The queue verification doesn't care, since queues
+		 * don't really have much in the way of structure.)
+		 */
+		if (dbp->type != DB_HEAP)
+			pip->type = P_HASH;
+		ret = 0;
+		goto err;	/* well, not really an err. */
+	}
+
+	if (PGNO(h) != pgno) {
+		EPRINT((env, DB_STR_A("0536", "Page %lu: bad page number %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)h->pgno));
+		ret = DB_VERIFY_BAD;
+	}
+
+	switch (h->type) {
+	case P_INVALID:			/* Order matches ordinal value. */
+	case P_HASH_UNSORTED:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LRECNO:
+	case P_OVERFLOW:
+	case P_HASHMETA:
+	case P_BTREEMETA:
+	case P_QAMMETA:
+	case P_QAMDATA:
+	case P_LDUP:
+	case P_HASH:
+	case P_HEAP:
+	case P_IHEAP:
+	case P_HEAPMETA:
+		break;
+	default:
+		EPRINT((env, DB_STR_A("0537", "Page %lu: bad page type %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)h->type));
+		ret = DB_VERIFY_BAD;
+	}
+	pip->type = h->type;
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_invalid --
+ *	Verify P_INVALID page.
+ *	(Yes, there's not much to do here.)
+ */
+static int
+__db_vrfy_invalid(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	pip->next_pgno = pip->prev_pgno = 0;
+
+	if (!IS_VALID_PGNO(NEXT_PGNO(h))) {
+		EPRINT((env, DB_STR_A("0538", "Page %lu: invalid next_pgno %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)NEXT_PGNO(h)));
+		ret = DB_VERIFY_BAD;
+	} else
+		pip->next_pgno = NEXT_PGNO(h);
+
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_vrfy_datapage --
+ *	Verify elements common to data pages (P_HASH, P_LBTREE,
+ *	P_IBTREE, P_IRECNO, P_LRECNO, P_OVERFLOW, P_DUPLICATE)--i.e.,
+ *	those defined in the PAGE structure.
+ *
+ *	Called from each of the per-page routines, after the
+ *	all-page-type-common elements of pip have been verified and filled
+ *	in.
+ *
+ * PUBLIC: int __db_vrfy_datapage
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_datapage(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	u_int32_t smallest_entry;
+	int isbad, ret, t_ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	isbad = 0;
+
+	/*
+	 * prev_pgno and next_pgno:  store for inter-page checks,
+	 * verify that they point to actual pages and not to self.
+	 *
+	 * !!!
+	 * Internal btree pages, as well as heap pages, do not maintain these
+	 * fields (indeed, they overload them).  Skip.
+	 */
+	if (TYPE(h) != P_IBTREE &&
+	    TYPE(h) != P_IRECNO && TYPE(h) != P_HEAP && TYPE(h) != P_IHEAP) {
+		if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0539",
+			    "Page %lu: invalid prev_pgno %lu", "%lu %lu"),
+			    (u_long)pip->pgno, (u_long)PREV_PGNO(h)));
+		}
+		if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0540",
+			    "Page %lu: invalid next_pgno %lu", "%lu %lu"),
+			    (u_long)pip->pgno, (u_long)NEXT_PGNO(h)));
+		}
+		pip->prev_pgno = PREV_PGNO(h);
+		pip->next_pgno = NEXT_PGNO(h);
+	}
+
+	/*
+	 * Verify the number of entries on the page: there's no good way to
+	 * determine if this is accurate.  The best we can do is verify that
+	 * it's not more than can, in theory, fit on the page.  Then, we make
+	 * sure there are at least this many valid elements in inp[], and
+	 * hope the test catches most cases.
+	 */
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		smallest_entry = HKEYDATA_PSIZE(0);
+		break;
+	case P_HEAP:
+		smallest_entry = sizeof(HEAPHDR) + sizeof(db_indx_t);
+		break;
+	case P_IHEAP:
+		/* Really high_pgno. */
+		pip->prev_pgno = PREV_PGNO(h);
+		smallest_entry = 0;
+		break;
+	case P_IBTREE:
+		smallest_entry = BINTERNAL_PSIZE(0);
+		break;
+	case P_IRECNO:
+		smallest_entry = RINTERNAL_PSIZE;
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		smallest_entry = BKEYDATA_PSIZE(0);
+		break;
+	default:
+		smallest_entry = 0;
+		break;
+	}
+	if (smallest_entry * NUM_ENT(h) / 2 > dbp->pgsize) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0541",
+		    "Page %lu: too many entries: %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)NUM_ENT(h)));
+	}
+
+	if (TYPE(h) != P_OVERFLOW)
+		pip->entries = NUM_ENT(h);
+
+	/*
+	 * btree level.  Should be zero unless we're a btree;
+	 * if we are a btree, should be between LEAFLEVEL and MAXBTREELEVEL,
+	 * and we need to save it off.
+	 */
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+		if (LEVEL(h) < LEAFLEVEL + 1) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0542",
+			    "Page %lu: bad btree level %lu", "%lu %lu"),
+			    (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		pip->bt_level = LEVEL(h);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		if (LEVEL(h) != LEAFLEVEL) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0543",
+			    "Page %lu: btree leaf page has incorrect level %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		break;
+	default:
+		if (LEVEL(h) != 0) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0544",
+			    "Page %lu: nonzero level %lu in non-btree database",
+			    "%lu %lu"), (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		break;
+	}
+
+	/*
+	 * Even though inp[] occurs in all PAGEs, we look at it in the
+	 * access-method-specific code, since btree and hash treat
+	 * item lengths very differently, and one of the most important
+	 * things we want to verify is that the data--as specified
+	 * by offset and length--cover the right part of the page
+	 * without overlaps, gaps, or violations of the page boundary.
+	 */
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_meta --
+ *	Verify the access-method common parts of a meta page, using
+ *	normal mpool routines.
+ *
+ * PUBLIC: int __db_vrfy_meta
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DBMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	DBTYPE dbtype, magtype;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int isbad, ret, t_ret;
+
+	isbad = 0;
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/* type plausible for a meta page */
+	switch (meta->type) {
+	case P_BTREEMETA:
+		dbtype = DB_BTREE;
+		break;
+	case P_HASHMETA:
+		dbtype = DB_HASH;
+		break;
+	case P_HEAPMETA:
+		dbtype = DB_HEAP;
+		break;
+	case P_QAMMETA:
+		dbtype = DB_QUEUE;
+		break;
+	default:
+		ret = __db_unknown_path(env, "__db_vrfy_meta");
+		goto err;
+	}
+
+	/* magic number valid */
+	if (!__db_is_valid_magicno(meta->magic, &magtype)) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0545", "Page %lu: invalid magic number",
+		    "%lu"), (u_long)pgno));
+	}
+	if (magtype != dbtype) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0546",
+		    "Page %lu: magic number does not match database type",
+		    "%lu"), (u_long)pgno));
+	}
+
+	/* version */
+	if ((dbtype == DB_BTREE &&
+	    (meta->version > DB_BTREEVERSION ||
+	    meta->version < DB_BTREEOLDVER)) ||
+	    (dbtype == DB_HASH &&
+	    (meta->version > DB_HASHVERSION ||
+	    meta->version < DB_HASHOLDVER)) ||
+	    (dbtype == DB_HEAP &&
+	    (meta->version > DB_HEAPVERSION ||
+	    meta->version < DB_HEAPOLDVER)) ||
+	    (dbtype == DB_QUEUE &&
+	    (meta->version > DB_QAMVERSION ||
+	    meta->version < DB_QAMOLDVER))) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0547",
+    "Page %lu: unsupported database version %lu; extraneous errors may result",
+		    "%lu %lu"), (u_long)pgno, (u_long)meta->version));
+	}
+
+	/* pagesize */
+	if (meta->pagesize != dbp->pgsize) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0548", "Page %lu: invalid pagesize %lu",
+		    "%lu %lu"), (u_long)pgno, (u_long)meta->pagesize));
+	}
+
+	/* Flags */
+	if (meta->metaflags != 0) {
+		if (FLD_ISSET(meta->metaflags,
+		    ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+			isbad = 1;
+			EPRINT((env, DB_STR_A("0549",
+			    "Page %lu: bad meta-data flags value %#lx",
+			    "%lu %#lx"), (u_long)PGNO_BASE_MD,
+			    (u_long)meta->metaflags));
+		}
+		if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+			F_SET(pip, VRFY_HAS_CHKSUM);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+			F_SET(pip, VRFY_HAS_PART_RANGE);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+			F_SET(pip, VRFY_HAS_PART_CALLBACK);
+	}
+
+	/*
+	 * Free list.
+	 *
+	 * If this is not the main, master-database meta page, it
+	 * should not have a free list.
+	 */
+	if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0550",
+		    "Page %lu: nonempty free list on subdatabase metadata page",
+		    "%lu"), (u_long)pgno));
+	}
+
+	/* Can correctly be PGNO_INVALID--that's just the end of the list. */
+	if (IS_VALID_PGNO(meta->free))
+		pip->free = meta->free;
+	else {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0551",
+		    "Page %lu: nonsensical free list pgno %lu", "%lu %lu"),
+		    (u_long)pgno, (u_long)meta->free));
+	}
+
+	/*
+	 * Check that the meta page agrees with what we got from mpool.
+	 * If we don't have FTRUNCATE then mpool could include some
+	 * zeroed pages at the end of the file, we assume the meta page
+	 * is correct.  Queue does not update the meta page's last_pgno.
+	 */
+	if (pgno == PGNO_BASE_MD &&
+	    dbtype != DB_QUEUE && meta->last_pgno != vdp->last_pgno) {
+#ifdef HAVE_FTRUNCATE
+		isbad = 1;
+		EPRINT((env, DB_STR_A("0552",
+		    "Page %lu: last_pgno is not correct: %lu != %lu",
+		    "%lu %lu %lu"), (u_long)pgno,
+		    (u_long)meta->last_pgno, (u_long)vdp->last_pgno));
+#endif
+		vdp->meta_last_pgno = meta->last_pgno;
+	}
+
+	/*
+	 * We have now verified the common fields of the metadata page.
+	 * Clear the flag that told us they had been incompletely checked.
+	 */
+	F_CLR(pip, VRFY_INCOMPLETE);
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_freelist --
+ *	Walk free list, checking off pages and verifying absence of
+ *	loops.
+ */
+static int
+__db_vrfy_freelist(dbp, vdp, meta, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t cur_pgno, next_pgno;
+	int p, ret, t_ret;
+
+	env = dbp->env;
+	pgset = vdp->pgset;
+	DB_ASSERT(env, pgset != NULL);
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0)
+		return (ret);
+	for (next_pgno = pip->free;
+	    next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) {
+		cur_pgno = pip->pgno;
+		if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			return (t_ret);
+
+		/* This shouldn't happen, but just in case. */
+		if (!IS_VALID_PGNO(next_pgno)) {
+			EPRINT((env, DB_STR_A("0553",
+			    "Page %lu: invalid next_pgno %lu on free list page",
+			    "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+			return (DB_VERIFY_BAD);
+		}
+
+		if (next_pgno > vdp->last_pgno) {
+			EPRINT((env, DB_STR_A("0713",
+			 "Page %lu: page %lu on free list beyond last_pgno %lu",
+			    "%lu %lu %lu"), (u_long)cur_pgno,
+			    (u_long)next_pgno, (u_long)vdp->last_pgno));
+			ret = DB_VERIFY_BAD;
+		}
+		/* Detect cycles. */
+		if ((t_ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, vdp->txn, next_pgno, &p)) != 0)
+			return (t_ret);
+		if (p != 0) {
+			EPRINT((env, DB_STR_A("0554",
+		    "Page %lu: page %lu encountered a second time on free list",
+			    "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+			return (DB_VERIFY_BAD);
+		}
+		if ((t_ret = __db_vrfy_pgset_inc(pgset,
+		    vdp->thread_info, vdp->txn, next_pgno)) != 0)
+			return (t_ret);
+
+		if ((t_ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+			return (t_ret);
+
+		if (pip->type != P_INVALID) {
+			EPRINT((env, DB_STR_A("0555",
+			    "Page %lu: non-invalid page %lu on free list",
+			    "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+			ret = DB_VERIFY_BAD;	  /* unsafe to continue */
+			break;
+		}
+	}
+
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_vrfy_subdbs --
+ *	Walk the known-safe master database of subdbs with a cursor,
+ *	verifying the structure of each subdatabase we encounter.
+ */
+static int
+__db_vrfy_subdbs(dbp, vdp, dbname, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *dbname;
+	u_int32_t flags;
+{
+	DB *mdbp;
+	DBC *dbc;
+	DBT key, data;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t meta_pgno;
+	int ret, t_ret, isbad;
+	u_int8_t type;
+
+	isbad = 0;
+	dbc = NULL;
+	env = dbp->env;
+
+	if ((ret = __db_master_open(dbp,
+	    vdp->thread_info, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0)
+		return (ret);
+
+	if ((ret = __db_cursor_int(mdbp, NULL,
+	    vdp->txn, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+		goto err;
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	while ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) == 0) {
+		if (data.size != sizeof(db_pgno_t)) {
+			EPRINT((env, DB_STR("0556",
+			    "Subdatabase entry not page-number size")));
+			isbad = 1;
+			goto err;
+		}
+		memcpy(&meta_pgno, data.data, data.size);
+		/*
+		 * Subdatabase meta pgnos are stored in network byte
+		 * order for cross-endian compatibility.  Swap if appropriate.
+		 */
+		DB_NTOHL_SWAP(env, &meta_pgno);
+		if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) {
+			EPRINT((env, DB_STR_A("0557",
+			    "Subdatabase entry references invalid page %lu",
+			    "%lu"), (u_long)meta_pgno));
+			isbad = 1;
+			goto err;
+		}
+		if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+			goto err;
+		type = pip->type;
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		switch (type) {
+		case P_BTREEMETA:
+			if ((ret = __bam_vrfy_structure(
+			    dbp, vdp, meta_pgno, NULL, NULL, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+			break;
+		case P_HASHMETA:
+			if ((ret = __ham_vrfy_structure(
+			    dbp, vdp, meta_pgno, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+			break;
+		case P_QAMMETA:
+		default:
+			EPRINT((env, DB_STR_A("0558",
+		    "Subdatabase entry references page %lu of invalid type %lu",
+			    "%lu %lu"), (u_long)meta_pgno, (u_long)type));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(mdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_struct_feedback --
+ *	Provide feedback during top-down database structure traversal.
+ *	(See comment at the beginning of __db_vrfy_structure.)
+ *
+ * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+ */
+void
+__db_vrfy_struct_feedback(dbp, vdp)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+{
+	int progress;
+
+	if (dbp->db_feedback == NULL)
+		return;
+
+	if (vdp->pgs_remaining > 0)
+		vdp->pgs_remaining--;
+
+	/* Don't allow a feedback call of 100 until we're really done. */
+	progress = 100 - (int)(vdp->pgs_remaining * 50 / (vdp->last_pgno + 1));
+	dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress);
+}
+
+/*
+ * __db_vrfy_orderchkonly --
+ *	Do an sort-order/hashing check on a known-otherwise-good subdb.
+ */
+static int
+__db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	BTMETA *btmeta;
+	DB *mdbp, *pgset;
+	DBC *pgsc;
+	DBT key, data;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH *h_internal;
+	HMETA *hmeta;
+	PAGE *h, *currpg;
+	db_pgno_t meta_pgno, p, pgno;
+	u_int32_t bucket;
+	int t_ret, ret;
+
+	pgset = NULL;
+	pgsc = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	currpg = h = NULL;
+
+	LF_CLR(DB_NOORDERCHK);
+
+	/* Open the master database and get the meta_pgno for the subdb. */
+	if ((ret = __db_master_open(dbp,
+	    vdp->thread_info, NULL, name, DB_RDONLY, 0, &mdbp)) != 0)
+		goto err;
+
+	DB_INIT_DBT(key, subdb, strlen(subdb));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __db_get(mdbp,
+	    vdp->thread_info, NULL, &key, &data, 0)) != 0) {
+		if (ret == DB_NOTFOUND)
+			ret = ENOENT;
+		goto err;
+	}
+
+	if (data.size != sizeof(db_pgno_t)) {
+		EPRINT((env, DB_STR("0559",
+		    "Subdatabase entry of invalid size")));
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+	memcpy(&meta_pgno, data.data, data.size);
+
+	/*
+	 * Subdatabase meta pgnos are stored in network byte
+	 * order for cross-endian compatibility.  Swap if appropriate.
+	 */
+	DB_NTOHL_SWAP(env, &meta_pgno);
+
+	if ((ret = __memp_fget(mpf,
+	     &meta_pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		goto err;
+
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		btmeta = (BTMETA *)h;
+		if (F_ISSET(&btmeta->dbmeta, BTM_RECNO)) {
+			/* Recnos have no order to check. */
+			ret = 0;
+			goto err;
+		}
+		if ((ret =
+		    __db_meta2pgset(dbp, vdp, meta_pgno, flags, pgset)) != 0)
+			goto err;
+		if ((ret = __db_cursor_int(pgset, NULL, vdp->txn, dbp->type,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+			goto err;
+		while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+			if ((ret = __memp_fget(mpf, &p,
+			     vdp->thread_info, NULL, 0, &currpg)) != 0)
+				goto err;
+			if ((ret = __bam_vrfy_itemorder(dbp, NULL,
+			    vdp->thread_info, currpg, p, NUM_ENT(currpg), 1,
+			    F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0)
+				goto err;
+			if ((ret = __memp_fput(mpf,
+			    vdp->thread_info, currpg, dbp->priority)) != 0)
+				goto err;
+			currpg = NULL;
+		}
+
+		/*
+		 * The normal exit condition for the loop above is DB_NOTFOUND.
+		 * If we see that, zero it and continue on to cleanup.
+		 * Otherwise, it's a real error and will be returned.
+		 */
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		break;
+	case P_HASHMETA:
+		hmeta = (HMETA *)h;
+		h_internal = (HASH *)dbp->h_internal;
+		/*
+		 * Make sure h_charkey is right.
+		 */
+		if (h_internal == NULL) {
+			EPRINT((env, DB_STR_A("0560",
+			    "Page %lu: DB->h_internal field is NULL", "%lu"),
+			    (u_long)meta_pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		if (h_internal->h_hash == NULL)
+			h_internal->h_hash = hmeta->dbmeta.version < 5
+			? __ham_func4 : __ham_func5;
+		if (hmeta->h_charkey !=
+		    h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) {
+			EPRINT((env, DB_STR_A("0561",
+			    "Page %lu: incorrect hash function for database",
+			    "%lu"), (u_long)meta_pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		/*
+		 * Foreach bucket, verify hashing on each page in the
+		 * corresponding chain of pages.
+		 */
+		if ((ret = __db_cursor_int(dbp, NULL, vdp->txn, dbp->type,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+			goto err;
+		for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+			pgno = BS_TO_PAGE(bucket, hmeta->spares);
+			while (pgno != PGNO_INVALID) {
+				if ((ret = __memp_fget(mpf, &pgno,
+				    vdp->thread_info, NULL, 0, &currpg)) != 0)
+					goto err;
+				if ((ret = __ham_vrfy_hashing(pgsc,
+				    NUM_ENT(currpg), hmeta, bucket, pgno,
+				    flags, h_internal->h_hash)) != 0)
+					goto err;
+				pgno = NEXT_PGNO(currpg);
+				if ((ret = __memp_fput(mpf, vdp->thread_info,
+				    currpg, dbp->priority)) != 0)
+					goto err;
+				currpg = NULL;
+			}
+		}
+		break;
+	default:
+		EPRINT((env, DB_STR_A("0562",
+		    "Page %lu: database metapage of bad type %lu",
+		    "%lu %lu"), (u_long)meta_pgno, (u_long)TYPE(h)));
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+err:	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pgset != NULL &&
+	    (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    vdp->thread_info, h, dbp->priority)) != 0)
+		ret = t_ret;
+	if (currpg != NULL &&
+	    (t_ret = __memp_fput(mpf,
+		vdp->thread_info, currpg, dbp->priority)) != 0)
+		ret = t_ret;
+	if ((t_ret = __db_close(mdbp, NULL, 0)) != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_pg --
+ *	Walk through a page, salvaging all likely or plausible (w/
+ *	DB_AGGRESSIVE) key/data pairs and marking seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_pg(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int keyflag, ret, t_ret;
+
+	env = dbp->env;
+	DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+	/*
+	 * !!!
+	 * We dump record numbers when salvaging Queue databases, but not for
+	 * immutable Recno databases.  The problem is we can't figure out the
+	 * record number from the database page in the Recno case, while the
+	 * offset in the file is sufficient for Queue.
+	 */
+	keyflag = 0;
+
+	/* If we got this page in the subdb pass, we can safely skip it. */
+	if (__db_salvage_isdone(vdp, pgno))
+		return (0);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, pgno, flags);
+		break;
+	case P_HASH:
+	case P_HASH_UNSORTED:
+	case P_HEAP:
+	case P_LBTREE:
+	case P_QAMDATA:
+		return (__db_salvage_leaf(dbp,
+		    vdp, pgno, h, handle, callback, flags));
+	case P_HASHMETA:
+		ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, pgno, flags);
+		break;
+	case P_HEAPMETA:
+		ret = __heap_vrfy_meta(dbp, vdp, (HEAPMETA *)h, pgno, flags);
+		break;
+	case P_IBTREE:
+		/*
+		 * We need to mark any overflow keys on internal pages as seen,
+		 * so we don't print them out in __db_salvage_unknowns.  But if
+		 * we're an upgraded database, a P_LBTREE page may very well
+		 * have a reference to the same overflow pages (this practice
+		 * stopped somewhere around db4.5).  To give P_LBTREEs a chance
+		 * to print out any keys on shared pages, mark the page now and
+		 * deal with it at the end.
+		 */
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_IBTREE));
+	case P_IHEAP:
+		/*
+		 * There's nothing to salvage from heap region pages.  Just mark
+		 * that we've seen the page.
+		 */
+		return (__db_salvage_markdone(vdp, pgno));
+	case P_LDUP:
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LDUP));
+	case P_LRECNO:
+		/*
+		 * Recno leaves are tough, because the leaf could be (1) a dup
+		 * page, or it could be (2) a regular database leaf page.
+		 * Fortunately, RECNO databases are not allowed to have
+		 * duplicates.
+		 *
+		 * If there are no subdatabases, dump the page immediately if
+		 * it's a leaf in a RECNO database, otherwise wait and hopefully
+		 * it will be dumped by the leaf page that refers to it,
+		 * otherwise we'll get it with the unknowns.
+		 *
+		 * If there are subdatabases, there might be mixed types and
+		 * dbp->type can't be trusted.  We'll only get here after
+		 * salvaging each database, though, so salvaging this page
+		 * immediately isn't important.  If this page is a dup, it might
+		 * get salvaged later on, otherwise the unknowns pass will pick
+		 * it up.  Note that SALVAGE_HASSUBDBS won't get set if we're
+		 * salvaging aggressively.
+		 *
+		 * If we're salvaging aggressively, we don't know whether or not
+		 * there's subdatabases, so we wait on all recno pages.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) &&
+		    !F_ISSET(vdp, SALVAGE_HASSUBDBS) && dbp->type == DB_RECNO)
+			return (__db_salvage_leaf(dbp,
+			    vdp, pgno, h, handle, callback, flags));
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LRECNODUP));
+	case P_OVERFLOW:
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_OVERFLOW));
+	case P_QAMMETA:
+		keyflag = 1;
+		ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, pgno, flags);
+		break;
+	case P_INVALID:
+	case P_IRECNO:
+	case __P_DUPLICATE:
+	default:
+		/*
+		 * There's no need to display an error, the page type was
+		 * already checked and reported on.
+		 */
+		return (0);
+	}
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * We have to display the dump header if it's a metadata page.  It's
+	 * our last chance as the page was marked "seen" in the vrfy routine,
+	 * and  we won't see the page again.  We don't display headers for
+	 * the first database in a multi-database file, that database simply
+	 * contains a list of subdatabases.
+	 */
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	if (!F_ISSET(pip, VRFY_HAS_SUBDBS) && !LF_ISSET(DB_VERIFY_PARTITION))
+		ret = __db_prheader(
+		    dbp, NULL, 0, keyflag, handle, callback, vdp, pgno);
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_leaf --
+ *	Walk through a leaf, salvaging all likely key/data pairs and marking
+ *	seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_leaf(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+	DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+	/* If we got this page in the subdb pass, we can safely skip it. */
+	if (__db_salvage_isdone(vdp, pgno))
+		return (0);
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		return (__ham_salvage(dbp, vdp,
+		    pgno, h, handle, callback, flags));
+	case P_HEAP:
+		return (__heap_salvage(dbp, vdp,
+		    pgno, h, handle, callback, flags));
+	case P_LBTREE:
+	case P_LRECNO:
+		return (__bam_salvage(dbp, vdp,
+		    pgno, TYPE(h), h, handle, callback, NULL, flags));
+	case P_QAMDATA:
+		return (__qam_salvage(dbp, vdp,
+		    pgno, h, handle, callback, flags));
+	default:
+		/*
+		 * There's no need to display an error, the page type was
+		 * already checked and reported on.
+		 */
+		return (0);
+	}
+}
+
+/*
+ * __db_salvage_unknowns --
+ *	Walk through the salvager database, printing with key "UNKNOWN"
+ *	any pages we haven't dealt with.
+ */
+static int
+__db_salvage_unknowns(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT unkdbt, key, *dbt;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t pgtype, ovfl_bufsz, tmp_flags;
+	int ret, t_ret;
+	void *ovflbuf;
+
+	dbc = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+
+	DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+	if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+		return (ret);
+	ovfl_bufsz = dbp->pgsize;
+
+	/*
+	 * We make two passes -- in the first pass, skip SALVAGE_OVERFLOW
+	 * pages, because they may be referenced by the standard database
+	 * pages that we're resolving.
+	 */
+	while ((t_ret =
+	    __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 1)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			continue;
+		}
+
+		dbt = NULL;
+		tmp_flags = 0;
+		switch (pgtype) {
+		case SALVAGE_LDUP:
+		case SALVAGE_LRECNODUP:
+			dbt = &unkdbt;
+			tmp_flags = DB_SA_UNKNOWNKEY;
+			/* FALLTHROUGH */
+		case SALVAGE_IBTREE:
+		case SALVAGE_LBTREE:
+		case SALVAGE_LRECNO:
+			if ((t_ret = __bam_salvage(
+			    dbp, vdp, pgno, pgtype, h, handle,
+			    callback, dbt, tmp_flags | flags)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case SALVAGE_OVERFLOW:
+			DB_ASSERT(env, 0);	/* Shouldn't ever happen. */
+			break;
+		case SALVAGE_HASH:
+			if ((t_ret = __ham_salvage(dbp, vdp,
+			    pgno, h, handle, callback, flags)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case SALVAGE_INVALID:
+		case SALVAGE_IGNORE:
+		default:
+			/*
+			 * Shouldn't happen, but if it does, just do what the
+			 * nice man says.
+			 */
+			DB_ASSERT(env, 0);
+			break;
+		}
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* We should have reached the end of the database. */
+	if (t_ret == DB_NOTFOUND)
+		t_ret = 0;
+	if (t_ret != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Re-open the cursor so we traverse the database again. */
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	dbc = NULL;
+
+	/* Now, deal with any remaining overflow pages. */
+	while ((t_ret =
+	    __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 0)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			continue;
+		}
+
+		switch (pgtype) {
+		case SALVAGE_OVERFLOW:
+			/*
+			 * XXX:
+			 * This may generate multiple "UNKNOWN" keys in
+			 * a database with no dups.  What to do?
+			 */
+			if ((t_ret = __db_safe_goff(dbp, vdp,
+			    pgno, &key, &ovflbuf, &ovfl_bufsz, flags)) != 0 ||
+			    ((vdp->type == DB_BTREE || vdp->type == DB_HASH) &&
+			    (t_ret = __db_vrfy_prdbt(&unkdbt,
+			    0, " ", handle, callback, 0, 0, vdp)) != 0) ||
+			    (t_ret = __db_vrfy_prdbt(
+			    &key, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+				if (ret == 0)
+					ret = t_ret;
+			break;
+		default:
+			DB_ASSERT(env, 0);	/* Shouldn't ever happen. */
+			break;
+		}
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* We should have reached the end of the database. */
+	if (t_ret == DB_NOTFOUND)
+		t_ret = 0;
+	if (t_ret != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, ovflbuf);
+
+	return (ret);
+}
+
+/*
+ * Offset of the ith inp array entry, which we can compare to the offset
+ * the entry stores.
+ */
+#define	INP_OFFSET(dbp, h, i)	\
+    ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h)))
+
+/*
+ * __db_vrfy_inpitem --
+ *	Verify that a single entry in the inp array is sane, and update
+ *	the high water mark and current item offset.  (The former of these is
+ *	used for state information between calls, and is required;  it must
+ *	be initialized to the pagesize before the first call.)
+ *
+ *	Returns DB_VERIFY_FATAL if inp has collided with the data,
+ *	since verification can't continue from there;  returns DB_VERIFY_BAD
+ *	if anything else is wrong.
+ *
+ * PUBLIC: int __db_vrfy_inpitem __P((DB *, PAGE *,
+ * PUBLIC:     db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+ */
+int
+__db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
+	DB *dbp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t i;
+	int is_btree;
+	u_int32_t flags, *himarkp, *offsetp;
+{
+	BKEYDATA *bk;
+	ENV *env;
+	db_indx_t *inp, offset, len;
+
+	env = dbp->env;
+
+	DB_ASSERT(env, himarkp != NULL);
+	inp = P_INP(dbp, h);
+
+	/*
+	 * Check that the inp array, which grows from the beginning of the
+	 * page forward, has not collided with the data, which grow from the
+	 * end of the page backward.
+	 */
+	if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
+		/* We've collided with the data.  We need to bail. */
+		EPRINT((env, DB_STR_A("0563",
+		    "Page %lu: entries listing %lu overlaps data",
+		    "%lu %lu"), (u_long)pgno, (u_long)i));
+		return (DB_VERIFY_FATAL);
+	}
+
+	offset = inp[i];
+
+	/*
+	 * Check that the item offset is reasonable:  it points somewhere
+	 * after the inp array and before the end of the page.
+	 */
+	if (offset <= INP_OFFSET(dbp, h, i) || offset >= dbp->pgsize) {
+		EPRINT((env, DB_STR_A("0564",
+		    "Page %lu: bad offset %lu at page index %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)offset, (u_long)i));
+		return (DB_VERIFY_BAD);
+	}
+
+	/* Update the high-water mark (what HOFFSET should be) */
+	if (offset < *himarkp)
+		*himarkp = offset;
+
+	if (is_btree) {
+		/*
+		 * Check alignment;  if it's unaligned, it's unsafe to
+		 * manipulate this item.
+		 */
+		if (offset != DB_ALIGN(offset, sizeof(u_int32_t))) {
+			EPRINT((env, DB_STR_A("0565",
+			    "Page %lu: unaligned offset %lu at page index %lu",
+			    "%lu %lu %lu"), (u_long)pgno, (u_long)offset,
+			    (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+
+		/*
+		 * Check that the item length remains on-page.
+		 */
+		bk = GET_BKEYDATA(dbp, h, i);
+
+		/*
+		 * We need to verify the type of the item here;
+		 * we can't simply assume that it will be one of the
+		 * expected three.  If it's not a recognizable type,
+		 * it can't be considered to have a verifiable
+		 * length, so it's not possible to certify it as safe.
+		 */
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			len = bk->len;
+			break;
+		case B_DUPLICATE:
+		case B_OVERFLOW:
+			len = BOVERFLOW_SIZE;
+			break;
+		default:
+			EPRINT((env, DB_STR_A("0566",
+			    "Page %lu: item %lu of unrecognizable type",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+
+		if ((size_t)(offset + len) > dbp->pgsize) {
+			EPRINT((env, DB_STR_A("0567",
+			    "Page %lu: item %lu extends past page boundary",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+	}
+
+	if (offsetp != NULL)
+		*offsetp = offset;
+	return (0);
+}
+
+/*
+ * __db_vrfy_duptype--
+ *	Given a page number and a set of flags to __bam_vrfy_subtree,
+ *	verify that the dup tree type is correct--i.e., it's a recno
+ *	if DUPSORT is not set and a btree if it is.
+ *
+ * PUBLIC: int __db_vrfy_duptype
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_duptype(dbp, vdp, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, isbad;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	switch (pip->type) {
+	case P_IBTREE:
+	case P_LDUP:
+		if (!LF_ISSET(DB_ST_DUPSORT)) {
+			EPRINT((env, DB_STR_A("0568",
+	    "Page %lu: sorted duplicate set in unsorted-dup database",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+		}
+		break;
+	case P_IRECNO:
+	case P_LRECNO:
+		if (LF_ISSET(DB_ST_DUPSORT)) {
+			EPRINT((env, DB_STR_A("0569",
+	    "Page %lu: unsorted duplicate set in sorted-dup database",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+		}
+		break;
+	default:
+		/*
+		 * If the page is entirely zeroed, its pip->type will be a lie
+		 * (we assumed it was a hash page, as they're allowed to be
+		 * zeroed);  handle this case specially.
+		 */
+		if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+			ZEROPG_ERR_PRINT(env, pgno, DB_STR_P("duplicate page"));
+		else
+			EPRINT((env, DB_STR_A("0570",
+		    "Page %lu: duplicate page of inappropriate type %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)pip->type));
+		isbad = 1;
+		break;
+	}
+
+	if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+		return (ret);
+	return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_salvage_duptree --
+ *	Attempt to salvage a given duplicate tree, given its alleged root.
+ *
+ *	The key that corresponds to this dup set has been passed to us
+ *	in DBT *key.  Because data items follow keys, though, it has been
+ *	printed once already.
+ *
+ *	The basic idea here is that pgno ought to be a P_LDUP, a P_LRECNO, a
+ *	P_IBTREE, or a P_IRECNO.  If it's an internal page, use the verifier
+ *	functions to make sure it's safe;  if it's not, we simply bail and the
+ *	data will have to be printed with no key later on.  if it is safe,
+ *	recurse on each of its children.
+ *
+ *	Whether or not it's safe, if it's a leaf page, __bam_salvage it.
+ *
+ *	At all times, use the DB hanging off vdp to mark and check what we've
+ *	done, so each page gets printed exactly once and we don't get caught
+ *	in any cycles.
+ *
+ * PUBLIC: int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	DBT *key;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno))
+		return (DB_VERIFY_BAD);
+
+	/* We have a plausible page.  Try it. */
+	if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+		if ((ret = __db_vrfy_common(dbp, vdp, h, pgno, flags)) != 0)
+			goto err;
+		if ((ret = __bam_vrfy(dbp,
+		    vdp, h, pgno, flags | DB_NOORDERCHK)) != 0 ||
+		    (ret = __db_salvage_markdone(vdp, pgno)) != 0)
+			goto err;
+		/*
+		 * We have a known-healthy internal page.  Walk it.
+		 */
+		if ((ret = __bam_salvage_walkdupint(dbp, vdp, h, key,
+		    handle, callback, flags)) != 0)
+			goto err;
+		break;
+	case P_LRECNO:
+	case P_LDUP:
+		if ((ret = __bam_salvage(dbp,
+		    vdp, pgno, TYPE(h), h, handle, callback, key, flags)) != 0)
+			goto err;
+		break;
+	default:
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+err:	if ((t_ret = __memp_fput(mpf,
+	     vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_all --
+ *	Salvage only the leaves we find by walking the tree.  If we have subdbs,
+ *	salvage each of them individually.
+ */
+static int
+__db_salvage_all(dbp, vdp, handle, callback, flags, hassubsp)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+	int *hassubsp;
+{
+	DB *pgset;
+	DBC *pgsc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t p, meta_pgno;
+	int ret, t_ret;
+
+	*hassubsp = 0;
+
+	env = dbp->env;
+	pgset = NULL;
+	pgsc = NULL;
+	mpf = dbp->mpf;
+	h = NULL;
+	pip = NULL;
+	ret = 0;
+
+	/*
+	 * Check to make sure the page is OK and find out if it contains
+	 * subdatabases.
+	 */
+	meta_pgno = PGNO_BASE_MD;
+	if ((t_ret = __memp_fget(mpf,
+	    &meta_pgno, vdp->thread_info, NULL, 0, &h)) == 0 &&
+	    (t_ret = __db_vrfy_common(dbp, vdp, h, PGNO_BASE_MD, flags)) == 0 &&
+	    (t_ret = __db_salvage_pg(
+		dbp, vdp, PGNO_BASE_MD, h, handle, callback, flags)) == 0 &&
+	    (t_ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) == 0)
+		if (F_ISSET(pip, VRFY_HAS_SUBDBS))
+			*hassubsp = 1;
+	if (pip != NULL &&
+	    (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (h != NULL) {
+		if ((t_ret = __memp_fput(mpf,
+		     vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		h = NULL;
+	}
+	if (ret != 0)
+		return (ret);
+
+	/* Without subdatabases, we can just dump from the meta pgno. */
+	if (*hassubsp == 0)
+		return (__db_salvage(dbp,
+		    vdp, PGNO_BASE_MD, handle, callback, flags));
+
+	/*
+	 * We have subdbs.  Try to crack them.
+	 *
+	 * To do so, get a set of leaf pages in the master database, and then
+	 * walk each of the valid ones, salvaging subdbs as we go.  If any
+	 * prove invalid, just drop them;  we'll pick them up on a later pass.
+	 */
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+	if ((ret = __db_meta2pgset(dbp, vdp, PGNO_BASE_MD, flags, pgset)) != 0)
+		goto err;
+	if ((ret = __db_cursor(pgset, vdp->thread_info, NULL, &pgsc, 0)) != 0)
+		goto err;
+	while ((t_ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &p, vdp->thread_info, NULL, 0, &h)) == 0 &&
+		    (t_ret = __db_vrfy_common(dbp, vdp, h, p, flags)) == 0 &&
+		    (t_ret =
+		    __bam_vrfy(dbp, vdp, h, p, flags | DB_NOORDERCHK)) == 0)
+			t_ret = __db_salvage_subdbpg(
+			    dbp, vdp, h, handle, callback, flags);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+		if (h != NULL) {
+			if ((t_ret = __memp_fput(mpf, vdp->thread_info,
+			    h, dbp->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			h = NULL;
+		}
+	}
+
+	if (t_ret != DB_NOTFOUND && ret == 0)
+		ret = t_ret;
+
+err:	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pgset != NULL &&
+	    (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret ==0)
+		ret = t_ret;
+	if (h != NULL &&
+	    (t_ret = __memp_fput(mpf,
+		vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_subdbpg --
+ *	Given a known-good leaf page in the master database, salvage all
+ *	leaf pages corresponding to each subdb.
+ */
+static int
+__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *master;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	BKEYDATA *bkkey, *bkdata;
+	BOVERFLOW *bo;
+	DB *pgset;
+	DBC *pgsc;
+	DBT key;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *subpg;
+	db_indx_t i;
+	db_pgno_t meta_pgno;
+	int ret, err_ret, t_ret;
+	char *subdbname;
+	u_int32_t ovfl_bufsz;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	ret = err_ret = 0;
+	subdbname = NULL;
+	pgsc = NULL;
+	pgset = NULL;
+	ovfl_bufsz = 0;
+
+	/*
+	 * For each entry, get and salvage the set of pages
+	 * corresponding to that entry.
+	 */
+	for (i = 0; i < NUM_ENT(master); i += P_INDX) {
+		bkkey = GET_BKEYDATA(dbp, master, i);
+		bkdata = GET_BKEYDATA(dbp, master, i + O_INDX);
+
+		/* Get the subdatabase name. */
+		if (B_TYPE(bkkey->type) == B_OVERFLOW) {
+			/*
+			 * We can, in principle anyway, have a subdb
+			 * name so long it overflows.  Ick.
+			 */
+			bo = (BOVERFLOW *)bkkey;
+			if ((ret = __db_safe_goff(dbp, vdp, bo->pgno,
+			    &key, &subdbname, &ovfl_bufsz, flags)) != 0) {
+				err_ret = DB_VERIFY_BAD;
+				continue;
+			}
+
+			/* Nul-terminate it. */
+			if (ovfl_bufsz < key.size + 1) {
+				if ((ret = __os_realloc(env,
+				    key.size + 1, &subdbname)) != 0)
+					goto err;
+				ovfl_bufsz = key.size + 1;
+			}
+			subdbname[key.size] = '\0';
+		} else if (B_TYPE(bkkey->type) == B_KEYDATA) {
+			if (ovfl_bufsz < (u_int32_t)bkkey->len + 1) {
+				if ((ret = __os_realloc(env,
+				    bkkey->len + 1, &subdbname)) != 0)
+					goto err;
+				ovfl_bufsz = bkkey->len + 1;
+			}
+			DB_ASSERT(env, subdbname != NULL);
+			memcpy(subdbname, bkkey->data, bkkey->len);
+			subdbname[bkkey->len] = '\0';
+		}
+
+		/* Get the corresponding pgno. */
+		if (bkdata->len != sizeof(db_pgno_t)) {
+			err_ret = DB_VERIFY_BAD;
+			continue;
+		}
+		memcpy(&meta_pgno,
+		    (db_pgno_t *)bkdata->data, sizeof(db_pgno_t));
+
+		/*
+		 * Subdatabase meta pgnos are stored in network byte
+		 * order for cross-endian compatibility.  Swap if appropriate.
+		 */
+		DB_NTOHL_SWAP(env, &meta_pgno);
+
+		/* If we can't get the subdb meta page, just skip the subdb. */
+		if (!IS_VALID_PGNO(meta_pgno) || (ret = __memp_fget(mpf,
+		    &meta_pgno, vdp->thread_info, NULL, 0, &subpg)) != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		/*
+		 * Verify the subdatabase meta page.  This has two functions.
+		 * First, if it's bad, we have no choice but to skip the subdb
+		 * and let the pages just get printed on a later pass.  Second,
+		 * the access-method-specific meta verification routines record
+		 * the various state info (such as the presence of dups)
+		 * that we need for __db_prheader().
+		 */
+		if ((ret =
+		    __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) {
+			err_ret = ret;
+			(void)__memp_fput(mpf,
+			    vdp->thread_info, subpg, dbp->priority);
+			continue;
+		}
+		switch (TYPE(subpg)) {
+		case P_BTREEMETA:
+			if ((ret = __bam_vrfy_meta(dbp,
+			    vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) {
+				err_ret = ret;
+				(void)__memp_fput(mpf,
+				    vdp->thread_info, subpg, dbp->priority);
+				continue;
+			}
+			break;
+		case P_HASHMETA:
+			if ((ret = __ham_vrfy_meta(dbp,
+			    vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) {
+				err_ret = ret;
+				(void)__memp_fput(mpf,
+				    vdp->thread_info, subpg, dbp->priority);
+				continue;
+			}
+			break;
+		default:
+			/* This isn't an appropriate page;  skip this subdb. */
+			err_ret = DB_VERIFY_BAD;
+			continue;
+		}
+
+		if ((ret = __memp_fput(mpf,
+		    vdp->thread_info, subpg, dbp->priority)) != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		/* Print a subdatabase header. */
+		if ((ret = __db_prheader(dbp,
+		    subdbname, 0, 0, handle, callback, vdp, meta_pgno)) != 0)
+			goto err;
+
+		/* Salvage meta_pgno's tree. */
+		if ((ret = __db_salvage(dbp,
+		    vdp, meta_pgno, handle, callback, flags)) != 0)
+			err_ret = ret;
+
+		/* Print a subdatabase footer. */
+		if ((ret = __db_prfooter(handle, callback)) != 0)
+			goto err;
+	}
+
+err:	if (subdbname)
+		__os_free(env, subdbname);
+
+	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+		ret = t_ret;
+
+	if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_salvage_markdone(vdp, PGNO(master))) != 0)
+		return (t_ret);
+
+	return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_salvage --
+ *      Given a meta page number, salvage all data from leaf pages found by
+ *      walking the meta page's tree.
+ */
+static int
+__db_salvage(dbp, vdp, meta_pgno, handle, callback, flags)
+     DB *dbp;
+     VRFY_DBINFO *vdp;
+     db_pgno_t meta_pgno;
+     void *handle;
+     int (*callback) __P((void *, const void *));
+     u_int32_t flags;
+
+{
+	DB *pgset;
+	DBC *dbc, *pgsc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *subpg;
+	db_pgno_t p;
+	int err_ret, ret, t_ret;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	err_ret = ret = t_ret = 0;
+	pgsc = NULL;
+	pgset = NULL;
+	dbc = NULL;
+
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+
+	/* Get all page numbers referenced from this meta page. */
+	if ((ret = __db_meta2pgset(dbp, vdp, meta_pgno,
+	    flags, pgset)) != 0) {
+		err_ret = ret;
+		goto err;
+	}
+
+	if ((ret = __db_cursor(pgset,
+	    vdp->thread_info, NULL, &pgsc, 0)) != 0)
+		goto err;
+
+	if (dbp->type == DB_QUEUE &&
+	    (ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+		goto err;
+
+	/* Salvage every page in pgset. */
+	while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+		if (dbp->type == DB_QUEUE) {
+#ifdef HAVE_QUEUE
+			ret = __qam_fget(dbc, &p, 0, &subpg);
+#else
+			ret = __db_no_queue_am(env);
+#endif
+			/* Don't report an error for pages not found in a queue.
+			 * The pgset is a best guess, it doesn't know about
+			 * deleted extents which leads to this error.
+			 */
+			if (ret == ENOENT || ret == DB_PAGE_NOTFOUND)
+				continue;
+		} else
+			ret = __memp_fget(mpf,
+			    &p, vdp->thread_info, NULL, 0, &subpg);
+		if (ret != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		if ((ret = __db_salvage_pg(dbp, vdp, p, subpg,
+		    handle, callback, flags)) != 0)
+			err_ret = ret;
+
+		if (dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+			ret = __qam_fput(dbc, p, subpg, dbp->priority);
+#else
+			ret = __db_no_queue_am(env);
+#endif
+		else
+			ret = __memp_fput(mpf,
+			    vdp->thread_info, subpg, dbp->priority);
+		if (ret != 0)
+			err_ret = ret;
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0)
+		ret = t_ret;
+	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+		ret = t_ret;
+	if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+		ret = t_ret;
+
+	return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_meta2pgset --
+ *	Given a known-safe meta page number, return the set of pages
+ *	corresponding to the database it represents.  Return DB_VERIFY_BAD if
+ *	it's not a suitable meta page or is invalid.
+ */
+static int
+__db_meta2pgset(dbp, vdp, pgno, flags, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t flags;
+	DB *pgset;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		ret = __bam_meta2pgset(dbp, vdp, (BTMETA *)h, flags, pgset);
+		break;
+	case P_HASHMETA:
+		ret = __ham_meta2pgset(dbp, vdp, (HMETA *)h, flags, pgset);
+		break;
+	case P_HEAPMETA:
+		ret = __heap_meta2pgset(dbp, vdp, (HEAPMETA *)h, pgset);
+		break;
+	case P_QAMMETA:
+#ifdef HAVE_QUEUE
+		ret = __qam_meta2pgset(dbp, vdp, pgset);
+		break;
+#endif
+	default:
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+	if ((t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+		return (t_ret);
+	return (ret);
+}
+
+/*
+ * __db_guesspgsize --
+ *	Try to guess what the pagesize is if the one on the meta page
+ *	and the one in the db are invalid.
+ */
+static u_int
+__db_guesspgsize(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	db_pgno_t i;
+	size_t nr;
+	u_int32_t guess;
+	u_int8_t type;
+
+	for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) {
+		/*
+		 * We try to read three pages ahead after the first one
+		 * and make sure we have plausible types for all of them.
+		 * If the seeks fail, continue with a smaller size;
+		 * we're probably just looking past the end of the database.
+		 * If they succeed and the types are reasonable, also continue
+		 * with a size smaller;  we may be looking at pages N,
+		 * 2N, and 3N for some N > 1.
+		 *
+		 * As soon as we hit an invalid type, we stop and return
+		 * our previous guess; that last one was probably the page size.
+		 */
+		for (i = 1; i <= 3; i++) {
+			if (__os_seek(
+			    env, fhp, i, guess, SSZ(DBMETA, type)) != 0)
+				break;
+			if (__os_read(env,
+			    fhp, &type, 1, &nr) != 0 || nr == 0)
+				break;
+			if (type == P_INVALID || type >= P_PAGETYPE_MAX)
+				return (guess << 1);
+		}
+	}
+
+	/*
+	 * If we're just totally confused--the corruption takes up most of the
+	 * beginning pages of the database--go with the default size.
+	 */
+	return (DB_DEF_IOSIZE);
+}
diff --git a/src/db/db_vrfy_stub.c b/src/db/db_vrfy_stub.c
new file mode 100644
index 00000000..5037f33e
--- /dev/null
+++ b/src/db/db_vrfy_stub.c
@@ -0,0 +1,120 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * If the library wasn't compiled with the verification support, various
+ * routines aren't available.  Stub them here, returning an appropriate
+ * error.
+ */
+
+static int __db_novrfy __P((ENV *));
+
+/*
+ * __db_novrfy --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_novrfy(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0571",
+    "library build did not include support for database verification"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+	DB *dbp;
+	const char *file, *database;
+	FILE *outfile;
+	u_int32_t flags;
+{
+	int ret;
+
+	COMPQUIET(file, NULL);
+	COMPQUIET(database, NULL);
+	COMPQUIET(outfile, NULL);
+	COMPQUIET(flags, 0);
+
+	ret = __db_novrfy(dbp->env);
+
+	/* The verify method is a destructor. */
+	(void)__db_close(dbp, NULL, 0);
+
+	return (ret);
+}
+
+int
+__db_verify_internal(dbp, name, subdb, handle, callback, flags)
+	DB *dbp;
+	const char *name, *subdb;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(subdb, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_PAGEINFO **pipp;
+{
+	COMPQUIET(pgno, 0);
+	COMPQUIET(pipp, NULL);
+	return (__db_novrfy(vdp->pgdbp->env));
+}
+
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	VRFY_PAGEINFO *pip;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(pip, NULL);
+	return (__db_novrfy(env));
+}
+
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix,
+    handle, callback, is_recno, is_heap, vdp)
+	DBT *dbtp;
+	int checkprint;
+	const char *prefix;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	int is_recno;
+	int is_heap;
+	VRFY_DBINFO *vdp;
+{
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(checkprint, 0);
+	COMPQUIET(prefix, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(is_recno, 0);
+	COMPQUIET(is_heap, 0);
+	return (__db_novrfy(vdp->pgdbp->env));
+}
+#endif /* !HAVE_VERIFY */
diff --git a/src/db/db_vrfyutil.c b/src/db/db_vrfyutil.c
new file mode 100644
index 00000000..d72e1188
--- /dev/null
+++ b/src/db/db_vrfyutil.c
@@ -0,0 +1,932 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+
+static int __db_vrfy_childinc __P((DBC *, VRFY_CHILDINFO *));
+static int __db_vrfy_pageinfo_create __P((ENV *, VRFY_PAGEINFO **));
+
+/*
+ * __db_vrfy_dbinfo_create --
+ *	Allocate and initialize a VRFY_DBINFO structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_create
+ * PUBLIC:     __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+ */
+int
+__db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t pgsize;
+	VRFY_DBINFO **vdpp;
+{
+	DB *cdbp, *pgdbp, *pgset;
+	VRFY_DBINFO *vdp;
+	int ret;
+
+	vdp = NULL;
+	cdbp = pgdbp = pgset = NULL;
+
+	if ((ret = __os_calloc(NULL, 1, sizeof(VRFY_DBINFO), &vdp)) != 0)
+		goto err;
+
+	if ((ret = __db_create_internal(&cdbp, env, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0)
+		goto err;
+
+	if ((ret = __db_set_pagesize(cdbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(cdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(cdbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(pgdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+
+	if ((ret = __db_open(pgdbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	if ((ret = __db_vrfy_pgset(env, ip, pgsize, &pgset)) != 0)
+		goto err;
+
+	if (CDB_LOCKING(env) &&
+	    (ret = __cdsgroup_begin(env, &vdp->txn)) != 0)
+		goto err;
+
+	LIST_INIT(&vdp->subdbs);
+	LIST_INIT(&vdp->activepips);
+
+	vdp->cdbp = cdbp;
+	vdp->pgdbp = pgdbp;
+	vdp->pgset = pgset;
+	vdp->thread_info = ip;
+	*vdpp = vdp;
+	return (0);
+
+err:	if (cdbp != NULL)
+		(void)__db_close(cdbp, NULL, 0);
+	if (pgdbp != NULL)
+		(void)__db_close(pgdbp, NULL, 0);
+	if (vdp->txn != NULL)
+		(void)vdp->txn->commit(vdp->txn, 0);
+	if (vdp != NULL)
+		__os_free(env, vdp);
+	return (ret);
+}
+
+/*
+ * __db_vrfy_dbinfo_destroy --
+ *	Destructor for VRFY_DBINFO.  Destroys VRFY_PAGEINFOs and deallocates
+ *	structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_dbinfo_destroy(env, vdp)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+{
+	VRFY_CHILDINFO *c;
+	int t_ret, ret;
+
+	ret = 0;
+
+	/*
+	 * Discard active page structures.  Ideally there wouldn't be any,
+	 * but in some error cases we may not have cleared them all out.
+	 */
+	while (LIST_FIRST(&vdp->activepips) != NULL)
+		if ((t_ret = __db_vrfy_putpageinfo(
+		    env, vdp, LIST_FIRST(&vdp->activepips))) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/* Discard subdatabase list structures. */
+	while ((c = LIST_FIRST(&vdp->subdbs)) != NULL) {
+		LIST_REMOVE(c, links);
+		__os_free(NULL, c);
+	}
+
+	if ((t_ret = __db_close(vdp->pgdbp, NULL, 0)) != 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(vdp->cdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(vdp->pgset, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (vdp->txn != NULL &&
+	    (t_ret = vdp->txn->commit(vdp->txn, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (vdp->extents != NULL)
+		__os_free(env, vdp->extents);
+	__os_free(env, vdp);
+	return (ret);
+}
+
+/*
+ * __db_vrfy_getpageinfo --
+ *	Get a PAGEINFO structure for a given page, creating it if necessary.
+ *
+ * PUBLIC: int __db_vrfy_getpageinfo
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+ */
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_PAGEINFO **pipp;
+{
+	DB *pgdbp;
+	DBT key, data;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret;
+
+	/*
+	 * We want a page info struct.  There are three places to get it from,
+	 * in decreasing order of preference:
+	 *
+	 * 1. vdp->activepips.  If it's already "checked out", we're
+	 *	already using it, we return the same exact structure with a
+	 *	bumped refcount.  This is necessary because this code is
+	 *	replacing array accesses, and it's common for f() to make some
+	 *	changes to a pip, and then call g() and h() which each make
+	 *	changes to the same pip.  vdps are never shared between threads
+	 *	(they're never returned to the application), so this is safe.
+	 * 2. The pgdbp.  It's not in memory, but it's in the database, so
+	 *	get it, give it a refcount of 1, and stick it on activepips.
+	 * 3. malloc.  It doesn't exist yet;  create it, then stick it on
+	 *	activepips.  We'll put it in the database when we putpageinfo
+	 *	later.
+	 */
+
+	/* Case 1. */
+	LIST_FOREACH(pip, &vdp->activepips, links)
+		if (pip->pgno == pgno)
+			goto found;
+
+	/* Case 2. */
+	pgdbp = vdp->pgdbp;
+	env = pgdbp->env;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	F_SET(&data, DB_DBT_MALLOC);
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	if ((ret = __db_get(pgdbp,
+	    vdp->thread_info, vdp->txn, &key, &data, 0)) == 0) {
+		/* Found it. */
+		DB_ASSERT(env, data.size == sizeof(VRFY_PAGEINFO));
+		pip = data.data;
+		LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+		goto found;
+	} else if (ret != DB_NOTFOUND)	/* Something nasty happened. */
+		return (ret);
+
+	/* Case 3 */
+	if ((ret = __db_vrfy_pageinfo_create(env, &pip)) != 0)
+		return (ret);
+
+	LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+found:	pip->pi_refcount++;
+
+	*pipp = pip;
+	return (0);
+}
+
+/*
+ * __db_vrfy_putpageinfo --
+ *	Put back a VRFY_PAGEINFO that we're done with.
+ *
+ * PUBLIC: int __db_vrfy_putpageinfo __P((ENV *,
+ * PUBLIC:     VRFY_DBINFO *, VRFY_PAGEINFO *));
+ */
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	VRFY_PAGEINFO *pip;
+{
+	DB *pgdbp;
+	DBT key, data;
+	VRFY_PAGEINFO *p;
+	int ret;
+
+	if (--pip->pi_refcount > 0)
+		return (0);
+
+	pgdbp = vdp->pgdbp;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pip->pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = pip;
+	data.size = sizeof(VRFY_PAGEINFO);
+
+	if ((ret = __db_put(pgdbp,
+	     vdp->thread_info, vdp->txn, &key, &data, 0)) != 0)
+		return (ret);
+
+	LIST_FOREACH(p, &vdp->activepips, links)
+		if (p == pip)
+			break;
+	if (p != NULL)
+		LIST_REMOVE(p, links);
+
+	__os_ufree(env, p);
+	return (0);
+}
+
+/*
+ * __db_vrfy_pgset --
+ *	Create a temporary database for the storing of sets of page numbers.
+ *	(A mapping from page number to int, used by the *_meta2pgset functions,
+ *	as well as for keeping track of which pages the verifier has seen.)
+ *
+ * PUBLIC: int __db_vrfy_pgset __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, u_int32_t, DB **));
+ */
+int
+__db_vrfy_pgset(env, ip, pgsize, dbpp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t pgsize;
+	DB **dbpp;
+{
+	DB *dbp;
+	int ret;
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+	if ((ret = __db_set_pagesize(dbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(dbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) == 0)
+		*dbpp = dbp;
+	else
+err:		(void)__db_close(dbp, NULL, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_pgset_get --
+ *	Get the value associated in a page set with a given pgno.  Return
+ *	a 0 value (and succeed) if we've never heard of this page.
+ *
+ * PUBLIC: int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     db_pgno_t, int *));
+ */
+int
+__db_vrfy_pgset_get(dbp, ip, txn, pgno, valp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	db_pgno_t pgno;
+	int *valp;
+{
+	DBT key, data;
+	int ret, val;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = &val;
+	data.ulen = sizeof(int);
+	F_SET(&data, DB_DBT_USERMEM);
+
+	if ((ret = __db_get(dbp, ip, txn, &key, &data, 0)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(int));
+	} else if (ret == DB_NOTFOUND)
+		val = 0;
+	else
+		return (ret);
+
+	*valp = val;
+	return (0);
+}
+
+/*
+ * __db_vrfy_pgset_inc --
+ *	Increment the value associated with a pgno by 1.
+ *
+ * PUBLIC: int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:	db_pgno_t));
+ */
+int
+__db_vrfy_pgset_inc(dbp, ip, txn, pgno)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	db_pgno_t pgno;
+{
+	DBT key, data;
+	int ret;
+	int val;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	val = 0;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = &val;
+	data.ulen = sizeof(int);
+	F_SET(&data, DB_DBT_USERMEM);
+
+	if ((ret = __db_get(dbp, ip, txn, &key, &data, 0)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(int));
+	} else if (ret != DB_NOTFOUND)
+		return (ret);
+
+	data.size = sizeof(int);
+	++val;
+
+	return (__db_put(dbp, ip, txn, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_pgset_next --
+ *	Given a cursor open in a pgset database, get the next page in the
+ *	set.
+ *
+ * PUBLIC: int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+ */
+int
+__db_vrfy_pgset_next(dbc, pgnop)
+	DBC *dbc;
+	db_pgno_t *pgnop;
+{
+	DBT key, data;
+	db_pgno_t pgno;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	/* We don't care about the data, just the keys. */
+	F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+	F_SET(&key, DB_DBT_USERMEM);
+	key.data = &pgno;
+	key.ulen = sizeof(db_pgno_t);
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, key.size == sizeof(db_pgno_t));
+	*pgnop = pgno;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_childcursor --
+ *	Create a cursor to walk the child list with.  Returns with a nonzero
+ *	final argument if the specified page has no children.
+ *
+ * PUBLIC: int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+ */
+int
+__db_vrfy_childcursor(vdp, dbcp)
+	VRFY_DBINFO *vdp;
+	DBC **dbcp;
+{
+	DB *cdbp;
+	DBC *dbc;
+	int ret;
+
+	cdbp = vdp->cdbp;
+
+	if ((ret = __db_cursor(cdbp, vdp->thread_info, vdp->txn, &dbc, 0)) == 0)
+		*dbcp = dbc;
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_childput --
+ *	Add a child structure to the set for a given page.
+ *
+ * PUBLIC: int __db_vrfy_childput
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+ */
+int
+__db_vrfy_childput(vdp, pgno, cip)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_CHILDINFO *cip;
+{
+	DB *cdbp;
+	DBC *cc;
+	DBT key, data;
+	VRFY_CHILDINFO *oldcip;
+	int ret;
+
+	cdbp = vdp->cdbp;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * We want to avoid adding multiple entries for a single child page;
+	 * we only need to verify each child once, even if a child (such
+	 * as an overflow key) is multiply referenced.
+	 *
+	 * However, we also need to make sure that when walking the list
+	 * of children, we encounter them in the order they're referenced
+	 * on a page.  (This permits us, for example, to verify the
+	 * prev_pgno/next_pgno chain of Btree leaf pages.)
+	 *
+	 * Check the child database to make sure that this page isn't
+	 * already a child of the specified page number.  If it's not,
+	 * put it at the end of the duplicate set.
+	 */
+	if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+		return (ret);
+	for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0;
+	    ret = __db_vrfy_ccnext(cc, &oldcip))
+		if (oldcip->pgno == cip->pgno) {
+			/*
+			 * Found a matching child.  Increment its reference
+			 * count--we've run into it again--but don't put it
+			 * again.
+			 */
+			if ((ret = __db_vrfy_childinc(cc, oldcip)) != 0 ||
+			    (ret = __db_vrfy_ccclose(cc)) != 0)
+				return (ret);
+			return (0);
+		}
+	if (ret != DB_NOTFOUND) {
+		(void)__db_vrfy_ccclose(cc);
+		return (ret);
+	}
+	if ((ret = __db_vrfy_ccclose(cc)) != 0)
+		return (ret);
+
+	cip->refcnt = 1;
+	data.data = cip;
+	data.size = sizeof(VRFY_CHILDINFO);
+
+	return (__db_put(cdbp, vdp->thread_info, vdp->txn, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_childinc --
+ *	Increment the refcount of the VRFY_CHILDINFO struct that the child
+ * cursor is pointing to.  (The caller has just retrieved this struct, and
+ * passes it in as cip to save us a get.)
+ */
+static int
+__db_vrfy_childinc(dbc, cip)
+	DBC *dbc;
+	VRFY_CHILDINFO *cip;
+{
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	cip->refcnt++;
+	data.data = cip;
+	data.size = sizeof(VRFY_CHILDINFO);
+
+	return (__dbc_put(dbc, &key, &data, DB_CURRENT));
+}
+
+/*
+ * __db_vrfy_ccset --
+ *	Sets a cursor created with __db_vrfy_childcursor to the first
+ *	child of the given pgno, and returns it in the third arg.
+ *
+ * PUBLIC: int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccset(dbc, pgno, cipp)
+	DBC *dbc;
+	db_pgno_t pgno;
+	VRFY_CHILDINFO **cipp;
+{
+	DBT key, data;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_SET)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+	*cipp = (VRFY_CHILDINFO *)data.data;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_ccnext --
+ *	Gets the next child of the given cursor created with
+ *	__db_vrfy_childcursor, and returns it in the memory provided in the
+ *	second arg.
+ *
+ * PUBLIC: int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccnext(dbc, cipp)
+	DBC *dbc;
+	VRFY_CHILDINFO **cipp;
+{
+	DBT key, data;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+	*cipp = (VRFY_CHILDINFO *)data.data;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_ccclose --
+ *	Closes the cursor created with __db_vrfy_childcursor.
+ *
+ *	This doesn't actually do anything interesting now, but it's
+ *	not inconceivable that we might change the internal database usage
+ *	and keep the interfaces the same, and a function call here or there
+ *	seldom hurts anyone.
+ *
+ * PUBLIC: int __db_vrfy_ccclose __P((DBC *));
+ */
+int
+__db_vrfy_ccclose(dbc)
+	DBC *dbc;
+{
+
+	return (__dbc_close(dbc));
+}
+
+/*
+ * __db_vrfy_pageinfo_create --
+ *	Constructor for VRFY_PAGEINFO;  allocates and initializes.
+ */
+static int
+__db_vrfy_pageinfo_create(env, pipp)
+	ENV *env;
+	VRFY_PAGEINFO **pipp;
+{
+	VRFY_PAGEINFO *pip;
+	int ret;
+
+	/*
+	 * pageinfo structs are sometimes allocated here and sometimes
+	 * allocated by fetching them from a database with DB_DBT_MALLOC.
+	 * There's no easy way for the destructor to tell which was
+	 * used, and so we always allocate with __os_umalloc so we can free
+	 * with __os_ufree.
+	 */
+	if ((ret = __os_umalloc(env, sizeof(VRFY_PAGEINFO), &pip)) != 0)
+		return (ret);
+	memset(pip, 0, sizeof(VRFY_PAGEINFO));
+
+	*pipp = pip;
+	return (0);
+}
+
+/*
+ * __db_salvage_init --
+ *	Set up salvager database.
+ *
+ * PUBLIC: int  __db_salvage_init __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_init(vdp)
+	VRFY_DBINFO *vdp;
+{
+	DB *dbp;
+	int ret;
+
+	if ((ret = __db_create_internal(&dbp, NULL, 0)) != 0)
+		return (ret);
+
+	if ((ret = __db_set_pagesize(dbp, 1024)) != 0)
+		goto err;
+
+	if ((ret = __db_open(dbp, vdp->thread_info,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	vdp->salvage_pages = dbp;
+	return (0);
+
+err:	(void)__db_close(dbp, NULL, 0);
+	return (ret);
+}
+
+/*
+ * __db_salvage_destroy --
+ *	Close salvager database.
+ * PUBLIC: int  __db_salvage_destroy __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_destroy(vdp)
+	VRFY_DBINFO *vdp;
+{
+	return (vdp->salvage_pages == NULL ? 0 :
+	    __db_close(vdp->salvage_pages, NULL, 0));
+}
+
+/*
+ * __db_salvage_getnext --
+ *	Get the next (first) unprinted page in the database of pages we need to
+ *	print still.  Delete entries for any already-printed pages we encounter
+ *	in this search, as well as the page we're returning.
+ *
+ * PUBLIC: int __db_salvage_getnext
+ * PUBLIC:     __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+ */
+int
+__db_salvage_getnext(vdp, dbcp, pgnop, pgtypep, skip_overflow)
+	VRFY_DBINFO *vdp;
+	DBC **dbcp;
+	db_pgno_t *pgnop;
+	u_int32_t *pgtypep;
+	int skip_overflow;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+	u_int32_t pgtype;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	if (*dbcp == NULL &&
+	    (ret = __db_cursor(dbp, vdp->thread_info, vdp->txn, dbcp, 0)) != 0)
+		return (ret);
+
+	while ((ret = __dbc_get(*dbcp, &key, &data, DB_NEXT)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+		memcpy(&pgtype, data.data, sizeof(pgtype));
+
+		if (skip_overflow && pgtype == SALVAGE_OVERFLOW)
+			continue;
+
+		if ((ret = __dbc_del(*dbcp, 0)) != 0)
+			return (ret);
+		if (pgtype != SALVAGE_IGNORE) {
+			DB_ASSERT(dbp->env, key.size == sizeof(db_pgno_t));
+			DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+
+			*pgnop = *(db_pgno_t *)key.data;
+			*pgtypep = *(u_int32_t *)data.data;
+			break;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_salvage_isdone --
+ *	Return whether or not the given pgno is already marked
+ *	SALVAGE_IGNORE (meaning that we don't need to print it again).
+ *
+ *	Returns DB_KEYEXIST if it is marked, 0 if not, or another error on
+ *	error.
+ *
+ * PUBLIC: int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_isdone(vdp, pgno)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+	u_int32_t currtype;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	currtype = SALVAGE_INVALID;
+	data.data = &currtype;
+	data.ulen = sizeof(u_int32_t);
+	data.flags = DB_DBT_USERMEM;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there and is marked done.
+	 * If it's there and is marked anything else, that's fine--we
+	 * want to mark it done.
+	 */
+	if ((ret = __db_get(dbp,
+	    vdp->thread_info, vdp->txn, &key, &data, 0)) == 0) {
+		/*
+		 * The key's already here.  Check and see if it's already
+		 * marked done.  If it is, return DB_KEYEXIST.  If it's not,
+		 * return 0.
+		 */
+		if (currtype == SALVAGE_IGNORE)
+			return (DB_KEYEXIST);
+		else
+			return (0);
+	} else if (ret != DB_NOTFOUND)
+		return (ret);
+
+	/* The pgno is not yet marked anything; return 0. */
+	return (0);
+}
+
+/*
+ * __db_salvage_markdone --
+ *	Mark as done a given page.
+ *
+ * PUBLIC: int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_markdone(vdp, pgno)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT key, data;
+	int pgtype, ret;
+	u_int32_t currtype;
+
+	pgtype = SALVAGE_IGNORE;
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	currtype = SALVAGE_INVALID;
+	data.data = &currtype;
+	data.ulen = sizeof(u_int32_t);
+	data.flags = DB_DBT_USERMEM;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there and is marked done.
+	 * If it's there and is marked anything else, that's fine--we
+	 * want to mark it done, but db_salvage_isdone only lets
+	 * us know if it's marked IGNORE.
+	 *
+	 * We don't want to return DB_KEYEXIST, though;  this will
+	 * likely get passed up all the way and make no sense to the
+	 * application.  Instead, use DB_VERIFY_BAD to indicate that
+	 * we've seen this page already--it probably indicates a
+	 * multiply-linked page.
+	 */
+	if ((ret = __db_salvage_isdone(vdp, pgno)) != 0)
+		return (ret == DB_KEYEXIST ? DB_VERIFY_BAD : ret);
+
+	data.size = sizeof(u_int32_t);
+	data.data = &pgtype;
+
+	return (__db_put(dbp, vdp->thread_info, vdp->txn, &key, &data, 0));
+}
+
+/*
+ * __db_salvage_markneeded --
+ *	If it has not yet been printed, make note of the fact that a page
+ *	must be dealt with later.
+ *
+ * PUBLIC: int __db_salvage_markneeded
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_salvage_markneeded(vdp, pgno, pgtype)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t pgtype;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	data.data = &pgtype;
+	data.size = sizeof(u_int32_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there, in which case it's presumably
+	 * already been marked done.
+	 */
+	ret = __db_put(dbp,
+	     vdp->thread_info, vdp->txn, &key, &data, DB_NOOVERWRITE);
+	return (ret == DB_KEYEXIST ? 0 : ret);
+}
+
+/*
+ * __db_vrfy_prdbt --
+ *	Print out a DBT data element from a verification routine.
+ *
+ * PUBLIC: int __db_vrfy_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC:     int (*)(void *, const void *), int, int, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix,
+    handle, callback, is_recno, is_heap, vdp)
+	DBT *dbtp;
+	int checkprint;
+	const char *prefix;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	int is_recno;
+	int is_heap;
+	VRFY_DBINFO *vdp;
+{
+	if (vdp != NULL) {
+		/*
+		 * If vdp is non-NULL, we might be the first key in the
+		 * "fake" subdatabase used for key/data pairs we can't
+		 * associate with a known subdb.
+		 *
+		 * Check and clear the SALVAGE_PRINTHEADER flag;  if
+		 * it was set, print a subdatabase header.
+		 */
+		if (F_ISSET(vdp, SALVAGE_PRINTHEADER)) {
+			(void)__db_prheader(
+			    NULL, "__OTHER__", 0, 0, handle, callback, vdp, 0);
+			F_CLR(vdp, SALVAGE_PRINTHEADER);
+			F_SET(vdp, SALVAGE_PRINTFOOTER);
+		}
+
+		/*
+		 * Even if the printable flag wasn't set by our immediate
+		 * caller, it may be set on a salvage-wide basis.
+		 */
+		if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+			checkprint = 1;
+	}
+	return (
+	    __db_prdbt(dbtp, checkprint,
+	    prefix, handle, callback, is_recno, is_heap));
+}
diff --git a/src/db/partition.c b/src/db/partition.c
new file mode 100644
index 00000000..f8beaf16
--- /dev/null
+++ b/src/db/partition.c
@@ -0,0 +1,2059 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#ifdef HAVE_HASH
+#include "dbinc/hash.h"
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+#ifdef HAVE_PARTITION
+
+static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+	       const char *, const char *, const char *, u_int32_t));
+static int __partc_close __P((DBC *, db_pgno_t, int *));
+static int __partc_del __P((DBC*, u_int32_t));
+static int __partc_destroy __P((DBC*));
+static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t));
+static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __partc_writelock __P((DBC*));
+static int __partition_chk_meta __P((DB *,
+		DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+static int __partition_setup_keys __P((DBC *,
+		DB_PARTITION *, DBMETA *, u_int32_t));
+static int __part_key_cmp __P((const void *, const void *));
+static inline void __part_search __P((DB *,
+		DB_PARTITION *, DBT *, u_int32_t *));
+
+static char *Alloc_err = DB_STR_A("0644",
+    "Partition open failed to allocate %d bytes", "%d");
+
+/*
+ * Allocate a partition cursor and copy flags to the partition cursor.
+ * Not passed:
+ *	DBC_PARTITIONED -- the subcursors are not.
+ *	DBC_OWN_LID -- the arg dbc owns the lock id.
+ *	DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on
+ *				the whole DB, not the partition.
+ */
+#define	GET_PART_CURSOR(dbc, new_dbc, part_id) do {			     \
+	DB *__part_dbp;							     \
+	__part_dbp = part->handles[part_id];				     \
+	if ((ret = __db_cursor_int(__part_dbp,				     \
+	     (dbc)->thread_info, (dbc)->txn, __part_dbp->type,		     \
+	     PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0)		     \
+		goto err;						     \
+	(new_dbc)->flags = (dbc)->flags &				     \
+	    ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER);	     \
+} while (0)
+
+/*
+ * Search for the correct partition.
+ */
+static inline void __part_search(dbp, part, key, part_idp)
+	DB *dbp;
+	DB_PARTITION *part;
+	DBT *key;
+	u_int32_t *part_idp;
+{
+	db_indx_t base, indx, limit;
+	int cmp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+
+	DB_ASSERT(dbp->env, part->nparts != 0);
+	COMPQUIET(cmp, 0);
+	COMPQUIET(indx, 0);
+
+	func = ((BTREE *)dbp->bt_internal)->bt_compare;
+	DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
+		DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
+		cmp = func(dbp, key, &part->keys[indx]);
+		if (cmp == 0)
+			break;
+		if (cmp > 0)
+			DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
+	}
+	if (cmp == 0)
+		*part_idp = indx;
+	else if ((*part_idp = base) != 0)
+		(*part_idp)--;
+}
+
+/*
+ * __partition_init --
+ *	Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_PARTITION *part;
+	int ret;
+
+	if ((part = dbp->p_internal) != NULL) {
+		if ((LF_ISSET(DBMETA_PART_RANGE) &&
+		    F_ISSET(part, PART_CALLBACK)) ||
+		    (LF_ISSET(DBMETA_PART_CALLBACK) &&
+		    F_ISSET(part, PART_RANGE))) {
+			__db_errx(dbp->env, DB_STR("0645",
+			    "Cannot specify callback and range keys."));
+			return (EINVAL);
+		}
+	} else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DBMETA_PART_RANGE))
+		F_SET(part, PART_RANGE);
+	if (LF_ISSET(DBMETA_PART_CALLBACK))
+		F_SET(part, PART_CALLBACK);
+	dbp->p_internal = part;
+	/* Set up AM-specific methods that do not require an open. */
+	dbp->db_am_rename = __part_rename;
+	dbp->db_am_remove = __part_remove;
+	return (0);
+}
+/*
+ * __partition_set --
+ *	Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC:	u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+	DB *dbp;
+	u_int32_t parts;
+	DBT *keys;
+	u_int32_t (*callback)(DB *, DBT *key);
+{
+	DB_PARTITION *part;
+	ENV *env;
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
+	env = dbp->dbenv->env;
+
+	if (parts < 2) {
+		__db_errx(env, DB_STR("0646",
+		    "Must specify at least 2 partitions."));
+		return (EINVAL);
+	}
+
+	if (keys == NULL && callback == NULL) {
+		__db_errx(env, DB_STR("0647",
+		    "Must specify either keys or a callback."));
+		return (EINVAL);
+	}
+	if (keys != NULL && callback != NULL) {
+bad:		__db_errx(env, DB_STR("0648",
+		    "May not specify both keys and a callback."));
+		return (EINVAL);
+	}
+
+	if ((ret = __partition_init(dbp,
+	    keys != NULL ?
+	    DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0)
+		return (ret);
+	part = dbp->p_internal;
+
+	if ((part->keys != NULL && callback != NULL) ||
+	    (part->callback != NULL && keys != NULL))
+		goto bad;
+
+	part->nparts = parts;
+	part->keys = keys;
+	part->callback = callback;
+
+	return (0);
+}
+
+/*
+ * __partition_set_dirs --
+ *	Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	DB_ENV *dbenv;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t ndirs, slen;
+	int i, ret;
+	const char **dir;
+	char *cp, **part_dirs, **pd;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs");
+	dbenv = dbp->dbenv;
+	env = dbp->env;
+
+	ndirs = 1;
+	slen = 0;
+	for (dir = dirp; *dir != NULL; dir++) {
+		if (F_ISSET(env, ENV_DBLOCAL))
+			slen += (u_int32_t)strlen(*dir) + 1;
+		ndirs++;
+	}
+
+	slen += sizeof(char *) * ndirs;
+	if ((ret = __os_malloc(env, slen,  &part_dirs)) != 0)
+		return (EINVAL);
+	memset(part_dirs, 0, slen);
+
+	cp = (char *) part_dirs + (sizeof(char *) * ndirs);
+	pd = part_dirs;
+	for (dir = dirp; *dir != NULL; dir++, pd++) {
+		if (F_ISSET(env, ENV_DBLOCAL)) {
+			(void)strcpy(cp, *dir);
+			*pd = cp;
+			cp += strlen(*dir) + 1;
+			continue;
+		}
+		for (i = 0; i < dbenv->data_next; i++)
+			if (strcmp(*dir, dbenv->db_data_dir[i]) == 0)
+				break;
+		if (i == dbenv->data_next) {
+			__db_errx(dbp->env, DB_STR_A("0649",
+			    "Directory not in environment list %s",
+			    "%s"), *dir);
+			__os_free(env, part_dirs);
+			return (EINVAL);
+		}
+		*pd = dbenv->db_data_dir[i];
+	}
+
+	if ((part = dbp->p_internal) == NULL) {
+		if ((ret = __partition_init(dbp, 0)) != 0)
+			return (ret);
+		part = dbp->p_internal;
+	}
+
+	part->dirs = (const char **)part_dirs;
+
+	return (0);
+}
+
+/*
+ * __partition_open --
+ *	Open/create a partitioned database.
+ * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:	 DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+ */
+int
+__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *fname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode, do_open;
+{
+	DB *part_db;
+	DB_PARTITION *part;
+	DBC *dbc;
+	ENV *env;
+	u_int32_t part_id;
+	int ret;
+	char *name, *sp;
+	const char **dirp, *np;
+
+	part = dbp->p_internal;
+	env = dbp->dbenv->env;
+	name = NULL;
+
+	if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open)
+		goto err;
+
+	if ((ret = __os_calloc(env,
+	     part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
+		__db_errx(env,
+		    Alloc_err, part->nparts * sizeof(*part->handles));
+		goto err;
+	}
+
+	DB_ASSERT(env, fname != NULL);
+	if ((ret = __os_malloc(env,
+	     strlen(fname) + PART_LEN + 1, &name)) != 0) {
+		__db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+		goto err;
+	}
+
+	sp = name;
+	np = __db_rpath(fname);
+	if (np == NULL)
+		np = fname;
+	else {
+		np++;
+		(void)strncpy(name, fname, (size_t)(np - fname));
+		sp = name + (np - fname);
+	}
+
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		goto done;
+	dirp = part->dirs;
+	for (part_id = 0; part_id < part->nparts; part_id++) {
+		if ((ret = __db_create_internal(
+		    &part->handles[part_id], dbp->env, 0)) != 0)
+			goto err;
+
+		part_db = part->handles[part_id];
+		part_db->flags = F_ISSET(dbp,
+		    ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED));
+		F_SET(part_db, DB_AM_PARTDB);
+		part_db->adj_fileid = dbp->adj_fileid;
+		part_db->pgsize = dbp->pgsize;
+		part_db->priority = dbp->priority;
+		part_db->db_append_recno = dbp->db_append_recno;
+		part_db->db_feedback = dbp->db_feedback;
+		part_db->dup_compare = dbp->dup_compare;
+		part_db->app_private = dbp->app_private;
+		part_db->api_internal = dbp->api_internal;
+
+		if (dbp->type == DB_BTREE)
+			__bam_copy_config(dbp, part_db, part->nparts);
+#ifdef HAVE_HASH
+		if (dbp->type == DB_HASH)
+			__ham_copy_config(dbp, part_db, part->nparts);
+#endif
+
+		(void)sprintf(sp, PART_NAME, np, part_id);
+		if (do_open) {
+			/*
+			 * Cycle through the directory names passed in,
+			 * if any.
+			 */
+			if (dirp != NULL &&
+			    (part_db->dirname = *dirp++) == NULL) {
+				part_db->dirname = *(dirp = part->dirs);
+				dirp++;
+			}
+			if ((ret = __db_open(part_db, ip, txn,
+			    name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0)
+				goto err;
+		} else if ((ret = __os_strdup(env, name, &part_db->fname)) != 0)
+			goto err;
+	}
+
+	/* Get rid of the cursor used to open the database its the wrong type */
+done:	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((ret = __dbc_destroy(dbc)) != 0)
+			break;
+
+	if (0) {
+err:		(void)__partition_close(dbp, txn, 0);
+	}
+	if (name != NULL)
+		__os_free(env, name);
+	return (ret);
+}
+
+/*
+ * __partition_chk_meta --
+ * Check for a consistent meta data page and parameters when opening a
+ * partitioned database.
+ */
+static int
+__partition_chk_meta(dbp, ip, txn, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	DB_PARTITION *part;
+	DBC *dbc;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	db_pgno_t base_pgno;
+	int ret, t_ret;
+
+	dbc = NULL;
+	meta = NULL;
+	LOCK_INIT(metalock);
+	part = dbp->p_internal;
+	mpf = dbp->mpf;
+	env = dbp->env;
+	ret = 0;
+
+	/* Get a cursor on the main db.  */
+	dbp->p_internal = NULL;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err;
+
+	/* Get the metadata page. */
+	base_pgno = PGNO_BASE_MD;
+	if ((ret =
+	    __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	if (meta->magic != DB_HASHMAGIC &&
+	    (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) {
+		__db_errx(env, DB_STR("0650",
+	    "Partitioning may only specified on BTREE and HASH databases."));
+		ret = EINVAL;
+		goto err;
+	}
+	if (!FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) {
+		__db_errx(env, DB_STR("0651",
+		    "Partitioning specified on a non-partitioned database."));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if ((F_ISSET(part, PART_RANGE) &&
+	    FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) ||
+	    (F_ISSET(part, PART_CALLBACK) &&
+	    FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) {
+		__db_errx(env, DB_STR("0652",
+		    "Incompatible partitioning specified."));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) &&
+	     part->callback == NULL && !IS_RECOVERING(env) &&
+	     !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) {
+		__db_errx(env, DB_STR("0653",
+		    "Partition callback not specified."));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (F_ISSET(dbp, DB_AM_RECNUM)) {
+		__db_errx(env, DB_STR("0654",
+	    "Record numbers are not supported in partitioned databases."));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (part->nparts == 0) {
+		if (LF_ISSET(DB_CREATE) && meta->nparts == 0) {
+			__db_errx(env, DB_STR("0655",
+			    "Zero paritions specified."));
+			ret = EINVAL;
+			goto err;
+		} else
+			part->nparts = meta->nparts;
+	} else if (meta->nparts != 0 && part->nparts != meta->nparts) {
+		__db_errx(env, DB_STR("0656",
+		    "Number of partitions does not match."));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (meta->magic == DB_HASHMAGIC) {
+		if (!F_ISSET(part, PART_CALLBACK)) {
+			__db_errx(env, DB_STR("0657",
+		    "Hash database must specify a partition callback."));
+			ret = EINVAL;
+		}
+	} else if (meta->magic != DB_BTREEMAGIC) {
+		__db_errx(env, DB_STR("0658",
+		    "Partitioning only supported on BTREE nad HASH."));
+		ret = EINVAL;
+	} else
+		ret = __partition_setup_keys(dbc, part, meta, flags);
+
+err:	/* Put the metadata page back. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbp->p_internal = part;
+	return (ret);
+}
+
+/*
+ * Support for sorting keys.  Keys must be sorted using the btree
+ * compare function so if we call qsort in __partition_setup_keys
+ * we use this structure to pass the DBP and compare function.
+ */
+struct key_sort {
+	DB *dbp;
+	DBT *key;
+	int (*compare) __P((DB *, const DBT *, const DBT *));
+};
+
+static int __part_key_cmp(a, b)
+	const void *a, *b;
+{
+	const struct key_sort *ka, *kb;
+
+	ka = a;
+	kb = b;
+	return (ka->compare(ka->dbp, ka->key, kb->key));
+}
+/*
+ * __partition_setup_keys --
+ *	Get the partition keys into memory, or put them to disk if we
+ * are creating a partitioned database.
+ */
+static int
+__partition_setup_keys(dbc, part, meta, flags)
+	DBC *dbc;
+	DB_PARTITION *part;
+	DBMETA *meta;
+	u_int32_t flags;
+{
+	BTREE *t;
+	DB *dbp;
+	DBT data, key, *keys, *kp;
+	ENV *env;
+	u_int32_t ds, i, j;
+	u_int8_t *dd;
+	struct key_sort *ks;
+	int have_keys, ret;
+	int (*compare) __P((DB *, const DBT *, const DBT *));
+	void *dp;
+
+	COMPQUIET(dd, NULL);
+	COMPQUIET(ds, 0);
+	memset(&data, 0, sizeof(data));
+	memset(&key, 0, sizeof(key));
+	ks = NULL;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Need to just read the main database. */
+	dbp->p_internal = NULL;
+	have_keys = 0;
+
+	/* First verify that things what we expect. */
+	if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
+		if (ret != DB_NOTFOUND)
+			goto err;
+		if (F_ISSET(part, PART_CALLBACK)) {
+			ret = 0;
+			goto done;
+		}
+		if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    !LF_ISSET(DB_RDWRMASTER)) {
+			__db_errx(env, DB_STR("0659", "No range keys found."));
+			ret = EINVAL;
+			goto err;
+		}
+	} else {
+		if (F_ISSET(part, PART_CALLBACK)) {
+			__db_errx(env, DB_STR("0660",
+			    "Keys found and callback set."));
+			ret = EINVAL;
+			goto err;
+		}
+		if (key.size != 0) {
+			__db_errx(env, DB_STR("0661",
+			    "Partition key 0 is not empty."));
+			ret = EINVAL;
+			goto err;
+		}
+		have_keys = 1;
+	}
+
+	if (LF_ISSET(DB_CREATE) && have_keys == 0) {
+		/* Insert the keys into the master database. */
+		for (i = 0; i < part->nparts - 1; i++) {
+			if ((ret = __db_put(dbp, dbc->thread_info,
+			    dbc->txn, &part->keys[i], &data, 0)) != 0)
+				    goto err;
+		}
+
+		/*
+		 * Insert the "0" pointer.  All records less than the first
+		 * given key go into this partition.  We must use the default
+		 * compare to insert this key, otherwise it might not be first.
+		 */
+		t = dbc->dbp->bt_internal;
+		compare = t->bt_compare;
+		t->bt_compare = __bam_defcmp;
+		memset(&key, 0, sizeof(key));
+		ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0);
+		t->bt_compare = compare;
+		if (ret != 0)
+		    goto err;
+	}
+done:	if (F_ISSET(part, PART_RANGE)) {
+		/*
+		 * Allocate one page to hold the keys plus space at the
+		 * end of the buffer to put an array of DBTs.  If there
+		 * is not enough space __dbc_get will return how much
+		 * is needed and we realloc.
+		 */
+		if ((ret = __os_malloc(env,
+		    meta->pagesize + (sizeof(DBT) * part->nparts),
+		    &part->data)) != 0) {
+			__db_errx(env, Alloc_err, meta->pagesize);
+			goto err;
+		}
+		memset(&key, 0, sizeof(key));
+		memset(&data, 0, sizeof(data));
+		data.data = part->data;
+		data.ulen = meta->pagesize;
+		data.flags = DB_DBT_USERMEM;
+again:		if ((ret = __dbc_get(dbc, &key, &data,
+		     DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
+			if ((ret = __os_realloc(env,
+			      data.size + (sizeof(DBT) * part->nparts),
+			      &part->data)) != 0)
+				goto err;
+			data.data = part->data;
+			data.ulen = data.size;
+			goto again;
+		}
+		if (ret == 0) {
+			/*
+			 * They passed in keys, they must match.
+			 */
+			keys = NULL;
+			compare = NULL;
+			if (have_keys == 1 && (keys = part->keys) != NULL) {
+				t = dbc->dbp->bt_internal;
+				compare = t->bt_compare;
+				if ((ret = __os_malloc(env, (part->nparts - 1)
+				     * sizeof(struct key_sort), &ks)) != 0)
+					goto err;
+				for (j = 0; j < part->nparts - 1; j++) {
+					ks[j].dbp = dbc->dbp;
+					ks[j].compare = compare;
+					ks[j].key = &keys[j];
+				}
+
+				qsort(ks, (size_t)part->nparts - 1,
+				    sizeof(struct key_sort), __part_key_cmp);
+			}
+			DB_MULTIPLE_INIT(dp, &data);
+			part->keys = (DBT *)
+			    ((u_int8_t *)part->data + data.size);
+			j = 0;
+			for (kp = part->keys;
+			    kp < &part->keys[part->nparts]; kp++, j++) {
+				DB_MULTIPLE_KEY_NEXT(dp,
+				     &data, kp->data, kp->size, dd, ds);
+				if (dp == NULL) {
+					ret = DB_NOTFOUND;
+					break;
+				}
+				if (keys != NULL && j != 0 &&
+				    compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
+					if (kp->data == NULL &&
+					    F_ISSET(dbp, DB_AM_RECOVER))
+						goto err;
+					__db_errx(env, DB_STR_A("0662",
+					    "Partition key %d does not match",
+					    "%d"), j);
+					ret = EINVAL;
+					goto err;
+				}
+			}
+		}
+	}
+	if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER))
+		ret = 0;
+
+err:	dbp->p_internal = part;
+	if (ks != NULL)
+		__os_free(env, ks);
+	return (ret);
+}
+
+/*
+ * __partition_get_callback --
+ *	Get the partition callback function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC:	 u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+	DB *dbp;
+	u_int32_t *parts;
+	u_int32_t (**callback)(DB *, DBT *key);
+{
+	DB_PARTITION *part;
+
+	part = dbp->p_internal;
+	/* Only return populated results if partitioned using callbacks. */
+	if (part != NULL && !F_ISSET(part, PART_CALLBACK))
+		part = NULL;
+	if (parts != NULL)
+		*parts = (part != NULL ? part->nparts : 0);
+	if (callback != NULL)
+		*callback = (part != NULL ? part->callback : NULL);
+
+	return (0);
+}
+
+/*
+ * __partition_get_keys --
+ *	Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+	DB *dbp;
+	u_int32_t *parts;
+	DBT **keys;
+{
+	DB_PARTITION *part;
+
+	part = dbp->p_internal;
+	/* Only return populated results if partitioned using ranges. */
+	if (part != NULL && !F_ISSET(part, PART_RANGE))
+		part = NULL;
+	if (parts != NULL)
+		*parts = (part != NULL ? part->nparts : 0);
+	if (keys != NULL)
+		*keys = (part != NULL ? &part->keys[1] : NULL);
+
+	return (0);
+}
+
+/*
+ * __partition_get_dirs --
+ *	Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+	DB *dbp;
+	const char ***dirpp;
+{
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret;
+
+	env = dbp->env;
+	if ((part = dbp->p_internal) == NULL) {
+		*dirpp = NULL;
+		return (0);
+	}
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+		*dirpp = part->dirs;
+		return (0);
+	}
+
+	/*
+	 * We build a list once when asked.  The original directory list,
+	 * if any, was discarded at open time.
+	 */
+	if ((*dirpp = part->dirs) != NULL)
+		return (0);
+
+	if ((ret = __os_calloc(env,
+	    sizeof(char *), part->nparts + 1, (void *) &part->dirs)) != 0)
+		return (ret);
+
+	for (i = 0; i < part->nparts; i++)
+		part->dirs[i] = part->handles[i]->dirname;
+
+	*dirpp = part->dirs;
+	return (0);
+}
+
+/*
+ * __partc_init --
+ *	Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __partc_init __P((DBC *));
+ */
+int
+__partc_init(dbc)
+	DBC *dbc;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+	/* Allocate/initialize the internal structure. */
+	if (dbc->internal == NULL && (ret =
+	    __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0)
+		return (ret);
+
+	/* Initialize methods. */
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __partc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	dbc->am_bulk = NULL;
+	dbc->am_close = __partc_close;
+	dbc->am_del = __partc_del;
+	dbc->am_destroy = __partc_destroy;
+	dbc->am_get = NULL;
+	dbc->am_put = __partc_put;
+	dbc->am_writelock = __partc_writelock;
+
+	/* We avoid swapping partition cursors since we swap the sub cursors */
+	F_SET(dbc, DBC_PARTITIONED);
+
+	return (0);
+}
+/*
+ * __partc_get_pp --
+ *	cursor get opeartion on a partitioned database.
+ */
+static int
+__partc_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+	ret = __partc_get(dbc, key, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+/*
+ * __partition_get --
+ *	cursor get opeartion on a partitioned database.
+ *
+ * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+ */
+int
+__partc_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *orig_dbc, *new_dbc;
+	DB_PARTITION *part;
+	PART_CURSOR *cp;
+	u_int32_t multi, part_id;
+	int ret, retry, search;
+
+	dbp = dbc->dbp;
+	cp = (PART_CURSOR*)dbc->internal;
+	orig_dbc = cp->sub_cursor;
+	part = dbp->p_internal;
+
+	new_dbc = NULL;
+	retry = search = 0;
+	part_id = cp->part_id;
+	multi = flags & ~DB_OPFLAGS_MASK;
+
+	switch (flags & DB_OPFLAGS_MASK) {
+	case DB_CURRENT:
+		break;
+	case DB_FIRST:
+		part_id = 0;
+		retry = 1;
+		break;
+	case DB_GET_BOTH:
+	case DB_GET_BOTHC:
+	case DB_GET_BOTH_RANGE:
+		search = 1;
+		break;
+	case DB_SET_RANGE:
+		search = 1;
+		retry = 1;
+		break;
+	case DB_LAST:
+		part_id = part->nparts - 1;
+		retry = 1;
+		break;
+	case DB_NEXT:
+	case DB_NEXT_NODUP:
+		if (orig_dbc == NULL)
+			part_id = 0;
+		else
+			part_id = cp->part_id;
+		retry = 1;
+		break;
+	case DB_NEXT_DUP:
+		break;
+	case DB_PREV:
+	case DB_PREV_NODUP:
+		if (orig_dbc == NULL)
+			part_id = part->nparts - 1;
+		else
+			part_id = cp->part_id;
+		retry = 1;
+		break;
+	case DB_PREV_DUP:
+		break;
+	case DB_SET:
+		search = 1;
+		break;
+	default:
+		return (__db_unknown_flag(dbp->env, "__partc_get", flags));
+	}
+
+	/*
+	 * If we need to find the partition to start on, then
+	 * do a binary search of the in memory partition table.
+	 */
+	if (search == 1 && F_ISSET(part, PART_CALLBACK))
+		part_id = part->callback(dbp, key) % part->nparts;
+	else if (search == 1)
+		__part_search(dbp, part, key, &part_id);
+
+	/* Get a new cursor if necessary */
+	if (orig_dbc == NULL || cp->part_id != part_id) {
+		GET_PART_CURSOR(dbc, new_dbc, part_id);
+	} else
+		new_dbc = orig_dbc;
+
+	while ((ret = __dbc_get(new_dbc,
+	    key, data, flags)) == DB_NOTFOUND && retry == 1) {
+		switch (flags & DB_OPFLAGS_MASK) {
+		case DB_FIRST:
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+		case DB_SET_RANGE:
+			if (++part_id < part->nparts) {
+				flags = DB_FIRST | multi;
+				break;
+			}
+			goto err;
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_NODUP:
+			if (part_id-- > 0) {
+				flags = DB_LAST | multi;
+				break;
+			}
+			goto err;
+		default:
+			goto err;
+		}
+
+		if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+		GET_PART_CURSOR(dbc, new_dbc, part_id);
+	}
+
+	if (ret != 0)
+		goto err;
+
+	/* Success: swap original and new cursors. */
+	if (new_dbc != orig_dbc) {
+		if (orig_dbc != NULL) {
+			cp->sub_cursor = NULL;
+			if ((ret = __dbc_close(orig_dbc)) != 0)
+				goto err;
+		}
+		cp->sub_cursor = new_dbc;
+		cp->part_id = part_id;
+	}
+
+	return (0);
+
+err:	if (new_dbc != NULL && new_dbc != orig_dbc)
+		(void)__dbc_close(new_dbc);
+	return (ret);
+}
+
+/*
+ * __partc_put --
+ *	cursor put opeartion on a partitioned cursor.
+ *
+ */
+static int
+__partc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	PART_CURSOR *cp;
+	u_int32_t part_id;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = (PART_CURSOR*)dbc->internal;
+	part_id = cp->part_id;
+	part = dbp->p_internal;
+	*pgnop = PGNO_INVALID;
+
+	switch (flags) {
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_NODUPDATA:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		if (F_ISSET(part, PART_CALLBACK)) {
+			part_id = part->callback(dbp, key) % part->nparts;
+			break;
+		}
+		__part_search(dbp, part, key, &part_id);
+		break;
+	default:
+		break;
+	}
+
+	if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) {
+		if ((ret = __db_cursor_int(part->handles[part_id],
+		    dbc->thread_info, dbc->txn, part->handles[part_id]->type,
+		    PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0)
+			goto err;
+	}
+
+	if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+		F_SET(new_dbc, DBC_WRITER);
+	if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0)
+		goto err;
+
+	if (new_dbc != cp->sub_cursor) {
+		if (cp->sub_cursor != NULL) {
+			if ((ret = __dbc_close(cp->sub_cursor)) != 0)
+				goto err;
+			cp->sub_cursor = NULL;
+		}
+		cp->sub_cursor = new_dbc;
+		cp->part_id = part_id;
+	}
+
+	return (0);
+
+err:	if (new_dbc != NULL && cp->sub_cursor != new_dbc)
+		(void)__dbc_close(new_dbc);
+	return (ret);
+}
+
+/*
+ * __partc_del
+ *	Delete interface to partitioned cursors.
+ *
+ */
+static int
+__partc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	PART_CURSOR *cp;
+	cp = (PART_CURSOR*)dbc->internal;
+
+	if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+		F_SET(cp->sub_cursor, DBC_WRITER);
+	return (__dbc_del(cp->sub_cursor, flags));
+}
+
+/*
+ * __partc_writelock
+ *	Writelock interface to partitioned cursors.
+ *
+ */
+static int
+__partc_writelock(dbc)
+	DBC *dbc;
+{
+	PART_CURSOR *cp;
+	cp = (PART_CURSOR*)dbc->internal;
+
+	return (cp->sub_cursor->am_writelock(cp->sub_cursor));
+}
+
+/*
+ * __partc_close
+ *	Close interface to partitioned cursors.
+ *
+ */
+static int
+__partc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	PART_CURSOR *cp;
+	int ret;
+
+	COMPQUIET(root_pgno, 0);
+	COMPQUIET(rmroot, NULL);
+
+	cp = (PART_CURSOR*)dbc->internal;
+
+	if (cp->sub_cursor == NULL)
+		return (0);
+	ret = __dbc_close(cp->sub_cursor);
+	cp->sub_cursor = NULL;
+	return (ret);
+}
+
+/*
+ * __partc_destroy --
+ *	Destroy a single cursor.
+ */
+static int
+__partc_destroy(dbc)
+	DBC *dbc;
+{
+	PART_CURSOR *cp;
+	ENV *env;
+
+	cp = (PART_CURSOR *)dbc->internal;
+	env = dbc->env;
+
+	/* Discard the structure. Don't recurse. */
+	__os_free(env, cp);
+
+	return (0);
+}
+
+/*
+ * __partition_close
+ *	Close a partitioned database.
+ *
+ * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__partition_close(dbp, txn, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret, t_ret;
+
+	if ((part = dbp->p_internal) == NULL)
+		return (0);
+
+	env = dbp->env;
+	ret = 0;
+
+	if ((pdbp = part->handles) != NULL) {
+		for (i = 0; i < part->nparts; i++, pdbp++)
+			if (*pdbp != NULL && (t_ret =
+			    __db_close(*pdbp, txn, flags)) != 0 && ret == 0)
+				ret = t_ret;
+		__os_free(env, part->handles);
+	}
+	if (part->dirs != NULL)
+		__os_free(env, (char **)part->dirs);
+	if (part->data != NULL)
+		__os_free(env, (char **)part->data);
+	__os_free(env, part);
+	dbp->p_internal = NULL;
+
+	return (ret);
+}
+
+/*
+ * __partition_sync
+ *	Sync a partitioned database.
+ *
+ * PUBLIC: int __partition_sync __P((DB *));
+ */
+int
+__partition_sync(dbp)
+	DB *dbp;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret, t_ret;
+
+	ret = 0;
+	part = dbp->p_internal;
+
+	if ((pdbp = part->handles) != NULL) {
+		for (i = 0; i < part->nparts; i++, pdbp++)
+			if (*pdbp != NULL &&
+			    F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret =
+			    __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0)
+				ret = t_ret;
+	}
+	if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __partition_stat
+ *	Stat a partitioned database.
+ *
+ * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__partition_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	DB *dbp, **pdbp;
+	DB_BTREE_STAT *fsp, *bsp;
+#ifdef HAVE_HASH
+	DB_HASH_STAT *hfsp, *hsp;
+#endif
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	ENV *env;
+	u_int32_t i;
+	int ret;
+
+	dbp = dbc->dbp;
+	part = dbp->p_internal;
+	env = dbp->env;
+	fsp = NULL;
+#ifdef HAVE_HASH
+	hfsp = NULL;
+#endif
+
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+		    (*pdbp)->type, PGNO_INVALID,
+		    0, dbc->locker, &new_dbc)) != 0)
+			goto err;
+		switch (new_dbc->dbtype) {
+		case DB_BTREE:
+			if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0)
+				goto err;
+			if (fsp == NULL) {
+				fsp = bsp;
+				*(DB_BTREE_STAT **)spp = fsp;
+			} else {
+				fsp->bt_nkeys += bsp->bt_nkeys;
+				fsp->bt_ndata += bsp->bt_ndata;
+				fsp->bt_pagecnt += bsp->bt_pagecnt;
+				if (fsp->bt_levels < bsp->bt_levels)
+					fsp->bt_levels = bsp->bt_levels;
+				fsp->bt_int_pg += bsp->bt_int_pg;
+				fsp->bt_leaf_pg += bsp->bt_leaf_pg;
+				fsp->bt_dup_pg += bsp->bt_dup_pg;
+				fsp->bt_over_pg += bsp->bt_over_pg;
+				fsp->bt_free += bsp->bt_free;
+				fsp->bt_int_pgfree += bsp->bt_int_pgfree;
+				fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree;
+				fsp->bt_dup_pgfree += bsp->bt_dup_pgfree;
+				fsp->bt_over_pgfree += bsp->bt_over_pgfree;
+				__os_ufree(env, bsp);
+			}
+			break;
+#ifdef HAVE_HASH
+		case DB_HASH:
+			if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0)
+				goto err;
+			if (hfsp == NULL) {
+				hfsp = hsp;
+				*(DB_HASH_STAT **)spp = hfsp;
+			} else {
+				hfsp->hash_nkeys += hsp->hash_nkeys;
+				hfsp->hash_ndata += hsp->hash_ndata;
+				hfsp->hash_pagecnt += hsp->hash_pagecnt;
+				hfsp->hash_ffactor += hsp->hash_ffactor;
+				hfsp->hash_buckets += hsp->hash_buckets;
+				hfsp->hash_free += hsp->hash_free;
+				hfsp->hash_bfree += hsp->hash_bfree;
+				hfsp->hash_bigpages += hsp->hash_bigpages;
+				hfsp->hash_big_bfree += hsp->hash_big_bfree;
+				hfsp->hash_overflows += hsp->hash_overflows;
+				hfsp->hash_ovfl_free += hsp->hash_ovfl_free;
+				hfsp->hash_dup += hsp->hash_dup;
+				hfsp->hash_dup_free += hsp->hash_dup_free;
+				__os_ufree(env, hsp);
+			}
+			break;
+#endif
+		default:
+			break;
+		}
+		if ((ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+	}
+	return (0);
+
+err:
+	if (fsp != NULL)
+		__os_ufree(env, fsp);
+	*(DB_BTREE_STAT **)spp = NULL;
+	return (ret);
+}
+
+/*
+ * __part_truncate --
+ *	Truncate a database.
+ *
+ * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__part_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	DB *dbp, **pdbp;
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	u_int32_t count, i;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	if (countp != NULL)
+		*countp = 0;
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+		    (*pdbp)->type, PGNO_INVALID,
+		    0, dbc->locker, &new_dbc)) != 0)
+			break;
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			ret = __bam_truncate(new_dbc, &count);
+			break;
+		case DB_HASH:
+#ifdef HAVE_HASH
+			ret = __ham_truncate(new_dbc, &count);
+			break;
+#endif
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(dbp->env,
+			    "DB->truncate", dbp->type);
+			count = 0;
+			break;
+		}
+		if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (countp != NULL)
+			*countp += count;
+	}
+
+	return (ret);
+}
+/*
+ * __part_compact -- compact a partitioned database.
+ *
+ * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__part_compact(dbp, ip, txn, start, stop, c_data, flags, end)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *start, *stop;
+	DB_COMPACT *c_data;
+	u_int32_t flags;
+	DBT *end;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+		switch (dbp->type) {
+		case DB_HASH:
+		case DB_BTREE:
+		case DB_RECNO:
+			ret = __db_compact_int(*pdbp,
+			     ip, txn, start, stop, c_data, flags, end);
+			break;
+
+		default:
+			ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+			break;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __part_lsn_reset --
+ *	reset the lsns on each partition.
+ *
+ * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__part_lsn_reset(dbp, ip)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++)
+		ret = __db_lsn_reset((*pdbp)->mpf, ip);
+
+	return (ret);
+}
+
+/*
+ * __part_fileid_reset --
+ *	reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *fname;
+	u_int32_t nparts;
+	int encrypted;
+{
+	int ret;
+	u_int32_t part_id;
+	char *name, *sp;
+	const char *np;
+
+	if ((ret = __os_malloc(env,
+	     strlen(fname) + PART_LEN + 1, &name)) != 0) {
+		__db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+		return (ret);
+	}
+
+	sp = name;
+	np = __db_rpath(fname);
+	if (np == NULL)
+		np = fname;
+	else {
+		np++;
+		(void)strncpy(name, fname, (size_t)(np - fname));
+		sp = name + (np - fname);
+	}
+
+	for (part_id = 0; ret == 0 && part_id < nparts; part_id++) {
+		(void)sprintf(sp, PART_NAME, np, part_id);
+		ret = __env_fileid_reset(env, ip, sp, encrypted);
+	}
+
+	__os_free(env, name);
+	return (ret);
+}
+
+/*
+ * __part_key_range --
+ *	Return proportion of keys relative to given key.
+ *
+ * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__part_key_range(dbc, dbt, kp, flags)
+	DBC *dbc;
+	DBT *dbt;
+	DB_KEY_RANGE *kp;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	DBC *new_dbc;
+	DB_PARTITION *part;
+	PAGE *h;
+	u_int32_t id, part_id;
+	u_int32_t elems, empty, less_elems, my_elems, greater_elems;
+	u_int32_t levels, max_levels, my_levels;
+	db_pgno_t root_pgno;
+	int ret;
+	double total_elems;
+
+	COMPQUIET(flags, 0);
+
+	part = dbc->dbp->p_internal;
+
+	/*
+	 * First we find the key range for the partition that contains the
+	 * key.  Then we scale based on estimates of the other partitions.
+	 */
+	if (F_ISSET(part, PART_CALLBACK))
+		part_id = part->callback(dbc->dbp, dbt) % part->nparts;
+	else
+		__part_search(dbc->dbp, part, dbt, &part_id);
+	GET_PART_CURSOR(dbc, new_dbc, part_id);
+
+	if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0)
+		goto err;
+
+	cp = (BTREE_CURSOR *)new_dbc->internal;
+
+	root_pgno = BAM_ROOT_PGNO(new_dbc);
+	if ((ret = __memp_fget(new_dbc->dbp->mpf, &root_pgno,
+	     new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+		goto c_err;
+
+	my_elems = NUM_ENT(h);
+	my_levels = LEVEL(h);
+	max_levels = my_levels;
+
+	if ((ret = __memp_fput(new_dbc->dbp->mpf,
+	     new_dbc->thread_info, h, new_dbc->priority)) != 0)
+		goto c_err;
+
+	if ((ret = __dbc_close(new_dbc)) != 0)
+		goto err;
+	/*
+	 * We have the range within one subtree.  Now estimate
+	 * what part of the whole range that subtree is.  Figure
+	 * out how many levels each part has and how many entries
+	 * in the level below the root.
+	 */
+	empty = less_elems = greater_elems = 0;
+	for (id = 0; id < part->nparts; id++) {
+		if (id == part_id) {
+			empty = 0;
+			continue;
+		}
+		GET_PART_CURSOR(dbc, new_dbc, id);
+		cp = (BTREE_CURSOR *)new_dbc->internal;
+		if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root,
+		     new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+			goto c_err;
+
+		elems = NUM_ENT(h);
+		levels = LEVEL(h);
+		if (levels == 1)
+			elems /= 2;
+
+		if ((ret = __memp_fput(new_dbc->dbp->mpf,
+		     new_dbc->thread_info, h, new_dbc->priority)) != 0)
+			goto c_err;
+
+		if ((ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+
+		/* If the tree is empty, ignore it. */
+		if (elems == 0) {
+			empty++;
+			continue;
+		}
+
+		/*
+		 * If a tree has fewer levels than the max just count
+		 * it as a single element in the higher level.
+		 */
+		if (id < part_id) {
+			if (levels > max_levels) {
+				max_levels = levels;
+				less_elems = id + elems - empty;
+			} else if (levels < max_levels)
+				less_elems++;
+			else
+				less_elems += elems;
+		} else {
+			if (levels > max_levels) {
+				max_levels = levels;
+				greater_elems = (id - part_id) + elems - empty;
+			} else if (levels < max_levels)
+				greater_elems++;
+			else
+				greater_elems += elems;
+		}
+
+	}
+
+	if (my_levels < max_levels) {
+		/*
+		 * The subtree containing the key is not the tallest one.
+		 * Reduce its share by the number of records at the highest
+		 * level.  Scale the greater and lesser components up
+		 * by  the number of records on either side of this
+		 * subtree.
+		 */
+		total_elems = 1 + greater_elems + less_elems;
+		kp->equal /= total_elems;
+		kp->less /= total_elems;
+		kp->less += less_elems/total_elems;
+		kp->greater /= total_elems;
+		kp->greater += greater_elems/total_elems;
+	} else if (my_levels == max_levels) {
+		/*
+		 * The key is in one of the tallest subtrees.  We will
+		 * scale the values by the ratio of the records at the
+		 * top of this stubtree to the number of records at the
+		 * highest level.
+		 */
+		total_elems = greater_elems + less_elems;
+		if (total_elems != 0) {
+			/*
+			 * First scale down by the fraction of elements
+			 * in this subtree.
+			 */
+			total_elems += my_elems;
+			kp->equal *= my_elems;
+			kp->equal /= total_elems;
+			kp->less *= my_elems;
+			kp->less /= total_elems;
+			kp->greater *= my_elems;
+			kp->greater /= total_elems;
+			/*
+			 * Proportionally add weight from the subtrees to the
+			 * left and right of this one.
+			 */
+			kp->less += less_elems / total_elems;
+			kp->greater += greater_elems / total_elems;
+		}
+	}
+
+	if (0) {
+c_err:		(void)__dbc_close(new_dbc);
+	}
+
+err:	return (ret);
+}
+
+/*
+ * __part_remove --
+ *	Remove method for a partitioned database.
+ *
+ * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__part_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags));
+}
+
+/*
+ * __part_rename --
+ *	Rename method for a partitioned database.
+ *
+ * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:         DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__part_rename(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	return (__part_rr(dbp, ip, txn, name, subdb, newname, 0));
+}
+
+/*
+ * __part_rr --
+ *	Remove/Rename method for a partitioned database.
+ */
+static int
+__part_rr(dbp, ip, txn, name, subdb, newname, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB **pdbp, *ptmpdbp, *tmpdbp;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret, t_ret;
+	char *np;
+
+	env = dbp->env;
+	ret = 0;
+
+	if (subdb != NULL && name != NULL) {
+		__db_errx(env, DB_STR("0663",
+	    "A partitioned database can not be in a multiple databases file"));
+		return (EINVAL);
+	}
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/*
+	 * Since rename no longer opens the database, we have
+	 * to do it here.
+	 */
+	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+		return (ret);
+
+	/*
+	 * We need to make sure we don't self-deadlock, so give
+	 * this dbp the same locker as the incoming one.
+	 */
+	tmpdbp->locker = dbp->locker;
+	if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type,
+	    DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	part = tmpdbp->p_internal;
+	pdbp = part->handles;
+	COMPQUIET(np, NULL);
+	if (newname != NULL && (ret = __os_malloc(env,
+	     strlen(newname) + PART_LEN + 1, &np)) != 0) {
+		__db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
+		goto err;
+	}
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0)
+			break;
+		ptmpdbp->locker = (*pdbp)->locker;
+		if (newname == NULL)
+			ret = __db_remove_int(ptmpdbp,
+			     ip, txn, (*pdbp)->fname, NULL, flags);
+		else {
+			DB_ASSERT(env, np != NULL);
+			(void)sprintf(np, PART_NAME, newname, i);
+			ret = __db_rename_int(ptmpdbp,
+			     ip, txn, (*pdbp)->fname, NULL, np, flags);
+		}
+		ptmpdbp->locker = NULL;
+		(void)__db_close(ptmpdbp, NULL, DB_NOSYNC);
+		if (ret != 0)
+			break;
+	}
+
+	if (newname != NULL)
+		__os_free(env, np);
+
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err:		/*
+		 * Since we copied the locker ID from the dbp, we'd better not
+		 * free it here.
+		 */
+		tmpdbp->locker = NULL;
+
+		/* We need to remove the lock event we associated with this. */
+		if (txn != NULL)
+			__txn_remlock(env,
+			    txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+		if ((t_ret = __db_close(tmpdbp,
+		    txn, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+#ifdef HAVE_VERIFY
+/*
+ * __part_verify --
+ *	Verify a partitioned database.
+ *
+ * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *,
+ * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__part_verify(dbp, vdp, fname, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *fname;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	BINTERNAL *lp, *rp;
+	DB **pdbp;
+	DB_PARTITION *part;
+	DBC *dbc;
+	DBT *key;
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t i;
+	int ret, t_ret;
+
+	env = dbp->env;
+	lp = rp = NULL;
+	dbc = NULL;
+	ip = vdp->thread_info;
+
+	if (dbp->type == DB_BTREE) {
+		if ((ret = __bam_open(dbp, ip,
+		    NULL, fname, PGNO_BASE_MD, flags)) != 0)
+			goto err;
+	}
+#ifdef HAVE_HASH
+	else if ((ret = __ham_open(dbp, ip,
+	    NULL, fname, PGNO_BASE_MD, flags)) != 0)
+		goto err;
+#endif
+
+	/*
+	 * Initalize partition db handles and get the names. Set DB_RDWRMASTER
+	 * because we may not have the partition callback, but we can still
+	 * look at the structure of the tree.
+	 */
+	if ((ret = __partition_open(dbp,
+	    ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0)
+		goto err;
+	part = dbp->p_internal;
+
+	if (LF_ISSET(DB_SALVAGE)) {
+		/* If we are being aggressive we don't want to dump the keys. */
+		if (LF_ISSET(DB_AGGRESSIVE))
+			dbp->p_internal = NULL;
+		ret = __db_prheader(dbp,
+		    NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD);
+		dbp->p_internal = part;
+		if (ret != 0)
+			goto err;
+	}
+
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		goto err;
+
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if (!F_ISSET(part, PART_RANGE) || part->keys == NULL)
+			goto vrfy;
+		if (lp != NULL)
+			__os_free(env, lp);
+		lp = rp;
+		rp = NULL;
+		if (i + 1 <  part->nparts) {
+			key = &part->keys[i + 1];
+			if ((ret = __os_malloc(env,
+			    BINTERNAL_SIZE(key->size), &rp)) != 0)
+				goto err;
+			rp->len = key->size;
+			memcpy(rp->data, key->data, key->size);
+			B_TSET(rp->type, B_KEYDATA);
+		}
+vrfy:		if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname,
+		    NULL, handle, callback,
+		    lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+err:	if (lp != NULL)
+		__os_free(env, lp);
+	if (rp != NULL)
+		__os_free(env, rp);
+	return (ret);
+}
+#endif
+
+#ifdef CONFIG_TEST
+/*
+ * __part_testdocopy -- copy all partitions for testing purposes.
+ *
+ * PUBLIC: int __part_testdocopy __P((DB *, const char *));
+ */
+int
+__part_testdocopy(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+		return (ret);
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++)
+		if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0)
+			return (ret);
+
+	return (0);
+}
+#endif
+#else
+/*
+ * __db_nopartition --
+ *	Error when a Berkeley DB build doesn't include partitioning.
+ *
+ * PUBLIC: int __db_no_partition __P((ENV *));
+ */
+int
+__db_no_partition(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0664",
+    "library build did not include support for the database partitioning"));
+	return (DB_OPNOTSUP);
+}
+/*
+ * __partition_set --
+ *	Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC:	u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+	DB *dbp;
+	u_int32_t parts;
+	DBT *keys;
+	u_int32_t (*callback)(DB *, DBT *key);
+{
+	COMPQUIET(parts, 0);
+	COMPQUIET(keys, NULL);
+	COMPQUIET(callback, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_callback --
+ *	Set the partition callback function.  This routine must be called
+ * prior to opening a partition database that requires a function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC:	 u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+	DB *dbp;
+	u_int32_t *parts;
+	u_int32_t (**callback)(DB *, DBT *key);
+{
+	COMPQUIET(parts, NULL);
+	COMPQUIET(callback, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_dirs --
+ *	Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+	DB *dbp;
+	const char ***dirpp;
+{
+	COMPQUIET(dirpp, NULL);
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_keys --
+ *	Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+	DB *dbp;
+	u_int32_t *parts;
+	DBT **keys;
+{
+	COMPQUIET(parts, NULL);
+	COMPQUIET(keys, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+/*
+ * __partition_init --
+ *	Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_no_partition(dbp->env));
+}
+/*
+ * __part_fileid_reset --
+ *	reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *fname;
+	u_int32_t nparts;
+	int encrypted;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(fname, NULL);
+	COMPQUIET(nparts, 0);
+	COMPQUIET(encrypted, 0);
+
+	return (__db_no_partition(env));
+}
+/*
+ * __partition_set_dirs --
+ *	Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	COMPQUIET(dirp, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+#endif
diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h
new file mode 100644
index 00000000..096176a5
--- /dev/null
+++ b/src/dbinc/atomic.h
@@ -0,0 +1,220 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_ATOMIC_H_
+#define	_DB_ATOMIC_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ *	Atomic operation support for Oracle Berkeley DB
+ *
+ * HAVE_ATOMIC_SUPPORT configures whether to use the assembly language
+ * or system calls to perform:
+ *
+ *	 atomic_inc(env, valueptr)
+ *	    Adds 1 to the db_atomic_t value, returning the new value.
+ *
+ *	 atomic_dec(env, valueptr)
+ *	    Subtracts 1 from the db_atomic_t value, returning the new value.
+ *
+ *	 atomic_compare_exchange(env, valueptr, oldval, newval)
+ *	    If the db_atomic_t's value is still oldval, set it to newval.
+ *	    It returns 1 for success or 0 for failure.
+ *
+ * The ENV * parameter is used only when HAVE_ATOMIC_SUPPORT is undefined.
+ *
+ * If the platform does not natively support any one of these operations,
+ * then atomic operations will be emulated with this sequence:
+ *		MUTEX_LOCK()
+ *		<op>
+ *		MUTEX_UNLOCK();
+ * Uses where mutexes are not available (e.g. the environment has not yet
+ * attached to the mutex region) must be avoided.
+ */
+#if defined(DB_WIN32)
+typedef DWORD	atomic_value_t;
+#else
+typedef int32_t	 atomic_value_t;
+#endif
+
+/*
+ * Windows CE has strange issues using the Interlocked APIs with variables
+ * stored in shared memory. It seems like the page needs to have been written
+ * prior to the API working as expected. Work around this by allocating an
+ * additional 32-bit value that can be harmlessly written for each value
+ * used in Interlocked instructions.
+ */
+#if defined(DB_WINCE)
+typedef struct {
+	volatile atomic_value_t value;
+	volatile atomic_value_t dummy;
+} db_atomic_t;
+#else
+typedef struct {
+	volatile atomic_value_t value;
+} db_atomic_t;
+#endif
+
+/*
+ * These macro hide the db_atomic_t structure layout and help detect
+ * non-atomic_t actual argument to the atomic_xxx() calls. DB requires
+ * aligned 32-bit reads to be atomic even outside of explicit 'atomic' calls.
+ * These have no memory barriers; the caller must include them when necessary.
+ */
+#define	atomic_read(p)		((p)->value)
+#define	atomic_init(p, val)	((p)->value = (val))
+
+#ifdef HAVE_ATOMIC_SUPPORT
+
+#if defined(DB_WIN32)
+#if defined(DB_WINCE)
+#define	WINCE_ATOMIC_MAGIC(p)						\
+	/*								\
+	 * Memory mapped regions on Windows CE cause problems with	\
+	 * InterlockedXXX calls. Each page in a mapped region needs to	\
+	 * have been written to prior to an InterlockedXXX call, or the	\
+	 * InterlockedXXX call hangs. This does not seem to be		\
+	 * documented anywhere. For now, read/write a non-critical	\
+	 * piece of memory from the shared region prior to attempting	\
+	 * shared region prior to attempting an InterlockedExchange	\
+	 * InterlockedXXX operation.					\
+	 */								\
+	(p)->dummy = 0
+#else
+#define	WINCE_ATOMIC_MAGIC(p) 0
+#endif
+
+#if defined(DB_WINCE) || (defined(_MSC_VER) && _MSC_VER < 1300)
+/*
+ * The Interlocked instructions on Windows CE have different parameter
+ * definitions. The parameters lost their 'volatile' qualifier,
+ * cast it away, to avoid compiler warnings.
+ * These definitions should match those in dbinc/mutex_int.h for tsl_t, except
+ * that the WINCE version drops the volatile qualifier.
+ */
+typedef PLONG interlocked_val;
+#define	atomic_inc(env, p)						\
+	(WINCE_ATOMIC_MAGIC(p),						\
+	InterlockedIncrement((interlocked_val)(&(p)->value)))
+
+#else
+typedef LONG volatile *interlocked_val;
+#define	atomic_inc(env, p)	\
+	InterlockedIncrement((interlocked_val)(&(p)->value))
+#endif
+
+#define	atomic_dec(env, p)						\
+	(WINCE_ATOMIC_MAGIC(p),						\
+	InterlockedDecrement((interlocked_val)(&(p)->value)))
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define	atomic_compare_exchange(env, p, oldval, newval)			\
+	(WINCE_ATOMIC_MAGIC(p),						\
+	(InterlockedCompareExchange((PVOID *)(&(p)->value),		\
+	(PVOID)(newval), (PVOID)(oldval)) == (PVOID)(oldval)))
+#else
+#define	atomic_compare_exchange(env, p, oldval, newval)			\
+	(WINCE_ATOMIC_MAGIC(p),						\
+	(InterlockedCompareExchange((interlocked_val)(&(p)->value),	\
+	(newval), (oldval)) == (oldval)))
+#endif
+#endif
+
+#if defined(HAVE_ATOMIC_SOLARIS)
+/* Solaris sparc & x86/64 */
+#include <atomic.h>
+#define	atomic_inc(env, p)	\
+	atomic_inc_uint_nv((volatile unsigned int *) &(p)->value)
+#define	atomic_dec(env, p)	\
+	atomic_dec_uint_nv((volatile unsigned int *) &(p)->value)
+#define	atomic_compare_exchange(env, p, oval, nval)		\
+	(atomic_cas_32((volatile unsigned int *) &(p)->value,	\
+	    (oval), (nval)) == (oval))
+#endif
+
+#if defined(HAVE_ATOMIC_X86_GCC_ASSEMBLY)
+/* x86/x86_64 gcc  */
+#define	atomic_inc(env, p)	__atomic_inc(p)
+#define	atomic_dec(env, p)	__atomic_dec(p)
+#define	atomic_compare_exchange(env, p, o, n)	\
+	__atomic_compare_exchange((p), (o), (n))
+static inline int __atomic_inc(db_atomic_t *p)
+{
+	int	temp;
+
+	temp = 1;
+	__asm__ __volatile__("lock; xadd %0, (%1)"
+		: "+r"(temp)
+		: "r"(p));
+	return (temp + 1);
+}
+
+static inline int __atomic_dec(db_atomic_t *p)
+{
+	int	temp;
+
+	temp = -1;
+	__asm__ __volatile__("lock; xadd %0, (%1)"
+		: "+r"(temp)
+		: "r"(p));
+	return (temp - 1);
+}
+
+/*
+ * x86/gcc Compare exchange for shared latches. i486+
+ *	Returns 1 for success, 0 for failure
+ *
+ * GCC 4.1+ has an equivalent  __sync_bool_compare_and_swap() as well as
+ * __sync_val_compare_and_swap() which returns the value read from *dest
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ * which configure could be changed to use.
+ */
+static inline int __atomic_compare_exchange(
+	db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval)
+{
+	atomic_value_t was;
+
+	if (p->value != oldval)	/* check without expensive cache line locking */
+		return 0;
+	__asm__ __volatile__("lock; cmpxchgl %1, (%2);"
+	    :"=a"(was)
+	    :"r"(newval), "r"(p), "a"(oldval)
+	    :"memory", "cc");
+	return (was == oldval);
+}
+#endif
+
+#else
+/*
+ * No native hardware support for atomic increment, decrement, and
+ * compare-exchange. Emulate them when mutexes are supported;
+ * do them without concern for atomicity when no mutexes.
+ */
+#ifndef HAVE_MUTEX_SUPPORT
+/*
+ * These minimal versions are correct to use only for single-threaded,
+ * single-process environments.
+ */
+#define	atomic_inc(env, p)	(++(p)->value)
+#define	atomic_dec(env, p)	(--(p)->value)
+#define	atomic_compare_exchange(env, p, oldval, newval)		\
+	(DB_ASSERT(env, atomic_read(p) == (oldval)),		\
+	atomic_init(p, (newval)), 1)
+#else
+#define atomic_inc(env, p)	__atomic_inc(env, p)
+#define atomic_dec(env, p)	__atomic_dec(env, p)
+#endif
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_ATOMIC_H_ */
diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h
new file mode 100644
index 00000000..86bbec14
--- /dev/null
+++ b/src/dbinc/btree.h
@@ -0,0 +1,553 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+#ifndef	_DB_BTREE_H_
+#define	_DB_BTREE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Forward structure declarations. */
+struct __btree;		typedef struct __btree BTREE;
+struct __cursor;	typedef struct __cursor BTREE_CURSOR;
+struct __epg;		typedef struct __epg EPG;
+
+#define	DEFMINKEYPAGE	 (2)
+
+/*
+ * A recno order of 0 indicates that we don't have an order, not that we've
+ * an order less than 1.
+ */
+#define	INVALID_ORDER	0
+
+#define	ISINTERNAL(p)	(TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
+#define	ISLEAF(p)	(TYPE(p) == P_LBTREE ||				\
+			    TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP)
+
+/* Flags for __bam_cadjust_log(). */
+#define	CAD_UPDATEROOT	0x01		/* Root page count was updated. */
+
+/* Flags for __bam_split_log(). */
+#define	SPL_NRECS	0x01		/* Split tree has record count. */
+#define	SPL_RECNO	0x02		/* This is a Recno cursor. */
+
+/* Flags for __bam_iitem(). */
+#define	BI_DELETED	0x01		/* Key/data pair only placeholder. */
+
+/* Flags for __bam_stkrel(). */
+#define	STK_CLRDBC	0x01		/* Clear dbc->page reference. */
+#define	STK_NOLOCK	0x02		/* Don't retain locks. */
+#define	STK_PGONLY	0x04
+
+/* Flags for __ram_ca(). These get logged, so make the values explicit. */
+typedef enum {
+	CA_DELETE = 0,			/* Delete the current record. */
+	CA_IAFTER = 1,			/* Insert before the current record. */
+	CA_IBEFORE = 2,			/* Insert after the current record. */
+	CA_ICURRENT = 3			/* Overwrite the current record. */
+} ca_recno_arg;
+
+/*
+ * Flags for __bam_search() and __bam_rsearch().
+ *
+ * Note, internal page searches must find the largest record less than key in
+ * the tree so that descents work.  Leaf page searches must find the smallest
+ * record greater than key so that the returned index is the record's correct
+ * position for insertion.
+ *
+ * The flags parameter to the search routines describes three aspects of the
+ * search: the type of locking required (including if we're locking a pair of
+ * pages), the item to return in the presence of duplicates and whether or not
+ * to return deleted entries.  To simplify both the mnemonic representation
+ * and the code that checks for various cases, we construct a set of bitmasks.
+ */
+#define	SR_READ		0x00001		/* Read locks. */
+#define	SR_WRITE	0x00002		/* Write locks. */
+
+#define	SR_APPEND	0x00040		/* Append to the tree. */
+#define	SR_DELNO	0x00080		/* Don't return deleted items. */
+#define	SR_DUPFIRST	0x00100		/* Return first duplicate. */
+#define	SR_DUPLAST	0x00200		/* Return last duplicate. */
+#define	SR_EXACT	0x00400		/* Exact items only. */
+#define	SR_PARENT	0x00800		/* Lock page pair. */
+#define	SR_STACK	0x01000		/* Need a complete stack. */
+#define	SR_PAST_EOF	0x02000		/* If doing insert search (or keyfirst
+					 * or keylast operations), or a split
+					 * on behalf of an insert, it's okay to
+					 * return an entry one past end-of-page.
+					 */
+#define	SR_STK_ONLY	0x04000		/* Just return info in the stack */
+#define	SR_MAX		0x08000		/* Get the right most key */
+#define	SR_MIN		0x10000		/* Get the left most key */
+#define	SR_NEXT		0x20000		/* Get the page after this key */
+#define	SR_DEL		0x40000		/* Get the tree to delete this key. */
+#define	SR_START	0x80000		/* Level to start stack. */
+#define	SR_BOTH		0x100000	/* Get this and the NEXT page */
+
+#define	SR_DELETE							\
+	(SR_WRITE | SR_DUPFIRST | SR_DELNO | SR_EXACT | SR_STACK)
+#define	SR_FIND		(SR_READ | SR_DUPFIRST | SR_DELNO)
+#define	SR_FIND_WR	(SR_WRITE | SR_DUPFIRST | SR_DELNO)
+#define	SR_INSERT	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
+#define	SR_KEYFIRST	(SR_WRITE | SR_DUPFIRST | SR_PAST_EOF | SR_STACK)
+#define	SR_KEYLAST	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
+#define	SR_WRPAIR	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_PARENT)
+
+/*
+ * Various routines pass around page references.  A page reference is
+ * a pointer to the page, and the indx indicates an item on the page.
+ * Each page reference may include a lock.
+ */
+struct __epg {
+	PAGE	     *page;		/* The page. */
+	db_indx_t     indx;		/* The index on the page. */
+	db_indx_t     entries;		/* The number of entries on page */
+	DB_LOCK	      lock;		/* The page's lock. */
+	db_lockmode_t lock_mode;	/* The lock mode. */
+};
+
+/*
+ * We maintain a stack of the pages that we're locking in the tree.  Grow
+ * the stack as necessary.
+ *
+ * XXX
+ * Temporary fix for #3243 -- clear the page and lock from the stack entry.
+ * The correct fix is to never release a stack that doesn't hold items.
+ */
+#define	BT_STK_CLR(c) do {						\
+	(c)->csp = (c)->sp;						\
+	(c)->csp->page = NULL;						\
+	LOCK_INIT((c)->csp->lock);					\
+} while (0)
+
+#define	BT_STK_ENTER(env, c, pagep, page_indx, l, mode, ret) do {	\
+	if ((ret = ((c)->csp == (c)->esp ?				\
+	    __bam_stkgrow(env, c) : 0)) == 0) {				\
+		(c)->csp->page = pagep;					\
+		(c)->csp->indx = (page_indx);				\
+		(c)->csp->entries = NUM_ENT(pagep);			\
+		(c)->csp->lock = l;					\
+		(c)->csp->lock_mode = mode;				\
+	}								\
+} while (0)
+
+#define	BT_STK_PUSH(env, c, pagep, page_indx, lock, mode, ret) do {	\
+	BT_STK_ENTER(env, c, pagep, page_indx, lock, mode, ret);	\
+	++(c)->csp;							\
+} while (0)
+
+#define	BT_STK_NUM(env, c, pagep, page_indx, ret) do {		\
+	if ((ret = ((c)->csp ==						\
+	    (c)->esp ? __bam_stkgrow(env, c) : 0)) == 0) {		\
+		(c)->csp->page = NULL;					\
+		(c)->csp->indx = (page_indx);				\
+		(c)->csp->entries = NUM_ENT(pagep);			\
+		LOCK_INIT((c)->csp->lock);				\
+		(c)->csp->lock_mode = DB_LOCK_NG;			\
+	}								\
+} while (0)
+
+#define	BT_STK_NUMPUSH(env, c, pagep, page_indx, ret) do {		\
+	BT_STK_NUM(env, cp, pagep, page_indx, ret);			\
+	++(c)->csp;							\
+} while (0)
+
+#define	BT_STK_POP(c)							\
+	((c)->csp == (c)->sp ? NULL : --(c)->csp)
+
+/*
+ * Flags for __bam_dpages.
+ */
+#define	BTD_UPDATE	0x0001		/* Update parents. */
+#define	BTD_RELINK	0x0002		/* Relink leaf pages. */
+
+/*
+ * TRY_LOCK
+ *	When holding a stack we have pages latched but not locked so
+ * we must avoid an undetectable deadlock by not then blocking on a
+ * lock.
+ */
+#define	TRY_LOCK(dbc, pgno, saved_pgno, saved_lock, lock_mode, label) \
+	TRY_LOCK2(dbc, NULL, pgno, saved_pgno, saved_lock, lock_mode, label)
+/*
+ * TRY_LOCK2
+ *	This is a special call for __bam_compact_int which uses 2
+ * overlapping stacks.
+ */
+
+#ifdef BTREE_DEBUG
+#define	TRY_LOCK2(dbc, ndbc, pgno,					\
+    saved_pgno, saved_lock, lock_mode, label) do {			\
+	static int BTcount = 0;						\
+	if ((pgno) != (saved_pgno) &&					\
+	    ((BTcount++ % 5) == 0 ||					\
+	    (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno,		\
+	    lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0)) {		\
+		if (ret != 0 && ret != DB_LOCK_NOTGRANTED &&		\
+		     ret != DB_LOCK_DEADLOCK)				\
+			break;						\
+		if ((ndbc) != NULL) {					\
+			BTREE_CURSOR *__cp;				\
+			__cp = (BTREE_CURSOR *) (dbc)->internal;	\
+			__cp->sp->page = NULL;				\
+			LOCK_INIT(__cp->sp->lock);			\
+			if ((ret = __bam_stkrel(ndbc, 0)) != 0)		\
+				break;					\
+		}							\
+		if ((ret = __bam_stkrel(dbc, 0)) != 0)			\
+			break;						\
+		if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno,	\
+		    lock_mode, 0, &(saved_lock))) != 0)			\
+			break;						\
+		saved_pgno = pgno;					\
+		goto label;						\
+	}								\
+	saved_pgno = pgno;						\
+} while (0)
+#else
+#define	TRY_LOCK2(dbc, ndbc, pgno,					\
+    saved_pgno, saved_lock, lock_mode, label) do {			\
+	if ((pgno) != (saved_pgno) &&					\
+	    (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno,		\
+	    lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0) {		\
+		if (ret != DB_LOCK_NOTGRANTED &&			\
+		     ret != DB_LOCK_DEADLOCK)				\
+			break;						\
+		if ((ndbc) != NULL) {					\
+			BTREE_CURSOR *__cp;				\
+			__cp = (BTREE_CURSOR *) (dbc)->internal;	\
+			__cp->sp->page = NULL;				\
+			LOCK_INIT(__cp->sp->lock);			\
+			if ((ret = __bam_stkrel(ndbc, 0)) != 0)		\
+				break;					\
+		}							\
+		if ((ret = __bam_stkrel(dbc, 0)) != 0)			\
+			break;						\
+		if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno,	\
+		    lock_mode, 0, &(saved_lock))) != 0)	\
+			break;						\
+		saved_pgno = pgno;					\
+		goto label;						\
+	}								\
+	saved_pgno = pgno;						\
+} while (0)
+#endif
+
+/* Btree/Recno cursor. */
+struct __cursor {
+	/* struct __dbc_internal */
+	__DBC_INTERNAL
+
+	/* btree private part */
+	EPG		*sp;		/* Stack pointer. */
+	EPG		*csp;		/* Current stack entry. */
+	EPG		*esp;		/* End stack pointer. */
+	EPG		 stack[5];
+
+	db_indx_t	 ovflsize;	/* Maximum key/data on-page size. */
+
+	db_recno_t	 recno;		/* Current record number. */
+	u_int32_t	 order;		/* Relative order among deleted curs. */
+
+#ifdef HAVE_COMPRESSION
+	/*
+	 * Compression:
+	 *
+	 * We need to hold the current compressed chunk, as well as the previous
+	 * key/data, in order to decompress the next key/data. We do that by
+	 * swapping whether prevKey/Data and currentKey/Data point to
+	 * key1/data1, or key2/data2.
+	 *
+	 * We store prevcursor in order to be able to perform one level of
+	 * DB_PREV by returning prevKey/prevData. We need prev2cursor to more
+	 * efficiently do a subsequent DB_PREV with a linear search from the
+	 * beginning of the compressed chunk.
+	 *
+	 * When we delete entries, we set the cursor to point to the next entry
+	 * after the last deleted key, and set C_COMPRESS_DELETED. The del_key
+	 * DBT holds the key of the deleted entry supposedly pointed to by a
+	 * compressed cursor, and is used to implement DB_PREV_DUP,
+	 * DB_PREV_NODUP, DB_NEXT_DUP, and DB_NEXT_NODUP on a deleted entry.
+	 */
+	DBT		 compressed;	/* Current compressed chunk */
+	DBT		 key1;		/* Holds prevKey or currentKey */
+	DBT		 key2;		/* Holds prevKey or currentKey */
+	DBT		 data1;		/* Holds prevData or currentData */
+	DBT		 data2;		/* Holds prevData or currentData */
+	DBT		 del_key;	/* Holds key from the deleted entry */
+	DBT		 del_data;	/* Holds data from the deleted entry */
+	DBT		*prevKey;	/* Previous key decompressed */
+	DBT		*prevData;	/* Previous data decompressed */
+	DBT		*currentKey;	/* Current key decompressed */
+	DBT		*currentData;	/* Current data decompressed */
+	u_int8_t	*compcursor;	/* Current position in compressed */
+	u_int8_t	*compend;	/* End of compressed */
+	u_int8_t	*prevcursor;	/* Previous current position */
+	u_int8_t	*prev2cursor;	/* Previous previous current position */
+#endif
+
+	/*
+	 * Btree:
+	 * We set a flag in the cursor structure if the underlying object has
+	 * been deleted.  It's not strictly necessary, we could get the same
+	 * information by looking at the page itself, but this method doesn't
+	 * require us to retrieve the page on cursor delete.
+	 *
+	 * Recno:
+	 * When renumbering recno databases during deletes, cursors referencing
+	 * "deleted" records end up positioned between two records, and so must
+	 * be specially adjusted on the next operation.
+	 */
+#define	C_DELETED		0x0001	/* Record was deleted. */
+	/*
+	 * There are three tree types that require maintaining record numbers.
+	 * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set,
+	 * and Btree off-page duplicate trees.
+	 */
+#define	C_RECNUM		0x0002	/* Tree requires record counts. */
+	/*
+	 * Recno trees have immutable record numbers by default, but optionally
+	 * support mutable record numbers.  Off-page duplicate Recno trees have
+	 * mutable record numbers.  All Btrees with record numbers (including
+	 * off-page duplicate trees) are mutable by design, no flag is needed.
+	 */
+#define	C_RENUMBER		0x0004	/* Tree records are mutable. */
+	/*
+	 * The current compressed key/data could be deleted, as well as the
+	 * key/data that the underlying BTree cursor points to.
+	 */
+#define	C_COMPRESS_DELETED	0x0008	/* Compressed record was deleted. */
+	/*
+	 * The current compressed chunk has been modified by another DBC. A
+	 * compressed cursor will have to seek it's position again if necessary
+	 * when it is next accessed.
+	 */
+#define	C_COMPRESS_MODIFIED	0x0010	/* Compressed record was modified. */
+	u_int32_t	 flags;
+};
+
+/*
+ * Threshhold value, as a function of bt_minkey, of the number of
+ * bytes a key/data pair can use before being placed on an overflow
+ * page.  Assume every item requires the maximum alignment for
+ * padding, out of sheer paranoia.
+ */
+#define	B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize)			\
+	((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\
+	    (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t)))))
+
+/*
+ * The maximum space that a single item can ever take up on one page.
+ * Used by __bam_split to determine whether a split is still necessary.
+ */
+#define	B_MAX(a,b)	(((a) > (b)) ? (a) : (b))
+#define	B_MAXSIZEONPAGE(ovflsize)					\
+	(B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize)))
+
+/*
+ * BAM_GET_ROOT --
+ *	This macro is used to isolate the fact that the root page of
+ * a subdatabase may move if DB->compact is called on it.
+ * The dbp->mpf->mfp->revision will be incremented every time
+ * a subdatabase root or meta page moves.  If this is the case then
+ * we must call __db_reopen to read the master database to find it.
+ * We leave the loop only by breaking out if we do not have a subdb
+ * or we are sure the have the right revision.
+ *
+ * It must be guaranteed that we cannot read an old root pgno and a
+ * current revision number.  We note that the global revision number
+ * and DB handle information are only updated while holding the latches
+ * and locks of the master database pages.
+ * If another thread is synchronizing the DB handle with the master
+ * database it will exclusively latch both the old and new pages so we will
+ * synchronize on that.
+ */
+#define BAM_GET_ROOT(dbc, root_pgno, 					\
+	     page, get_mode, lock_mode, lock, ret) do {			\
+	BTREE *__t = (dbc)->dbp->bt_internal;				\
+	BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal;		\
+	db_pgno_t __root;						\
+	u_int32_t __rev = 0;						\
+	if ((root_pgno) == PGNO_INVALID) {				\
+		if (__cp->root == PGNO_INVALID) {			\
+			__root = __t->bt_root;				\
+			__rev = __t->revision;				\
+		} else 							\
+			__root = root_pgno = __cp->root;		\
+	} else								\
+		__root = root_pgno;					\
+	if (STD_LOCKING(dbc) &&						\
+	    ((lock_mode) == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV)	\
+	    || dbc->dbtype == DB_RECNO || F_ISSET(__cp, C_RECNUM)) &&	\
+	    (ret =							\
+	    __db_lget(dbc, 0, __root, lock_mode, 0, &(lock))) != 0)	\
+		break;							\
+	if ((ret = __memp_fget((dbc)->dbp->mpf, &__root,		\
+	     (dbc)->thread_info, dbc->txn, get_mode, &page)) == 0) {	\
+		if (__root == root_pgno)				\
+			break;						\
+		if (F_ISSET(dbc, DBC_OPD) ||				\
+		    !F_ISSET((dbc)->dbp, DB_AM_SUBDB) ||		\
+		     (__t->bt_root == __root &&				\
+		     (LEVEL(page) == LEAFLEVEL || TYPE(page) == 	\
+		     (dbc->dbtype == DB_BTREE ? P_IBTREE : P_IRECNO)) &&\
+		     __rev == (dbc)->dbp->mpf->mfp->revision)) {	\
+			root_pgno = __root;				\
+			break;						\
+		}							\
+		if ((ret = __memp_fput((dbc)->dbp->mpf, 		\
+		     (dbc)->thread_info, page, (dbc)->priority)) != 0)	\
+			break;						\
+	} else if (ret != DB_PAGE_NOTFOUND)				\
+		break;							\
+	if ((ret = __LPUT(dbc, lock)) != 0)				\
+		break;							\
+	if ((ret = __db_reopen(dbc)) != 0)				\
+		break;							\
+} while (1)
+
+/*
+ * Return the root of this tree. If this is an off page duplicate tree
+ * then its in the cursor, otherwise we must look in the db handle.
+ */
+#define BAM_ROOT_PGNO(dbc)						\
+	(((BTREE_CURSOR *)(dbc)->internal)->root == PGNO_INVALID ?	\
+	    ((BTREE*)(dbc)->dbp->bt_internal)->bt_root :		\
+	    ((BTREE_CURSOR *)(dbc)->internal)->root)
+
+	
+
+/*
+ * The in-memory, per-tree btree/recno data structure.
+ */
+struct __btree {			/* Btree access method. */
+	/*
+	 * These fields may change if this is a subdatabase and
+	 * it gets compacted.
+	 */
+	db_pgno_t bt_meta;		/* Database meta-data page. */
+	db_pgno_t bt_root;		/* Database root page. */
+	u_int32_t revision;		/* Revision of root/meta. */
+
+	u_int32_t bt_minkey;		/* Minimum keys per page. */
+
+					/* Btree comparison function. */
+	int (*bt_compare) __P((DB *, const DBT *, const DBT *));
+					/* Btree prefix function. */
+	size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
+					/* Btree compress function. */
+#ifdef HAVE_COMPRESSION
+	int (*bt_compress) __P((DB *, const DBT *, const DBT *, const DBT *,
+				       const DBT *, DBT *));
+					/* Btree decompress function. */
+	int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+					 DBT *));
+					/* dup_compare for compression */
+	int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *));
+#endif
+
+					/* Recno access method. */
+	int	  re_pad;		/* Fixed-length padding byte. */
+	int	  re_delim;		/* Variable-length delimiting byte. */
+	u_int32_t re_len;		/* Length for fixed-length records. */
+	char	 *re_source;		/* Source file name. */
+
+	/*
+	 * !!!
+	 * The bt_lpgno field is NOT protected by any mutex, and for this
+	 * reason must be advisory only, so, while it is read/written by
+	 * multiple threads, DB is completely indifferent to the quality
+	 * of its information.
+	 */
+	db_pgno_t bt_lpgno;		/* Last insert location. */
+	DB_LSN	  bt_llsn;		/* Last insert LSN. */
+
+	/*
+	 * !!!
+	 * The re_modified field is NOT protected by any mutex, and for this
+	 * reason cannot be anything more complicated than a zero/non-zero
+	 * value.  The actual writing of the backing source file cannot be
+	 * threaded, so clearing the flag isn't a problem.
+	 */
+	int	  re_modified;		/* If the tree was modified. */
+
+	/*
+	 * !!!
+	 * These fields are ignored as far as multi-threading is concerned.
+	 * There are no transaction semantics associated with backing files,
+	 * nor is there any thread protection.
+	 */
+	FILE		*re_fp;		/* Source file handle. */
+	int		 re_eof;	/* Backing source file EOF reached. */
+	db_recno_t	 re_last;	/* Last record number read. */
+
+};
+
+/*
+ * Modes for the __bam_curadj recovery records (btree_curadj).
+ * These appear in log records, so we wire the values and
+ * do not leave it up to the compiler.
+ */
+typedef enum {
+	DB_CA_DI	= 1,
+	DB_CA_DUP	= 2,
+	DB_CA_RSPLIT	= 3,
+	DB_CA_SPLIT	= 4
+} db_ca_mode;
+
+/*
+ * Flags for __bam_pinsert.
+ */
+#define	BPI_SPACEONLY	0x01		/* Only check for space to update. */
+#define	BPI_NORECNUM	0x02		/* Not update the recnum on the left. */
+#define	BPI_NOLOGGING	0x04		/* Don't log the update. */
+#define	BPI_REPLACE	0x08		/* Replace the record. */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/btree_auto.h"
+#include "dbinc_auto/btree_ext.h"
+#include "dbinc/db_am.h"
+#endif /* !_DB_BTREE_H_ */
diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h
new file mode 100644
index 00000000..caeaee70
--- /dev/null
+++ b/src/dbinc/clock.h
@@ -0,0 +1,131 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)time.h	8.5 (Berkeley) 5/4/95
+ * FreeBSD: src/sys/sys/time.h,v 1.65 2004/04/07 04:19:49 imp Exp
+ */
+
+#ifndef _DB_CLOCK_H_
+#define	_DB_CLOCK_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This declaration is POSIX-compatible.  Because there are lots of different
+ * time.h include file patterns out there, it's easier to declare our own name
+ * in all cases than to try and discover if a system has a struct timespec.
+ * For the same reason, and because we'd have to #include <sys/time.h> in db.h,
+ * we don't export any timespec structures in the DB API, even in places where
+ * it would make sense, like the replication statistics information.
+ */
+typedef struct {
+	time_t	tv_sec;				/* seconds */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+	int32_t tv_nsec;
+#else
+	long	tv_nsec;			/* nanoseconds */
+#endif
+} db_timespec;
+
+/* Operations on timespecs */
+#undef	timespecclear
+#define	timespecclear(tvp)	((tvp)->tv_sec = (tvp)->tv_nsec = 0)
+#undef	timespecisset
+#define	timespecisset(tvp)	((tvp)->tv_sec || (tvp)->tv_nsec)
+#undef	timespeccmp
+#define	timespeccmp(tvp, uvp, cmp)					\
+	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
+	    ((tvp)->tv_nsec cmp (uvp)->tv_nsec) :			\
+	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
+#undef timespecadd
+/*
+ * Note that using timespecadd to add to yourself (i.e. doubling)
+ * must be supported.
+ */
+#define	timespecadd(vvp, uvp)						\
+	do {								\
+		(vvp)->tv_sec += (uvp)->tv_sec;				\
+		(vvp)->tv_nsec += (uvp)->tv_nsec;			\
+		if ((vvp)->tv_nsec >= 1000000000) {			\
+			(vvp)->tv_sec++;				\
+			(vvp)->tv_nsec -= 1000000000;			\
+		}							\
+	} while (0)
+#undef timespecsub
+#define	timespecsub(vvp, uvp)						\
+	do {								\
+		(vvp)->tv_sec -= (uvp)->tv_sec;				\
+		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
+		if ((vvp)->tv_nsec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_nsec += 1000000000;			\
+		}							\
+	} while (0)
+
+#undef timespecset
+#define	timespecset(vvp, sec, nsec)					\
+	do {								\
+		(vvp)->tv_sec = (time_t)(sec);				\
+		(vvp)->tv_nsec = (long)(nsec);				\
+	} while (0)
+
+#define	DB_TIMEOUT_TO_TIMESPEC(t, vvp)					\
+	do {								\
+		(vvp)->tv_sec = (time_t)((t) / 1000000);		\
+		(vvp)->tv_nsec = (long)(((t) % 1000000) * 1000);	\
+	} while (0)
+
+#define	DB_TIMESPEC_TO_TIMEOUT(t, vvp, prec)				\
+	do {								\
+		t = (u_long)((vvp)->tv_sec * 1000000);			\
+		t += (u_long)((vvp)->tv_nsec / 1000);			\
+		/* Add in 1 usec for lost nsec precision if wanted. */	\
+		if (prec)						\
+			t++;						\
+	} while (0)
+
+#define	TIMESPEC_ADD_DB_TIMEOUT(vvp, t)			        \
+	do {							        \
+		db_timespec __tmp;				        \
+		DB_TIMEOUT_TO_TIMESPEC(t, &__tmp);		        \
+		timespecadd((vvp), &__tmp);			        \
+	} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_CLOCK_H_ */
diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h
new file mode 100644
index 00000000..ea7a9cf0
--- /dev/null
+++ b/src/dbinc/crypto.h
@@ -0,0 +1,93 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_CRYPTO_H_
+#define	_DB_CRYPTO_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef HAVE_CRYPTO_IPP
+#include <ippcp.h>
+#endif
+
+/*
+ * !!!
+ * These are the internal representations of the algorithm flags.
+ * They are used in both the DB_CIPHER structure and the CIPHER
+ * structure so we can tell if users specified both passwd and alg
+ * correctly.
+ *
+ * CIPHER_ANY is used when an app joins an existing env but doesn't
+ * know the algorithm originally used.  This is only valid in the
+ * DB_CIPHER structure until we open and can set the alg.
+ */
+/*
+ * We store the algorithm in an 8-bit field on the meta-page.  So we
+ * use a numeric value, not bit fields.
+ * now we are limited to 8 algorithms before we cannot use bits and
+ * need numeric values.  That should be plenty.  It is okay for the
+ * CIPHER_ANY flag to go beyond that since that is never stored on disk.
+ */
+
+/*
+ * This structure is per-process, not in shared memory.
+ */
+struct __db_cipher {
+	u_int	(*adj_size) __P((size_t));
+	int	(*close) __P((ENV *, void *));
+	int	(*decrypt) __P((ENV *, void *, void *, u_int8_t *, size_t));
+	int	(*encrypt) __P((ENV *, void *, void *, u_int8_t *, size_t));
+	int	(*init) __P((ENV *, DB_CIPHER *));
+
+	u_int8_t mac_key[DB_MAC_KEY];	/* MAC key. */
+	void	*data;			/* Algorithm-specific information */
+
+#define	CIPHER_AES	1		/* AES algorithm */
+	u_int8_t	alg;		/* Algorithm used - See above */
+	u_int8_t	spare[3];	/* Spares */
+
+#define	CIPHER_ANY	0x00000001	/* Only for DB_CIPHER */
+	u_int32_t	flags;		/* Other flags */
+};
+
+#ifdef HAVE_CRYPTO
+
+#include "crypto/rijndael/rijndael-api-fst.h"
+
+/*
+ * Shared ciphering structure
+ * No mutex needed because all information is read-only after creation.
+ */
+typedef struct __cipher {
+	roff_t		passwd;		/* Offset to shared passwd */
+	size_t		passwd_len;	/* Length of passwd */
+	u_int32_t	flags;		/* Algorithm used - see above */
+} CIPHER;
+
+#define	DB_AES_KEYLEN	128	/* AES key length */
+#define	DB_AES_CHUNK	16	/* AES byte unit size */
+
+typedef struct __aes_cipher {
+#ifdef	HAVE_CRYPTO_IPP
+	void		*ipp_ctx;	/* IPP key instance */
+#else
+	keyInstance	decrypt_ki;	/* Decryption key instance */
+	keyInstance	encrypt_ki;	/* Encryption key instance */
+#endif
+	u_int32_t	flags;		/* AES-specific flags */
+} AES_CIPHER;
+
+#include "dbinc_auto/crypto_ext.h"
+#endif /* HAVE_CRYPTO */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_CRYPTO_H_ */
diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h
new file mode 100644
index 00000000..5492ead7
--- /dev/null
+++ b/src/dbinc/cxx_int.h
@@ -0,0 +1,77 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_CXX_INT_H_
+#define	_DB_CXX_INT_H_
+
+// private data structures known to the implementation only
+
+//
+// Using FooImp classes will allow the implementation to change in the
+// future without any modification to user code or even to header files
+// that the user includes. FooImp * is just like void * except that it
+// provides a little extra protection, since you cannot randomly assign
+// any old pointer to a FooImp* as you can with void *.  Currently, a
+// pointer to such an opaque class is always just a pointer to the
+// appropriate underlying implementation struct.  These are converted
+// back and forth using the various overloaded wrap()/unwrap() methods.
+// This is essentially a use of the "Bridge" Design Pattern.
+//
+// WRAPPED_CLASS implements the appropriate wrap() and unwrap() methods
+// for a wrapper class that has an underlying pointer representation.
+//
+#define	WRAPPED_CLASS(_WRAPPER_CLASS, _IMP_CLASS, _WRAPPED_TYPE)           \
+	class _IMP_CLASS {};                                               \
+									   \
+	inline _WRAPPED_TYPE *unwrap(_WRAPPER_CLASS *val)                  \
+	{                                                                  \
+		if (!val) return (0);                                      \
+		return (val->get_##_WRAPPED_TYPE());                       \
+	}                                                                  \
+									   \
+	inline const _WRAPPED_TYPE *unwrapConst(const _WRAPPER_CLASS *val) \
+	{                                                                  \
+		if (!val) return (0);                                      \
+		return (val->get_const_##_WRAPPED_TYPE());                 \
+	}
+
+WRAPPED_CLASS(Db, DbImp, DB)
+WRAPPED_CLASS(DbChannel, DbChannelImp, DB_CHANNEL)
+WRAPPED_CLASS(DbEnv, DbEnvImp, DB_ENV)
+WRAPPED_CLASS(DbMpoolFile, DbMpoolFileImp, DB_MPOOLFILE)
+WRAPPED_CLASS(DbSequence, DbSequenceImp, DB_SEQUENCE)
+WRAPPED_CLASS(DbSite, DbSiteImp, DB_SITE)
+WRAPPED_CLASS(DbTxn, DbTxnImp, DB_TXN)
+
+// A tristate integer value used by the DB_ERROR macro below.
+// We chose not to make this an enumerated type so it can
+// be kept private, even though methods that return the
+// tristate int can be declared in db_cxx.h .
+//
+#define	ON_ERROR_THROW     1
+#define	ON_ERROR_RETURN    0
+#define	ON_ERROR_UNKNOWN   (-1)
+
+// Macros that handle detected errors, in case we want to
+// change the default behavior.  The 'policy' is one of
+// the tristate values given above.  If UNKNOWN is specified,
+// the behavior is taken from the last initialized DbEnv.
+//
+#define	DB_ERROR(dbenv, caller, ecode, policy) \
+    DbEnv::runtime_error(dbenv, caller, ecode, policy)
+
+#define	DB_ERROR_DBT(dbenv, caller, dbt, policy) \
+    DbEnv::runtime_error_dbt(dbenv, caller, dbt, policy)
+
+#define	DB_OVERFLOWED_DBT(dbt) \
+	(F_ISSET(dbt, DB_DBT_USERMEM) && dbt->size > dbt->ulen)
+
+/* values for Db::flags_ */
+#define	DB_CXX_PRIVATE_ENV      0x00000001
+
+#endif /* !_DB_CXX_INT_H_ */
diff --git a/src/dbinc/db.in b/src/dbinc/db.in
new file mode 100644
index 00000000..a948910e
--- /dev/null
+++ b/src/dbinc/db.in
@@ -0,0 +1,2810 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ *
+ * db.h include file layout:
+ *	General.
+ *	Database Environment.
+ *	Locking subsystem.
+ *	Logging subsystem.
+ *	Shared buffer cache (mpool) subsystem.
+ *	Transaction subsystem.
+ *	Access methods.
+ *	Access method cursors.
+ *	Dbm/Ndbm, Hsearch historic interfaces.
+ */
+
+#ifndef _DB_H_
+#define	_DB_H_
+
+#ifndef	__NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+@inttypes_h_decl@
+@stdint_h_decl@
+@stddef_h_decl@
+#include <stdio.h>
+@unistd_h_decl@
+@thread_h_decl@
+#endif
+
+@platform_header@
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+@DB_CONST@
+@DB_PROTO1@
+@DB_PROTO2@
+
+/*
+ * Berkeley DB version information.
+ */
+#define	DB_VERSION_FAMILY	@DB_VERSION_FAMILY@
+#define	DB_VERSION_RELEASE	@DB_VERSION_RELEASE@
+#define	DB_VERSION_MAJOR	@DB_VERSION_MAJOR@
+#define	DB_VERSION_MINOR	@DB_VERSION_MINOR@
+#define	DB_VERSION_PATCH	@DB_VERSION_PATCH@
+#define	DB_VERSION_STRING	@DB_VERSION_STRING@
+#define	DB_VERSION_FULL_STRING	@DB_VERSION_FULL_STRING@
+
+/*
+ * !!!
+ * Berkeley DB uses specifically sized types.  If they're not provided by
+ * the system, typedef them here.
+ *
+ * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__,
+ * as does BIND and Kerberos, since we don't know for sure what #include
+ * files the user is using.
+ *
+ * !!!
+ * We also provide the standard u_int, u_long etc., if they're not provided
+ * by the system.
+ */
+#ifndef	__BIT_TYPES_DEFINED__
+#define	__BIT_TYPES_DEFINED__
+@u_int8_decl@
+@int16_decl@
+@u_int16_decl@
+@int32_decl@
+@u_int32_decl@
+@int64_decl@
+@u_int64_decl@
+#endif
+
+@u_char_decl@
+@u_int_decl@
+@u_long_decl@
+@u_short_decl@
+
+/*
+ * Missing ANSI types.
+ *
+ * uintmax_t --
+ * Largest unsigned type, used to align structures in memory.  We don't store
+ * floating point types in structures, so integral types should be sufficient
+ * (and we don't have to worry about systems that store floats in other than
+ * power-of-2 numbers of bytes).  Additionally this fixes compilers that rewrite
+ * structure assignments and ANSI C memcpy calls to be in-line instructions
+ * that happen to require alignment.
+ *
+ * uintptr_t --
+ * Unsigned type that's the same size as a pointer.  There are places where
+ * DB modifies pointers by discarding the bottom bits to guarantee alignment.
+ * We can't use uintmax_t, it may be larger than the pointer, and compilers
+ * get upset about that.  So far we haven't run on any machine where there's
+ * no unsigned type the same size as a pointer -- here's hoping.
+ */
+@uintmax_t_decl@
+@uintptr_t_decl@
+
+@FILE_t_decl@
+@off_t_decl@
+@pid_t_decl@
+@size_t_decl@
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef u_int32_t db_size_t;
+#else
+typedef size_t db_size_t;
+#endif
+@ssize_t_decl@
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef int32_t db_ssize_t;
+#else
+typedef ssize_t db_ssize_t;
+#endif
+@time_t_decl@
+
+/*
+ * Sequences are only available on machines with 64-bit integral types.
+ */
+@db_seq_decl@
+
+/* Thread and process identification. */
+@db_threadid_t_decl@
+
+/* Basic types that are exported or quasi-exported. */
+typedef	u_int32_t	db_pgno_t;	/* Page number type. */
+typedef	u_int16_t	db_indx_t;	/* Page offset type. */
+#define	DB_MAX_PAGES	0xffffffff	/* >= # of pages in a file */
+
+typedef	u_int32_t	db_recno_t;	/* Record number type. */
+#define	DB_MAX_RECORDS	0xffffffff	/* >= # of records in a tree */
+
+typedef u_int32_t	db_timeout_t;	/* Type of a timeout. */
+
+/*
+ * Region offsets are the difference between a pointer in a region and the
+ * region's base address.  With private environments, both addresses are the
+ * result of calling malloc, and we can't assume anything about what malloc
+ * will return, so region offsets have to be able to hold differences between
+ * arbitrary pointers.
+ */
+typedef	db_size_t	roff_t;
+
+/*
+ * Forward structure declarations, so we can declare pointers and
+ * applications can get type checking.
+ */
+struct __channel;	typedef struct __channel CHANNEL;
+struct __db;		typedef struct __db DB;
+struct __db_bt_stat;	typedef struct __db_bt_stat DB_BTREE_STAT;
+struct __db_channel;	typedef struct __db_channel DB_CHANNEL;
+struct __db_cipher;	typedef struct __db_cipher DB_CIPHER;
+struct __db_compact;	typedef struct __db_compact DB_COMPACT;
+struct __db_dbt;	typedef struct __db_dbt DBT;
+struct __db_distab;	typedef struct __db_distab DB_DISTAB;
+struct __db_env;	typedef struct __db_env DB_ENV;
+struct __db_h_stat;	typedef struct __db_h_stat DB_HASH_STAT;
+struct __db_heap_rid;	typedef struct __db_heap_rid DB_HEAP_RID;
+struct __db_heap_stat;	typedef struct __db_heap_stat DB_HEAP_STAT;
+struct __db_ilock;	typedef struct __db_ilock DB_LOCK_ILOCK;
+struct __db_lock_hstat;	typedef struct __db_lock_hstat DB_LOCK_HSTAT;
+struct __db_lock_pstat;	typedef struct __db_lock_pstat DB_LOCK_PSTAT;
+struct __db_lock_stat;	typedef struct __db_lock_stat DB_LOCK_STAT;
+struct __db_lock_u;	typedef struct __db_lock_u DB_LOCK;
+struct __db_locker;	typedef struct __db_locker DB_LOCKER;
+struct __db_lockreq;	typedef struct __db_lockreq DB_LOCKREQ;
+struct __db_locktab;	typedef struct __db_locktab DB_LOCKTAB;
+struct __db_log;	typedef struct __db_log DB_LOG;
+struct __db_log_cursor;	typedef struct __db_log_cursor DB_LOGC;
+struct __db_log_stat;	typedef struct __db_log_stat DB_LOG_STAT;
+struct __db_lsn;	typedef struct __db_lsn DB_LSN;
+struct __db_mpool;	typedef struct __db_mpool DB_MPOOL;
+struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT;
+struct __db_mpool_stat;	typedef struct __db_mpool_stat DB_MPOOL_STAT;
+struct __db_mpoolfile;	typedef struct __db_mpoolfile DB_MPOOLFILE;
+struct __db_mutex_stat;	typedef struct __db_mutex_stat DB_MUTEX_STAT;
+struct __db_mutex_t;	typedef struct __db_mutex_t DB_MUTEX;
+struct __db_mutexmgr;	typedef struct __db_mutexmgr DB_MUTEXMGR;
+struct __db_preplist;	typedef struct __db_preplist DB_PREPLIST;
+struct __db_qam_stat;	typedef struct __db_qam_stat DB_QUEUE_STAT;
+struct __db_rep;	typedef struct __db_rep DB_REP;
+struct __db_rep_stat;	typedef struct __db_rep_stat DB_REP_STAT;
+struct __db_repmgr_conn_err;
+	typedef struct __db_repmgr_conn_err DB_REPMGR_CONN_ERR;
+struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE;
+struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT;
+struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD;
+struct __db_seq_stat;	typedef struct __db_seq_stat DB_SEQUENCE_STAT;
+struct __db_site;	typedef struct __db_site DB_SITE;
+struct __db_sequence;	typedef struct __db_sequence DB_SEQUENCE;
+struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO;
+struct __db_txn;	typedef struct __db_txn DB_TXN;
+struct __db_txn_active;	typedef struct __db_txn_active DB_TXN_ACTIVE;
+struct __db_txn_stat;	typedef struct __db_txn_stat DB_TXN_STAT;
+struct __db_txn_token;	typedef struct __db_txn_token DB_TXN_TOKEN;
+struct __db_txnmgr;	typedef struct __db_txnmgr DB_TXNMGR;
+struct __dbc;		typedef struct __dbc DBC;
+struct __dbc_internal;	typedef struct __dbc_internal DBC_INTERNAL;
+struct __env;		typedef struct __env ENV;
+struct __fh_t;		typedef struct __fh_t DB_FH;
+struct __fname;		typedef struct __fname FNAME;
+struct __key_range;	typedef struct __key_range DB_KEY_RANGE;
+struct __mpoolfile;	typedef struct __mpoolfile MPOOLFILE;
+struct __db_logvrfy_config;
+typedef struct __db_logvrfy_config DB_LOG_VERIFY_CONFIG;
+
+/*
+ * The Berkeley DB API flags are automatically-generated -- the following flag
+ * names are no longer used, but remain for compatibility reasons.
+ */
+#define	DB_DEGREE_2	      DB_READ_COMMITTED
+#define	DB_DIRTY_READ	      DB_READ_UNCOMMITTED
+#define	DB_JOINENV	      0x0
+
+/* Key/data structure -- a Data-Base Thang. */
+struct __db_dbt {
+	void	 *data;			/* Key/data */
+	u_int32_t size;			/* key/data length */
+
+	u_int32_t ulen;			/* RO: length of user buffer. */
+	u_int32_t dlen;			/* RO: get/put record length. */
+	u_int32_t doff;			/* RO: get/put record offset. */
+
+	void *app_data;
+
+#define	DB_DBT_APPMALLOC	0x001	/* Callback allocated memory. */
+#define	DB_DBT_BULK		0x002	/* Internal: Insert if duplicate. */
+#define	DB_DBT_DUPOK		0x004	/* Internal: Insert if duplicate. */
+#define	DB_DBT_ISSET		0x008	/* Lower level calls set value. */
+#define	DB_DBT_MALLOC		0x010	/* Return in malloc'd memory. */
+#define	DB_DBT_MULTIPLE		0x020	/* References multiple records. */
+#define	DB_DBT_PARTIAL		0x040	/* Partial put/get. */
+#define	DB_DBT_REALLOC		0x080	/* Return in realloc'd memory. */
+#define	DB_DBT_READONLY		0x100	/* Readonly, don't update. */
+#define	DB_DBT_STREAMING	0x200	/* Internal: DBT is being streamed. */
+#define	DB_DBT_USERCOPY		0x400	/* Use the user-supplied callback. */
+#define	DB_DBT_USERMEM		0x800	/* Return in user's memory. */
+	u_int32_t flags;
+};
+
+/*******************************************************
+ * Mutexes.
+ *******************************************************/
+/* 
+ * When mixed size addressing is supported mutexes need to be the same size
+ * independent of the process address size is.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef db_size_t	db_mutex_t;
+#else
+typedef uintptr_t	db_mutex_t;
+#endif
+
+struct __db_mutex_stat { /* SHARED */
+	/* The following fields are maintained in the region's copy. */
+	u_int32_t st_mutex_align;	/* Mutex alignment */
+	u_int32_t st_mutex_tas_spins;	/* Mutex test-and-set spins */
+	u_int32_t st_mutex_init;	/* Initial mutex count */
+	u_int32_t st_mutex_cnt;		/* Mutex count */
+	u_int32_t st_mutex_max;		/* Mutex max */
+	u_int32_t st_mutex_free;	/* Available mutexes */
+	u_int32_t st_mutex_inuse;	/* Mutexes in use */
+	u_int32_t st_mutex_inuse_max;	/* Maximum mutexes ever in use */
+
+	/* The following fields are filled-in from other places. */
+#ifndef __TEST_DB_NO_STATISTICS
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted without wait. */
+	roff_t	  st_regsize;		/* Region size. */
+	roff_t	  st_regmax;		/* Region max. */
+#endif
+};
+
+/* This is the length of the buffer passed to DB_ENV->thread_id_string() */
+#define	DB_THREADID_STRLEN	128
+
+/*******************************************************
+ * Locking.
+ *******************************************************/
+#define	DB_LOCKVERSION	1
+
+#define	DB_FILE_ID_LEN		20	/* Unique file ID length. */
+
+/*
+ * Deadlock detector modes; used in the DB_ENV structure to configure the
+ * locking subsystem.
+ */
+#define	DB_LOCK_NORUN		0
+#define	DB_LOCK_DEFAULT		1	/* Default policy. */
+#define	DB_LOCK_EXPIRE		2	/* Only expire locks, no detection. */
+#define	DB_LOCK_MAXLOCKS	3	/* Select locker with max locks. */
+#define	DB_LOCK_MAXWRITE	4	/* Select locker with max writelocks. */
+#define	DB_LOCK_MINLOCKS	5	/* Select locker with min locks. */
+#define	DB_LOCK_MINWRITE	6	/* Select locker with min writelocks. */
+#define	DB_LOCK_OLDEST		7	/* Select oldest locker. */
+#define	DB_LOCK_RANDOM		8	/* Select random locker. */
+#define	DB_LOCK_YOUNGEST	9	/* Select youngest locker. */
+
+/*
+ * Simple R/W lock modes and for multi-granularity intention locking.
+ *
+ * !!!
+ * These values are NOT random, as they are used as an index into the lock
+ * conflicts arrays, i.e., DB_LOCK_IWRITE must be == 3, and DB_LOCK_IREAD
+ * must be == 4.
+ */
+typedef enum {
+	DB_LOCK_NG=0,			/* Not granted. */
+	DB_LOCK_READ=1,			/* Shared/read. */
+	DB_LOCK_WRITE=2,		/* Exclusive/write. */
+	DB_LOCK_WAIT=3,			/* Wait for event */
+	DB_LOCK_IWRITE=4,		/* Intent exclusive/write. */
+	DB_LOCK_IREAD=5,		/* Intent to share/read. */
+	DB_LOCK_IWR=6,			/* Intent to read and write. */
+	DB_LOCK_READ_UNCOMMITTED=7,	/* Degree 1 isolation. */
+	DB_LOCK_WWRITE=8		/* Was Written. */
+} db_lockmode_t;
+
+/*
+ * Request types.
+ */
+typedef enum {
+	DB_LOCK_DUMP=0,			/* Display held locks. */
+	DB_LOCK_GET=1,			/* Get the lock. */
+	DB_LOCK_GET_TIMEOUT=2,		/* Get lock with a timeout. */
+	DB_LOCK_INHERIT=3,		/* Pass locks to parent. */
+	DB_LOCK_PUT=4,			/* Release the lock. */
+	DB_LOCK_PUT_ALL=5,		/* Release locker's locks. */
+	DB_LOCK_PUT_OBJ=6,		/* Release locker's locks on obj. */
+	DB_LOCK_PUT_READ=7,		/* Release locker's read locks. */
+	DB_LOCK_TIMEOUT=8,		/* Force a txn to timeout. */
+	DB_LOCK_TRADE=9,		/* Trade locker ids on a lock. */
+	DB_LOCK_UPGRADE_WRITE=10	/* Upgrade writes for dirty reads. */
+} db_lockop_t;
+
+/*
+ * Status of a lock.
+ */
+typedef enum  {
+	DB_LSTAT_ABORTED=1,		/* Lock belongs to an aborted txn. */
+	DB_LSTAT_EXPIRED=2,		/* Lock has expired. */
+	DB_LSTAT_FREE=3,		/* Lock is unallocated. */
+	DB_LSTAT_HELD=4,		/* Lock is currently held. */
+	DB_LSTAT_PENDING=5,		/* Lock was waiting and has been
+					 * promoted; waiting for the owner
+					 * to run and upgrade it to held. */
+	DB_LSTAT_WAITING=6		/* Lock is on the wait queue. */
+}db_status_t;
+
+/* Lock statistics structure. */
+struct __db_lock_stat { /* SHARED */
+	u_int32_t st_id;		/* Last allocated locker ID. */
+	u_int32_t st_cur_maxid;		/* Current maximum unused ID. */
+	u_int32_t st_initlocks;		/* Initial number of locks in table. */
+	u_int32_t st_initlockers;	/* Initial num of lockers in table. */
+	u_int32_t st_initobjects;	/* Initial num of objects in table. */
+	u_int32_t st_locks;		/* Current number of locks in table. */
+	u_int32_t st_lockers;		/* Current num of lockers in table. */
+	u_int32_t st_objects;		/* Current num of objects in table. */
+	u_int32_t st_maxlocks;		/* Maximum number of locks in table. */
+	u_int32_t st_maxlockers;	/* Maximum num of lockers in table. */
+	u_int32_t st_maxobjects;	/* Maximum num of objects in table. */
+	u_int32_t st_partitions;	/* number of partitions. */
+	u_int32_t st_tablesize;		/* Size of object hash table. */
+	int32_t   st_nmodes;		/* Number of lock modes. */
+	u_int32_t st_nlockers;		/* Current number of lockers. */
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t st_nlocks;		/* Current number of locks. */
+	u_int32_t st_maxnlocks;		/* Maximum number of locks so far. */
+	u_int32_t st_maxhlocks;		/* Maximum number of locks in any bucket. */
+	uintmax_t st_locksteals;	/* Number of lock steals so far. */
+	uintmax_t st_maxlsteals;	/* Maximum number steals in any partition. */
+	u_int32_t st_maxnlockers;	/* Maximum number of lockers so far. */
+	u_int32_t st_nobjects;		/* Current number of objects. */
+	u_int32_t st_maxnobjects;	/* Maximum number of objects so far. */
+	u_int32_t st_maxhobjects;	/* Maximum number of objectsin any bucket. */
+	uintmax_t st_objectsteals;	/* Number of objects steals so far. */
+	uintmax_t st_maxosteals;	/* Maximum number of steals in any partition. */
+	uintmax_t st_nrequests;		/* Number of lock gets. */
+	uintmax_t st_nreleases;		/* Number of lock puts. */
+	uintmax_t st_nupgrade;		/* Number of lock upgrades. */
+	uintmax_t st_ndowngrade;	/* Number of lock downgrades. */
+	uintmax_t st_lock_wait;		/* Lock conflicts w/ subsequent wait */
+	uintmax_t st_lock_nowait;	/* Lock conflicts w/o subsequent wait */
+	uintmax_t st_ndeadlocks;	/* Number of lock deadlocks. */
+	db_timeout_t st_locktimeout;	/* Lock timeout. */
+	uintmax_t st_nlocktimeouts;	/* Number of lock timeouts. */
+	db_timeout_t st_txntimeout;	/* Transaction timeout. */
+	uintmax_t st_ntxntimeouts;	/* Number of transaction timeouts. */
+	uintmax_t st_part_wait;		/* Partition lock granted after wait. */
+	uintmax_t st_part_nowait;	/* Partition lock granted without wait. */
+	uintmax_t st_part_max_wait;	/* Max partition lock granted after wait. */
+	uintmax_t st_part_max_nowait;	/* Max partition lock granted without wait. */
+	uintmax_t st_objs_wait;	/* 	Object lock granted after wait. */
+	uintmax_t st_objs_nowait;	/* Object lock granted without wait. */
+	uintmax_t st_lockers_wait;	/* Locker lock granted after wait. */
+	uintmax_t st_lockers_nowait;	/* Locker lock granted without wait. */
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted without wait. */
+	u_int32_t st_hash_len;		/* Max length of bucket. */
+	roff_t	  st_regsize;		/* Region size. */
+#endif
+};
+
+struct __db_lock_hstat { /* SHARED */
+	uintmax_t st_nrequests;		/* Number of lock gets. */
+	uintmax_t st_nreleases;		/* Number of lock puts. */
+	uintmax_t st_nupgrade;		/* Number of lock upgrades. */
+	uintmax_t st_ndowngrade;	/* Number of lock downgrades. */
+	u_int32_t st_nlocks;		/* Current number of locks. */
+	u_int32_t st_maxnlocks;		/* Maximum number of locks so far. */
+	u_int32_t st_nobjects;		/* Current number of objects. */
+	u_int32_t st_maxnobjects;	/* Maximum number of objects so far. */
+	uintmax_t st_lock_wait;		/* Lock conflicts w/ subsequent wait */
+	uintmax_t st_lock_nowait;	/* Lock conflicts w/o subsequent wait */
+	uintmax_t st_nlocktimeouts;	/* Number of lock timeouts. */
+	uintmax_t st_ntxntimeouts;	/* Number of transaction timeouts. */
+	u_int32_t st_hash_len;		/* Max length of bucket. */
+};
+
+struct __db_lock_pstat { /* SHARED */
+	u_int32_t st_nlocks;		/* Current number of locks. */
+	u_int32_t st_maxnlocks;		/* Maximum number of locks so far. */
+	u_int32_t st_nobjects;		/* Current number of objects. */
+	u_int32_t st_maxnobjects;	/* Maximum number of objects so far. */
+	uintmax_t st_locksteals;	/* Number of lock steals so far. */
+	uintmax_t st_objectsteals;	/* Number of objects steals so far. */
+};
+
+/*
+ * DB_LOCK_ILOCK --
+ *	Internal DB access method lock.
+ */
+struct __db_ilock { /* SHARED */
+	db_pgno_t pgno;			/* Page being locked. */
+	u_int8_t fileid[DB_FILE_ID_LEN];/* File id. */
+#define	DB_HANDLE_LOCK		1
+#define	DB_RECORD_LOCK		2
+#define	DB_PAGE_LOCK		3
+#define	DB_DATABASE_LOCK	4
+	u_int32_t type;			/* Type of lock. */
+};
+
+/*
+ * DB_LOCK --
+ *	The structure is allocated by the caller and filled in during a
+ *	lock_get request (or a lock_vec/DB_LOCK_GET).
+ */
+struct __db_lock_u { /* SHARED */
+	roff_t		off;		/* Offset of the lock in the region */
+	u_int32_t	ndx;		/* Index of the object referenced by
+					 * this lock; used for locking. */
+	u_int32_t	gen;		/* Generation number of this lock. */
+	db_lockmode_t	mode;		/* mode of this lock. */
+};
+
+/* Lock request structure. */
+struct __db_lockreq {
+	db_lockop_t	 op;		/* Operation. */
+	db_lockmode_t	 mode;		/* Requested mode. */
+	db_timeout_t	 timeout;	/* Time to expire lock. */
+	DBT		*obj;		/* Object being locked. */
+	DB_LOCK		 lock;		/* Lock returned. */
+};
+
+/*******************************************************
+ * Logging.
+ *******************************************************/
+#define	DB_LOGVERSION	19		/* Current log version. */
+#define	DB_LOGVERSION_LATCHING 15	/* Log version using latching: db-4.8 */
+#define	DB_LOGCHKSUM	12		/* Check sum headers: db-4.5 */
+#define	DB_LOGOLDVER	8		/* Oldest version supported: db-4.2 */
+#define	DB_LOGMAGIC	0x040988
+
+/*
+ * A DB_LSN has two parts, a fileid which identifies a specific file, and an
+ * offset within that file.  The fileid is an unsigned 4-byte quantity that
+ * uniquely identifies a file within the log directory -- currently a simple
+ * counter inside the log.  The offset is also an unsigned 4-byte value.  The
+ * log manager guarantees the offset is never more than 4 bytes by switching
+ * to a new log file before the maximum length imposed by an unsigned 4-byte
+ * offset is reached.
+ */
+struct __db_lsn { /* SHARED */
+	u_int32_t	file;		/* File ID. */
+	u_int32_t	offset;		/* File offset. */
+};
+
+/*
+ * Application-specified log record types start at DB_user_BEGIN, and must not
+ * equal or exceed DB_debug_FLAG.
+ *
+ * DB_debug_FLAG is the high-bit of the u_int32_t that specifies a log record
+ * type.  If the flag is set, it's a log record that was logged for debugging
+ * purposes only, even if it reflects a database change -- the change was part
+ * of a non-durable transaction.
+ */
+#define	DB_user_BEGIN		10000
+#define	DB_debug_FLAG		0x80000000
+
+/*
+ * DB_LOGC --
+ *	Log cursor.
+ */
+struct __db_log_cursor {
+	ENV	 *env;			/* Environment */
+
+	DB_FH	 *fhp;			/* File handle. */
+	DB_LSN	  lsn;			/* Cursor: LSN */
+	u_int32_t len;			/* Cursor: record length */
+	u_int32_t prev;			/* Cursor: previous record's offset */
+
+	DBT	  dbt;			/* Return DBT. */
+	DB_LSN    p_lsn;		/* Persist LSN. */
+	u_int32_t p_version;		/* Persist version. */
+
+	u_int8_t *bp;			/* Allocated read buffer. */
+	u_int32_t bp_size;		/* Read buffer length in bytes. */
+	u_int32_t bp_rlen;		/* Read buffer valid data length. */
+	DB_LSN	  bp_lsn;		/* Read buffer first byte LSN. */
+
+	u_int32_t bp_maxrec;		/* Max record length in the log file. */
+
+	/* DB_LOGC PUBLIC HANDLE LIST BEGIN */
+	int (*close) __P((DB_LOGC *, u_int32_t));
+	int (*get) __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+	int (*version) __P((DB_LOGC *, u_int32_t *, u_int32_t));
+	/* DB_LOGC PUBLIC HANDLE LIST END */
+
+#define	DB_LOG_DISK		0x01	/* Log record came from disk. */
+#define	DB_LOG_LOCKED		0x02	/* Log region already locked */
+#define	DB_LOG_SILENT_ERR	0x04	/* Turn-off error messages. */
+	u_int32_t flags;
+};
+
+/* Log statistics structure. */
+struct __db_log_stat { /* SHARED */
+	u_int32_t st_magic;		/* Log file magic number. */
+	u_int32_t st_version;		/* Log file version number. */
+	int32_t   st_mode;		/* Log file permissions mode. */
+	u_int32_t st_lg_bsize;		/* Log buffer size. */
+	u_int32_t st_lg_size;		/* Log file size. */
+	u_int32_t st_wc_bytes;		/* Bytes to log since checkpoint. */
+	u_int32_t st_wc_mbytes;		/* Megabytes to log since checkpoint. */
+	u_int32_t st_fileid_init;	/* Initial allocation for fileids. */
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t st_nfileid;		/* Current number of fileids. */
+	u_int32_t st_maxnfileid;	/* Maximum number of fileids used. */
+	uintmax_t st_record;		/* Records entered into the log. */
+	u_int32_t st_w_bytes;		/* Bytes to log. */
+	u_int32_t st_w_mbytes;		/* Megabytes to log. */
+	uintmax_t st_wcount;		/* Total I/O writes to the log. */
+	uintmax_t st_wcount_fill;	/* Overflow writes to the log. */
+	uintmax_t st_rcount;		/* Total I/O reads from the log. */
+	uintmax_t st_scount;		/* Total syncs to the log. */
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted without wait. */
+	u_int32_t st_cur_file;		/* Current log file number. */
+	u_int32_t st_cur_offset;	/* Current log file offset. */
+	u_int32_t st_disk_file;		/* Known on disk log file number. */
+	u_int32_t st_disk_offset;	/* Known on disk log file offset. */
+	u_int32_t st_maxcommitperflush;	/* Max number of commits in a flush. */
+	u_int32_t st_mincommitperflush;	/* Min number of commits in a flush. */
+	roff_t	  st_regsize;		/* Region size. */
+#endif
+};
+
+/*
+ * We need to record the first log record of a transaction.  For user
+ * defined logging this macro returns the place to put that information,
+ * if it is need in rlsnp, otherwise it leaves it unchanged.  We also
+ * need to track the last record of the transaction, this returns the
+ * place to put that info.
+ */
+#define	DB_SET_TXN_LSNP(txn, blsnp, llsnp)		\
+	((txn)->set_txn_lsnp(txn, blsnp, llsnp))
+
+/*
+ * Definition of the structure which specifies marshalling of log records.
+ */
+typedef enum {
+	LOGREC_Done,
+	LOGREC_ARG,
+	LOGREC_HDR,
+	LOGREC_DATA,
+	LOGREC_DB,
+	LOGREC_DBOP,
+	LOGREC_DBT,
+	LOGREC_LOCKS,
+	LOGREC_OP,
+	LOGREC_PGDBT,
+	LOGREC_PGDDBT,
+	LOGREC_PGLIST,
+	LOGREC_POINTER,
+	LOGREC_TIME
+} log_rec_type_t;
+
+typedef const struct __log_rec_spec {
+	log_rec_type_t	type;
+	u_int32_t	offset;
+	const char 	*name;
+	const char	fmt[4];
+} DB_LOG_RECSPEC;
+
+/*
+ * Size of a DBT in a log record.
+ */
+#define	LOG_DBT_SIZE(dbt)						\
+    (sizeof(u_int32_t) + ((dbt) == NULL ? 0 : (dbt)->size))
+
+/*******************************************************
+ * Shared buffer cache (mpool).
+ *******************************************************/
+/* Priority values for DB_MPOOLFILE->{put,set_priority}. */
+typedef enum {
+	DB_PRIORITY_UNCHANGED=0,
+	DB_PRIORITY_VERY_LOW=1,
+	DB_PRIORITY_LOW=2,
+	DB_PRIORITY_DEFAULT=3,
+	DB_PRIORITY_HIGH=4,
+	DB_PRIORITY_VERY_HIGH=5
+} DB_CACHE_PRIORITY;
+
+/* Per-process DB_MPOOLFILE information. */
+struct __db_mpoolfile {
+	DB_FH	  *fhp;			/* Underlying file handle. */
+
+	/*
+	 * !!!
+	 * The ref, pinref and q fields are protected by the region lock.
+	 */
+	u_int32_t  ref;			/* Reference count. */
+
+	u_int32_t pinref;		/* Pinned block reference count. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__db_mpoolfile) q;
+	 */
+	struct {
+		struct __db_mpoolfile *tqe_next;
+		struct __db_mpoolfile **tqe_prev;
+	} q;				/* Linked list of DB_MPOOLFILE's. */
+
+	/*
+	 * !!!
+	 * The rest of the fields (with the exception of the MP_FLUSH flag)
+	 * are not thread-protected, even when they may be modified at any
+	 * time by the application.  The reason is the DB_MPOOLFILE handle
+	 * is single-threaded from the viewpoint of the application, and so
+	 * the only fields needing to be thread-protected are those accessed
+	 * by checkpoint or sync threads when using DB_MPOOLFILE structures
+	 * to flush buffers from the cache.
+	 */
+	ENV	       *env;		/* Environment */
+	MPOOLFILE      *mfp;		/* Underlying MPOOLFILE. */
+
+	u_int32_t	clear_len;	/* Cleared length on created pages. */
+	u_int8_t			/* Unique file ID. */
+			fileid[DB_FILE_ID_LEN];
+	int		ftype;		/* File type. */
+	int32_t		lsn_offset;	/* LSN offset in page. */
+	u_int32_t	gbytes, bytes;	/* Maximum file size. */
+	DBT	       *pgcookie;	/* Byte-string passed to pgin/pgout. */
+	int32_t		priority;	/* Cache priority. */
+
+	void	       *addr;		/* Address of mmap'd region. */
+	size_t		len;		/* Length of mmap'd region. */
+
+	u_int32_t	config_flags;	/* Flags to DB_MPOOLFILE->set_flags. */
+
+	/* DB_MPOOLFILE PUBLIC HANDLE LIST BEGIN */
+	int (*close) __P((DB_MPOOLFILE *, u_int32_t));
+	int (*get)
+	    __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+	int (*get_clear_len) __P((DB_MPOOLFILE *, u_int32_t *));
+	int (*get_fileid) __P((DB_MPOOLFILE *, u_int8_t *));
+	int (*get_flags) __P((DB_MPOOLFILE *, u_int32_t *));
+	int (*get_ftype) __P((DB_MPOOLFILE *, int *));
+	int (*get_last_pgno) __P((DB_MPOOLFILE *, db_pgno_t *));
+	int (*get_lsn_offset) __P((DB_MPOOLFILE *, int32_t *));
+	int (*get_maxsize) __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *));
+	int (*get_pgcookie) __P((DB_MPOOLFILE *, DBT *));
+	int (*get_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+	int (*open) __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+	int (*put) __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+	int (*set_clear_len) __P((DB_MPOOLFILE *, u_int32_t));
+	int (*set_fileid) __P((DB_MPOOLFILE *, u_int8_t *));
+	int (*set_flags) __P((DB_MPOOLFILE *, u_int32_t, int));
+	int (*set_ftype) __P((DB_MPOOLFILE *, int));
+	int (*set_lsn_offset) __P((DB_MPOOLFILE *, int32_t));
+	int (*set_maxsize) __P((DB_MPOOLFILE *, u_int32_t, u_int32_t));
+	int (*set_pgcookie) __P((DB_MPOOLFILE *, DBT *));
+	int (*set_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+	int (*sync) __P((DB_MPOOLFILE *));
+	/* DB_MPOOLFILE PUBLIC HANDLE LIST END */
+
+	/*
+	 * MP_FILEID_SET, MP_OPEN_CALLED and MP_READONLY do not need to be
+	 * thread protected because they are initialized before the file is
+	 * linked onto the per-process lists, and never modified.
+	 *
+	 * MP_FLUSH is thread protected because it is potentially read/set by
+	 * multiple threads of control.
+	 */
+#define	MP_FILEID_SET	0x001		/* Application supplied a file ID. */
+#define	MP_FLUSH	0x002		/* Was used to flush a buffer. */
+#define	MP_FOR_FLUSH	0x004		/* Was opened to flush a buffer. */
+#define	MP_MULTIVERSION	0x008		/* Opened for multiversion access. */
+#define	MP_OPEN_CALLED	0x010		/* File opened. */
+#define	MP_READONLY	0x020		/* File is readonly. */
+#define	MP_DUMMY	0x040		/* File is dummy for __memp_fput. */
+	u_int32_t  flags;
+};
+
+/* Mpool statistics structure. */
+struct __db_mpool_stat { /* SHARED */
+	u_int32_t st_gbytes;		/* Total cache size: GB. */
+	u_int32_t st_bytes;		/* Total cache size: B. */
+	u_int32_t st_ncache;		/* Number of cache regions. */
+	u_int32_t st_max_ncache;	/* Maximum number of regions. */
+	db_size_t st_mmapsize;		/* Maximum file size for mmap. */
+	int32_t st_maxopenfd;		/* Maximum number of open fd's. */
+	int32_t st_maxwrite;		/* Maximum buffers to write. */
+	db_timeout_t st_maxwrite_sleep;	/* Sleep after writing max buffers. */
+	u_int32_t st_pages;		/* Total number of pages. */
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t st_map;		/* Pages from mapped files. */
+	uintmax_t st_cache_hit;	/* Pages found in the cache. */
+	uintmax_t st_cache_miss;	/* Pages not found in the cache. */
+	uintmax_t st_page_create;	/* Pages created in the cache. */
+	uintmax_t st_page_in;		/* Pages read in. */
+	uintmax_t st_page_out;		/* Pages written out. */
+	uintmax_t st_ro_evict;		/* Clean pages forced from the cache. */
+	uintmax_t st_rw_evict;		/* Dirty pages forced from the cache. */
+	uintmax_t st_page_trickle;	/* Pages written by memp_trickle. */
+	u_int32_t st_page_clean;	/* Clean pages. */
+	u_int32_t st_page_dirty;	/* Dirty pages. */
+	u_int32_t st_hash_buckets;	/* Number of hash buckets. */
+	u_int32_t st_hash_mutexes;	/* Number of hash bucket mutexes. */
+	u_int32_t st_pagesize;		/* Assumed page size. */
+	u_int32_t st_hash_searches;	/* Total hash chain searches. */
+	u_int32_t st_hash_longest;	/* Longest hash chain searched. */
+	uintmax_t st_hash_examined;	/* Total hash entries searched. */
+	uintmax_t st_hash_nowait;	/* Hash lock granted with nowait. */
+	uintmax_t st_hash_wait;		/* Hash lock granted after wait. */
+	uintmax_t st_hash_max_nowait;	/* Max hash lock granted with nowait. */
+	uintmax_t st_hash_max_wait;	/* Max hash lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted with nowait. */
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_mvcc_frozen;	/* Buffers frozen. */
+	uintmax_t st_mvcc_thawed;	/* Buffers thawed. */
+	uintmax_t st_mvcc_freed;	/* Frozen buffers freed. */
+	uintmax_t st_alloc;		/* Number of page allocations. */
+	uintmax_t st_alloc_buckets;	/* Buckets checked during allocation. */
+	uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */
+	uintmax_t st_alloc_pages;	/* Pages checked during allocation. */
+	uintmax_t st_alloc_max_pages;	/* Max checked during allocation. */
+	uintmax_t st_io_wait;		/* Thread waited on buffer I/O. */
+	uintmax_t st_sync_interrupted;	/* Number of times sync interrupted. */
+	roff_t	  st_regsize;		/* Region size. */
+	roff_t	  st_regmax;		/* Region max. */
+#endif
+};
+
+/*
+ * Mpool file statistics structure.
+ * The first fields in this structure must mirror the __db_mpool_fstat_int
+ * structure, since content is mem copied between the two.
+ */
+struct __db_mpool_fstat {
+	u_int32_t st_pagesize;		/* Page size. */
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t st_map;		/* Pages from mapped files. */
+	uintmax_t st_cache_hit;	/* Pages found in the cache. */
+	uintmax_t st_cache_miss;	/* Pages not found in the cache. */
+	uintmax_t st_page_create;	/* Pages created in the cache. */
+	uintmax_t st_page_in;		/* Pages read in. */
+	uintmax_t st_page_out;		/* Pages written out. */
+	uintmax_t st_backup_spins;	/* Number of spins during a copy. */
+#endif
+	char *file_name;	/* File name. */
+};
+
+/*******************************************************
+ * Transactions and recovery.
+ *******************************************************/
+#define	DB_TXNVERSION	1
+
+typedef enum {
+	DB_TXN_ABORT=0,			/* Public. */
+	DB_TXN_APPLY=1,			/* Public. */
+	DB_TXN_BACKWARD_ROLL=3,		/* Public. */
+	DB_TXN_FORWARD_ROLL=4,		/* Public. */
+	DB_TXN_OPENFILES=5,		/* Internal. */
+	DB_TXN_POPENFILES=6,		/* Internal. */
+	DB_TXN_PRINT=7,			/* Public. */
+	DB_TXN_LOG_VERIFY=8		/* Internal. */
+} db_recops;
+
+/*
+ * BACKWARD_ALLOC is used during the forward pass to pick up any aborted
+ * allocations for files that were created during the forward pass.
+ * The main difference between _ALLOC and _ROLL is that the entry for
+ * the file not exist during the rollforward pass.
+ */
+#define	DB_UNDO(op)	((op) == DB_TXN_ABORT || (op) == DB_TXN_BACKWARD_ROLL)
+#define	DB_REDO(op)	((op) == DB_TXN_FORWARD_ROLL || (op) == DB_TXN_APPLY)
+
+struct __db_txn {
+	DB_TXNMGR	*mgrp;		/* Pointer to transaction manager. */
+	DB_TXN		*parent;	/* Pointer to transaction's parent. */
+	DB_THREAD_INFO	*thread_info;	/* Pointer to thread information. */
+
+	u_int32_t	txnid;		/* Unique transaction id. */
+	char		*name;		/* Transaction name. */
+	DB_LOCKER	*locker;	/* Locker for this txn. */
+
+	void		*td;		/* Detail structure within region. */
+	db_timeout_t	lock_timeout;	/* Timeout for locks for this txn. */
+	void		*txn_list;	/* Undo information for parent. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__db_txn) links;
+	 */
+	struct {
+		struct __db_txn *tqe_next;
+		struct __db_txn **tqe_prev;
+	} links;			/* Links transactions off manager. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from shqueue.h.
+	 * SH_TAILQ_ENTRY xa_links;
+	 * These links link together transactions that are active in
+	 * the same thread of control.
+	 */
+	struct {
+		db_ssize_t stqe_next;
+		db_ssize_t stqe_prev;
+	} xa_links;			/* Links XA transactions. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__kids, __db_txn) kids;
+	 */
+	struct __kids {
+		struct __db_txn *tqh_first;
+		struct __db_txn **tqh_last;
+	} kids;
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__events, __txn_event) events;
+	 */
+	struct {
+		struct __txn_event *tqh_first;
+		struct __txn_event **tqh_last;
+	} events;			/* Links deferred events. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * STAILQ_HEAD(__logrec, __txn_logrec) logs;
+	 */
+	struct {
+		struct __txn_logrec *stqh_first;
+		struct __txn_logrec **stqh_last;
+	} logs;				/* Links in memory log records. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__db_txn) klinks;
+	 */
+	struct {
+		struct __db_txn *tqe_next;
+		struct __db_txn **tqe_prev;
+	} klinks;			/* Links of children in parent. */
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__my_cursors, __dbc) my_cursors;
+	 */
+	struct __my_cursors {
+		struct __dbc *tqh_first;
+		struct __dbc **tqh_last;
+	} my_cursors;
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__femfs, MPOOLFILE) femfs;
+	 *
+	 * These are DBs involved in file extension in this transaction.
+	 */
+	struct __femfs {
+		DB *tqh_first;
+		DB **tqh_last;
+	} femfs;
+
+	DB_TXN_TOKEN	*token_buffer;	/* User's commit token buffer. */
+	void	*api_internal;		/* C++ API private. */
+	void	*xml_internal;		/* XML API private. */
+
+	u_int32_t	cursors;	/* Number of cursors open for txn */
+
+	/* DB_TXN PUBLIC HANDLE LIST BEGIN */
+	int	  (*abort) __P((DB_TXN *));
+	int	  (*commit) __P((DB_TXN *, u_int32_t));
+	int	  (*discard) __P((DB_TXN *, u_int32_t));
+	int	  (*get_name) __P((DB_TXN *, const char **));
+	int	  (*get_priority) __P((DB_TXN *, u_int32_t *));
+	u_int32_t (*id) __P((DB_TXN *));
+	int	  (*prepare) __P((DB_TXN *, u_int8_t *));
+	int	  (*set_commit_token) __P((DB_TXN *, DB_TXN_TOKEN *));
+	int	  (*set_name) __P((DB_TXN *, const char *));
+	int	  (*set_priority) __P((DB_TXN *, u_int32_t));
+	int	  (*set_timeout) __P((DB_TXN *, db_timeout_t, u_int32_t));
+	/* DB_TXN PUBLIC HANDLE LIST END */
+
+	/* DB_TXN PRIVATE HANDLE LIST BEGIN */
+	void	  (*set_txn_lsnp) __P((DB_TXN *txn, DB_LSN **, DB_LSN **));
+	/* DB_TXN PRIVATE HANDLE LIST END */
+
+#define	TXN_XA_THREAD_NOTA		0
+#define	TXN_XA_THREAD_ASSOCIATED	1
+#define	TXN_XA_THREAD_SUSPENDED		2
+#define	TXN_XA_THREAD_UNASSOCIATED 	3
+	u_int32_t	xa_thr_status;
+
+#define	TXN_CHILDCOMMIT		0x00001	/* Txn has committed. */
+#define	TXN_COMPENSATE		0x00002	/* Compensating transaction. */
+#define	TXN_DEADLOCK		0x00004	/* Txn has deadlocked. */
+#define	TXN_FAMILY		0x00008	/* Cursors/children are independent. */
+#define	TXN_IGNORE_LEASE	0x00010	/* Skip lease check at commit time. */
+#define	TXN_INFAMILY		0x00020	/* Part of a transaction family. */
+#define	TXN_LOCKTIMEOUT		0x00040	/* Txn has a lock timeout. */
+#define	TXN_MALLOC		0x00080	/* Structure allocated by TXN system. */
+#define	TXN_NOSYNC		0x00100	/* Do not sync on prepare and commit. */
+#define	TXN_NOWAIT		0x00200	/* Do not wait on locks. */
+#define	TXN_PRIVATE		0x00400	/* Txn owned by cursor. */
+#define	TXN_READONLY		0x00800	/* CDS group handle. */
+#define	TXN_READ_COMMITTED	0x01000	/* Txn has degree 2 isolation. */
+#define	TXN_READ_UNCOMMITTED	0x02000	/* Txn has degree 1 isolation. */
+#define	TXN_RESTORED		0x04000	/* Txn has been restored. */
+#define	TXN_SNAPSHOT		0x08000	/* Snapshot Isolation. */
+#define	TXN_SYNC		0x10000	/* Write and sync on prepare/commit. */
+#define	TXN_WRITE_NOSYNC	0x20000	/* Write only on prepare/commit. */
+#define TXN_BULK		0x40000 /* Enable bulk loading optimization. */
+	u_int32_t	flags;
+};
+
+#define	TXN_SYNC_FLAGS (TXN_SYNC | TXN_NOSYNC | TXN_WRITE_NOSYNC)
+
+/*
+ * Structure used for two phase commit interface.
+ * We set the size of our global transaction id (gid) to be 128 in order
+ * to match that defined by the XA X/Open standard.
+ */
+#define	DB_GID_SIZE	128
+struct __db_preplist {
+	DB_TXN	*txn;
+	u_int8_t gid[DB_GID_SIZE];
+};
+
+/* Transaction statistics structure. */
+struct __db_txn_active {
+	u_int32_t txnid;		/* Transaction ID */
+	u_int32_t parentid;		/* Transaction ID of parent */
+	pid_t     pid;			/* Process owning txn ID */
+	db_threadid_t tid;		/* Thread owning txn ID */
+
+	DB_LSN	  lsn;			/* LSN when transaction began */
+
+	DB_LSN	  read_lsn;		/* Read LSN for MVCC */
+	u_int32_t mvcc_ref;		/* MVCC reference count */
+
+	u_int32_t priority;		/* Deadlock resolution priority */
+
+#define	TXN_ABORTED		1
+#define	TXN_COMMITTED		2
+#define	TXN_NEED_ABORT		3
+#define	TXN_PREPARED		4
+#define	TXN_RUNNING		5
+	u_int32_t status;		/* Status of the transaction */
+
+#define	TXN_XA_ACTIVE		1
+#define	TXN_XA_DEADLOCKED	2
+#define	TXN_XA_IDLE		3
+#define	TXN_XA_PREPARED		4
+#define	TXN_XA_ROLLEDBACK	5
+	u_int32_t xa_status;		/* XA status */
+
+	u_int8_t  gid[DB_GID_SIZE];	/* Global transaction ID */
+	char	  name[51];		/* 50 bytes of name, nul termination */
+};
+
+struct __db_txn_stat {
+	u_int32_t st_nrestores;		/* number of restored transactions
+					   after recovery. */
+#ifndef __TEST_DB_NO_STATISTICS
+	DB_LSN	  st_last_ckp;		/* lsn of the last checkpoint */
+	time_t	  st_time_ckp;		/* time of last checkpoint */
+	u_int32_t st_last_txnid;	/* last transaction id given out */
+	u_int32_t st_inittxns;		/* inital txns allocated */
+	u_int32_t st_maxtxns;		/* maximum txns possible */
+	uintmax_t st_naborts;		/* number of aborted transactions */
+	uintmax_t st_nbegins;		/* number of begun transactions */
+	uintmax_t st_ncommits;		/* number of committed transactions */
+	u_int32_t st_nactive;		/* number of active transactions */
+	u_int32_t st_nsnapshot;		/* number of snapshot transactions */
+	u_int32_t st_maxnactive;	/* maximum active transactions */
+	u_int32_t st_maxnsnapshot;	/* maximum snapshot transactions */
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted without wait. */
+	roff_t	  st_regsize;		/* Region size. */
+	DB_TXN_ACTIVE *st_txnarray;	/* array of active transactions */
+#endif
+};
+
+#define	DB_TXN_TOKEN_SIZE		20
+struct __db_txn_token {
+	u_int8_t buf[DB_TXN_TOKEN_SIZE];
+};
+
+/*******************************************************
+ * Replication.
+ *******************************************************/
+/* Special, out-of-band environment IDs. */
+#define	DB_EID_BROADCAST	-1
+#define	DB_EID_INVALID		-2
+#define	DB_EID_MASTER		-3
+
+#define	DB_REP_DEFAULT_PRIORITY		100
+
+/* Acknowledgement policies; 0 reserved as OOB. */
+#define	DB_REPMGR_ACKS_ALL		1
+#define	DB_REPMGR_ACKS_ALL_AVAILABLE	2
+#define	DB_REPMGR_ACKS_ALL_PEERS	3
+#define	DB_REPMGR_ACKS_NONE		4
+#define	DB_REPMGR_ACKS_ONE		5
+#define	DB_REPMGR_ACKS_ONE_PEER		6
+#define	DB_REPMGR_ACKS_QUORUM		7
+
+/* Replication timeout configuration values. */
+#define	DB_REP_ACK_TIMEOUT		1	/* RepMgr acknowledgements. */
+#define	DB_REP_CHECKPOINT_DELAY		2	/* Master checkpoint delay. */
+#define	DB_REP_CONNECTION_RETRY		3	/* RepMgr connections. */
+#define	DB_REP_ELECTION_RETRY		4	/* RepMgr elect retries. */
+#define	DB_REP_ELECTION_TIMEOUT		5	/* Rep normal elections. */
+#define	DB_REP_FULL_ELECTION_TIMEOUT	6	/* Rep full elections. */
+#define	DB_REP_HEARTBEAT_MONITOR	7	/* RepMgr client HB monitor. */
+#define	DB_REP_HEARTBEAT_SEND		8	/* RepMgr master send freq. */
+#define	DB_REP_LEASE_TIMEOUT		9	/* Master leases. */
+
+/*
+ * Event notification types.  (Tcl testing interface currently assumes there are
+ * no more than 32 of these.)
+ */
+#define	DB_EVENT_PANIC			 0
+#define	DB_EVENT_REG_ALIVE		 1
+#define	DB_EVENT_REG_PANIC		 2
+#define	DB_EVENT_REP_CLIENT		 3
+#define	DB_EVENT_REP_CONNECT_BROKEN	 4
+#define	DB_EVENT_REP_CONNECT_ESTD	 5
+#define	DB_EVENT_REP_CONNECT_TRY_FAILED	 6
+#define	DB_EVENT_REP_DUPMASTER		 7
+#define	DB_EVENT_REP_ELECTED		 8
+#define	DB_EVENT_REP_ELECTION_FAILED	 9
+#define	DB_EVENT_REP_INIT_DONE		10
+#define	DB_EVENT_REP_JOIN_FAILURE	11
+#define	DB_EVENT_REP_LOCAL_SITE_REMOVED	12
+#define	DB_EVENT_REP_MASTER		13
+#define	DB_EVENT_REP_MASTER_FAILURE	14
+#define	DB_EVENT_REP_NEWMASTER		15
+#define	DB_EVENT_REP_PERM_FAILED	16
+#define	DB_EVENT_REP_SITE_ADDED		17
+#define	DB_EVENT_REP_SITE_REMOVED	18
+#define	DB_EVENT_REP_STARTUPDONE	19
+#define	DB_EVENT_REP_WOULD_ROLLBACK	20	/* Undocumented; C API only. */
+#define	DB_EVENT_WRITE_FAILED		21
+#define	DB_EVENT_NO_SUCH_EVENT		 0xffffffff /* OOB sentinel value */
+
+/* Replication Manager site status. */
+struct __db_repmgr_site {
+	int eid;
+	char *host;
+	u_int port;
+
+#define	DB_REPMGR_CONNECTED	1
+#define	DB_REPMGR_DISCONNECTED	2
+	u_int32_t status;
+
+#define	DB_REPMGR_ISPEER	0x01
+	u_int32_t flags;
+};
+
+/* Replication statistics. */
+struct __db_rep_stat { /* SHARED */
+	/* !!!
+	 * Many replication statistics fields cannot be protected by a mutex
+	 * without an unacceptable performance penalty, since most message
+	 * processing is done without the need to hold a region-wide lock.
+	 * Fields whose comments end with a '+' may be updated without holding
+	 * the replication or log mutexes (as appropriate), and thus may be
+	 * off somewhat (or, on unreasonable architectures under unlucky
+	 * circumstances, garbaged).
+	 */
+	u_int32_t st_startup_complete;	/* Site completed client sync-up. */
+#ifndef __TEST_DB_NO_STATISTICS
+	uintmax_t st_log_queued;	/* Log records currently queued.+ */
+	u_int32_t st_status;		/* Current replication status. */
+	DB_LSN st_next_lsn;		/* Next LSN to use or expect. */
+	DB_LSN st_waiting_lsn;		/* LSN we're awaiting, if any. */
+	DB_LSN st_max_perm_lsn;		/* Maximum permanent LSN. */
+	db_pgno_t st_next_pg;		/* Next pg we expect. */
+	db_pgno_t st_waiting_pg;	/* pg we're awaiting, if any. */
+
+	u_int32_t st_dupmasters;	/* # of times a duplicate master
+					   condition was detected.+ */
+	db_ssize_t st_env_id;		/* Current environment ID. */
+	u_int32_t st_env_priority;	/* Current environment priority. */
+	uintmax_t st_bulk_fills;	/* Bulk buffer fills. */
+	uintmax_t st_bulk_overflows;	/* Bulk buffer overflows. */
+	uintmax_t st_bulk_records;	/* Bulk records stored. */
+	uintmax_t st_bulk_transfers;	/* Transfers of bulk buffers. */
+	uintmax_t st_client_rerequests;/* Number of forced rerequests. */
+	uintmax_t st_client_svc_req;	/* Number of client service requests
+					   received by this client. */
+	uintmax_t st_client_svc_miss;	/* Number of client service requests
+					   missing on this client. */
+	u_int32_t st_gen;		/* Current generation number. */
+	u_int32_t st_egen;		/* Current election gen number. */
+	uintmax_t st_lease_chk;		/* Lease validity checks. */
+	uintmax_t st_lease_chk_misses;	/* Lease checks invalid. */
+	uintmax_t st_lease_chk_refresh;	/* Lease refresh attempts. */
+	uintmax_t st_lease_sends;	/* Lease messages sent live. */
+
+	uintmax_t st_log_duplicated;	/* Log records received multiply.+ */
+	uintmax_t st_log_queued_max;	/* Max. log records queued at once.+ */
+	uintmax_t st_log_queued_total;	/* Total # of log recs. ever queued.+ */
+	uintmax_t st_log_records;	/* Log records received and put.+ */
+	uintmax_t st_log_requested;	/* Log recs. missed and requested.+ */
+	db_ssize_t st_master;		/* Env. ID of the current master. */
+	uintmax_t st_master_changes;	/* # of times we've switched masters. */
+	uintmax_t st_msgs_badgen;	/* Messages with a bad generation #.+ */
+	uintmax_t st_msgs_processed;	/* Messages received and processed.+ */
+	uintmax_t st_msgs_recover;	/* Messages ignored because this site
+					   was a client in recovery.+ */
+	uintmax_t st_msgs_send_failures;/* # of failed message sends.+ */
+	uintmax_t st_msgs_sent;	/* # of successful message sends.+ */
+	uintmax_t st_newsites;		/* # of NEWSITE msgs. received.+ */
+	u_int32_t st_nsites;		/* Current number of sites we will
+					   assume during elections. */
+	uintmax_t st_nthrottles;	/* # of times we were throttled. */
+	uintmax_t st_outdated;		/* # of times we detected and returned
+					   an OUTDATED condition.+ */
+	uintmax_t st_pg_duplicated;	/* Pages received multiply.+ */
+	uintmax_t st_pg_records;	/* Pages received and stored.+ */
+	uintmax_t st_pg_requested;	/* Pages missed and requested.+ */
+	uintmax_t st_txns_applied;	/* # of transactions applied.+ */
+	uintmax_t st_startsync_delayed;/* # of STARTSYNC msgs delayed.+ */
+
+	/* Elections generally. */
+	uintmax_t st_elections;	/* # of elections held.+ */
+	uintmax_t st_elections_won;	/* # of elections won by this site.+ */
+
+	/* Statistics about an in-progress election. */
+	db_ssize_t st_election_cur_winner;	/* Current front-runner. */
+	u_int32_t st_election_gen;	/* Election generation number. */
+	u_int32_t st_election_datagen;	/* Election data generation number. */
+	DB_LSN st_election_lsn;		/* Max. LSN of current winner. */
+	u_int32_t st_election_nsites;	/* # of "registered voters". */
+	u_int32_t st_election_nvotes;	/* # of "registered voters" needed. */
+	u_int32_t st_election_priority;	/* Current election priority. */
+	int32_t   st_election_status;	/* Current election status. */
+	u_int32_t st_election_tiebreaker;/* Election tiebreaker value. */
+	u_int32_t st_election_votes;	/* Votes received in this round. */
+	u_int32_t st_election_sec;	/* Last election time seconds. */
+	u_int32_t st_election_usec;	/* Last election time useconds. */
+	u_int32_t st_max_lease_sec;	/* Maximum lease timestamp seconds. */
+	u_int32_t st_max_lease_usec;	/* Maximum lease timestamp useconds. */
+
+	/* Undocumented statistics only used by the test system. */
+#ifdef	CONFIG_TEST
+	u_int32_t st_filefail_cleanups;	/* # of FILE_FAIL cleanups done. */
+#endif
+#endif
+};
+
+/* Replication Manager statistics. */
+struct __db_repmgr_stat { /* SHARED */
+	uintmax_t st_perm_failed;	/* # of insufficiently ack'ed msgs. */
+	uintmax_t st_msgs_queued;	/* # msgs queued for network delay. */
+	uintmax_t st_msgs_dropped;	/* # msgs discarded due to excessive
+					   queue length. */
+	uintmax_t st_connection_drop;	/* Existing connections dropped. */
+	uintmax_t st_connect_fail;	/* Failed new connection attempts. */
+	uintmax_t st_elect_threads;	/* # of active election threads. */
+	uintmax_t st_max_elect_threads;	/* Max concurrent e-threads ever. */
+};
+
+/* Replication Manager connection error. */
+struct __db_repmgr_conn_err {
+	int		eid;		/* Replication Environment ID. */
+	int		error;		/* System networking error code. */
+};
+
+/*******************************************************
+ * Sequences.
+ *******************************************************/
+/*
+ * The storage record for a sequence.
+ */
+struct __db_seq_record {
+	u_int32_t	seq_version;	/* Version size/number. */
+	u_int32_t	flags;		/* DB_SEQ_XXX Flags. */
+	db_seq_t	seq_value;	/* Current value. */
+	db_seq_t	seq_max;	/* Max permitted. */
+	db_seq_t	seq_min;	/* Min permitted. */
+};
+
+/*
+ * Handle for a sequence object.
+ */
+struct __db_sequence {
+	DB		*seq_dbp;	/* DB handle for this sequence. */
+	db_mutex_t	mtx_seq;	/* Mutex if sequence is threaded. */
+	DB_SEQ_RECORD	*seq_rp;	/* Pointer to current data. */
+	DB_SEQ_RECORD	seq_record;	/* Data from DB_SEQUENCE. */
+	int32_t		seq_cache_size; /* Number of values cached. */
+	db_seq_t	seq_last_value;	/* Last value cached. */
+	db_seq_t	seq_prev_value;	/* Last value returned. */
+	DBT		seq_key;	/* DBT pointing to sequence key. */
+	DBT		seq_data;	/* DBT pointing to seq_record. */
+
+	/* API-private structure: used by C++ and Java. */
+	void		*api_internal;
+
+	/* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */
+	int		(*close) __P((DB_SEQUENCE *, u_int32_t));
+	int		(*get) __P((DB_SEQUENCE *,
+			      DB_TXN *, int32_t, db_seq_t *, u_int32_t));
+	int		(*get_cachesize) __P((DB_SEQUENCE *, int32_t *));
+	int		(*get_db) __P((DB_SEQUENCE *, DB **));
+	int		(*get_flags) __P((DB_SEQUENCE *, u_int32_t *));
+	int		(*get_key) __P((DB_SEQUENCE *, DBT *));
+	int		(*get_range) __P((DB_SEQUENCE *,
+			     db_seq_t *, db_seq_t *));
+	int		(*initial_value) __P((DB_SEQUENCE *, db_seq_t));
+	int		(*open) __P((DB_SEQUENCE *,
+			    DB_TXN *, DBT *, u_int32_t));
+	int		(*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
+	int		(*set_cachesize) __P((DB_SEQUENCE *, int32_t));
+	int		(*set_flags) __P((DB_SEQUENCE *, u_int32_t));
+	int		(*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
+	int		(*stat) __P((DB_SEQUENCE *,
+			    DB_SEQUENCE_STAT **, u_int32_t));
+	int		(*stat_print) __P((DB_SEQUENCE *, u_int32_t));
+	/* DB_SEQUENCE PUBLIC HANDLE LIST END */
+};
+
+struct __db_seq_stat { /* SHARED */
+	uintmax_t st_wait;		/* Sequence lock granted w/o wait. */
+	uintmax_t st_nowait;		/* Sequence lock granted after wait. */
+	db_seq_t  st_current;		/* Current value in db. */
+	db_seq_t  st_value;		/* Current cached value. */
+	db_seq_t  st_last_value;	/* Last cached value. */
+	db_seq_t  st_min;		/* Minimum value. */
+	db_seq_t  st_max;		/* Maximum value. */
+	int32_t   st_cache_size;	/* Cache size. */
+	u_int32_t st_flags;		/* Flag value. */
+};
+
+/*******************************************************
+ * Access methods.
+ *******************************************************/
+/*
+ * Any new methods need to retain the original numbering.  The type
+ * is written in a log record so must be maintained.
+ */
+typedef enum {
+	DB_BTREE=1,
+	DB_HASH=2,
+	DB_HEAP=6,
+	DB_RECNO=3,
+	DB_QUEUE=4,
+	DB_UNKNOWN=5			/* Figure it out on open. */
+} DBTYPE;
+
+#define	DB_RENAMEMAGIC	0x030800	/* File has been renamed. */
+
+#define	DB_BTREEVERSION	9		/* Current btree version. */
+#define	DB_BTREEOLDVER	8		/* Oldest btree version supported. */
+#define	DB_BTREEMAGIC	0x053162
+
+#define	DB_HASHVERSION	9		/* Current hash version. */
+#define	DB_HASHOLDVER	7		/* Oldest hash version supported. */
+#define	DB_HASHMAGIC	0x061561
+
+#define	DB_HEAPVERSION	1		/* Current heap version. */
+#define	DB_HEAPOLDVER	1		/* Oldest heap version supported. */
+#define	DB_HEAPMAGIC	0x074582
+
+#define	DB_QAMVERSION	4		/* Current queue version. */
+#define	DB_QAMOLDVER	3		/* Oldest queue version supported. */
+#define	DB_QAMMAGIC	0x042253
+
+#define	DB_SEQUENCE_VERSION 2		/* Current sequence version. */
+#define	DB_SEQUENCE_OLDVER  1		/* Oldest sequence version supported. */
+
+/*
+ * DB access method and cursor operation values.  Each value is an operation
+ * code to which additional bit flags are added.
+ */
+#define	DB_AFTER		 1	/* Dbc.put */
+#define	DB_APPEND		 2	/* Db.put */
+#define	DB_BEFORE		 3	/* Dbc.put */
+#define	DB_CONSUME		 4	/* Db.get */
+#define	DB_CONSUME_WAIT		 5	/* Db.get */
+#define	DB_CURRENT		 6	/* Dbc.get, Dbc.put, DbLogc.get */
+#define	DB_FIRST		 7	/* Dbc.get, DbLogc->get */
+#define	DB_GET_BOTH		 8	/* Db.get, Dbc.get */
+#define	DB_GET_BOTHC		 9	/* Dbc.get (internal) */
+#define	DB_GET_BOTH_RANGE	10	/* Db.get, Dbc.get */
+#define	DB_GET_RECNO		11	/* Dbc.get */
+#define	DB_JOIN_ITEM		12	/* Dbc.get; don't do primary lookup */
+#define	DB_KEYFIRST		13	/* Dbc.put */
+#define	DB_KEYLAST		14	/* Dbc.put */
+#define	DB_LAST			15	/* Dbc.get, DbLogc->get */
+#define	DB_NEXT			16	/* Dbc.get, DbLogc->get */
+#define	DB_NEXT_DUP		17	/* Dbc.get */
+#define	DB_NEXT_NODUP		18	/* Dbc.get */
+#define	DB_NODUPDATA		19	/* Db.put, Dbc.put */
+#define	DB_NOOVERWRITE		20	/* Db.put */
+#define	DB_OVERWRITE_DUP	21	/* Dbc.put, Db.put; no DB_KEYEXIST */
+#define	DB_POSITION		22	/* Dbc.dup */
+#define	DB_PREV			23	/* Dbc.get, DbLogc->get */
+#define	DB_PREV_DUP		24	/* Dbc.get */
+#define	DB_PREV_NODUP		25	/* Dbc.get */
+#define	DB_SET			26	/* Dbc.get, DbLogc->get */
+#define	DB_SET_RANGE		27	/* Dbc.get */
+#define	DB_SET_RECNO		28	/* Db.get, Dbc.get */
+#define	DB_UPDATE_SECONDARY	29	/* Dbc.get, Dbc.del (internal) */
+#define	DB_SET_LTE		30	/* Dbc.get (internal) */
+#define	DB_GET_BOTH_LTE		31	/* Dbc.get (internal) */
+
+/* This has to change when the max opcode hits 255. */
+#define	DB_OPFLAGS_MASK	0x000000ff	/* Mask for operations flags. */
+
+/*
+ * DB (user visible) error return codes.
+ *
+ * !!!
+ * We don't want our error returns to conflict with other packages where
+ * possible, so pick a base error value that's hopefully not common.  We
+ * document that we own the error name space from -30,800 to -30,999.
+ */
+/* DB (public) error return codes. */
+#define	DB_BUFFER_SMALL		(-30999)/* User memory too small for return. */
+#define	DB_DONOTINDEX		(-30998)/* "Null" return from 2ndary callbk. */
+#define	DB_FOREIGN_CONFLICT	(-30997)/* A foreign db constraint triggered. */
+#define	DB_HEAP_FULL		(-30996)/* No free space in a heap file. */
+#define	DB_KEYEMPTY		(-30995)/* Key/data deleted or never created. */
+#define	DB_KEYEXIST		(-30994)/* The key/data pair already exists. */
+#define	DB_LOCK_DEADLOCK	(-30993)/* Deadlock. */
+#define	DB_LOCK_NOTGRANTED	(-30992)/* Lock unavailable. */
+#define	DB_LOG_BUFFER_FULL	(-30991)/* In-memory log buffer full. */
+#define	DB_LOG_VERIFY_BAD	(-30990)/* Log verification failed. */
+#define	DB_NOSERVER		(-30989)/* Server panic return. */
+#define	DB_NOTFOUND		(-30988)/* Key/data pair not found (EOF). */
+#define	DB_OLD_VERSION		(-30987)/* Out-of-date version. */
+#define	DB_PAGE_NOTFOUND	(-30986)/* Requested page not found. */
+#define	DB_REP_DUPMASTER	(-30985)/* There are two masters. */
+#define	DB_REP_HANDLE_DEAD	(-30984)/* Rolled back a commit. */
+#define	DB_REP_HOLDELECTION	(-30983)/* Time to hold an election. */
+#define	DB_REP_IGNORE		(-30982)/* This msg should be ignored.*/
+#define	DB_REP_ISPERM		(-30981)/* Cached not written perm written.*/
+#define	DB_REP_JOIN_FAILURE	(-30980)/* Unable to join replication group. */
+#define	DB_REP_LEASE_EXPIRED	(-30979)/* Master lease has expired. */
+#define	DB_REP_LOCKOUT		(-30978)/* API/Replication lockout now. */
+#define	DB_REP_NEWSITE		(-30977)/* New site entered system. */
+#define	DB_REP_NOTPERM		(-30976)/* Permanent log record not written. */
+#define	DB_REP_UNAVAIL		(-30975)/* Site cannot currently be reached. */
+#define	DB_REP_WOULDROLLBACK	(-30974)/* UNDOC: rollback inhibited by app. */
+#define	DB_RUNRECOVERY		(-30973)/* Panic return. */
+#define	DB_SECONDARY_BAD	(-30972)/* Secondary index corrupt. */
+#define	DB_TIMEOUT		(-30971)/* Timed out on read consistency. */
+#define	DB_VERIFY_BAD		(-30970)/* Verify failed; bad format. */
+#define	DB_VERSION_MISMATCH	(-30969)/* Environment version mismatch. */
+
+/* DB (private) error return codes. */
+#define	DB_ALREADY_ABORTED	(-30899)
+#define	DB_CHKSUM_FAIL		(-30898)/* Checksum failed. */
+#define	DB_DELETED		(-30897)/* Recovery file marked deleted. */
+#define	DB_EVENT_NOT_HANDLED	(-30896)/* Forward event to application. */
+#define	DB_NEEDSPLIT		(-30895)/* Page needs to be split. */
+#define	DB_REP_BULKOVF		(-30894)/* Rep bulk buffer overflow. */
+#define	DB_REP_LOGREADY		(-30893)/* Rep log ready for recovery. */
+#define	DB_REP_NEWMASTER	(-30892)/* We have learned of a new master. */
+#define	DB_REP_PAGEDONE		(-30891)/* This page was already done. */
+#define	DB_SURPRISE_KID		(-30890)/* Child commit where parent
+					   didn't know it was a parent. */
+#define	DB_SWAPBYTES		(-30889)/* Database needs byte swapping. */
+#define	DB_TXN_CKP		(-30888)/* Encountered ckp record in log. */
+#define	DB_VERIFY_FATAL		(-30887)/* DB->verify cannot proceed. */
+
+/* Database handle. */
+struct __db {
+	/*******************************************************
+	 * Public: owned by the application.
+	 *******************************************************/
+	u_int32_t pgsize;		/* Database logical page size. */
+	DB_CACHE_PRIORITY priority;	/* Database priority in cache. */
+
+					/* Callbacks. */
+	int (*db_append_recno) __P((DB *, DBT *, db_recno_t));
+	void (*db_feedback) __P((DB *, int, int));
+	int (*dup_compare) __P((DB *, const DBT *, const DBT *));
+
+	void	*app_private;		/* Application-private handle. */
+
+	/*******************************************************
+	 * Private: owned by DB.
+	 *******************************************************/
+	DB_ENV	*dbenv;			/* Backing public environment. */
+	ENV	*env;			/* Backing private environment. */
+
+	DBTYPE	 type;			/* DB access method type. */
+
+	DB_MPOOLFILE *mpf;		/* Backing buffer pool. */
+
+	db_mutex_t mutex;		/* Synchronization for free threading */
+
+	char *fname, *dname;		/* File/database passed to DB->open. */
+	const char *dirname;		/* Directory of DB file. */
+	u_int32_t open_flags;		/* Flags passed to DB->open. */
+
+	u_int8_t fileid[DB_FILE_ID_LEN];/* File's unique ID for locking. */
+
+	u_int32_t adj_fileid;		/* File's unique ID for curs. adj. */
+
+#define	DB_LOGFILEID_INVALID	-1
+	FNAME *log_filename;		/* File's naming info for logging. */
+
+	db_pgno_t meta_pgno;		/* Meta page number */
+	DB_LOCKER *locker;		/* Locker for handle locking. */
+	DB_LOCKER *cur_locker;		/* Current handle lock holder. */
+	DB_TXN *cur_txn;		/* Opening transaction. */
+	DB_LOCKER *associate_locker;	/* Locker for DB->associate call. */
+	DB_LOCK	 handle_lock;		/* Lock held on this handle. */
+
+	time_t	 timestamp;		/* Handle timestamp for replication. */
+	u_int32_t fid_gen;		/* Rep generation number for fids. */
+
+	/*
+	 * Returned data memory for DB->get() and friends.
+	 */
+	DBT	 my_rskey;		/* Secondary key. */
+	DBT	 my_rkey;		/* [Primary] key. */
+	DBT	 my_rdata;		/* Data. */
+
+	/*
+	 * !!!
+	 * Some applications use DB but implement their own locking outside of
+	 * DB.  If they're using fcntl(2) locking on the underlying database
+	 * file, and we open and close a file descriptor for that file, we will
+	 * discard their locks.  The DB_FCNTL_LOCKING flag to DB->open is an
+	 * undocumented interface to support this usage which leaves any file
+	 * descriptors we open until DB->close.  This will only work with the
+	 * DB->open interface and simple caches, e.g., creating a transaction
+	 * thread may open/close file descriptors this flag doesn't protect.
+	 * Locking with fcntl(2) on a file that you don't own is a very, very
+	 * unsafe thing to do.  'Nuff said.
+	 */
+	DB_FH	*saved_open_fhp;	/* Saved file handle. */
+
+	/*
+	 * Linked list of DBP's, linked from the ENV, used to keep track
+	 * of all open db handles for cursor adjustment.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__db) dblistlinks;
+	 */
+	struct {
+		struct __db *tqe_next;
+		struct __db **tqe_prev;
+	} dblistlinks;
+
+	/*
+	 * Cursor queues.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__cq_fq, __dbc) free_queue;
+	 * TAILQ_HEAD(__cq_aq, __dbc) active_queue;
+	 * TAILQ_HEAD(__cq_jq, __dbc) join_queue;
+	 */
+	struct __cq_fq {
+		struct __dbc *tqh_first;
+		struct __dbc **tqh_last;
+	} free_queue;
+	struct __cq_aq {
+		struct __dbc *tqh_first;
+		struct __dbc **tqh_last;
+	} active_queue;
+	struct __cq_jq {
+		struct __dbc *tqh_first;
+		struct __dbc **tqh_last;
+	} join_queue;
+
+	/*
+	 * Secondary index support.
+	 *
+	 * Linked list of secondary indices -- set in the primary.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * LIST_HEAD(s_secondaries, __db);
+	 */
+	struct {
+		struct __db *lh_first;
+	} s_secondaries;
+
+	/*
+	 * List entries for secondaries, and reference count of how many
+	 * threads are updating this secondary (see Dbc.put).
+	 *
+	 * !!!
+	 * Note that these are synchronized by the primary's mutex, but
+	 * filled in in the secondaries.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * LIST_ENTRY(__db) s_links;
+	 */
+	struct {
+		struct __db *le_next;
+		struct __db **le_prev;
+	} s_links;
+	u_int32_t s_refcnt;
+
+	/* Secondary callback and free functions -- set in the secondary. */
+	int	(*s_callback) __P((DB *, const DBT *, const DBT *, DBT *));
+
+	/* Reference to primary -- set in the secondary. */
+	DB	*s_primary;
+
+#define	DB_ASSOC_IMMUTABLE_KEY    0x00000001 /* Secondary key is immutable. */
+#define	DB_ASSOC_CREATE    0x00000002 /* Secondary db populated on open. */
+
+	/* Flags passed to associate -- set in the secondary. */
+	u_int32_t s_assoc_flags;
+
+	/*
+	 * Foreign key support.
+	 *
+	 * Linked list of primary dbs -- set in the foreign db
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * LIST_HEAD(f_primaries, __db);
+	 */
+	struct {
+		struct __db_foreign_info *lh_first;
+	} f_primaries;
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__db) felink;
+	 *
+	 * Links in a list of DBs involved in file extension
+	 * during a transaction.  These are to be used only while the
+	 * metadata is locked.
+	 */
+	struct {
+		struct __db *tqe_next;
+		struct __db **tqe_prev;
+	} felink;
+
+	/* Reference to foreign -- set in the secondary. */
+	DB      *s_foreign;
+
+	/* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */
+	void	*api_internal;
+
+	/* Subsystem-private structure. */
+	void	*bt_internal;		/* Btree/Recno access method. */
+	void	*h_internal;		/* Hash access method. */
+	void	*heap_internal;		/* Heap access method. */
+	void	*p_internal;		/* Partition informaiton. */
+	void	*q_internal;		/* Queue access method. */
+
+	/* DB PUBLIC HANDLE LIST BEGIN */
+	int  (*associate) __P((DB *, DB_TXN *, DB *,
+		int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+	int  (*associate_foreign) __P((DB *, DB *,
+		int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+		u_int32_t));
+	int  (*close) __P((DB *, u_int32_t));
+	int  (*compact) __P((DB *,
+		DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+	int  (*cursor) __P((DB *, DB_TXN *, DBC **, u_int32_t));
+	int  (*del) __P((DB *, DB_TXN *, DBT *, u_int32_t));
+	void (*err) __P((DB *, int, const char *, ...));
+	void (*errx) __P((DB *, const char *, ...));
+	int  (*exists) __P((DB *, DB_TXN *, DBT *, u_int32_t));
+	int  (*fd) __P((DB *, int *));
+	int  (*get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+	int  (*get_alloc) __P((DB *, void *(**)(size_t),
+		void *(**)(void *, size_t), void (**)(void *)));
+	int  (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t)));
+	int  (*get_assoc_flags) __P((DB *, u_int32_t *));
+	int  (*get_bt_compare)
+		__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+	int  (*get_bt_compress) __P((DB *,
+		int (**)(DB *,
+		const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+		int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+	int  (*get_bt_minkey) __P((DB *, u_int32_t *));
+	int  (*get_bt_prefix)
+		__P((DB *, size_t (**)(DB *, const DBT *, const DBT *)));
+	int  (*get_byteswapped) __P((DB *, int *));
+	int  (*get_cachesize) __P((DB *, u_int32_t *, u_int32_t *, int *));
+	int  (*get_create_dir) __P((DB *, const char **));
+	int  (*get_dbname) __P((DB *, const char **, const char **));
+	int  (*get_dup_compare)
+		__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+	int  (*get_encrypt_flags) __P((DB *, u_int32_t *));
+	DB_ENV *(*get_env) __P((DB *));
+	void (*get_errcall) __P((DB *,
+		void (**)(const DB_ENV *, const char *, const char *)));
+	void (*get_errfile) __P((DB *, FILE **));
+	void (*get_errpfx) __P((DB *, const char **));
+	int  (*get_feedback) __P((DB *, void (**)(DB *, int, int)));
+	int  (*get_flags) __P((DB *, u_int32_t *));
+	int  (*get_h_compare)
+		__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+	int  (*get_h_ffactor) __P((DB *, u_int32_t *));
+	int  (*get_h_hash)
+		__P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t)));
+	int  (*get_h_nelem) __P((DB *, u_int32_t *));
+	int  (*get_heapsize) __P((DB *, u_int32_t *, u_int32_t *));
+	int  (*get_heap_regionsize) __P((DB *, u_int32_t *));
+	int  (*get_lk_exclusive) __P((DB *, int *, int *));
+	int  (*get_lorder) __P((DB *, int *));
+	DB_MPOOLFILE *(*get_mpf) __P((DB *));
+	void (*get_msgcall) __P((DB *,
+	    void (**)(const DB_ENV *, const char *)));
+	void (*get_msgfile) __P((DB *, FILE **));
+	int  (*get_multiple) __P((DB *));
+	int  (*get_open_flags) __P((DB *, u_int32_t *));
+	int  (*get_pagesize) __P((DB *, u_int32_t *));
+	int  (*get_partition_callback) __P((DB *,
+		u_int32_t *, u_int32_t (**)(DB *, DBT *key)));
+	int  (*get_partition_dirs) __P((DB *, const char ***));
+	int  (*get_partition_keys) __P((DB *, u_int32_t *, DBT **));
+	int  (*get_priority) __P((DB *, DB_CACHE_PRIORITY *));
+	int  (*get_q_extentsize) __P((DB *, u_int32_t *));
+	int  (*get_re_delim) __P((DB *, int *));
+	int  (*get_re_len) __P((DB *, u_int32_t *));
+	int  (*get_re_pad) __P((DB *, int *));
+	int  (*get_re_source) __P((DB *, const char **));
+	int  (*get_transactional) __P((DB *));
+	int  (*get_type) __P((DB *, DBTYPE *));
+	int  (*join) __P((DB *, DBC **, DBC **, u_int32_t));
+	int  (*key_range)
+		__P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+	int  (*open) __P((DB *,
+		DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int));
+	int  (*pget) __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+	int  (*put) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+	int  (*remove) __P((DB *, const char *, const char *, u_int32_t));
+	int  (*rename) __P((DB *,
+		const char *, const char *, const char *, u_int32_t));
+	int  (*set_alloc) __P((DB *, void *(*)(size_t),
+		void *(*)(void *, size_t), void (*)(void *)));
+	int  (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+	int  (*set_bt_compare)
+		__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+	int  (*set_bt_compress) __P((DB *,
+		int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+		int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+	int  (*set_bt_minkey) __P((DB *, u_int32_t));
+	int  (*set_bt_prefix)
+		__P((DB *, size_t (*)(DB *, const DBT *, const DBT *)));
+	int  (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int));
+	int  (*set_create_dir) __P((DB *, const char *));
+	int  (*set_dup_compare)
+		__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+	int  (*set_encrypt) __P((DB *, const char *, u_int32_t));
+	void (*set_errcall) __P((DB *,
+		void (*)(const DB_ENV *, const char *, const char *)));
+	void (*set_errfile) __P((DB *, FILE *));
+	void (*set_errpfx) __P((DB *, const char *));
+	int  (*set_feedback) __P((DB *, void (*)(DB *, int, int)));
+	int  (*set_flags) __P((DB *, u_int32_t));
+	int  (*set_h_compare)
+		__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+	int  (*set_h_ffactor) __P((DB *, u_int32_t));
+	int  (*set_h_hash)
+		__P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t)));
+	int  (*set_h_nelem) __P((DB *, u_int32_t));
+	int  (*set_heapsize) __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+	int  (*set_heap_regionsize) __P((DB *, u_int32_t));
+	int  (*set_lk_exclusive) __P((DB *, int));
+	int  (*set_lorder) __P((DB *, int));
+	void (*set_msgcall) __P((DB *, void (*)(const DB_ENV *, const char *)));
+	void (*set_msgfile) __P((DB *, FILE *));
+	int  (*set_pagesize) __P((DB *, u_int32_t));
+	int  (*set_paniccall) __P((DB *, void (*)(DB_ENV *, int)));
+	int  (*set_partition) __P((DB *,
+		u_int32_t, DBT *, u_int32_t (*)(DB *, DBT *key)));
+	int  (*set_partition_dirs) __P((DB *, const char **));
+	int  (*set_priority) __P((DB *, DB_CACHE_PRIORITY));
+	int  (*set_q_extentsize) __P((DB *, u_int32_t));
+	int  (*set_re_delim) __P((DB *, int));
+	int  (*set_re_len) __P((DB *, u_int32_t));
+	int  (*set_re_pad) __P((DB *, int));
+	int  (*set_re_source) __P((DB *, const char *));
+	int  (*sort_multiple) __P((DB *, DBT *, DBT *, u_int32_t));
+	int  (*stat) __P((DB *, DB_TXN *, void *, u_int32_t));
+	int  (*stat_print) __P((DB *, u_int32_t));
+	int  (*sync) __P((DB *, u_int32_t));
+	int  (*truncate) __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+	int  (*upgrade) __P((DB *, const char *, u_int32_t));
+	int  (*verify)
+		__P((DB *, const char *, const char *, FILE *, u_int32_t));
+	/* DB PUBLIC HANDLE LIST END */
+
+	/* DB PRIVATE HANDLE LIST BEGIN */
+	int  (*dump) __P((DB *, const char *,
+		int (*)(void *, const void *), void *, int, int));
+	int  (*db_am_remove) __P((DB *, DB_THREAD_INFO *,
+		DB_TXN *, const char *, const char *, u_int32_t));
+	int  (*db_am_rename) __P((DB *, DB_THREAD_INFO *,
+		DB_TXN *, const char *, const char *, const char *));
+	/* DB PRIVATE HANDLE LIST END */
+
+	/*
+	 * Never called; these are a place to save function pointers
+	 * so that we can undo an associate.
+	 */
+	int  (*stored_get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+	int  (*stored_close) __P((DB *, u_int32_t));
+
+	/* Alternative handle close function, used by C++ API. */
+	int  (*alt_close) __P((DB *, u_int32_t));
+
+#define	DB_OK_BTREE	0x01
+#define	DB_OK_HASH	0x02
+#define	DB_OK_HEAP	0x04
+#define	DB_OK_QUEUE	0x08
+#define	DB_OK_RECNO	0x10
+	u_int32_t	am_ok;		/* Legal AM choices. */
+
+	/*
+	 * This field really ought to be an AM_FLAG, but we have
+	 * have run out of bits.  If/when we decide to split up
+	 * the flags, we can incorporate it.
+	 */
+	int	 preserve_fid;		/* Do not free fileid on close. */
+
+#define	DB_AM_CHKSUM		0x00000001 /* Checksumming */
+#define	DB_AM_COMPENSATE	0x00000002 /* Created by compensating txn */
+#define	DB_AM_COMPRESS		0x00000004 /* Compressed BTree */
+#define	DB_AM_CREATED		0x00000008 /* Database was created upon open */
+#define	DB_AM_CREATED_MSTR	0x00000010 /* Encompassing file was created */
+#define	DB_AM_DBM_ERROR		0x00000020 /* Error in DBM/NDBM database */
+#define	DB_AM_DELIMITER		0x00000040 /* Variable length delimiter set */
+#define	DB_AM_DISCARD		0x00000080 /* Discard any cached pages */
+#define	DB_AM_DUP		0x00000100 /* DB_DUP */
+#define	DB_AM_DUPSORT		0x00000200 /* DB_DUPSORT */
+#define	DB_AM_ENCRYPT		0x00000400 /* Encryption */
+#define	DB_AM_FIXEDLEN		0x00000800 /* Fixed-length records */
+#define	DB_AM_INMEM		0x00001000 /* In-memory; no sync on close */
+#define	DB_AM_INORDER		0x00002000 /* DB_INORDER */
+#define	DB_AM_IN_RENAME		0x00004000 /* File is being renamed */
+#define	DB_AM_NOT_DURABLE	0x00008000 /* Do not log changes */
+#define	DB_AM_OPEN_CALLED	0x00010000 /* DB->open called */
+#define	DB_AM_PAD		0x00020000 /* Fixed-length record pad */
+#define	DB_AM_PARTDB		0x00040000 /* Handle for a database partition */
+#define	DB_AM_PGDEF		0x00080000 /* Page size was defaulted */
+#define	DB_AM_RDONLY		0x00100000 /* Database is readonly */
+#define	DB_AM_READ_UNCOMMITTED	0x00200000 /* Support degree 1 isolation */
+#define	DB_AM_RECNUM		0x00400000 /* DB_RECNUM */
+#define	DB_AM_RECOVER		0x00800000 /* DB opened by recovery routine */
+#define	DB_AM_RENUMBER		0x01000000 /* DB_RENUMBER */
+#define	DB_AM_REVSPLITOFF	0x02000000 /* DB_REVSPLITOFF */
+#define	DB_AM_SECONDARY		0x04000000 /* Database is a secondary index */
+#define	DB_AM_SNAPSHOT		0x08000000 /* DB_SNAPSHOT */
+#define	DB_AM_SUBDB		0x10000000 /* Subdatabases supported */
+#define	DB_AM_SWAP		0x20000000 /* Pages need to be byte-swapped */
+#define	DB_AM_TXN		0x40000000 /* Opened in a transaction */
+#define	DB_AM_VERIFYING		0x80000000 /* DB handle is in the verifier */
+	u_int32_t orig_flags;		   /* Flags at  open, for refresh */
+	u_int32_t flags;
+
+#define DB2_AM_EXCL		0x00000001 /* Exclusively lock the handle */ 
+#define DB2_AM_INTEXCL		0x00000002 /* Internal exclusive lock. */
+#define DB2_AM_NOWAIT		0x00000004 /* Do not wait for handle lock */ 
+	u_int32_t orig_flags2;		   /* Second flags word; for refresh */ 
+	u_int32_t flags2;		   /* Second flags word */
+};
+
+/*
+ * Macros for bulk operations.  These are only intended for the C API.
+ * For C++, use DbMultiple*Iterator or DbMultiple*Builder.
+ *
+ * Bulk operations store multiple entries into a single DBT structure. The
+ * following macros assist with creating and reading these Multiple DBTs.
+ *
+ * The basic layout for single data items is:
+ *
+ * -------------------------------------------------------------------------
+ * | data1 | ... | dataN | ..... |-1 | dNLen | dNOff | ... | d1Len | d1Off |
+ * -------------------------------------------------------------------------
+ *
+ * For the DB_MULTIPLE_KEY* macros, the items are in key/data pairs, so data1
+ * would be a key, and data2 its corresponding value (N is always even).
+ *
+ * For the DB_MULTIPLE_RECNO* macros, the record number is stored along with
+ * the len/off pair in the "header" section, and the list is zero terminated
+ * (since -1 is a valid record number):
+ *
+ * --------------------------------------------------------------------------
+ * | d1 |..| dN |..| 0 | dNLen | dNOff | recnoN |..| d1Len | d1Off | recno1 |
+ * --------------------------------------------------------------------------
+ */
+#define	DB_MULTIPLE_INIT(pointer, dbt)					\
+	(pointer = (u_int8_t *)(dbt)->data +				\
+	    (dbt)->ulen - sizeof(u_int32_t))
+
+#define	DB_MULTIPLE_NEXT(pointer, dbt, retdata, retdlen)		\
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		if (*__p == (u_int32_t)-1) {				\
+			retdata = NULL;					\
+			pointer = NULL;					\
+			break;						\
+		}							\
+		retdata = (u_int8_t *)(dbt)->data + *__p--;		\
+		retdlen = *__p--;					\
+		pointer = __p;						\
+		if (retdlen == 0 && retdata == (u_int8_t *)(dbt)->data)	\
+			retdata = NULL;					\
+	} while (0)
+
+#define	DB_MULTIPLE_KEY_NEXT(pointer, dbt, retkey, retklen, retdata, retdlen) \
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		if (*__p == (u_int32_t)-1) {				\
+			retdata = NULL;					\
+			retkey = NULL;					\
+			pointer = NULL;					\
+			break;						\
+		}							\
+		retkey = (u_int8_t *)(dbt)->data + *__p--;		\
+		retklen = *__p--;					\
+		retdata = (u_int8_t *)(dbt)->data + *__p--;		\
+		retdlen = *__p--;					\
+		pointer = __p;						\
+	} while (0)
+
+#define	DB_MULTIPLE_RECNO_NEXT(pointer, dbt, recno, retdata, retdlen)   \
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		if (*__p == (u_int32_t)0) {				\
+			recno = 0;					\
+			retdata = NULL;					\
+			pointer = NULL;					\
+			break;						\
+		}							\
+		recno = *__p--;						\
+		retdata = (u_int8_t *)(dbt)->data + *__p--;		\
+		retdlen = *__p--;					\
+		pointer = __p;						\
+	} while (0)
+
+#define DB_MULTIPLE_WRITE_INIT(pointer, dbt)				\
+	do {								\
+		(dbt)->flags |= DB_DBT_BULK;				\
+		pointer = (u_int8_t *)(dbt)->data +			\
+		    (dbt)->ulen - sizeof(u_int32_t);			\
+		*(u_int32_t *)(pointer) = (u_int32_t)-1;		\
+	} while (0)
+
+#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen)	\
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		u_int32_t __off = ((pointer) ==	(u_int8_t *)(dbt)->data +\
+		    (dbt)->ulen - sizeof(u_int32_t)) ?  0 : __p[1] + __p[2];\
+		if ((u_int8_t *)(dbt)->data + __off + (writedlen) >	\
+		    (u_int8_t *)(__p - 2))				\
+			writedata = NULL;				\
+		else {							\
+			writedata = (u_int8_t *)(dbt)->data + __off;	\
+			__p[0] = __off;					\
+			__p[-1] = (u_int32_t)(writedlen);		\
+			__p[-2] = (u_int32_t)-1;			\
+			pointer = __p - 2;				\
+		}							\
+	} while (0)
+
+#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen)	\
+	do {								\
+		void *__destd;						\
+		DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt),		\
+		    __destd, (writedlen));				\
+		if (__destd == NULL)					\
+			pointer = NULL;					\
+		else							\
+			memcpy(__destd, (writedata), (writedlen));	\
+	} while (0)
+
+#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
+		    (dbt)->ulen - sizeof(u_int32_t)) ?  0 : __p[1] + __p[2];\
+		if ((u_int8_t *)(dbt)->data + __off + (writeklen) +	\
+		    (writedlen) > (u_int8_t *)(__p - 4)) {		\
+			writekey = NULL;				\
+			writedata = NULL;				\
+		} else {						\
+			writekey = (u_int8_t *)(dbt)->data + __off;	\
+			__p[0] = __off;					\
+			__p[-1] = (u_int32_t)(writeklen);		\
+			__p -= 2;					\
+			__off += (u_int32_t)(writeklen);		\
+			writedata = (u_int8_t *)(dbt)->data + __off;	\
+			__p[0] = __off;					\
+			__p[-1] = (u_int32_t)(writedlen);		\
+			__p[-2] = (u_int32_t)-1;			\
+			pointer = __p - 2;				\
+		}							\
+	} while (0)
+
+#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+	do {								\
+		void *__destk, *__destd;				\
+		DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt),		\
+		    __destk, (writeklen), __destd, (writedlen));	\
+		if (__destk == NULL)					\
+			pointer = NULL;					\
+		else {							\
+			memcpy(__destk, (writekey), (writeklen));	\
+			if (__destd != NULL)				\
+				memcpy(__destd, (writedata), (writedlen));\
+		}							\
+	} while (0)
+
+#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt)			\
+	do {								\
+		(dbt)->flags |= DB_DBT_BULK;				\
+		pointer = (u_int8_t *)(dbt)->data +			\
+		    (dbt)->ulen - sizeof(u_int32_t);			\
+		*(u_int32_t *)(pointer) = 0;				\
+	} while (0)
+
+#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
+	do {								\
+		u_int32_t *__p = (u_int32_t *)(pointer);		\
+		u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
+		    (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2]; \
+		if (((u_int8_t *)(dbt)->data + __off) + (writedlen) >	\
+		    (u_int8_t *)(__p - 3))				\
+			writedata = NULL;				\
+		else {							\
+			writedata = (u_int8_t *)(dbt)->data + __off;	\
+			__p[0] = (u_int32_t)(recno);			\
+			__p[-1] = __off;				\
+			__p[-2] = (u_int32_t)(writedlen);		\
+			__p[-3] = 0;					\
+			pointer = __p - 3;				\
+		}							\
+	} while (0)
+
+#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
+	do {								\
+		void *__destd;						\
+		DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt),	\
+		    (recno), __destd, (writedlen));			\
+		if (__destd == NULL)					\
+			pointer = NULL;					\
+		else if ((writedlen) != 0)				\
+			memcpy(__destd, (writedata), (writedlen));	\
+	} while (0)
+
+struct __db_heap_rid {
+	db_pgno_t pgno;			/* Page number. */
+	db_indx_t indx;			/* Index in the offset table. */
+};
+#define DB_HEAP_RID_SZ	(sizeof(db_pgno_t) + sizeof(db_indx_t))
+
+/*******************************************************
+ * Access method cursors.
+ *******************************************************/
+struct __dbc {
+	DB *dbp;			/* Backing database */
+	DB_ENV *dbenv;			/* Backing environment */
+	ENV *env;			/* Backing environment */
+
+	DB_THREAD_INFO *thread_info;	/* Thread that owns this cursor. */
+	DB_TXN	 *txn;			/* Associated transaction. */
+	DB_CACHE_PRIORITY priority;	/* Priority in cache. */
+
+	/*
+	 * Active/free cursor queues.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__dbc) links;
+	 */
+	struct {
+		DBC *tqe_next;
+		DBC **tqe_prev;
+	} links;
+
+	/*
+	 * Cursor queue of the owning transaction.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(__dbc) txn_cursors;
+	 */
+	struct {
+		DBC *tqe_next;	/* next element */
+		DBC **tqe_prev;	/* address of previous next element */
+	} txn_cursors;
+
+	/*
+	 * The DBT *'s below are used by the cursor routines to return
+	 * data to the user when DBT flags indicate that DB should manage
+	 * the returned memory.  They point at a DBT containing the buffer
+	 * and length that will be used, and "belonging" to the handle that
+	 * should "own" this memory.  This may be a "my_*" field of this
+	 * cursor--the default--or it may be the corresponding field of
+	 * another cursor, a DB handle, a join cursor, etc.  In general, it
+	 * will be whatever handle the user originally used for the current
+	 * DB interface call.
+	 */
+	DBT	 *rskey;		/* Returned secondary key. */
+	DBT	 *rkey;			/* Returned [primary] key. */
+	DBT	 *rdata;		/* Returned data. */
+
+	DBT	  my_rskey;		/* Space for returned secondary key. */
+	DBT	  my_rkey;		/* Space for returned [primary] key. */
+	DBT	  my_rdata;		/* Space for returned data. */
+
+	DB_LOCKER *lref;		/* Reference to default locker. */
+	DB_LOCKER *locker;		/* Locker for this operation. */
+	DBT	  lock_dbt;		/* DBT referencing lock. */
+	DB_LOCK_ILOCK lock;		/* Object to be locked. */
+	DB_LOCK	  mylock;		/* CDB lock held on this cursor. */
+
+	DBTYPE	  dbtype;		/* Cursor type. */
+
+	DBC_INTERNAL *internal;		/* Access method private. */
+
+	/* DBC PUBLIC HANDLE LIST BEGIN */
+	int (*close) __P((DBC *));
+	int (*cmp) __P((DBC *, DBC *, int *, u_int32_t));
+	int (*count) __P((DBC *, db_recno_t *, u_int32_t));
+	int (*del) __P((DBC *, u_int32_t));
+	int (*dup) __P((DBC *, DBC **, u_int32_t));
+	int (*get) __P((DBC *, DBT *, DBT *, u_int32_t));
+	int (*get_priority) __P((DBC *, DB_CACHE_PRIORITY *));
+	int (*pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+	int (*put) __P((DBC *, DBT *, DBT *, u_int32_t));
+	int (*set_priority) __P((DBC *, DB_CACHE_PRIORITY));
+	/* DBC PUBLIC HANDLE LIST END */
+
+	/* The following are the method names deprecated in the 4.6 release. */
+	int (*c_close) __P((DBC *));
+	int (*c_count) __P((DBC *, db_recno_t *, u_int32_t));
+	int (*c_del) __P((DBC *, u_int32_t));
+	int (*c_dup) __P((DBC *, DBC **, u_int32_t));
+	int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t));
+	int (*c_pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+	int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t));
+
+	/* DBC PRIVATE HANDLE LIST BEGIN */
+	int (*am_bulk) __P((DBC *, DBT *, u_int32_t));
+	int (*am_close) __P((DBC *, db_pgno_t, int *));
+	int (*am_del) __P((DBC *, u_int32_t));
+	int (*am_destroy) __P((DBC *));
+	int (*am_get) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+	int (*am_put) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+	int (*am_writelock) __P((DBC *));
+	/* DBC PRIVATE HANDLE LIST END */
+
+/*
+ * DBC_DONTLOCK and DBC_RECOVER are used during recovery and transaction
+ * abort.  If a transaction is being aborted or recovered then DBC_RECOVER
+ * will be set and locking and logging will be disabled on this cursor.  If
+ * we are performing a compensating transaction (e.g. free page processing)
+ * then DB_DONTLOCK will be set to inhibit locking, but logging will still
+ * be required. DB_DONTLOCK is also used if the whole database is locked.
+ */
+#define	DBC_ACTIVE		0x00001	/* Cursor in use. */
+#define	DBC_BULK		0x00002	/* Bulk update cursor. */
+#define	DBC_DONTLOCK		0x00004	/* Don't lock on this cursor. */
+#define	DBC_DOWNREV		0x00008	/* Down rev replication master. */
+#define	DBC_DUPLICATE		0x00010	/* Create a duplicate cursor. */
+#define	DBC_ERROR		0x00020	/* Error in this request. */
+#define	DBC_FAMILY		0x00040 /* Part of a locker family. */
+#define	DBC_FROM_DB_GET		0x00080 /* Called from the DB->get() method. */
+#define	DBC_MULTIPLE		0x00100	/* Return Multiple data. */
+#define	DBC_MULTIPLE_KEY	0x00200	/* Return Multiple keys and data. */
+#define	DBC_OPD			0x00400	/* Cursor references off-page dups. */
+#define	DBC_OWN_LID		0x00800	/* Free lock id on destroy. */
+#define	DBC_PARTITIONED		0x01000	/* Cursor for a partitioned db. */
+#define	DBC_READ_COMMITTED	0x02000	/* Cursor has degree 2 isolation. */
+#define	DBC_READ_UNCOMMITTED	0x04000	/* Cursor has degree 1 isolation. */
+#define	DBC_RECOVER		0x08000	/* Recovery cursor; don't log/lock. */
+#define	DBC_RMW			0x10000	/* Acquire write flag in read op. */
+#define	DBC_TRANSIENT		0x20000	/* Cursor is transient. */
+#define	DBC_WAS_READ_COMMITTED	0x40000	/* Cursor holds a read commited lock. */
+#define	DBC_WRITECURSOR		0x80000	/* Cursor may be used to write (CDB). */
+#define	DBC_WRITER	       0x100000	/* Cursor immediately writing (CDB). */
+	u_int32_t flags;
+};
+
+/* Key range statistics structure */
+struct __key_range {
+	double less;
+	double equal;
+	double greater;
+};
+
+/* Btree/Recno statistics structure. */
+struct __db_bt_stat { /* SHARED */
+	u_int32_t bt_magic;		/* Magic number. */
+	u_int32_t bt_version;		/* Version number. */
+	u_int32_t bt_metaflags;		/* Metadata flags. */
+	u_int32_t bt_nkeys;		/* Number of unique keys. */
+	u_int32_t bt_ndata;		/* Number of data items. */
+	u_int32_t bt_pagecnt;		/* Page count. */
+	u_int32_t bt_pagesize;		/* Page size. */
+	u_int32_t bt_minkey;		/* Minkey value. */
+	u_int32_t bt_re_len;		/* Fixed-length record length. */
+	u_int32_t bt_re_pad;		/* Fixed-length record pad. */
+	u_int32_t bt_levels;		/* Tree levels. */
+	u_int32_t bt_int_pg;		/* Internal pages. */
+	u_int32_t bt_leaf_pg;		/* Leaf pages. */
+	u_int32_t bt_dup_pg;		/* Duplicate pages. */
+	u_int32_t bt_over_pg;		/* Overflow pages. */
+	u_int32_t bt_empty_pg;		/* Empty pages. */
+	u_int32_t bt_free;		/* Pages on the free list. */
+	uintmax_t bt_int_pgfree;	/* Bytes free in internal pages. */
+	uintmax_t bt_leaf_pgfree;	/* Bytes free in leaf pages. */
+	uintmax_t bt_dup_pgfree;	/* Bytes free in duplicate pages. */
+	uintmax_t bt_over_pgfree;	/* Bytes free in overflow pages. */
+};
+
+struct __db_compact {
+	/* Input Parameters. */
+	u_int32_t	compact_fillpercent;	/* Desired fillfactor: 1-100 */
+	db_timeout_t	compact_timeout;	/* Lock timeout. */
+	u_int32_t	compact_pages;		/* Max pages to process. */
+	/* Output Stats. */
+	u_int32_t	compact_empty_buckets;	/* Empty hash buckets found. */
+	u_int32_t	compact_pages_free;	/* Number of pages freed. */
+	u_int32_t	compact_pages_examine;	/* Number of pages examine. */
+	u_int32_t	compact_levels;		/* Number of levels removed. */
+	u_int32_t	compact_deadlock;	/* Number of deadlocks. */
+	db_pgno_t	compact_pages_truncated; /* Pages truncated to OS. */
+	/* Internal. */
+	db_pgno_t	compact_truncate;	/* Page number for truncation */
+};
+
+/* Hash statistics structure. */
+struct __db_h_stat { /* SHARED */
+	u_int32_t hash_magic;		/* Magic number. */
+	u_int32_t hash_version;		/* Version number. */
+	u_int32_t hash_metaflags;	/* Metadata flags. */
+	u_int32_t hash_nkeys;		/* Number of unique keys. */
+	u_int32_t hash_ndata;		/* Number of data items. */
+	u_int32_t hash_pagecnt;		/* Page count. */
+	u_int32_t hash_pagesize;	/* Page size. */
+	u_int32_t hash_ffactor;		/* Fill factor specified at create. */
+	u_int32_t hash_buckets;		/* Number of hash buckets. */
+	u_int32_t hash_free;		/* Pages on the free list. */
+	uintmax_t hash_bfree;		/* Bytes free on bucket pages. */
+	u_int32_t hash_bigpages;	/* Number of big key/data pages. */
+	uintmax_t hash_big_bfree;	/* Bytes free on big item pages. */
+	u_int32_t hash_overflows;	/* Number of overflow pages. */
+	uintmax_t hash_ovfl_free;	/* Bytes free on ovfl pages. */
+	u_int32_t hash_dup;		/* Number of dup pages. */
+	uintmax_t hash_dup_free;	/* Bytes free on duplicate pages. */
+};
+
+/* Heap statistics structure. */
+struct __db_heap_stat { /* SHARED */
+	u_int32_t heap_magic;		/* Magic number. */
+	u_int32_t heap_version;		/* Version number. */
+	u_int32_t heap_metaflags;	/* Metadata flags. */
+	u_int32_t heap_nrecs;		/* Number of records. */
+	u_int32_t heap_pagecnt;		/* Page count. */
+	u_int32_t heap_pagesize;	/* Page size. */
+	u_int32_t heap_nregions;	/* Number of regions. */
+	u_int32_t heap_regionsize;	/* Number of pages in a region. */
+};
+
+/* Queue statistics structure. */
+struct __db_qam_stat { /* SHARED */
+	u_int32_t qs_magic;		/* Magic number. */
+	u_int32_t qs_version;		/* Version number. */
+	u_int32_t qs_metaflags;		/* Metadata flags. */
+	u_int32_t qs_nkeys;		/* Number of unique keys. */
+	u_int32_t qs_ndata;		/* Number of data items. */
+	u_int32_t qs_pagesize;		/* Page size. */
+	u_int32_t qs_extentsize;	/* Pages per extent. */
+	u_int32_t qs_pages;		/* Data pages. */
+	u_int32_t qs_re_len;		/* Fixed-length record length. */
+	u_int32_t qs_re_pad;		/* Fixed-length record pad. */
+	u_int32_t qs_pgfree;		/* Bytes free in data pages. */
+	u_int32_t qs_first_recno;	/* First not deleted record. */
+	u_int32_t qs_cur_recno;		/* Next available record number. */
+};
+
+/*******************************************************
+ * Environment.
+ *******************************************************/
+#define	DB_REGION_MAGIC	0x120897	/* Environment magic number. */
+
+/*
+ * Database environment structure.
+ *
+ * This is the public database environment handle.  The private environment
+ * handle is the ENV structure.   The user owns this structure, the library
+ * owns the ENV structure.  The reason there are two structures is because
+ * the user's configuration outlives any particular DB_ENV->open call, and
+ * separate structures allows us to easily discard internal information without
+ * discarding the user's configuration.
+ *
+ * Fields in the DB_ENV structure should normally be set only by application
+ * DB_ENV handle methods.
+ */
+
+/*
+ * Memory configuration types.
+ */
+typedef enum {
+	DB_MEM_LOCK=1,
+	DB_MEM_LOCKOBJECT=2,
+	DB_MEM_LOCKER=3,
+	DB_MEM_LOGID=4,
+	DB_MEM_TRANSACTION=5,
+	DB_MEM_THREAD=6
+} DB_MEM_CONFIG;
+
+/*
+ * Backup configuration types.
+ */
+typedef enum {
+	DB_BACKUP_READ_COUNT = 1,
+	DB_BACKUP_READ_SLEEP = 2,
+	DB_BACKUP_SIZE = 3,
+	DB_BACKUP_WRITE_DIRECT = 4
+} DB_BACKUP_CONFIG;
+
+struct __db_env {
+	ENV *env;			/* Linked ENV structure */
+
+	/*
+	 * The DB_ENV structure can be used concurrently, so field access is
+	 * protected.
+	 */
+	db_mutex_t mtx_db_env;		/* DB_ENV structure mutex */
+
+					/* Error message callback */
+	void (*db_errcall) __P((const DB_ENV *, const char *, const char *));
+	FILE		*db_errfile;	/* Error message file stream */
+	const char	*db_errpfx;	/* Error message prefix */
+
+					/* Other message callback */
+	void (*db_msgcall) __P((const DB_ENV *, const char *));
+	FILE		*db_msgfile;	/* Other message file stream */
+
+	/* Other application callback functions */
+	int   (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+	void  (*db_event_func) __P((DB_ENV *, u_int32_t, void *));
+	void  (*db_feedback) __P((DB_ENV *, int, int));
+	void  (*db_free) __P((void *));
+	void  (*db_paniccall) __P((DB_ENV *, int));
+	void *(*db_malloc) __P((size_t));
+	void *(*db_realloc) __P((void *, size_t));
+	int   (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+	void  (*thread_id) __P((DB_ENV *, pid_t *, db_threadid_t *));
+	char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
+
+	/* Application specified paths */
+	char	*db_log_dir;		/* Database log file directory */
+	char	*db_md_dir;		/* Persistent metadata directory */
+	char	*db_tmp_dir;		/* Database tmp file directory */
+
+	char    *db_create_dir;		/* Create directory for data files */
+	char   **db_data_dir;		/* Database data file directories */
+	int	 data_cnt;		/* Database data file slots */
+	int	 data_next;		/* Next database data file slot */
+
+	char	*intermediate_dir_mode;	/* Intermediate directory perms */
+
+	long	 shm_key;		/* shmget key */
+
+	char	*passwd;		/* Cryptography support */
+	size_t	 passwd_len;
+
+	/* Private handle references */
+	void	*app_private;		/* Application-private handle */
+	void	*api1_internal;		/* C++, Perl API private */
+	void	*api2_internal;		/* Java API private */
+
+	u_int32_t	verbose;	/* DB_VERB_XXX flags */
+
+	/* Mutex configuration */
+	u_int32_t	mutex_align;	/* Mutex alignment */
+	u_int32_t	mutex_cnt;	/* Number of mutexes to configure */
+	u_int32_t	mutex_inc;	/* Number of mutexes to add */
+	u_int32_t	mutex_max;	/* Max number of mutexes */
+	u_int32_t	mutex_tas_spins;/* Test-and-set spin count */
+
+	/* Locking configuration */
+	u_int8_t       *lk_conflicts;	/* Two dimensional conflict matrix */
+	int		lk_modes;	/* Number of lock modes in table */
+	u_int32_t	lk_detect;	/* Deadlock detect on all conflicts */
+	u_int32_t	lk_max;	/* Maximum number of locks */
+	u_int32_t	lk_max_lockers;/* Maximum number of lockers */
+	u_int32_t	lk_max_objects;/* Maximum number of locked objects */
+	u_int32_t	lk_init;	/* Initial number of locks */
+	u_int32_t	lk_init_lockers;/* Initial number of lockers */
+	u_int32_t	lk_init_objects;/* Initial number of locked objects */
+	u_int32_t	lk_partitions ;/* Number of object partitions */
+	db_timeout_t	lk_timeout;	/* Lock timeout period */
+	/* Used during initialization */
+	u_int32_t	locker_t_size;	/* Locker hash table size. */
+	u_int32_t	object_t_size;	/* Object hash table size. */
+
+	/* Logging configuration */
+	u_int32_t	lg_bsize;	/* Buffer size */
+	u_int32_t	lg_fileid_init;	/* Initial allocation for fname structs */
+	int		lg_filemode;	/* Log file permission mode */
+	u_int32_t	lg_regionmax;	/* Region size */
+	u_int32_t	lg_size;	/* Log file size */
+	u_int32_t	lg_flags;	/* Log configuration */
+
+	/* Memory pool configuration */
+	u_int32_t	mp_gbytes;	/* Cache size: GB */
+	u_int32_t	mp_bytes;	/* Cache size: bytes */
+	u_int32_t	mp_max_gbytes;	/* Maximum cache size: GB */
+	u_int32_t	mp_max_bytes;	/* Maximum cache size: bytes */
+	size_t		mp_mmapsize;	/* Maximum file size for mmap */
+	int		mp_maxopenfd;	/* Maximum open file descriptors */
+	int		mp_maxwrite;	/* Maximum buffers to write */
+	u_int		mp_ncache;	/* Initial number of cache regions */
+	u_int32_t	mp_pagesize;	/* Average page size */
+	u_int32_t	mp_tablesize;	/* Approximate hash table size */
+	u_int32_t	mp_mtxcount;	/* Number of mutexs */
+					/* Sleep after writing max buffers */
+	db_timeout_t	mp_maxwrite_sleep;
+
+	/* Transaction configuration */
+	u_int32_t	tx_init;	/* Initial number of transactions */
+	u_int32_t	tx_max;		/* Maximum number of transactions */
+	time_t		tx_timestamp;	/* Recover to specific timestamp */
+	db_timeout_t	tx_timeout;	/* Timeout for transactions */
+
+	/* Thread tracking configuration */
+	u_int32_t	thr_init;	/* Thread count */
+	u_int32_t	thr_max;	/* Thread max */
+	roff_t		memory_max;	/* Maximum region memory */
+
+	/*
+	 * The following fields are not strictly user-owned, but they outlive
+	 * the ENV structure, and so are stored here.
+	 */
+	DB_FH		*registry;	/* DB_REGISTER file handle */
+	u_int32_t	registry_off;	/*
+					 * Offset of our slot.  We can't use
+					 * off_t because its size depends on
+					 * build settings.
+					 */
+        db_timeout_t	envreg_timeout; /* DB_REGISTER wait timeout */
+
+#define	DB_ENV_AUTO_COMMIT	0x00000001 /* DB_AUTO_COMMIT */
+#define	DB_ENV_CDB_ALLDB	0x00000002 /* CDB environment wide locking */
+#define	DB_ENV_FAILCHK		0x00000004 /* Failchk is running */
+#define	DB_ENV_DIRECT_DB	0x00000008 /* DB_DIRECT_DB set */
+#define	DB_ENV_DSYNC_DB		0x00000010 /* DB_DSYNC_DB set */
+#define	DB_ENV_DATABASE_LOCKING	0x00000020 /* Try database-level locking */
+#define	DB_ENV_MULTIVERSION	0x00000040 /* DB_MULTIVERSION set */
+#define	DB_ENV_NOLOCKING	0x00000080 /* DB_NOLOCKING set */
+#define	DB_ENV_NOMMAP		0x00000100 /* DB_NOMMAP set */
+#define	DB_ENV_NOPANIC		0x00000200 /* Okay if panic set */
+#define	DB_ENV_OVERWRITE	0x00000400 /* DB_OVERWRITE set */
+#define	DB_ENV_REGION_INIT	0x00000800 /* DB_REGION_INIT set */
+#define	DB_ENV_TIME_NOTGRANTED	0x00001000 /* DB_TIME_NOTGRANTED set */
+#define	DB_ENV_TXN_NOSYNC	0x00002000 /* DB_TXN_NOSYNC set */
+#define	DB_ENV_TXN_NOWAIT	0x00004000 /* DB_TXN_NOWAIT set */
+#define	DB_ENV_TXN_SNAPSHOT	0x00008000 /* DB_TXN_SNAPSHOT set */
+#define	DB_ENV_TXN_WRITE_NOSYNC	0x00010000 /* DB_TXN_WRITE_NOSYNC set */
+#define	DB_ENV_YIELDCPU		0x00020000 /* DB_YIELDCPU set */
+#define DB_ENV_HOTBACKUP	0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
+#define DB_ENV_NOFLUSH		0x00080000 /* DB_NOFLUSH set */
+	u_int32_t flags;
+
+	/* DB_ENV PUBLIC HANDLE LIST BEGIN */
+	int  (*add_data_dir) __P((DB_ENV *, const char *));
+	int  (*backup)	__P((DB_ENV *, const char *, u_int32_t));
+	int  (*cdsgroup_begin) __P((DB_ENV *, DB_TXN **));
+	int  (*close) __P((DB_ENV *, u_int32_t));
+	int  (*dbbackup) __P((DB_ENV *, const char *, const char *, u_int32_t));
+	int  (*dbremove) __P((DB_ENV *,
+		DB_TXN *, const char *, const char *, u_int32_t));
+	int  (*dbrename) __P((DB_ENV *,
+		DB_TXN *, const char *, const char *, const char *, u_int32_t));
+	void (*err) __P((const DB_ENV *, int, const char *, ...));
+	void (*errx) __P((const DB_ENV *, const char *, ...));
+	int  (*failchk) __P((DB_ENV *, u_int32_t));
+	int  (*fileid_reset) __P((DB_ENV *, const char *, u_int32_t));
+	int  (*get_alloc) __P((DB_ENV *, void *(**)(size_t),
+		void *(**)(void *, size_t), void (**)(void *)));
+	int  (*get_app_dispatch)
+		__P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+	int  (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+	int  (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+	int  (*get_create_dir) __P((DB_ENV *, const char **));
+	int  (*get_data_dirs) __P((DB_ENV *, const char ***));
+	int  (*get_data_len) __P((DB_ENV *, u_int32_t *));
+	int  (*get_backup_callbacks) __P((DB_ENV *,
+		int (**)(DB_ENV *, const char *, const char *, void **),
+		int (**)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+		int (**)(DB_ENV *, const char *, void *)));
+	int  (*get_backup_config) __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t *));
+	int  (*get_encrypt_flags) __P((DB_ENV *, u_int32_t *));
+	void (*get_errcall) __P((DB_ENV *,
+		void (**)(const DB_ENV *, const char *, const char *)));
+	void (*get_errfile) __P((DB_ENV *, FILE **));
+	void (*get_errpfx) __P((DB_ENV *, const char **));
+	int  (*get_flags) __P((DB_ENV *, u_int32_t *));
+	int  (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+	int  (*get_home) __P((DB_ENV *, const char **));
+	int  (*get_intermediate_dir_mode) __P((DB_ENV *, const char **));
+	int  (*get_isalive) __P((DB_ENV *,
+		int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+	int  (*get_lg_bsize) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lg_dir) __P((DB_ENV *, const char **));
+	int  (*get_lg_filemode) __P((DB_ENV *, int *));
+	int  (*get_lg_max) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lg_regionmax) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_conflicts) __P((DB_ENV *, const u_int8_t **, int *));
+	int  (*get_lk_detect) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_max_lockers) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_max_locks) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_max_objects) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_partitions) __P((DB_ENV *, u_int32_t *));
+	int  (*get_lk_priority) __P((DB_ENV *, u_int32_t, u_int32_t *));
+	int  (*get_lk_tablesize) __P((DB_ENV *, u_int32_t *));
+	int  (*get_memory_init) __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *));
+	int  (*get_memory_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+	int  (*get_metadata_dir) __P((DB_ENV *, const char **));
+	int  (*get_mp_max_openfd) __P((DB_ENV *, int *));
+	int  (*get_mp_max_write) __P((DB_ENV *, int *, db_timeout_t *));
+	int  (*get_mp_mmapsize) __P((DB_ENV *, size_t *));
+	int  (*get_mp_mtxcount) __P((DB_ENV *, u_int32_t *));
+	int  (*get_mp_pagesize) __P((DB_ENV *, u_int32_t *));
+	int  (*get_mp_tablesize) __P((DB_ENV *, u_int32_t *));
+	void (*get_msgcall)
+		__P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+	void (*get_msgfile) __P((DB_ENV *, FILE **));
+	int  (*get_open_flags) __P((DB_ENV *, u_int32_t *));
+	int  (*get_shm_key) __P((DB_ENV *, long *));
+	int  (*get_thread_count) __P((DB_ENV *, u_int32_t *));
+	int  (*get_thread_id_fn)
+		__P((DB_ENV *, void (**)(DB_ENV *, pid_t *, db_threadid_t *)));
+	int  (*get_thread_id_string_fn) __P((DB_ENV *,
+		char *(**)(DB_ENV *, pid_t, db_threadid_t, char *)));
+	int  (*get_timeout) __P((DB_ENV *, db_timeout_t *, u_int32_t));
+	int  (*get_tmp_dir) __P((DB_ENV *, const char **));
+	int  (*get_tx_max) __P((DB_ENV *, u_int32_t *));
+	int  (*get_tx_timestamp) __P((DB_ENV *, time_t *));
+	int  (*get_verbose) __P((DB_ENV *, u_int32_t, int *));
+	int  (*is_bigendian) __P((void));
+	int  (*lock_detect) __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+	int  (*lock_get) __P((DB_ENV *,
+		u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+	int  (*lock_id) __P((DB_ENV *, u_int32_t *));
+	int  (*lock_id_free) __P((DB_ENV *, u_int32_t));
+	int  (*lock_put) __P((DB_ENV *, DB_LOCK *));
+	int  (*lock_stat) __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+	int  (*lock_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*lock_vec) __P((DB_ENV *,
+		u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+	int  (*log_archive) __P((DB_ENV *, char **[], u_int32_t));
+	int  (*log_cursor) __P((DB_ENV *, DB_LOGC **, u_int32_t));
+	int  (*log_file) __P((DB_ENV *, const DB_LSN *, char *, size_t));
+	int  (*log_flush) __P((DB_ENV *, const DB_LSN *));
+	int  (*log_get_config) __P((DB_ENV *, u_int32_t, int *));
+	int  (*log_printf) __P((DB_ENV *, DB_TXN *, const char *, ...));
+	int  (*log_put) __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+	int  (*log_put_record) __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
+		u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+		DB_LOG_RECSPEC *, ...));
+	int  (*log_read_record) __P((DB_ENV *, DB **,
+		void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+	int  (*log_set_config) __P((DB_ENV *, u_int32_t, int));
+	int  (*log_stat) __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+	int  (*log_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*log_verify) __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+	int  (*lsn_reset) __P((DB_ENV *, const char *, u_int32_t));
+	int  (*memp_fcreate) __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+	int  (*memp_register) __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t,
+		void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+	int  (*memp_stat) __P((DB_ENV *,
+		DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+	int  (*memp_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*memp_sync) __P((DB_ENV *, DB_LSN *));
+	int  (*memp_trickle) __P((DB_ENV *, int, int *));
+	int  (*mutex_alloc) __P((DB_ENV *, u_int32_t, db_mutex_t *));
+	int  (*mutex_free) __P((DB_ENV *, db_mutex_t));
+	int  (*mutex_get_align) __P((DB_ENV *, u_int32_t *));
+	int  (*mutex_get_increment) __P((DB_ENV *, u_int32_t *));
+	int  (*mutex_get_init) __P((DB_ENV *, u_int32_t *));
+	int  (*mutex_get_max) __P((DB_ENV *, u_int32_t *));
+	int  (*mutex_get_tas_spins) __P((DB_ENV *, u_int32_t *));
+	int  (*mutex_lock) __P((DB_ENV *, db_mutex_t));
+	int  (*mutex_set_align) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_set_increment) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_set_init) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_set_max) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_set_tas_spins) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_stat) __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+	int  (*mutex_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*mutex_unlock) __P((DB_ENV *, db_mutex_t));
+	int  (*open) __P((DB_ENV *, const char *, u_int32_t, int));
+	int  (*remove) __P((DB_ENV *, const char *, u_int32_t));
+	int  (*rep_elect) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+	int  (*rep_flush) __P((DB_ENV *));
+	int  (*rep_get_clockskew) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+	int  (*rep_get_config) __P((DB_ENV *, u_int32_t, int *));
+	int  (*rep_get_limit) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+	int  (*rep_get_nsites) __P((DB_ENV *, u_int32_t *));
+	int  (*rep_get_priority) __P((DB_ENV *, u_int32_t *));
+	int  (*rep_get_request) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+	int  (*rep_get_timeout) __P((DB_ENV *, int, u_int32_t *));
+	int  (*rep_process_message)
+		__P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+	int  (*rep_set_clockskew) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*rep_set_config) __P((DB_ENV *, u_int32_t, int));
+	int  (*rep_set_limit) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*rep_set_nsites) __P((DB_ENV *, u_int32_t));
+	int  (*rep_set_priority) __P((DB_ENV *, u_int32_t));
+	int  (*rep_set_request) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t));
+	int  (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *,
+		const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+	int  (*rep_start) __P((DB_ENV *, DBT *, u_int32_t));
+	int  (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+	int  (*rep_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*rep_sync) __P((DB_ENV *, u_int32_t));
+	int  (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+	int  (*repmgr_get_ack_policy) __P((DB_ENV *, int *));
+	int  (*repmgr_local_site) __P((DB_ENV *, DB_SITE **));
+	int  (*repmgr_msg_dispatch) __P((DB_ENV *,
+		void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+		u_int32_t));
+	int  (*repmgr_set_ack_policy) __P((DB_ENV *, int));
+	int  (*repmgr_site)
+		__P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t));
+	int  (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**));
+	int  (*repmgr_site_list) __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+	int  (*repmgr_start) __P((DB_ENV *, int, u_int32_t));
+	int  (*repmgr_stat) __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+	int  (*repmgr_stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*set_alloc) __P((DB_ENV *, void *(*)(size_t),
+		void *(*)(void *, size_t), void (*)(void *)));
+	int  (*set_app_dispatch)
+		__P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+	int  (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int));
+	int  (*set_create_dir) __P((DB_ENV *, const char *));
+	int  (*set_data_dir) __P((DB_ENV *, const char *));
+	int  (*set_data_len) __P((DB_ENV *, u_int32_t));
+	int  (*set_backup_callbacks) __P((DB_ENV *,
+		int (*)(DB_ENV *, const char *, const char *, void **),
+		int (*)(DB_ENV *, u_int32_t,
+		    u_int32_t, u_int32_t, u_int8_t *, void *),
+		int (*)(DB_ENV *, const char *, void *)));
+	int  (*set_backup_config) __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t));
+	int  (*set_encrypt) __P((DB_ENV *, const char *, u_int32_t));
+	void (*set_errcall) __P((DB_ENV *,
+		void (*)(const DB_ENV *, const char *, const char *)));
+	void (*set_errfile) __P((DB_ENV *, FILE *));
+	void (*set_errpfx) __P((DB_ENV *, const char *));
+	int  (*set_event_notify)
+		__P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *)));
+	int  (*set_feedback) __P((DB_ENV *, void (*)(DB_ENV *, int, int)));
+	int  (*set_flags) __P((DB_ENV *, u_int32_t, int));
+	int  (*set_intermediate_dir_mode) __P((DB_ENV *, const char *));
+	int  (*set_isalive) __P((DB_ENV *,
+		int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+	int  (*set_lg_bsize) __P((DB_ENV *, u_int32_t));
+	int  (*set_lg_dir) __P((DB_ENV *, const char *));
+	int  (*set_lg_filemode) __P((DB_ENV *, int));
+	int  (*set_lg_max) __P((DB_ENV *, u_int32_t));
+	int  (*set_lg_regionmax) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_conflicts) __P((DB_ENV *, u_int8_t *, int));
+	int  (*set_lk_detect) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_max_lockers) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_max_locks) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_max_objects) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_partitions) __P((DB_ENV *, u_int32_t));
+	int  (*set_lk_priority) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*set_lk_tablesize) __P((DB_ENV *, u_int32_t));
+	int  (*set_memory_init) __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+	int  (*set_memory_max) __P((DB_ENV *, u_int32_t, u_int32_t));
+	int  (*set_metadata_dir) __P((DB_ENV *, const char *));
+	int  (*set_mp_max_openfd) __P((DB_ENV *, int));
+	int  (*set_mp_max_write) __P((DB_ENV *, int, db_timeout_t));
+	int  (*set_mp_mmapsize) __P((DB_ENV *, size_t));
+	int  (*set_mp_mtxcount) __P((DB_ENV *, u_int32_t));
+	int  (*set_mp_pagesize) __P((DB_ENV *, u_int32_t));
+	int  (*set_mp_tablesize) __P((DB_ENV *, u_int32_t));
+	void (*set_msgcall)
+		__P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+	void (*set_msgfile) __P((DB_ENV *, FILE *));
+	int  (*set_paniccall) __P((DB_ENV *, void (*)(DB_ENV *, int)));
+	int  (*set_shm_key) __P((DB_ENV *, long));
+	int  (*set_thread_count) __P((DB_ENV *, u_int32_t));
+	int  (*set_thread_id)
+		__P((DB_ENV *, void (*)(DB_ENV *, pid_t *, db_threadid_t *)));
+	int  (*set_thread_id_string) __P((DB_ENV *,
+		char *(*)(DB_ENV *, pid_t, db_threadid_t, char *)));
+	int  (*set_timeout) __P((DB_ENV *, db_timeout_t, u_int32_t));
+	int  (*set_tmp_dir) __P((DB_ENV *, const char *));
+	int  (*set_tx_max) __P((DB_ENV *, u_int32_t));
+	int  (*set_tx_timestamp) __P((DB_ENV *, time_t *));
+	int  (*set_verbose) __P((DB_ENV *, u_int32_t, int));
+	int  (*txn_applied) __P((DB_ENV *,
+		DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+	int  (*stat_print) __P((DB_ENV *, u_int32_t));
+	int  (*txn_begin) __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+	int  (*txn_checkpoint) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+	int  (*txn_recover) __P((DB_ENV *,
+		DB_PREPLIST *, long, long *, u_int32_t));
+	int  (*txn_stat) __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+	int  (*txn_stat_print) __P((DB_ENV *, u_int32_t));
+	/* DB_ENV PUBLIC HANDLE LIST END */
+
+	/* DB_ENV PRIVATE HANDLE LIST BEGIN */
+	int  (*prdbt) __P((DBT *, int,
+		const char *, void *, int (*)(void *, const void *), int, int));
+	/* DB_ENV PRIVATE HANDLE LIST END */
+};
+
+/*
+ * Dispatch structure for recovery, log verification and print routines. Since
+ * internal and external routines take different arguments (ENV versus DB_ENV),
+ * we need something more elaborate than a single pointer and size.
+ */
+struct __db_distab {
+	int   (**int_dispatch) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+	size_t	int_size;
+	int   (**ext_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+	size_t	ext_size;
+};
+
+/*
+ * Log verification configuration structure.
+ */
+struct __db_logvrfy_config {
+	int continue_after_fail, verbose;
+	u_int32_t cachesize;
+	const char *temp_envhome;
+	const char *dbfile, *dbname;
+	DB_LSN start_lsn, end_lsn;
+	time_t start_time, end_time;
+};
+
+struct __db_channel {
+	CHANNEL *channel;	/* Pointer to internal state details. */
+	int eid;		/* Env. ID passed in constructor. */
+	db_timeout_t timeout;
+
+	/* DB_CHANNEL PUBLIC HANDLE LIST BEGIN */
+	int (*close) __P((DB_CHANNEL *, u_int32_t));
+	int (*send_msg) __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+	int (*send_request) __P((DB_CHANNEL *,
+		DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+	int  (*set_timeout) __P((DB_CHANNEL *, db_timeout_t));
+	/* DB_CHANNEL PUBLIC HANDLE LIST END */
+};
+
+struct __db_site {
+	ENV *env;
+	int eid;
+	const char *host;
+	u_int port;
+	u_int32_t flags;
+
+	/* DB_SITE PUBLIC HANDLE LIST BEGIN */
+	int (*get_address) __P((DB_SITE *, const char **, u_int *));
+	int (*get_config) __P((DB_SITE *, u_int32_t, u_int32_t *));
+	int (*get_eid) __P((DB_SITE *, int *));
+	int (*set_config) __P((DB_SITE *, u_int32_t, u_int32_t));
+	int (*remove) __P((DB_SITE *));
+	int (*close) __P((DB_SITE *));
+	/* DB_SITE PUBLIC HANDLE LIST END */
+};
+
+#if DB_DBM_HSEARCH != 0
+/*******************************************************
+ * Dbm/Ndbm historic interfaces.
+ *******************************************************/
+typedef struct __db DBM;
+
+#define	DBM_INSERT	0		/* Flags to dbm_store(). */
+#define	DBM_REPLACE	1
+
+/*
+ * The DB support for ndbm(3) always appends this suffix to the
+ * file name to avoid overwriting the user's original database.
+ */
+#define	DBM_SUFFIX	".db"
+
+#if defined(_XPG4_2)
+typedef struct {
+	char *dptr;
+	size_t dsize;
+} datum;
+#else
+typedef struct {
+	char *dptr;
+	int dsize;
+} datum;
+#endif
+
+/*
+ * Translate NDBM calls into DB calls so that DB doesn't step on the
+ * application's name space.
+ */
+#define	dbm_clearerr(a)		__db_ndbm_clearerr@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_close(a)		__db_ndbm_close@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_delete(a, b)	__db_ndbm_delete@DB_VERSION_UNIQUE_NAME@(a, b)
+#define	dbm_dirfno(a)		__db_ndbm_dirfno@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_error(a)		__db_ndbm_error@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_fetch(a, b)		__db_ndbm_fetch@DB_VERSION_UNIQUE_NAME@(a, b)
+#define	dbm_firstkey(a)		__db_ndbm_firstkey@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_nextkey(a)		__db_ndbm_nextkey@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_open(a, b, c)	__db_ndbm_open@DB_VERSION_UNIQUE_NAME@(a, b, c)
+#define	dbm_pagfno(a)		__db_ndbm_pagfno@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_rdonly(a)		__db_ndbm_rdonly@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbm_store(a, b, c, d) \
+	__db_ndbm_store@DB_VERSION_UNIQUE_NAME@(a, b, c, d)
+
+/*
+ * Translate DBM calls into DB calls so that DB doesn't step on the
+ * application's name space.
+ *
+ * The global variables dbrdonly, dirf and pagf were not retained when 4BSD
+ * replaced the dbm interface with ndbm, and are not supported here.
+ */
+#define	dbminit(a)	__db_dbm_init@DB_VERSION_UNIQUE_NAME@(a)
+#define	dbmclose	__db_dbm_close@DB_VERSION_UNIQUE_NAME@
+#if !defined(__cplusplus)
+#define	delete(a)	__db_dbm_delete@DB_VERSION_UNIQUE_NAME@(a)
+#endif
+#define	fetch(a)	__db_dbm_fetch@DB_VERSION_UNIQUE_NAME@(a)
+#define	firstkey	__db_dbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define	nextkey(a)	__db_dbm_nextkey@DB_VERSION_UNIQUE_NAME@(a)
+#define	store(a, b)	__db_dbm_store@DB_VERSION_UNIQUE_NAME@(a, b)
+
+/*******************************************************
+ * Hsearch historic interface.
+ *******************************************************/
+typedef enum {
+	FIND, ENTER
+} ACTION;
+
+typedef struct entry {
+	char *key;
+	char *data;
+} ENTRY;
+
+#define	hcreate(a)	__db_hcreate@DB_VERSION_UNIQUE_NAME@(a)
+#define	hdestroy	__db_hdestroy@DB_VERSION_UNIQUE_NAME@
+#define	hsearch(a, b)	__db_hsearch@DB_VERSION_UNIQUE_NAME@(a, b)
+
+#endif /* DB_DBM_HSEARCH */
+
+#if defined(__cplusplus)
+}
+#endif
+
+@platform_footer@
+#endif /* !_DB_H_ */
diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in
new file mode 100644
index 00000000..43735344
--- /dev/null
+++ b/src/dbinc/db_185.in
@@ -0,0 +1,176 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_185_H_
+#define	_DB_185_H_
+
+#include <sys/types.h>
+
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * XXX
+ * Handle function prototypes and the keyword "const".  This steps on name
+ * space that DB doesn't control, but all of the other solutions are worse.
+ */
+#undef	__P
+#if defined(__STDC__) || defined(__cplusplus)
+#define	__P(protos)	protos		/* ANSI C prototypes */
+#else
+#define	const
+#define	__P(protos)	()		/* K&R C preprocessor */
+#endif
+
+#define	RET_ERROR	-1		/* Return values. */
+#define	RET_SUCCESS	 0
+#define	RET_SPECIAL	 1
+
+#ifndef	__BIT_TYPES_DEFINED__
+#define	__BIT_TYPES_DEFINED__
+@u_int8_decl@
+@int16_decl@
+@u_int16_decl@
+@int32_decl@
+@u_int32_decl@
+#endif
+
+/*
+ * XXX
+ * SGI/IRIX already has a pgno_t.
+ */
+#ifdef	__sgi
+#define	pgno_t	db_pgno_t
+#endif
+
+#define	MAX_PAGE_NUMBER	0xffffffff	/* >= # of pages in a file */
+typedef u_int32_t	pgno_t;
+#define	MAX_PAGE_OFFSET	65535		/* >= # of bytes in a page */
+typedef u_int16_t	indx_t;
+#define	MAX_REC_NUMBER	0xffffffff	/* >= # of records in a tree */
+typedef u_int32_t	recno_t;
+
+/* Key/data structure -- a Data-Base Thang. */
+typedef struct {
+	void	*data;			/* data */
+	size_t	 size;			/* data length */
+} DBT;
+
+/* Routine flags. */
+#define	R_CURSOR	1		/* del, put, seq */
+#define	__R_UNUSED	2		/* UNUSED */
+#define	R_FIRST		3		/* seq */
+#define	R_IAFTER	4		/* put (RECNO) */
+#define	R_IBEFORE	5		/* put (RECNO) */
+#define	R_LAST		6		/* seq (BTREE, RECNO) */
+#define	R_NEXT		7		/* seq */
+#define	R_NOOVERWRITE	8		/* put */
+#define	R_PREV		9		/* seq (BTREE, RECNO) */
+#define	R_SETCURSOR	10		/* put (RECNO) */
+#define	R_RECNOSYNC	11		/* sync (RECNO) */
+
+typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE;
+
+/* Access method description structure. */
+typedef struct __db {
+	DBTYPE type;			/* Underlying db type. */
+	int (*close)	__P((struct __db *));
+	int (*del)	__P((const struct __db *, const DBT *, u_int));
+	int (*get)	__P((const struct __db *, const DBT *, DBT *, u_int));
+	int (*put)	__P((const struct __db *, DBT *, const DBT *, u_int));
+	int (*seq)	__P((const struct __db *, DBT *, DBT *, u_int));
+	int (*sync)	__P((const struct __db *, u_int));
+	void *internal;			/* Access method private. */
+	int (*fd)	__P((const struct __db *));
+} DB;
+
+#define	BTREEMAGIC	0x053162
+#define	BTREEVERSION	3
+
+/* Structure used to pass parameters to the btree routines. */
+typedef struct {
+#define	R_DUP		0x01	/* duplicate keys */
+	u_int32_t flags;
+	u_int32_t cachesize;	/* bytes to cache */
+	u_int32_t maxkeypage;	/* maximum keys per page */
+	u_int32_t minkeypage;	/* minimum keys per page */
+	u_int32_t psize;	/* page size */
+	int	(*compare)	/* comparison function */
+	    __P((const DBT *, const DBT *));
+	size_t	(*prefix)	/* prefix function */
+	    __P((const DBT *, const DBT *));
+	int	lorder;		/* byte order */
+} BTREEINFO;
+
+#define	HASHMAGIC	0x061561
+#define	HASHVERSION	2
+
+/* Structure used to pass parameters to the hashing routines. */
+typedef struct {
+	u_int32_t bsize;	/* bucket size */
+	u_int32_t ffactor;	/* fill factor */
+	u_int32_t nelem;	/* number of elements */
+	u_int32_t cachesize;	/* bytes to cache */
+	u_int32_t		/* hash function */
+		(*hash) __P((const void *, size_t));
+	int	lorder;		/* byte order */
+} HASHINFO;
+
+/* Structure used to pass parameters to the record routines. */
+typedef struct {
+#define	R_FIXEDLEN	0x01	/* fixed-length records */
+#define	R_NOKEY		0x02	/* key not required */
+#define	R_SNAPSHOT	0x04	/* snapshot the input */
+	u_int32_t flags;
+	u_int32_t cachesize;	/* bytes to cache */
+	u_int32_t psize;	/* page size */
+	int	lorder;		/* byte order */
+	size_t	reclen;		/* record length (fixed-length records) */
+	u_char	bval;		/* delimiting byte (variable-length records */
+	char	*bfname;	/* btree file name */
+} RECNOINFO;
+
+/* Re-define the user's dbopen calls. */
+#define	dbopen __db185_open@DB_VERSION_UNIQUE_NAME@
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_185_H_ */
diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h
new file mode 100644
index 00000000..f34578c4
--- /dev/null
+++ b/src/dbinc/db_am.h
@@ -0,0 +1,327 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+#ifndef _DB_AM_H_
+#define	_DB_AM_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct __db_foreign_info; \
+			typedef struct __db_foreign_info DB_FOREIGN_INFO;
+
+/*
+ * Keep track of information for foreign keys.  Used to maintain a linked list
+ * of 'primary' DBs which reference this 'foreign' DB.
+ */
+struct __db_foreign_info {
+	DB *dbp;
+	u_int32_t flags;
+	int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+
+	/*
+	 * List entries for foreign key.
+	 *
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * LIST_ENTRY(__db) s_links;
+	 */
+	struct {
+		struct __db_foreign_info *le_next;
+		struct __db_foreign_info **le_prev;
+	} f_links;
+};
+
+/*
+ * IS_ENV_AUTO_COMMIT --
+ *	Auto-commit test for enviroment operations: DbEnv::{open,remove,rename}
+ */
+#define	IS_ENV_AUTO_COMMIT(env, txn, flags)				\
+	(LF_ISSET(DB_AUTO_COMMIT) ||					\
+	    (((txn) == NULL || F_ISSET((txn), TXN_FAMILY)) &&		\
+	    F_ISSET((env)->dbenv, DB_ENV_AUTO_COMMIT) &&		\
+	    !LF_ISSET(DB_NO_AUTO_COMMIT)))
+
+/*
+ * IS_DB_AUTO_COMMIT --
+ *	Auto-commit test for database operations.
+ */
+#define	IS_DB_AUTO_COMMIT(dbp, txn)					\
+	(((txn) == NULL || F_ISSET((txn), TXN_FAMILY)) &&		\
+	    F_ISSET((dbp), DB_AM_TXN))
+
+/*
+ * STRIP_AUTO_COMMIT --
+ *	Releases after 4.3 no longer requires DB operations to specify the
+ *	AUTO_COMMIT flag, but the API continues to allow it to be specified.
+ */
+#define	STRIP_AUTO_COMMIT(f)	FLD_CLR((f), DB_AUTO_COMMIT)
+
+/* DB recovery operation codes. */
+#define	DB_ADD_DUP	1
+#define	DB_REM_DUP	2
+#define	DB_ADD_BIG	3
+#define	DB_REM_BIG	4
+#define	DB_ADD_PAGE_COMPAT	5	/* Compatibility for 4.2 db_relink */
+#define	DB_REM_PAGE_COMPAT	6	/* Compatibility for 4.2 db_relink */
+#define	DB_APPEND_BIG	7
+#define	DB_ADD_HEAP	8
+#define	DB_REM_HEAP	9
+
+#define OP_MODE_SHIFT   8
+#define OP_PAGE_MASK    0xff
+
+#define OP_SET(mode, page)	(((mode) << OP_MODE_SHIFT) | (TYPE(page)))
+#define OP_MODE_GET(mode)	((mode) >> OP_MODE_SHIFT)
+#define OP_PAGE_GET(mode)	((mode) & OP_PAGE_MASK)
+
+
+/*
+ * Standard initialization and shutdown macros for all recovery functions.
+ */
+#define	REC_INTRO(func, ip, do_cursor) do {				\
+	argp = NULL;							\
+	dbc = NULL;							\
+	file_dbp = NULL;						\
+	COMPQUIET(mpf, NULL);	/* Not all recovery routines use mpf. */\
+	if ((ret = func(env, &file_dbp,					\
+	    (info != NULL) ? ((DB_TXNHEAD *)info)->td : NULL,		\
+	    dbtp->data, &argp)) != 0) {					\
+		if (ret	== DB_DELETED) {				\
+			ret = 0;					\
+			goto done;					\
+		}							\
+		goto out;						\
+	}								\
+	if (do_cursor) {						\
+		if ((ret = __db_cursor(file_dbp,			\
+		    ip, NULL, &dbc, DB_RECOVER)) != 0)			\
+			goto out;					\
+	}								\
+	mpf = file_dbp->mpf;						\
+} while (0)
+
+#define	REC_CLOSE {							\
+	int __t_ret;							\
+	if (argp != NULL)						\
+		__os_free(env, argp);					\
+	if (dbc != NULL &&						\
+	    (__t_ret = __dbc_close(dbc)) != 0 && ret == 0)		\
+		ret = __t_ret;						\
+	}								\
+	return (ret)
+
+/*
+ * No-op versions of the same macros.
+ */
+#define	REC_NOOP_INTRO(func) do {					\
+	argp = NULL;							\
+	if ((ret = func(env, dbtp->data, &argp)) != 0)		\
+		return (ret);						\
+} while (0)
+#define	REC_NOOP_CLOSE							\
+	if (argp != NULL)						\
+		__os_free(env, argp);					\
+	return (ret)
+
+/*
+ * Macro for reading pages during recovery.  In most cases we
+ * want to avoid an error if the page is not found during rollback.
+ */
+#define	REC_FGET(mpf, ip, pgno, pagep, cont)				\
+	if ((ret = __memp_fget(mpf,					\
+	     &(pgno), ip, NULL, 0, pagep)) != 0) {			\
+		if (ret != DB_PAGE_NOTFOUND) {				\
+			ret = __db_pgerr(file_dbp, pgno, ret);		\
+			goto out;					\
+		} else							\
+			goto cont;					\
+	}
+#define	REC_DIRTY(mpf, ip, priority, pagep)				\
+	if ((ret = __memp_dirty(mpf,					\
+	    pagep, ip, NULL, priority, DB_MPOOL_EDIT)) != 0) {		\
+		ret = __db_pgerr(file_dbp, PGNO(*(pagep)), ret);	\
+		goto out;						\
+	}
+
+/*
+ * Standard debugging macro for all recovery functions.
+ */
+#ifdef DEBUG_RECOVER
+#define	REC_PRINT(func)							\
+	(void)func(env, dbtp, lsnp, op, info);
+#else
+#define	REC_PRINT(func)
+#endif
+
+/*
+ * Actions to __db_lget
+ */
+#define	LCK_ALWAYS		1	/* Lock even for off page dup cursors */
+#define	LCK_COUPLE		2	/* Lock Couple */
+#define	LCK_COUPLE_ALWAYS	3	/* Lock Couple even in txn. */
+#define	LCK_DOWNGRADE		4	/* Downgrade the lock. (internal) */
+#define	LCK_ROLLBACK		5	/* Lock even if in rollback */
+
+/*
+ * If doing transactions we have to hold the locks associated with a data item
+ * from a page for the entire transaction.  However, we don't have to hold the
+ * locks associated with walking the tree.  Distinguish between the two so that
+ * we don't tie up the internal pages of the tree longer than necessary.
+ */
+#define	__LPUT(dbc, lock)						\
+	__ENV_LPUT((dbc)->env, lock)
+
+#define	__ENV_LPUT(env, lock)						\
+	(LOCK_ISSET(lock) ? __lock_put(env, &(lock)) : 0)
+
+/*
+ * __TLPUT -- transactional lock put
+ *	If the lock is valid then
+ *	   If we are not in a transaction put the lock.
+ *	   Else if the cursor is doing dirty reads and this was a read then
+ *		put the lock.
+ *	   Else if the db is supporting dirty reads and this is a write then
+ *		downgrade it.
+ *	Else do nothing.
+ */
+#define	__TLPUT(dbc, lock)						\
+	(LOCK_ISSET(lock) ? __db_lput(dbc, &(lock)) : 0)
+
+/*
+ * Check whether a database is a primary (that is, has associated secondaries).
+ */
+#define	DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL)
+/*
+ * A database should be required to be readonly if it's been explicitly
+ * specified as such or if we're a client in a replicated environment
+ * and the user did not specify DB_TXN_NOT_DURABLE.
+ */
+#define	DB_IS_READONLY(dbp)						\
+    (F_ISSET(dbp, DB_AM_RDONLY) ||					\
+    (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)))
+
+#ifdef HAVE_COMPRESSION
+/*
+ * Check whether a database is compressed (btree only)
+ */
+#define	DB_IS_COMPRESSED(dbp)						\
+    (((BTREE *)(dbp)->bt_internal)->bt_compress != NULL)
+#endif
+
+/*
+ * We copy the key out if there's any chance the key in the database is not
+ * the same as the user-specified key.  If there is a custom comparator we
+ * return a key, as the user-specified key might be a partial key, containing
+ * only the unique identifier.  [#13572] [#15770]
+ *
+ * The test for (flags != 0) is necessary for Db.{get,pget}, but it's not
+ * legal to pass a non-zero flags value to Dbc.{get,pget}.
+ *
+ * We need to split out the hash component, since it is possible to build
+ * without hash support enabled. Which would result in a null pointer access.
+ */
+#ifdef HAVE_HASH
+#define	DB_RETURNS_A_KEY_HASH(dbp)					\
+	((HASH *)(dbp)->h_internal)->h_compare != NULL
+#else
+#define	DB_RETURNS_A_KEY_HASH(dbp)	0
+#endif
+#define	DB_RETURNS_A_KEY(dbp, flags)					\
+	(((flags) != 0 && (flags) != DB_GET_BOTH &&			\
+	    (flags) != DB_GET_BOTH_RANGE && (flags) != DB_SET) ||	\
+	    ((BTREE *)(dbp)->bt_internal)->bt_compare != __bam_defcmp ||\
+	    DB_RETURNS_A_KEY_HASH(dbp))
+
+/*
+ * For portability, primary keys that are record numbers are stored in
+ * secondaries in the same byte order as the secondary database.  As a
+ * consequence, we need to swap the byte order of these keys before attempting
+ * to use them for lookups in the primary.  We also need to swap user-supplied
+ * primary keys that are used in secondary lookups (for example, with the
+ * DB_GET_BOTH flag on a secondary get).
+ */
+#include "dbinc/db_swap.h"
+
+#define	SWAP_IF_NEEDED(sdbp, pkey)					\
+	do {								\
+		if (((sdbp)->s_primary->type == DB_QUEUE ||		\
+		    (sdbp)->s_primary->type == DB_RECNO) &&		\
+		    F_ISSET((sdbp), DB_AM_SWAP))			\
+			P_32_SWAP((pkey)->data);			\
+	} while (0)
+
+/*
+ * Cursor adjustment:
+ *	Return the first DB handle in the sorted ENV list of DB
+ *	handles that has a matching file ID.
+ */
+#define	FIND_FIRST_DB_MATCH(env, dbp, tdbp) do {			\
+	for ((tdbp) = (dbp);						\
+	    TAILQ_PREV((tdbp), __dblist, dblistlinks) != NULL &&	\
+	    TAILQ_PREV((tdbp),						\
+		__dblist, dblistlinks)->adj_fileid == (dbp)->adj_fileid;\
+	    (tdbp) = TAILQ_PREV((tdbp), __dblist, dblistlinks))		\
+		;							\
+} while (0)
+
+/*
+ * Macros used to implement a binary search algorithm. Shared between the
+ * btree and hash implementations.
+ */
+#define	DB_BINARY_SEARCH_FOR(base, limit, nument, adjust)		\
+	for (base = 0, limit = (nument) / (db_indx_t)(adjust);		\
+	    (limit) != 0; (limit) >>= 1)
+
+#define	DB_BINARY_SEARCH_INCR(index, base, limit, adjust)		\
+	index = (base) + (((limit) >> 1) * (adjust))
+
+#define	DB_BINARY_SEARCH_SHIFT_BASE(index, base, limit, adjust)	do {	\
+	base = (index) + (adjust);					\
+	--(limit);							\
+} while (0)
+
+/*
+ * Sequence macros, shared between sequence.c and seq_stat.c
+ */
+#define	SEQ_IS_OPEN(seq)	((seq)->seq_key.data != NULL)
+
+#define	SEQ_ILLEGAL_AFTER_OPEN(seq, name)				\
+	if (SEQ_IS_OPEN(seq))						\
+		return (__db_mi_open((seq)->seq_dbp->env, name, 1));
+
+#define	SEQ_ILLEGAL_BEFORE_OPEN(seq, name)				\
+	if (!SEQ_IS_OPEN(seq))						\
+		return (__db_mi_open((seq)->seq_dbp->env, name, 0));
+
+/*
+ * Flags to __db_chk_meta.
+ */
+#define	DB_CHK_META	0x01	/* Checksum the meta page. */
+#define	DB_CHK_NOLSN	0x02	/* Don't check the LSN. */
+#define	DB_CHK_ONLY	0x04	/* Only do the checksum. */
+#define	DB_SKIP_CHK	0x08	/* Don't checksum or decrypt the meta page. */
+
+/*
+ * Flags to __db_truncate_page.
+ */
+#define DB_EXCH_FREE		0x01	/* Free the old page. */
+#define DB_EXCH_PARENT		0x02	/* There is a parent to update. */
+
+/* We usually want to do these operations. */
+#define DB_EXCH_DEFAULT		(DB_EXCH_FREE | DB_EXCH_PARENT)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc/db_dispatch.h"
+#include "dbinc_auto/db_auto.h"
+#include "dbinc_auto/crdel_auto.h"
+#include "dbinc_auto/db_ext.h"
+#endif /* !_DB_AM_H_ */
diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in
new file mode 100644
index 00000000..84fc0f88
--- /dev/null
+++ b/src/dbinc/db_cxx.in
@@ -0,0 +1,1523 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_CXX_H_
+#define	_DB_CXX_H_
+//
+// C++ assumptions:
+//
+// To ensure portability to many platforms, both new and old, we make
+// few assumptions about the C++ compiler and library.  For example,
+// we do not expect STL, templates or namespaces to be available.  The
+// "newest" C++ feature used is exceptions, which are used liberally
+// to transmit error information.  Even the use of exceptions can be
+// disabled at runtime, to do so, use the DB_CXX_NO_EXCEPTIONS flags
+// with the DbEnv or Db constructor.
+//
+// C++ naming conventions:
+//
+//  - All top level class names start with Db.
+//  - All class members start with lower case letter.
+//  - All private data members are suffixed with underscore.
+//  - Use underscores to divide names into multiple words.
+//  - Simple data accessors are named with get_ or set_ prefix.
+//  - All method names are taken from names of functions in the C
+//    layer of db (usually by dropping a prefix like "db_").
+//    These methods have the same argument types and order,
+//    other than dropping the explicit arg that acts as "this".
+//
+// As a rule, each DbFoo object has exactly one underlying DB_FOO struct
+// (defined in db.h) associated with it.  In some cases, we inherit directly
+// from the DB_FOO structure to make this relationship explicit.  Often,
+// the underlying C layer allocates and deallocates these structures, so
+// there is no easy way to add any data to the DbFoo class.  When you see
+// a comment about whether data is permitted to be added, this is what
+// is going on.  Of course, if we need to add data to such C++ classes
+// in the future, we will arrange to have an indirect pointer to the
+// DB_FOO struct (as some of the classes already have).
+//
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Forward declarations
+//
+
+#include <stdarg.h>
+
+@cxx_have_stdheaders@
+#ifdef HAVE_CXX_STDHEADERS
+#include <iostream>
+#include <exception>
+#define	__DB_STD(x)	std::x
+#else
+#include <iostream.h>
+#include <exception.h>
+#define	__DB_STD(x)	x
+#endif
+
+#include "db.h"
+
+class Db;                                        // forward
+class Dbc;                                       // forward
+class DbChannel;                                 // forward
+class DbEnv;                                     // forward
+class DbHeapRecordId;                            // forward
+class DbInfo;                                    // forward
+class DbLock;                                    // forward
+class DbLogc;                                    // forward
+class DbLsn;                                     // forward
+class DbMpoolFile;                               // forward
+class DbPreplist;                                // forward
+class DbSequence;                                // forward
+class DbSite;                                    // forward
+class Dbt;                                       // forward
+class DbTxn;                                     // forward
+
+class DbMultipleIterator;                        // forward
+class DbMultipleKeyDataIterator;                 // forward
+class DbMultipleRecnoDataIterator;               // forward
+class DbMultipleDataIterator;                    // forward
+
+class DbException;                               // forward
+class DbDeadlockException;                       // forward
+class DbLockNotGrantedException;                 // forward
+class DbMemoryException;                         // forward
+class DbRepHandleDeadException;                  // forward
+class DbRunRecoveryException;                    // forward
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Turn off inappropriate compiler warnings
+//
+
+#ifdef _MSC_VER
+
+// These are level 4 warnings that are explicitly disabled.
+// With Visual C++, by default you do not see above level 3 unless
+// you use /W4.  But we like to compile with the highest level
+// warnings to catch other errors.
+//
+// 4201: nameless struct/union
+//       triggered by standard include file <winnt.h>
+//
+// 4514: unreferenced inline function has been removed
+//       certain include files in MSVC define methods that are not called
+//
+#pragma warning(push)
+#pragma warning(disable: 4201 4514)
+
+#endif
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Mechanisms for declaring classes
+//
+
+//
+// Every class defined in this file has an _exported next to the class name.
+// This is needed for WinTel machines so that the class methods can
+// be exported or imported in a DLL as appropriate.  Users of the DLL
+// use the define DB_USE_DLL.  When the DLL is built, DB_CREATE_DLL
+// must be defined.
+//
+#if defined(_MSC_VER)
+
+#  if defined(DB_CREATE_DLL)
+#    define _exported __declspec(dllexport)      // creator of dll
+#  elif defined(DB_USE_DLL)
+#    define _exported __declspec(dllimport)      // user of dll
+#  else
+#    define _exported                            // static lib creator or user
+#  endif
+
+#else /* _MSC_VER */
+
+#  define _exported
+
+#endif /* _MSC_VER */
+
+// Some interfaces can be customized by allowing users to define
+// callback functions.  For performance and logistical reasons, some
+// callback functions must be declared in extern "C" blocks.  For others,
+// we allow you to declare the callbacks in C++ or C (or an extern "C"
+// block) as you wish.  See the set methods for the callbacks for
+// the choices.
+//
+extern "C" {
+	typedef void * (*db_malloc_fcn_type)
+		(size_t);
+	typedef void * (*db_realloc_fcn_type)
+		(void *, size_t);
+	typedef void (*db_free_fcn_type)
+		(void *);
+	typedef int (*bt_compare_fcn_type)          /*C++ version available*/
+		(DB *, const DBT *, const DBT *);
+	typedef size_t (*bt_prefix_fcn_type)        /*C++ version available*/
+		(DB *, const DBT *, const DBT *);
+	typedef int (*dup_compare_fcn_type)         /*C++ version available*/
+		(DB *, const DBT *, const DBT *);
+	typedef int (*h_compare_fcn_type)          /*C++ version available*/
+		(DB *, const DBT *, const DBT *);
+	typedef u_int32_t (*h_hash_fcn_type)        /*C++ version available*/
+		(DB *, const void *, u_int32_t);
+	typedef int (*pgin_fcn_type)
+		(DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie);
+	typedef int (*pgout_fcn_type)
+		(DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie);
+}
+
+//
+// Represents a database table = a set of keys with associated values.
+//
+class _exported Db
+{
+	friend class DbEnv;
+
+public:
+	Db(DbEnv*, u_int32_t);      // Create a Db object.
+	virtual ~Db();              // Calls close() if the user hasn't.
+
+	// These methods exactly match those in the C interface.
+	//
+	virtual int associate(DbTxn *txn, Db *secondary, int (*callback)
+	    (Db *, const Dbt *, const Dbt *, Dbt *), u_int32_t flags);
+	virtual int associate_foreign(Db *foreign, int (*callback)
+	    (Db *, const Dbt *, Dbt *, const Dbt *, int *), u_int32_t flags);
+	virtual int close(u_int32_t flags);
+	virtual int compact(DbTxn *txnid, Dbt *start,
+	    Dbt *stop, DB_COMPACT *c_data, u_int32_t flags, Dbt *end);
+	virtual int cursor(DbTxn *txnid, Dbc **cursorp, u_int32_t flags);
+	virtual int del(DbTxn *txnid, Dbt *key, u_int32_t flags);
+	virtual void err(int, const char *, ...);
+	virtual void errx(const char *, ...);
+	virtual int exists(DbTxn *txnid, Dbt *key, u_int32_t flags);
+	virtual int fd(int *fdp);
+	virtual int get(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags);
+	virtual int get_alloc(
+	    db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *);
+	virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t));
+	virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *));
+	virtual int get_bt_compress(
+	    int (**)(
+	    Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
+	    int (**)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *));
+	virtual int get_bt_minkey(u_int32_t *);
+	virtual int get_bt_prefix(size_t (**)(Db *, const Dbt *, const Dbt *));
+	virtual int get_byteswapped(int *);
+	virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
+	virtual int get_create_dir(const char **);
+	virtual int get_dbname(const char **, const char **);
+	virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *));
+	virtual int get_encrypt_flags(u_int32_t *);
+	virtual void get_errcall(
+	    void (**)(const DbEnv *, const char *, const char *));
+	virtual void get_errfile(FILE **);
+	virtual void get_errpfx(const char **);
+	virtual int get_feedback(void (**)(Db *, int, int));
+	virtual int get_flags(u_int32_t *);
+	virtual int get_heapsize(u_int32_t *, u_int32_t *);
+	virtual int get_heap_regionsize(u_int32_t *);
+	virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *));
+	virtual int get_h_ffactor(u_int32_t *);
+	virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t));
+	virtual int get_h_nelem(u_int32_t *);
+	virtual int get_lk_exclusive(bool *, bool *);
+	virtual int get_lorder(int *);
+	virtual void get_msgcall(void (**)(const DbEnv *, const char *));
+	virtual void get_msgfile(FILE **);
+	virtual int get_multiple();
+	virtual int get_open_flags(u_int32_t *);
+	virtual int get_pagesize(u_int32_t *);
+	virtual int get_partition_callback(
+	    u_int32_t *, u_int32_t (**)(Db *, Dbt *key));
+	virtual int get_partition_dirs(const char ***);
+	virtual int get_partition_keys(u_int32_t *, Dbt **);
+	virtual int get_priority(DB_CACHE_PRIORITY *);
+	virtual int get_q_extentsize(u_int32_t *);
+	virtual int get_re_delim(int *);
+	virtual int get_re_len(u_int32_t *);
+	virtual int get_re_pad(int *);
+	virtual int get_re_source(const char **);
+	virtual int get_transactional();
+	virtual int get_type(DBTYPE *);
+	virtual int join(Dbc **curslist, Dbc **dbcp, u_int32_t flags);
+	virtual int key_range(DbTxn *, Dbt *, DB_KEY_RANGE *, u_int32_t);
+	virtual int open(DbTxn *txnid,
+	    const char *, const char *subname, DBTYPE, u_int32_t, int);
+	virtual int pget(DbTxn *txnid,
+	    Dbt *key, Dbt *pkey, Dbt *data, u_int32_t flags);
+	virtual int put(DbTxn *, Dbt *, Dbt *, u_int32_t);
+	virtual int remove(const char *, const char *, u_int32_t);
+	virtual int rename(const char *, const char *, const char *, u_int32_t);
+	virtual int set_alloc(
+	    db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type);
+	virtual void set_app_private(void *);
+	virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t));
+	virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/
+	virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *));
+	virtual int set_bt_compress(
+	    int (*)
+	    (Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
+	    int (*)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *));
+	virtual int set_bt_minkey(u_int32_t);
+	virtual int set_bt_prefix(bt_prefix_fcn_type); /*deprecated*/
+	virtual int set_bt_prefix(size_t (*)(Db *, const Dbt *, const Dbt *));
+	virtual int set_cachesize(u_int32_t, u_int32_t, int);
+	virtual int set_create_dir(const char *);
+	virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/
+	virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *));
+	virtual int set_encrypt(const char *, u_int32_t);
+	virtual void set_errcall(
+	    void (*)(const DbEnv *, const char *, const char *));
+	virtual void set_errfile(FILE *);
+	virtual void set_errpfx(const char *);
+	virtual int set_feedback(void (*)(Db *, int, int));
+	virtual int set_flags(u_int32_t);
+	virtual int set_heapsize(u_int32_t, u_int32_t);
+	virtual int set_heap_regionsize(u_int32_t);
+	virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/
+	virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *));
+	virtual int set_h_ffactor(u_int32_t);
+	virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/
+	virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t));
+	virtual int set_h_nelem(u_int32_t);
+	virtual int set_lk_exclusive(bool);
+	virtual int set_lorder(int);
+	virtual void set_msgcall(void (*)(const DbEnv *, const char *));
+	virtual void set_msgfile(FILE *);
+	virtual int set_pagesize(u_int32_t);
+	virtual int set_paniccall(void (*)(DbEnv *, int));
+	virtual int set_partition(
+	    u_int32_t, Dbt *, u_int32_t (*)(Db *, Dbt *));
+	virtual int set_partition_dirs(const char **);
+	virtual int set_priority(DB_CACHE_PRIORITY);
+	virtual int set_q_extentsize(u_int32_t);
+	virtual int set_re_delim(int);
+	virtual int set_re_len(u_int32_t);
+	virtual int set_re_pad(int);
+	virtual int set_re_source(const char *);
+	virtual int sort_multiple(Dbt *, Dbt *, u_int32_t);
+	virtual int stat(DbTxn *, void *sp, u_int32_t flags);
+	virtual int stat_print(u_int32_t flags);
+	virtual int sync(u_int32_t flags);
+	virtual int truncate(DbTxn *, u_int32_t *, u_int32_t);
+	virtual int upgrade(const char *name, u_int32_t flags);
+	virtual int verify(
+	    const char *, const char *, __DB_STD(ostream) *, u_int32_t);
+
+	// These additional methods are not in the C interface, and
+	// are only available for C++.
+	//
+	virtual void *get_app_private() const;
+	virtual __DB_STD(ostream) *get_error_stream();
+	virtual void set_error_stream(__DB_STD(ostream) *);
+	virtual __DB_STD(ostream) *get_message_stream();
+	virtual void set_message_stream(__DB_STD(ostream) *);
+
+	virtual DbEnv *get_env();
+	virtual DbMpoolFile *get_mpf();
+
+	virtual ENV *get_ENV()
+	{
+		return imp_->env;
+	}
+
+	virtual DB *get_DB()
+	{
+		return imp_;
+	}
+
+	virtual const DB *get_const_DB() const
+	{
+		return imp_;
+	}
+
+	static Db* get_Db(DB *db)
+	{
+		return (Db *)db->api_internal;
+	}
+
+	static const Db* get_const_Db(const DB *db)
+	{
+		return (const Db *)db->api_internal;
+	}
+	
+	u_int32_t get_create_flags() const
+	{
+		return construct_flags_;
+	}
+
+private:
+	// no copying
+	Db(const Db &);
+	Db &operator = (const Db &);
+
+	void cleanup();
+	int initialize();
+	int error_policy();
+
+	// instance data
+	DB *imp_;
+	DbEnv *dbenv_;
+	DbMpoolFile *mpf_;
+	int construct_error_;
+	u_int32_t flags_;
+	u_int32_t construct_flags_;
+
+	static int alt_close(DB *, u_int32_t);
+
+public:
+	// These are public only because they need to be called
+	// via C callback functions.  They should never be used by
+	// external users of this class.
+	//
+	int (*append_recno_callback_)(Db *, Dbt *, db_recno_t);
+	int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *);
+	int (*associate_foreign_callback_)
+	    (Db *, const Dbt *, Dbt *, const Dbt *, int *);
+	int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *);
+	int (*bt_compress_callback_)(
+	    Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *);
+	int (*bt_decompress_callback_)(
+	    Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *);
+	size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *);
+	u_int32_t (*db_partition_callback_)(Db *, Dbt *);
+	int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *);
+	void (*feedback_callback_)(Db *, int, int);
+	int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *);
+	u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t);
+};
+
+//
+// Cursor
+//
+class _exported Dbc : protected DBC
+{
+	friend class Db;
+
+public:
+	int close();
+	int cmp(Dbc *other_csr, int *result, u_int32_t flags);
+	int count(db_recno_t *countp, u_int32_t flags);
+	int del(u_int32_t flags);
+	int dup(Dbc** cursorp, u_int32_t flags);
+	int get(Dbt* key, Dbt *data, u_int32_t flags);
+	int get_priority(DB_CACHE_PRIORITY *priorityp);
+	int pget(Dbt* key, Dbt* pkey, Dbt *data, u_int32_t flags);
+	int put(Dbt* key, Dbt *data, u_int32_t flags);
+	int set_priority(DB_CACHE_PRIORITY priority);
+
+private:
+	// No data is permitted in this class (see comment at top)
+
+	// Note: use Db::cursor() to get pointers to a Dbc,
+	// and call Dbc::close() rather than delete to release them.
+	//
+	Dbc();
+	~Dbc();
+
+	// no copying
+	Dbc(const Dbc &);
+	Dbc &operator = (const Dbc &);
+};
+
+//
+// A channel in replication group
+//
+class _exported DbChannel
+{
+	friend class DbEnv;
+
+public:
+	int close();
+	int send_msg(Dbt *msg, u_int32_t nmsg, u_int32_t flags);
+	int send_request(Dbt *request, u_int32_t nrequest, Dbt *response,
+	    db_timeout_t timeout, u_int32_t flags); 
+	int set_timeout(db_timeout_t timeout);
+
+	virtual DB_CHANNEL *get_DB_CHANNEL()
+	{
+		return imp_;
+	}
+
+	virtual const DB_CHANNEL *get_const_DB_CHANNEL() const
+	{
+		return imp_;
+	}
+
+private:
+	DbChannel();
+	virtual ~DbChannel();
+
+	// no copying
+	DbChannel(const DbChannel &);
+	DbChannel &operator = (const DbChannel &);
+	DB_CHANNEL *imp_;
+	DbEnv *dbenv_;
+};
+
+//
+// Berkeley DB environment class.  Provides functions for opening databases.
+// User of this library can use this class as a starting point for
+// developing a DB application - derive their application class from
+// this one, add application control logic.
+//
+// Note that if you use the default constructor, you must explicitly
+// call appinit() before any other db activity (e.g. opening files)
+//
+class _exported DbEnv
+{
+	friend class Db;
+	friend class DbLock;
+	friend class DbMpoolFile;
+
+public:
+	// After using this constructor, you can set any needed
+	// parameters for the environment using the set_* methods.
+	// Then call open() to finish initializing the environment
+	// and attaching it to underlying files.
+	//
+	DbEnv(u_int32_t flags);
+
+	virtual ~DbEnv();
+
+	// These methods match those in the C interface.
+	//
+	virtual int add_data_dir(const char *);
+	virtual int backup(const char *target, u_int32_t flags);
+	virtual int cdsgroup_begin(DbTxn **tid);
+	virtual int close(u_int32_t);
+	virtual int dbbackup(
+	    const char *dbfile, const char *target, u_int32_t flags);
+	virtual int dbremove(DbTxn *txn, const char *name, const char *subdb,
+	    u_int32_t flags);
+	virtual int dbrename(DbTxn *txn, const char *name, const char *subdb,
+	    const char *newname, u_int32_t flags);
+	virtual void err(int, const char *, ...);
+	virtual void errx(const char *, ...);
+	virtual int failchk(u_int32_t);
+	virtual int fileid_reset(const char *, u_int32_t);
+	virtual int get_alloc(db_malloc_fcn_type *, db_realloc_fcn_type *,
+	    db_free_fcn_type *);
+	virtual void *get_app_private() const;
+	virtual int get_home(const char **);
+	virtual int get_open_flags(u_int32_t *);
+	virtual int open(const char *, u_int32_t, int);
+	virtual int remove(const char *, u_int32_t);
+	virtual int stat_print(u_int32_t flags);
+
+	virtual int set_alloc(db_malloc_fcn_type, db_realloc_fcn_type,
+	    db_free_fcn_type);
+	virtual void set_app_private(void *);
+	virtual int get_backup_callbacks(
+	    int (**)(DbEnv *, const char *, const char *, void **),
+	    int (**)(DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+	    int (**)(DbEnv *, const char *, void *));
+	virtual int set_backup_callbacks(
+	    int (*)(DbEnv *, const char *, const char *, void **),
+	    int (*)(DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+	    int (*)(DbEnv *, const char *, void *));
+	virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *);
+	virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t);
+	virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
+	virtual int set_cachesize(u_int32_t, u_int32_t, int);
+	virtual int get_cache_max(u_int32_t *, u_int32_t *);
+	virtual int set_cache_max(u_int32_t, u_int32_t);
+	virtual int get_create_dir(const char **);
+	virtual int set_create_dir(const char *);
+	virtual int get_data_dirs(const char ***);
+	virtual int set_data_dir(const char *);
+	virtual int get_encrypt_flags(u_int32_t *);
+	virtual int get_intermediate_dir_mode(const char **);
+	virtual int set_intermediate_dir_mode(const char *);
+	virtual int get_isalive(
+	    int (**)(DbEnv *, pid_t, db_threadid_t, u_int32_t));
+	virtual int set_isalive(
+	    int (*)(DbEnv *, pid_t, db_threadid_t, u_int32_t));
+	virtual int set_encrypt(const char *, u_int32_t);
+	virtual void get_errcall(
+	    void (**)(const DbEnv *, const char *, const char *));
+	virtual void set_errcall(
+	    void (*)(const DbEnv *, const char *, const char *));
+	virtual void get_errfile(FILE **);
+	virtual void set_errfile(FILE *);
+	virtual void get_errpfx(const char **);
+	virtual void set_errpfx(const char *);
+	virtual int set_event_notify(void (*)(DbEnv *, u_int32_t, void *));
+	virtual int get_flags(u_int32_t *);
+	virtual int set_flags(u_int32_t, int);
+	virtual bool is_bigendian();
+	virtual int lsn_reset(const char *, u_int32_t);
+	virtual int get_feedback(void (**)(DbEnv *, int, int));
+	virtual int set_feedback(void (*)(DbEnv *, int, int));
+	virtual int get_lg_bsize(u_int32_t *);
+	virtual int set_lg_bsize(u_int32_t);
+	virtual int get_lg_dir(const char **);
+	virtual int set_lg_dir(const char *);
+	virtual int get_lg_filemode(int *);
+	virtual int set_lg_filemode(int);
+	virtual int get_lg_max(u_int32_t *);
+	virtual int set_lg_max(u_int32_t);
+	virtual int get_lg_regionmax(u_int32_t *);
+	virtual int set_lg_regionmax(u_int32_t);
+	virtual int get_lk_conflicts(const u_int8_t **, int *);
+	virtual int set_lk_conflicts(u_int8_t *, int);
+	virtual int get_lk_detect(u_int32_t *);
+	virtual int set_lk_detect(u_int32_t);
+	virtual int get_lk_max_lockers(u_int32_t *);
+	virtual int set_lk_max_lockers(u_int32_t);
+	virtual int get_lk_max_locks(u_int32_t *);
+	virtual int set_lk_max_locks(u_int32_t);
+	virtual int get_lk_max_objects(u_int32_t *);
+	virtual int set_lk_max_objects(u_int32_t);
+	virtual int get_lk_partitions(u_int32_t *);
+	virtual int set_lk_partitions(u_int32_t);
+	virtual int get_lk_priority(u_int32_t, u_int32_t *);
+	virtual int set_lk_priority(u_int32_t, u_int32_t);
+	virtual int get_lk_tablesize(u_int32_t *);
+	virtual int set_lk_tablesize(u_int32_t);
+	virtual int get_memory_init(DB_MEM_CONFIG, u_int32_t *);
+	virtual int set_memory_init(DB_MEM_CONFIG, u_int32_t);
+	virtual int get_memory_max(u_int32_t *, u_int32_t *);
+	virtual int set_memory_max(u_int32_t, u_int32_t);
+	virtual int get_metadata_dir(const char **);
+	virtual int set_metadata_dir(const char *);
+	virtual int get_mp_mmapsize(size_t *);
+	virtual int set_mp_mmapsize(size_t);
+	virtual int get_mp_max_openfd(int *);
+	virtual int set_mp_max_openfd(int);
+	virtual int get_mp_max_write(int *, db_timeout_t *);
+	virtual int set_mp_max_write(int, db_timeout_t);
+	virtual int get_mp_pagesize(u_int32_t *);
+	virtual int set_mp_pagesize(u_int32_t);
+	virtual int get_mp_tablesize(u_int32_t *);
+	virtual int set_mp_tablesize(u_int32_t);
+	virtual void get_msgcall(void (**)(const DbEnv *, const char *));
+	virtual void set_msgcall(void (*)(const DbEnv *, const char *));
+	virtual void get_msgfile(FILE **);
+	virtual void set_msgfile(FILE *);
+	virtual int set_paniccall(void (*)(DbEnv *, int));
+	virtual int get_shm_key(long *);
+	virtual int set_shm_key(long);
+	virtual int get_timeout(db_timeout_t *, u_int32_t);
+	virtual int set_timeout(db_timeout_t, u_int32_t);
+	virtual int get_tmp_dir(const char **);
+	virtual int set_tmp_dir(const char *);
+	virtual int get_tx_max(u_int32_t *);
+	virtual int set_tx_max(u_int32_t);
+	virtual int get_app_dispatch(
+	    int (**)(DbEnv *, Dbt *, DbLsn *, db_recops));
+	virtual int set_app_dispatch(int (*)(DbEnv *,
+	    Dbt *, DbLsn *, db_recops));
+	virtual int get_tx_timestamp(time_t *);
+	virtual int set_tx_timestamp(time_t *);
+	virtual int get_verbose(u_int32_t which, int *);
+	virtual int set_verbose(u_int32_t which, int);
+
+	// Version information.  Static methods, can be called at any time.
+	//
+	static char *version(int *major, int *minor, int *patch);
+	static char *full_version(int *family, int *release,
+	    int *major, int *minor, int *patch);
+
+	// Convert DB errors to strings
+	static char *strerror(int);
+
+	// If an error is detected and the error call function
+	// or stream is set, a message is dispatched or printed.
+	// If a prefix is set, each message is prefixed.
+	//
+	// You can use set_errcall() or set_errfile() above to control
+	// error functionality.  Alternatively, you can call
+	// set_error_stream() to force all errors to a C++ stream.
+	// It is unwise to mix these approaches.
+	//
+	virtual __DB_STD(ostream) *get_error_stream();
+	virtual void set_error_stream(__DB_STD(ostream) *);
+	virtual __DB_STD(ostream) *get_message_stream();
+	virtual void set_message_stream(__DB_STD(ostream) *);
+
+	// used internally
+	static void runtime_error(DbEnv *dbenv, const char *caller, int err,
+				  int error_policy);
+	static void runtime_error_dbt(DbEnv *dbenv, const char *caller, Dbt *dbt,
+				  int error_policy);
+	static void runtime_error_lock_get(DbEnv *dbenv, const char *caller,
+				  int err, db_lockop_t op, db_lockmode_t mode,
+				  Dbt *obj, DbLock lock, int index,
+				  int error_policy);
+
+	// Lock functions
+	//
+	virtual int lock_detect(u_int32_t flags, u_int32_t atype, int *aborted);
+	virtual int lock_get(u_int32_t locker, u_int32_t flags, Dbt *obj,
+		     db_lockmode_t lock_mode, DbLock *lock);
+	virtual int lock_id(u_int32_t *idp);
+	virtual int lock_id_free(u_int32_t id);
+	virtual int lock_put(DbLock *lock);
+	virtual int lock_stat(DB_LOCK_STAT **statp, u_int32_t flags);
+	virtual int lock_stat_print(u_int32_t flags);
+	virtual int lock_vec(u_int32_t locker, u_int32_t flags,
+		     DB_LOCKREQ list[], int nlist, DB_LOCKREQ **elistp);
+
+	// Log functions
+	//
+	virtual int log_archive(char **list[], u_int32_t flags);
+	static int log_compare(const DbLsn *lsn0, const DbLsn *lsn1);
+	virtual int log_cursor(DbLogc **cursorp, u_int32_t flags);
+	virtual int log_file(DbLsn *lsn, char *namep, size_t len);
+	virtual int log_flush(const DbLsn *lsn);
+	virtual int log_get_config(u_int32_t, int *);
+	virtual int log_put(DbLsn *lsn, const Dbt *data, u_int32_t flags);
+	virtual int log_printf(DbTxn *, const char *, ...);
+	virtual int log_set_config(u_int32_t, int);
+	virtual int log_stat(DB_LOG_STAT **spp, u_int32_t flags);
+	virtual int log_stat_print(u_int32_t flags);
+	virtual int log_verify(DB_LOG_VERIFY_CONFIG *);
+
+	// Mpool functions
+	//
+	virtual int memp_fcreate(DbMpoolFile **dbmfp, u_int32_t flags);
+	virtual int memp_register(int ftype,
+			  pgin_fcn_type pgin_fcn,
+			  pgout_fcn_type pgout_fcn);
+	virtual int memp_stat(DB_MPOOL_STAT
+		      **gsp, DB_MPOOL_FSTAT ***fsp, u_int32_t flags);
+	virtual int memp_stat_print(u_int32_t flags);
+	virtual int memp_sync(DbLsn *lsn);
+	virtual int memp_trickle(int pct, int *nwrotep);
+
+	// Mpool functions
+	//
+	virtual int mutex_alloc(u_int32_t, db_mutex_t *);
+	virtual int mutex_free(db_mutex_t);
+	virtual int mutex_get_align(u_int32_t *);
+	virtual int mutex_get_increment(u_int32_t *);
+	virtual int mutex_get_init(u_int32_t *);
+	virtual int mutex_get_max(u_int32_t *);
+	virtual int mutex_get_tas_spins(u_int32_t *);
+	virtual int mutex_lock(db_mutex_t);
+	virtual int mutex_set_align(u_int32_t);
+	virtual int mutex_set_increment(u_int32_t);
+	virtual int mutex_set_init(u_int32_t);
+	virtual int mutex_set_max(u_int32_t);
+	virtual int mutex_set_tas_spins(u_int32_t);
+	virtual int mutex_stat(DB_MUTEX_STAT **, u_int32_t);
+	virtual int mutex_stat_print(u_int32_t);
+	virtual int mutex_unlock(db_mutex_t);
+
+	// Transaction functions
+	//
+	virtual int txn_begin(DbTxn *pid, DbTxn **tid, u_int32_t flags);
+	virtual int txn_checkpoint(u_int32_t kbyte, u_int32_t min,
+			u_int32_t flags);
+	virtual int txn_recover(DbPreplist *preplist, long count,
+			long *retp, u_int32_t flags);
+	virtual int txn_stat(DB_TXN_STAT **statp, u_int32_t flags);
+	virtual int txn_stat_print(u_int32_t flags);
+
+	// Replication functions
+	//
+	virtual int rep_elect(u_int32_t, u_int32_t, u_int32_t);
+	virtual int rep_flush();
+	virtual int rep_process_message(Dbt *, Dbt *, int, DbLsn *);
+	virtual int rep_start(Dbt *, u_int32_t);
+	virtual int rep_stat(DB_REP_STAT **statp, u_int32_t flags);
+	virtual int rep_stat_print(u_int32_t flags);
+	virtual int rep_get_clockskew(u_int32_t *, u_int32_t *);
+	virtual int rep_set_clockskew(u_int32_t, u_int32_t);
+	virtual int rep_get_limit(u_int32_t *, u_int32_t *);
+	virtual int rep_set_limit(u_int32_t, u_int32_t);
+	virtual int rep_set_transport(int, int (*)(DbEnv *,
+	    const Dbt *, const Dbt *, const DbLsn *, int, u_int32_t));
+	virtual int rep_set_request(u_int32_t, u_int32_t);
+	virtual int rep_get_request(u_int32_t *, u_int32_t *);
+	virtual int get_thread_count(u_int32_t *);
+	virtual int set_thread_count(u_int32_t);
+	virtual int get_thread_id_fn(
+	    void (**)(DbEnv *, pid_t *, db_threadid_t *));
+	virtual int set_thread_id(void (*)(DbEnv *, pid_t *, db_threadid_t *));
+	virtual int get_thread_id_string_fn(
+	    char *(**)(DbEnv *, pid_t, db_threadid_t, char *));
+	virtual int set_thread_id_string(char *(*)(DbEnv *,
+	    pid_t, db_threadid_t, char *));
+	virtual int rep_set_config(u_int32_t, int);
+	virtual int rep_get_config(u_int32_t, int *);
+	virtual int rep_sync(u_int32_t flags);
+
+	// Advanced replication functions
+	//
+	virtual int rep_get_nsites(u_int32_t *n);
+	virtual int rep_set_nsites(u_int32_t n);
+	virtual int rep_get_priority(u_int32_t *priorityp);
+	virtual int rep_set_priority(u_int32_t priority);
+	virtual int rep_get_timeout(int which, db_timeout_t *timeout);
+	virtual int rep_set_timeout(int which, db_timeout_t timeout);
+	virtual int repmgr_channel(int eid, DbChannel **channel,
+	    u_int32_t flags);
+	virtual int repmgr_get_ack_policy(int *policy);
+	virtual int repmgr_set_ack_policy(int policy);
+	virtual int repmgr_local_site(DbSite **site);
+	virtual int repmgr_msg_dispatch(void (*) (DbEnv *,
+	    DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags);
+	virtual int repmgr_site(const char *host, u_int port, DbSite **site,
+	    u_int32_t flags);
+	virtual int repmgr_site_by_eid(int eid, DbSite **site);
+	virtual int repmgr_site_list(u_int *countp, DB_REPMGR_SITE **listp);
+	virtual int repmgr_start(int nthreads, u_int32_t flags);
+	virtual int repmgr_stat(DB_REPMGR_STAT **statp, u_int32_t flags);
+	virtual int repmgr_stat_print(u_int32_t flags);
+
+	// Conversion functions
+	//
+	virtual ENV *get_ENV()
+	{
+		return imp_->env;
+	}
+
+	virtual DB_ENV *get_DB_ENV()
+	{
+		return imp_;
+	}
+
+	virtual const DB_ENV *get_const_DB_ENV() const
+	{
+		return imp_;
+	}
+
+	static DbEnv* get_DbEnv(DB_ENV *dbenv)
+	{
+		return dbenv ? (DbEnv *)dbenv->api1_internal : 0;
+	}
+
+	static const DbEnv* get_const_DbEnv(const DB_ENV *dbenv)
+	{
+		return dbenv ? (const DbEnv *)dbenv->api1_internal : 0;
+	}
+
+	u_int32_t get_create_flags() const
+	{
+		return construct_flags_;
+	}
+
+	// For internal use only.
+	static DbEnv* wrap_DB_ENV(DB_ENV *dbenv);
+
+	// These are public only because they need to be called
+	// via C functions.  They should never be called by users
+	// of this class.
+	//
+	static int _app_dispatch_intercept(DB_ENV *dbenv, DBT *dbt, DB_LSN *lsn,
+				       db_recops op);
+	static int _backup_close_intercept(DB_ENV *dbenv,
+	    const char *dbname, void *handle);
+	static int _backup_open_intercept(DB_ENV *dbenv,
+	    const char *dbname, const char *target, void **handle);
+	static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes,
+	    u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle);
+	static void _paniccall_intercept(DB_ENV *dbenv, int errval);
+	static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct);
+	static void  _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *);
+	static int _isalive_intercept(DB_ENV *dbenv, pid_t pid,
+	    db_threadid_t thrid, u_int32_t flags);
+	static int _rep_send_intercept(DB_ENV *dbenv, const DBT *cntrl,
+	    const DBT *data, const DB_LSN *lsn, int id, u_int32_t flags);
+	static void _stream_error_function(const DB_ENV *dbenv,
+	    const char *prefix, const char *message);
+	static void _stream_message_function(const DB_ENV *dbenv,
+	    const char *message);
+	static void _thread_id_intercept(DB_ENV *dbenv, pid_t *pidp,
+	    db_threadid_t *thridp);
+	static char *_thread_id_string_intercept(DB_ENV *dbenv, pid_t pid,
+	    db_threadid_t thrid, char *buf);
+	static void _message_dispatch_intercept(DB_ENV *dbenv,
+	    DB_CHANNEL *dbchannel, DBT *request, u_int32_t nrequest,
+	    u_int32_t cb_flags);
+
+private:
+	void cleanup();
+	int initialize(DB_ENV *dbenv);
+	int error_policy();
+
+	// For internal use only.
+	DbEnv(DB_ENV *, u_int32_t flags);
+
+	// no copying
+	DbEnv(const DbEnv &);
+	void operator = (const DbEnv &);
+
+	// instance data
+	DB_ENV *imp_;
+	int construct_error_;
+	u_int32_t construct_flags_;
+	__DB_STD(ostream) *error_stream_;
+	__DB_STD(ostream) *message_stream_;
+
+	int (*app_dispatch_callback_)(DbEnv *, Dbt *, DbLsn *, db_recops);
+	int (*backup_close_callback_)(DbEnv *, const char *, void *);
+	int (*backup_open_callback_)(
+	    DbEnv *, const char *, const char *, void **);
+	int (*backup_write_callback_)(
+	    DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+	int (*isalive_callback_)(DbEnv *, pid_t, db_threadid_t, u_int32_t);
+	void (*error_callback_)(const DbEnv *, const char *, const char *);
+	void (*feedback_callback_)(DbEnv *, int, int);
+	void (*message_callback_)(const DbEnv *, const char *);
+	void (*paniccall_callback_)(DbEnv *, int);
+	void (*event_func_callback_)(DbEnv *, u_int32_t, void *);
+	int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *,
+	    const DbLsn *, int, u_int32_t);
+	void (*thread_id_callback_)(DbEnv *, pid_t *, db_threadid_t *);
+	char *(*thread_id_string_callback_)(DbEnv *, pid_t, db_threadid_t,
+	    char *);
+	void (*message_dispatch_callback_)(DbEnv *, DbChannel *, Dbt *,
+	    u_int32_t, u_int32_t);
+};
+
+//
+// Heap record id
+//
+class _exported DbHeapRecordId : private DB_HEAP_RID
+{
+public:
+	db_pgno_t get_pgno() const		{ return pgno; }
+	void set_pgno(db_pgno_t value)		{ pgno = value; }
+
+	db_indx_t get_indx() const		{ return indx; }
+	void set_indx(db_indx_t value)		{ indx = value; }
+
+	DB_HEAP_RID *get_DB_HEAP_RID()		{ return (DB_HEAP_RID *)this; }
+	const DB_HEAP_RID *get_const_DB_HEAP_RID() const 
+					{ return (const DB_HEAP_RID *)this; }
+
+	static DbHeapRecordId* get_DbHeapRecordId(DB_HEAP_RID *rid) 
+					{ return (DbHeapRecordId *)rid; }
+	static const DbHeapRecordId* get_const_DbHeapRecordId(DB_HEAP_RID *rid)
+					{ return (const DbHeapRecordId *)rid; }
+
+	DbHeapRecordId(db_pgno_t pgno, db_indx_t indx);
+	DbHeapRecordId();
+	~DbHeapRecordId();
+	DbHeapRecordId(const DbHeapRecordId &);
+	DbHeapRecordId &operator = (const DbHeapRecordId &);
+};
+
+//
+// Lock
+//
+class _exported DbLock
+{
+	friend class DbEnv;
+
+public:
+	DbLock();
+	DbLock(const DbLock &);
+	DbLock &operator = (const DbLock &);
+
+protected:
+	// We can add data to this class if needed
+	// since its contained class is not allocated by db.
+	// (see comment at top)
+
+	DbLock(DB_LOCK);
+	DB_LOCK lock_;
+};
+
+//
+// Log cursor
+//
+class _exported DbLogc : protected DB_LOGC
+{
+	friend class DbEnv;
+
+public:
+	int close(u_int32_t _flags);
+	int get(DbLsn *lsn, Dbt *data, u_int32_t _flags);
+	int version(u_int32_t *versionp, u_int32_t _flags);
+
+private:
+	// No data is permitted in this class (see comment at top)
+
+	// Note: use Db::cursor() to get pointers to a Dbc,
+	// and call Dbc::close() rather than delete to release them.
+	//
+	DbLogc();
+	~DbLogc();
+
+	// no copying
+	DbLogc(const Dbc &);
+	DbLogc &operator = (const Dbc &);
+};
+
+//
+// Log sequence number
+//
+class _exported DbLsn : public DB_LSN
+{
+	friend class DbEnv;          // friendship needed to cast to base class
+	friend class DbLogc;         // friendship needed to cast to base class
+};
+
+//
+// Memory pool file
+//
+class _exported DbMpoolFile
+{
+	friend class DbEnv;
+	friend class Db;
+
+public:
+	int close(u_int32_t flags);
+	int get(db_pgno_t *pgnoaddr, DbTxn *txn, u_int32_t flags, void *pagep);
+	int get_clear_len(u_int32_t *len);
+	int get_fileid(u_int8_t *fileid);
+	int get_flags(u_int32_t *flagsp);
+	int get_ftype(int *ftype);
+	int get_last_pgno(db_pgno_t *pgnop);
+	int get_lsn_offset(int32_t *offsetp);
+	int get_maxsize(u_int32_t *gbytes, u_int32_t *bytes);
+	int get_pgcookie(DBT *dbt);
+	int get_priority(DB_CACHE_PRIORITY *priorityp);
+	int get_transactional(void);
+	int open(const char *file, u_int32_t flags, int mode, size_t pagesize);
+	int put(void *pgaddr, DB_CACHE_PRIORITY priority, u_int32_t flags);
+	int set_clear_len(u_int32_t len);
+	int set_fileid(u_int8_t *fileid);
+	int set_flags(u_int32_t flags, int onoff);
+	int set_ftype(int ftype);
+	int set_lsn_offset(int32_t offset);
+	int set_maxsize(u_int32_t gbytes, u_int32_t bytes);
+	int set_pgcookie(DBT *dbt);
+	int set_priority(DB_CACHE_PRIORITY priority);
+	int sync();
+
+	virtual DB_MPOOLFILE *get_DB_MPOOLFILE()
+	{
+		return imp_;
+	}
+
+	virtual const DB_MPOOLFILE *get_const_DB_MPOOLFILE() const
+	{
+		return imp_;
+	}
+
+private:
+	DB_MPOOLFILE *imp_;
+
+	// We can add data to this class if needed
+	// since it is implemented via a pointer.
+	// (see comment at top)
+
+	// Note: use DbEnv::memp_fcreate() to get pointers to a DbMpoolFile,
+	// and call DbMpoolFile::close() rather than delete to release them.
+	//
+	DbMpoolFile();
+
+	// Shut g++ up.
+protected:
+	virtual ~DbMpoolFile();
+
+private:
+	// no copying
+	DbMpoolFile(const DbMpoolFile &);
+	void operator = (const DbMpoolFile &);
+};
+
+//
+// This is filled in and returned by the DbEnv::txn_recover() method.
+//
+class _exported DbPreplist
+{
+public:
+	DbTxn *txn;
+	u_int8_t gid[DB_GID_SIZE];
+};
+
+//
+// A sequence record in a database
+//
+class _exported DbSequence
+{
+public:
+	DbSequence(Db *db, u_int32_t flags);
+	virtual ~DbSequence();
+
+	int open(DbTxn *txnid, Dbt *key, u_int32_t flags);
+	int initial_value(db_seq_t value);
+	int close(u_int32_t flags);
+	int remove(DbTxn *txnid, u_int32_t flags);
+	int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags);
+	int stat_print(u_int32_t flags);
+
+	int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags);
+	int get_cachesize(int32_t *sizep);
+	int set_cachesize(int32_t size);
+	int get_flags(u_int32_t *flagsp);
+	int set_flags(u_int32_t flags);
+	int get_range(db_seq_t *minp, db_seq_t *maxp);
+	int set_range(db_seq_t min, db_seq_t max);
+
+	Db *get_db();
+	Dbt *get_key();
+
+	virtual DB_SEQUENCE *get_DB_SEQUENCE()
+	{
+		return imp_;
+	}
+
+	virtual const DB_SEQUENCE *get_const_DB_SEQUENCE() const
+	{
+		return imp_;
+	}
+
+	static DbSequence* get_DbSequence(DB_SEQUENCE *seq)
+	{
+		return (DbSequence *)seq->api_internal;
+	}
+
+	static const DbSequence* get_const_DbSequence(const DB_SEQUENCE *seq)
+	{
+		return (const DbSequence *)seq->api_internal;
+	}
+
+	// For internal use only.
+	static DbSequence* wrap_DB_SEQUENCE(DB_SEQUENCE *seq);
+
+private:
+	DbSequence(DB_SEQUENCE *seq);
+	// no copying
+	DbSequence(const DbSequence &);
+	DbSequence &operator = (const DbSequence &);
+
+	DB_SEQUENCE *imp_;
+	DBT key_;
+};
+
+//
+// A site in replication group 
+//
+class _exported DbSite
+{
+	friend class DbEnv;
+
+public:
+	int close();
+	int get_address(const char **hostp, u_int *port);
+	int get_config(u_int32_t which, u_int32_t *value);
+	int get_eid(int *eidp);
+	int remove();
+	int set_config(u_int32_t which, u_int32_t value);
+
+	virtual DB_SITE *get_DB_SITE()
+	{
+		return imp_;
+	}
+
+	virtual const DB_SITE *get_const_DB_SITE() const
+	{
+		return imp_;
+	}
+
+private:
+        DbSite();
+        virtual ~DbSite();
+
+	// no copying
+	DbSite(const DbSite &);
+	DbSite &operator = (const DbSite &);
+	DB_SITE *imp_;
+};
+
+//
+// Transaction
+//
+class _exported DbTxn
+{
+	friend class DbEnv;
+
+public:
+	int abort();
+	int commit(u_int32_t flags);
+	int discard(u_int32_t flags);
+	u_int32_t id();
+	int get_name(const char **namep);
+	int get_priority(u_int32_t *priorityp);
+	int prepare(u_int8_t *gid);
+	int set_name(const char *name);
+	int set_priority(u_int32_t priority);
+	int set_timeout(db_timeout_t timeout, u_int32_t flags);
+
+	virtual DB_TXN *get_DB_TXN()
+	{
+		return imp_;
+	}
+
+	virtual const DB_TXN *get_const_DB_TXN() const
+	{
+		return imp_;
+	}
+
+	static DbTxn* get_DbTxn(DB_TXN *txn)
+	{
+		return (DbTxn *)txn->api_internal;
+	}
+
+	static const DbTxn* get_const_DbTxn(const DB_TXN *txn)
+	{
+		return (const DbTxn *)txn->api_internal;
+	}
+
+	// For internal use only.
+	static DbTxn* wrap_DB_TXN(DB_TXN *txn);
+	void remove_child_txn(DbTxn *kid);
+	void add_child_txn(DbTxn *kid);
+
+	void set_parent(DbTxn *ptxn)
+	{
+		parent_txn_ = ptxn;
+	}
+
+private:
+	DB_TXN *imp_;
+
+	// We use a TAILQ to store this object's kids of DbTxn objects, and
+	// each kid has a "parent_txn_" to point to this DbTxn object.
+	//
+	// If imp_ has a parent transaction which is not wrapped by DbTxn 
+	// class, parent_txn_ will be NULL since we don't need to maintain 
+	// this parent-kid relationship. This relationship only helps to 
+	// delete unresolved kids when the parent is resolved.
+	DbTxn *parent_txn_;
+
+	// We can add data to this class if needed
+	// since it is implemented via a pointer.
+	// (see comment at top)
+
+	// Note: use DbEnv::txn_begin() to get pointers to a DbTxn,
+	// and call DbTxn::abort() or DbTxn::commit rather than
+	// delete to release them.
+	//
+	DbTxn(DbTxn *ptxn);
+	// For internal use only.
+	DbTxn(DB_TXN *txn, DbTxn *ptxn);
+	virtual ~DbTxn();
+
+	// no copying
+	DbTxn(const DbTxn &);
+	void operator = (const DbTxn &);
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_HEAD(__children, DbTxn) children;
+	 */
+	struct __children {
+		DbTxn *tqh_first;
+		DbTxn **tqh_last;
+	} children;
+
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(DbTxn) child_entry;
+	 */
+	struct {
+		DbTxn *tqe_next;
+		DbTxn **tqe_prev;
+	} child_entry;
+};
+
+//
+// A chunk of data, maybe a key or value.
+//
+class _exported Dbt : private DBT
+{
+	friend class Db;
+	friend class Dbc;
+	friend class DbEnv;
+	friend class DbLogc;
+	friend class DbSequence;
+
+public:
+	// key/data
+	void *get_data() const                 { return data; }
+	void set_data(void *value)             { data = value; }
+
+	// key/data length
+	u_int32_t get_size() const             { return size; }
+	void set_size(u_int32_t value)         { size = value; }
+
+	// RO: length of user buffer.
+	u_int32_t get_ulen() const             { return ulen; }
+	void set_ulen(u_int32_t value)         { ulen = value; }
+
+	// RO: get/put record length.
+	u_int32_t get_dlen() const             { return dlen; }
+	void set_dlen(u_int32_t value)         { dlen = value; }
+
+	// RO: get/put record offset.
+	u_int32_t get_doff() const             { return doff; }
+	void set_doff(u_int32_t value)         { doff = value; }
+
+	// flags
+	u_int32_t get_flags() const            { return flags; }
+	void set_flags(u_int32_t value)        { flags = value; }
+
+	// Conversion functions
+	DBT *get_DBT()                         { return (DBT *)this; }
+	const DBT *get_const_DBT() const       { return (const DBT *)this; }
+
+	static Dbt* get_Dbt(DBT *dbt)          { return (Dbt *)dbt; }
+	static const Dbt* get_const_Dbt(const DBT *dbt)
+					       { return (const Dbt *)dbt; }
+
+	Dbt(void *data, u_int32_t size);
+	Dbt();
+	~Dbt();
+	Dbt(const Dbt &);
+	Dbt &operator = (const Dbt &);
+
+private:
+	// Note: no extra data appears in this class (other than
+	// inherited from DBT) since we need DBT and Dbt objects
+	// to have interchangable pointers.
+	//
+	// When subclassing this class, remember that callback
+	// methods like bt_compare, bt_prefix, dup_compare may
+	// internally manufacture DBT objects (which later are
+	// cast to Dbt), so such callbacks might receive objects
+	// not of your subclassed type.
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// multiple key/data/recno iterator classes
+//
+
+// DbMultipleIterator is a shared private base class for the three types
+// of bulk-return Iterator;  it should never be instantiated directly,
+// but it handles the functionality shared by its subclasses.
+class _exported DbMultipleIterator
+{
+public:
+	DbMultipleIterator(const Dbt &dbt);
+protected:
+	u_int8_t *data_;
+	u_int32_t *p_;
+};
+
+class _exported DbMultipleKeyDataIterator : private DbMultipleIterator
+{
+public:
+	DbMultipleKeyDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+	bool next(Dbt &key, Dbt &data);
+};
+
+class _exported DbMultipleRecnoDataIterator : private DbMultipleIterator
+{
+public:
+	DbMultipleRecnoDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+	bool next(db_recno_t &recno, Dbt &data);
+};
+
+class _exported DbMultipleDataIterator : private DbMultipleIterator
+{
+public:
+	DbMultipleDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+	bool next(Dbt &data);
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// multiple key/data/recno builder classes
+//
+
+// DbMultipleBuilder is a shared private base class for the three types
+// of bulk buffer builders;  it should never be instantiated directly,
+// but it handles the functionality shared by its subclasses.
+class _exported DbMultipleBuilder
+{
+public:
+	DbMultipleBuilder(Dbt &dbt);
+protected:
+	Dbt &dbt_;
+	void *p_;
+};
+
+class _exported DbMultipleDataBuilder : DbMultipleBuilder
+{
+public:
+	DbMultipleDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {}
+	bool append(void *dbuf, size_t dlen);
+	bool reserve(void *&ddest, size_t dlen);
+};
+
+class _exported DbMultipleKeyDataBuilder : DbMultipleBuilder
+{
+public:
+	DbMultipleKeyDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {}
+	bool append(void *kbuf, size_t klen, void *dbuf, size_t dlen);
+	bool reserve(void *&kdest, size_t klen, void *&ddest, size_t dlen);
+};
+
+class _exported DbMultipleRecnoDataBuilder
+{
+public:
+	DbMultipleRecnoDataBuilder(Dbt &dbt);
+	bool append(db_recno_t recno, void *dbuf, size_t dlen);
+	bool reserve(db_recno_t recno, void *&ddest, size_t dlen);
+protected:
+	Dbt &dbt_;
+	void *p_;
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Exception classes
+//
+
+// Almost any error in the DB library throws a DbException.
+// Every exception should be considered an abnormality
+// (e.g. bug, misuse of DB, file system error).
+//
+class _exported DbException : public __DB_STD(exception)
+{
+public:
+	virtual ~DbException() throw();
+	DbException(int err);
+	DbException(const char *description);
+	DbException(const char *description, int err);
+	DbException(const char *prefix, const char *description, int err);
+	int get_errno() const;
+	virtual const char *what() const throw();
+	DbEnv *get_env() const;
+	void set_env(DbEnv *dbenv);
+
+	DbException(const DbException &);
+	DbException &operator = (const DbException &);
+
+private:
+	void describe(const char *prefix, const char *description);
+
+	char *what_;
+	int err_;                   // errno
+	DbEnv *dbenv_;
+};
+
+//
+// A specific sort of exception that occurs when
+// an operation is aborted to resolve a deadlock.
+//
+class _exported DbDeadlockException : public DbException
+{
+public:
+	virtual ~DbDeadlockException() throw();
+	DbDeadlockException(const char *description);
+
+	DbDeadlockException(const DbDeadlockException &);
+	DbDeadlockException &operator = (const DbDeadlockException &);
+};
+
+//
+// A specific sort of exception that occurs when
+// a lock is not granted, e.g. by lock_get or lock_vec.
+// Note that the Dbt is only live as long as the Dbt used
+// in the offending call.
+//
+class _exported DbLockNotGrantedException : public DbException
+{
+public:
+	virtual ~DbLockNotGrantedException() throw();
+	DbLockNotGrantedException(const char *prefix, db_lockop_t op,
+	    db_lockmode_t mode, const Dbt *obj, const DbLock lock, int index);
+	DbLockNotGrantedException(const char *description);
+
+	DbLockNotGrantedException(const DbLockNotGrantedException &);
+	DbLockNotGrantedException &operator =
+	    (const DbLockNotGrantedException &);
+
+	db_lockop_t get_op() const;
+	db_lockmode_t get_mode() const;
+	const Dbt* get_obj() const;
+	DbLock *get_lock() const;
+	int get_index() const;
+
+private:
+	db_lockop_t op_;
+	db_lockmode_t mode_;
+	const Dbt *obj_;
+	DbLock *lock_;
+	int index_;
+};
+
+//
+// A specific sort of exception that occurs when
+// user declared memory is insufficient in a Dbt.
+//
+class _exported DbMemoryException : public DbException
+{
+public:
+	virtual ~DbMemoryException() throw();
+	DbMemoryException(Dbt *dbt);
+	DbMemoryException(const char *prefix, Dbt *dbt);
+
+	DbMemoryException(const DbMemoryException &);
+	DbMemoryException &operator = (const DbMemoryException &);
+
+	Dbt *get_dbt() const;
+private:
+	Dbt *dbt_;
+};
+
+//
+// A specific sort of exception that occurs when a change of replication
+// master requires that all handles be re-opened.
+//
+class _exported DbRepHandleDeadException : public DbException
+{
+public:
+	virtual ~DbRepHandleDeadException() throw();
+	DbRepHandleDeadException(const char *description);
+
+	DbRepHandleDeadException(const DbRepHandleDeadException &);
+	DbRepHandleDeadException &operator = (const DbRepHandleDeadException &);
+};
+
+//
+// A specific sort of exception that occurs when
+// recovery is required before continuing DB activity.
+//
+class _exported DbRunRecoveryException : public DbException
+{
+public:
+	virtual ~DbRunRecoveryException() throw();
+	DbRunRecoveryException(const char *description);
+
+	DbRunRecoveryException(const DbRunRecoveryException &);
+	DbRunRecoveryException &operator = (const DbRunRecoveryException &);
+};
+
+//
+// A specific sort of exception that occurs when
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Restore default compiler warnings
+//
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif /* !_DB_CXX_H_ */
diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h
new file mode 100644
index 00000000..b6382871
--- /dev/null
+++ b/src/dbinc/db_dispatch.h
@@ -0,0 +1,97 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_DISPATCH_H_
+#define	_DB_DISPATCH_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Declarations and typedefs for the list of transaction IDs used during
+ * recovery.  This is a generic list used to pass along whatever information
+ * we need during recovery.
+ */
+typedef enum {
+	TXNLIST_DELETE,
+	TXNLIST_LSN,
+	TXNLIST_TXNID
+} db_txnlist_type;
+
+#define	DB_TXNLIST_MASK(hp, n)  (n % hp->nslots)
+struct __db_txnhead {
+	void *td;		/* If abort, the detail for the txn. */
+	DB_THREAD_INFO *thread_info;	/* Thread information. */
+	u_int32_t maxid;	/* Maximum transaction id. */
+	DB_LSN maxlsn;		/* Maximum commit lsn. */
+	DB_LSN ckplsn;		/* LSN of last retained checkpoint. */
+	DB_LSN trunc_lsn;	/* Lsn to which we are going to truncate;
+				 * make sure we abort anyone after this. */
+	u_int32_t generation;	/* Current generation number. */
+	u_int32_t gen_alloc;	/* Number of generations allocated. */
+	struct {
+		u_int32_t generation;
+		u_int32_t txn_min;
+		u_int32_t txn_max;
+	} *gen_array;		/* Array of txnids associated with a gen. */
+	u_int nslots;
+	LIST_HEAD(__db_headlink, __db_txnlist) head[1];
+};
+
+#define	DB_LSN_STACK_SIZE 4
+struct __db_txnlist {
+	db_txnlist_type type;
+	LIST_ENTRY(__db_txnlist) links;
+	union {
+		struct {
+			u_int32_t txnid;
+			u_int32_t generation;
+			u_int32_t status;
+		} t;
+		struct {
+			u_int32_t stack_size;
+			u_int32_t stack_indx;
+			DB_LSN *lsn_stack;
+		} l;
+	} u;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_DISPATCH_H_ */
diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in
new file mode 100644
index 00000000..42439107
--- /dev/null
+++ b/src/dbinc/db_int.in
@@ -0,0 +1,1162 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_INT_H_
+#define	_DB_INT_H_
+
+/*******************************************************
+ * Berkeley DB ANSI/POSIX include files.
+ *******************************************************/
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#include <sys/types.h>
+#ifdef DIAG_MVCC
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#if defined(HAVE_REPLICATION_THREADS)
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#ifdef HAVE_VXWORKS
+#include <selectLib.h>
+#endif
+#endif
+
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#include <time.h>
+#else
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+#endif
+
+#ifdef HAVE_VXWORKS
+#include <net/uio.h>
+#else
+#include <sys/uio.h>
+#endif
+
+#if defined(HAVE_REPLICATION_THREADS)
+#ifdef HAVE_SYS_SOCKET_H
+#include <sys/socket.h>
+#endif
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#endif
+
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#endif /* !HAVE_SYSTEM_INCLUDE_FILES */
+
+#ifdef DB_WIN32
+#include "dbinc/win_db.h"
+#endif
+
+#ifdef HAVE_DBM
+#undef	DB_DBM_HSEARCH
+#define	DB_DBM_HSEARCH 1
+#endif
+
+#include "db.h"
+#include "clib_port.h"
+
+#include "dbinc/queue.h"
+#include "dbinc/shqueue.h"
+#include "dbinc/perfmon.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * The Windows compiler needs to be told about structures that are available
+ * outside a dll.
+ */
+#if defined(DB_WIN32) && defined(_MSC_VER) && \
+    !defined(DB_CREATE_DLL) && !defined(_LIB)
+#define	__DB_IMPORT __declspec(dllimport)
+#else
+#define	__DB_IMPORT
+#endif
+
+/*******************************************************
+ * Forward structure declarations.
+ *******************************************************/
+struct __db_commit_info; typedef struct __db_commit_info DB_COMMIT_INFO;
+struct __db_reginfo_t;	typedef struct __db_reginfo_t REGINFO;
+struct __db_txnhead;	typedef struct __db_txnhead DB_TXNHEAD;
+struct __db_txnlist;	typedef struct __db_txnlist DB_TXNLIST;
+struct __vrfy_childinfo;typedef struct __vrfy_childinfo VRFY_CHILDINFO;
+struct __vrfy_dbinfo;   typedef struct __vrfy_dbinfo VRFY_DBINFO;
+struct __vrfy_pageinfo; typedef struct __vrfy_pageinfo VRFY_PAGEINFO;
+
+struct __db_log_verify_info;
+struct __txn_verify_info;
+struct __lv_filereg_info;
+struct __lv_ckp_info;
+struct __lv_timestamp_info;
+typedef struct __db_log_verify_info DB_LOG_VRFY_INFO;
+typedef struct __txn_verify_info VRFY_TXN_INFO;
+typedef struct __lv_filereg_info VRFY_FILEREG_INFO;
+typedef struct __lv_filelife VRFY_FILELIFE;
+typedef struct __lv_ckp_info VRFY_CKP_INFO;
+typedef struct __lv_timestamp_info VRFY_TIMESTAMP_INFO;
+
+/*
+ * TXNINFO_HANDLER --
+ *	Callback function pointer type for __iterate_txninfo.
+ */
+typedef int (*TXNINFO_HANDLER) __P((DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *));
+
+typedef SH_TAILQ_HEAD(__hash_head) DB_HASHTAB;
+
+/*******************************************************
+ * General purpose constants and macros.
+ *******************************************************/
+#undef	FALSE
+#define	FALSE		0
+#undef	TRUE
+#define	TRUE		(!FALSE)
+
+#define	MEGABYTE	1048576
+#define	GIGABYTE	1073741824
+
+#define	NS_PER_MS	1000000		/* Nanoseconds in a millisecond */
+#define	NS_PER_US	1000		/* Nanoseconds in a microsecond */
+#define	NS_PER_SEC	1000000000	/* Nanoseconds in a second */
+#define	US_PER_MS	1000		/* Microseconds in a millisecond */
+#define	US_PER_SEC	1000000		/* Microseconds in a second */
+#define	MS_PER_SEC	1000		/* Milliseconds in a second */
+
+#define	RECNO_OOB	0		/* Illegal record number. */
+
+/*
+ * Define a macro which has no runtime effect, yet avoids triggering empty
+ * statement compiler warnings. Use it as the text of conditionally-null macros.
+ */
+#define	NOP_STATEMENT	do { } while (0)
+
+/* Test for a power-of-two (tests true for zero, which doesn't matter here). */
+#define	POWER_OF_TWO(x)	(((x) & ((x) - 1)) == 0)
+
+/* Test for valid page sizes. */
+#define	DB_MIN_PGSIZE	0x000200	/* Minimum page size (512). */
+#define	DB_MAX_PGSIZE	0x010000	/* Maximum page size (65536). */
+#define	IS_VALID_PAGESIZE(x)						\
+	(POWER_OF_TWO(x) && (x) >= DB_MIN_PGSIZE && ((x) <= DB_MAX_PGSIZE))
+
+/* Minimum number of pages cached, by default. */
+#define	DB_MINPAGECACHE	16
+
+/*
+ * If we are unable to determine the underlying filesystem block size, use
+ * 8K on the grounds that most OS's use less than 8K for a VM page size.
+ */
+#define	DB_DEF_IOSIZE	(8 * 1024)
+
+/* Align an integer to a specific boundary. */
+#undef	DB_ALIGN
+#define	DB_ALIGN(v, bound)						\
+	(((v) + (bound) - 1) & ~(((uintmax_t)(bound)) - 1))
+
+/* Increment a pointer to a specific boundary. */
+#undef	ALIGNP_INC
+#define	ALIGNP_INC(p, bound)						\
+	(void *)(((uintptr_t)(p) + (bound) - 1) & ~(((uintptr_t)(bound)) - 1))
+
+/*
+ * DB_ALIGN8 adjusts structure alignments to make sure shared structures have
+ * fixed size and filed offset on both 32bit and 64bit platforms when
+ * HAVE_MIXED_SIZE_ADDRESSING is defined.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+#define DB_ALIGN8 @DB_STRUCT_ALIGN8@
+#else
+#define DB_ALIGN8
+#endif
+
+/*
+ * Berkeley DB uses the va_copy macro from C99, not all compilers include
+ * it, so add a dumb implementation compatible with pre C99 implementations.
+ */
+#ifndef	va_copy
+#define	va_copy(d, s)	((d) = (s))
+#endif
+
+/*
+ * Print an address as a u_long (a u_long is the largest type we can print
+ * portably).  Most 64-bit systems have made longs 64-bits, so this should
+ * work.
+ */
+#define	P_TO_ULONG(p)	((u_long)(uintptr_t)(p))
+
+/*
+ * Convert a pointer to an integral value.
+ *
+ * The (u_int16_t)(uintptr_t) cast avoids warnings: the (uintptr_t) cast
+ * converts the value to an integral type, and the (u_int16_t) cast converts
+ * it to a small integral type so we don't get complaints when we assign the
+ * final result to an integral type smaller than uintptr_t.
+ */
+#define	P_TO_UINT32(p)	((u_int32_t)(uintptr_t)(p))
+#define	P_TO_UINT16(p)	((u_int16_t)(uintptr_t)(p))
+#define	P_TO_ROFF(p)	((roff_t)(uintptr_t)(p))
+
+/* The converse of P_TO_ROFF() above. */
+#define	ROFF_TO_P(roff)	((void *)(uintptr_t)(roff))
+
+/*
+ * There are several on-page structures that are declared to have a number of
+ * fields followed by a variable length array of items.  The structure size
+ * without including the variable length array or the address of the first of
+ * those elements can be found using SSZ.
+ *
+ * This macro can also be used to find the offset of a structure element in a
+ * structure.  This is used in various places to copy structure elements from
+ * unaligned memory references, e.g., pointers into a packed page.
+ *
+ * There are two versions because compilers object if you take the address of
+ * an array.
+ */
+#undef	SSZ
+#define	SSZ(name, field)  P_TO_UINT16(&(((name *)0)->field))
+
+#undef	SSZA
+#define	SSZA(name, field) P_TO_UINT16(&(((name *)0)->field[0]))
+
+/* Structure used to print flag values. */
+typedef struct __fn {
+	u_int32_t mask;			/* Flag value. */
+	const char *name;		/* Flag name. */
+} FN;
+
+/* Set, clear and test flags. */
+#define	FLD_CLR(fld, f)		(fld) &= ~(f)
+#define	FLD_ISSET(fld, f)	((fld) & (f))
+#define	FLD_SET(fld, f)		(fld) |= (f)
+#define	F_CLR(p, f)		(p)->flags &= ~(f)
+#define	F_ISSET(p, f)		((p)->flags & (f))
+#define	F_SET(p, f)		(p)->flags |= (f)
+#define	F2_CLR(p, f)		((p)->flags2 &= ~(f))
+#define	F2_ISSET(p, f)		((p)->flags2 & (f))
+#define	F2_SET(p, f)		((p)->flags2 |= (f))
+#define	LF_CLR(f)		((flags) &= ~(f))
+#define	LF_ISSET(f)		((flags) & (f))
+#define	LF_SET(f)		((flags) |= (f))
+
+/*
+ * Calculate a percentage.  The values can overflow 32-bit integer arithmetic
+ * so we use floating point.
+ *
+ * When calculating a bytes-vs-page size percentage, we're getting the inverse
+ * of the percentage in all cases, that is, we want 100 minus the percentage we
+ * calculate.
+ */
+#define	DB_PCT(v, total)						\
+	((int)((total) == 0 ? 0 : ((double)(v) * 100) / (total)))
+#define	DB_PCT_PG(v, total, pgsize)					\
+	((int)((total) == 0 ? 0 :					\
+	    100 - ((double)(v) * 100) / (((double)total) * (pgsize))))
+
+/*
+ * Statistics update shared memory and so are expensive -- don't update the
+ * values unless we're going to display the results.
+ * When performance monitoring is enabled, the changed value can be published
+ * (via DTrace or SystemTap) along with another associated value or two.
+ */
+#undef	STAT
+#ifdef	HAVE_STATISTICS
+#define	STAT(x)	x
+#define	STAT_ADJUST(env, cat, subcat, val, amount, id)			\
+	do {								\
+		(val) += (amount);					\
+		STAT_PERFMON2((env), cat, subcat, (val), (id));		\
+	} while (0)
+#define	STAT_ADJUST_VERB(env, cat, subcat, val, amount, id1, id2)	\
+	do {								\
+		(val) += (amount);					\
+		STAT_PERFMON3((env), cat, subcat, (val), (id1), (id2));	\
+	} while (0)
+#define	STAT_INC(env, cat, subcat, val, id) 				\
+	STAT_ADJUST(env, cat, subcat, (val), 1, (id))
+#define	STAT_INC_VERB(env, cat, subcat, val, id1, id2) 			\
+	STAT_ADJUST_VERB((env), cat, subcat, (val), 1, (id1), (id2))
+/*
+ * STAT_DEC() subtracts one rather than adding (-1) with STAT_ADJUST(); the
+ * latter might generate a compilation warning for an unsigned value.
+ */
+#define	STAT_DEC(env, cat, subcat, val, id) 				\
+	do {								\
+		(val)--;						\
+		STAT_PERFMON2((env), cat, subcat, (val), (id));		\
+	} while (0)
+/* N.B.: Add a verbose version of STAT_DEC() when needed. */
+
+#define	STAT_SET(env, cat, subcat, val, newval, id) 			\
+	do {								\
+		(val) = (newval);					\
+		STAT_PERFMON2((env), cat, subcat, (val), (id));		\
+	} while (0)
+#define	STAT_SET_VERB(env, cat, subcat, val, newval, id1, id2) 		\
+	do {								\
+		(val) = (newval);					\
+		STAT_PERFMON3((env), cat, subcat, (val), (id1), (id2));	\
+	} while (0)
+#else
+#define	STAT(x)							NOP_STATEMENT
+#define	STAT_ADJUST(env, cat, subcat, val, amt, id)		NOP_STATEMENT
+#define	STAT_ADJUST_VERB(env, cat, subcat, val, amt, id1, id2)	NOP_STATEMENT
+#define	STAT_INC(env, cat, subcat, val, id)			NOP_STATEMENT
+#define	STAT_INC_VERB(env, cat, subcat, val, id1, id2)		NOP_STATEMENT
+#define	STAT_DEC(env, cat, subcat, val, id)			NOP_STATEMENT
+#define	STAT_SET(env, cat, subcat, val, newval, id)		NOP_STATEMENT
+#define	STAT_SET_VERB(env, cat, subcat, val, newval, id1, id2)	NOP_STATEMENT
+#endif
+
+#if defined HAVE_SIMPLE_THREAD_TYPE
+#define DB_THREADID_INIT(t)	COMPQUIET((t), 0)
+#else
+#define DB_THREADID_INIT(t)	memset(&(t), 0, sizeof(t))
+#endif
+
+/*
+ * These macros are used when an error condition is first noticed. They allow
+ * one to be notified (via e.g. DTrace, SystemTap, ...) when an error occurs
+ * deep inside DB, rather than when it is returned back through the API.
+ *
+ * The second actual argument to these is the second part of the error or
+ * warning event name. They work when 'errcode' is a symbolic name e.g.
+ * EINVAL or DB_LOCK_DEALOCK, not a variable.  Noticing system call failures
+ * would be handled by tracing on syscall exit; when e.g., it returns < 0.
+ */
+#define	ERR_ORIGIN(env, errcode)        				\
+	(PERFMON0(env, error, errcode), errcode)
+
+#define	ERR_ORIGIN_MSG(env, errcode, msg)				\
+	(PERFMON1(env, error, errcode, msg), errcode)
+
+#define	WARNING_ORIGIN(env, errcode)					\
+	(PERFMON0(env, warning, errcode), errcode)
+
+/*
+ * Structure used for callback message aggregation.
+ *
+ * Display values in XXX_stat_print calls.
+ */
+typedef struct __db_msgbuf {
+	char *buf;			/* Heap allocated buffer. */
+	char *cur;			/* Current end of message. */
+	size_t len;			/* Allocated length of buffer. */
+} DB_MSGBUF;
+#define	DB_MSGBUF_INIT(a) do {						\
+	(a)->buf = (a)->cur = NULL;					\
+	(a)->len = 0;							\
+} while (0)
+#define	DB_MSGBUF_FLUSH(env, a) do {					\
+	if ((a)->buf != NULL) {						\
+		if ((a)->cur != (a)->buf)				\
+			__db_msg(env, "%s", (a)->buf);			\
+		__os_free(env, (a)->buf);				\
+		DB_MSGBUF_INIT(a);					\
+	}								\
+} while (0)
+#define	DB_MSGBUF_REP_FLUSH(env, a, diag_msg, regular_msg) do {		\
+	if ((a)->buf != NULL) {						\
+		if ((a)->cur != (a)->buf && diag_msg)			\
+			__db_repmsg(env, "%s", (a)->buf);		\
+		if (regular_msg)					\
+			DB_MSGBUF_FLUSH(env, a);			\
+		else {							\
+			__os_free(env, (a)->buf);			\
+			DB_MSGBUF_INIT(a);				\
+		}							\
+	}								\
+} while (0)
+#define	STAT_FMT(msg, fmt, type, v) do {				\
+	DB_MSGBUF __mb;							\
+	DB_MSGBUF_INIT(&__mb);						\
+	__db_msgadd(env, &__mb, fmt, (type)(v));			\
+	__db_msgadd(env, &__mb, "\t%s", msg);				\
+	DB_MSGBUF_FLUSH(env, &__mb);					\
+} while (0)
+#define	STAT_HEX(msg, v)						\
+	__db_msg(env, "%#lx\t%s", (u_long)(v), msg)
+#define	STAT_ISSET(msg, p)						\
+	__db_msg(env, "%sSet\t%s", (p) == NULL ? "!" : " ", msg)
+#define	STAT_LONG(msg, v)						\
+	__db_msg(env, "%ld\t%s", (long)(v), msg)
+#define	STAT_LSN(msg, lsnp)						\
+	__db_msg(env, "%lu/%lu\t%s",					\
+	    (u_long)(lsnp)->file, (u_long)(lsnp)->offset, msg)
+#define	STAT_POINTER(msg, v)						\
+	__db_msg(env, "%#lx\t%s", P_TO_ULONG(v), msg)
+#define	STAT_STRING(msg, p) do {					\
+	const char *__p = p;	/* p may be a function call. */		\
+	__db_msg(env, "%s\t%s", __p == NULL ? "!Set" : __p, msg);	\
+} while (0)
+#define	STAT_ULONG(msg, v)						\
+	__db_msg(env, "%lu\t%s", (u_long)(v), msg)
+
+/*
+ * The following macros are used to control how error and message strings are
+ * output by Berkeley DB. There are essentially three different controls
+ * available:
+ *  - Default behavior is to output error strings with its unique identifier.
+ *  - If HAVE_STRIPPED_MESSAGES is enabled, a unique identifier along with any
+ *    parameters to the error string will be output.
+ *  - If HAVE_LOCALIZATION is defined, and the '_()' macro is implemented, a
+ *    gettext or ICU style translation will be done.
+ *
+ * Each new string that will be output should be wrapped in a DB_STR* macro.
+ * There are three versions of this macro for different scenarions:
+ *  - DB_STR for strings that need an identifier, and don't have any argument.
+ *  - DB_STR_A for strings that need an identifier, and have argument(s).
+ *  - DB_STR_P for strings that don't need an identifier, and don't have
+ *    arguments.
+ *
+ * Error message IDs are automatically assigned by dist/s_message_id script.
+ */
+#ifdef HAVE_LOCALIZATION
+#define _(msg)	msg	/* Replace with localization function. */
+#else
+#define _(msg)	msg
+#endif
+
+#ifdef HAVE_STRIPPED_MESSAGES
+#define DB_STR_C(msg, fmt)	fmt
+#else
+#define DB_STR_C(msg, fmt)	_(msg)
+#endif
+
+#define DB_MSGID(id)		"BDB" id
+
+#define DB_STR(id, msg)		DB_MSGID(id) " " DB_STR_C(msg, "")
+
+#define DB_STR_A(id, msg, fmt)	DB_MSGID(id) " " DB_STR_C(msg, fmt)
+
+#define DB_STR_P(msg)		_(msg)
+
+/*
+ * There are quite a few places in Berkeley DB where we want to initialize
+ * a DBT from a string or other random pointer type, using a length typed
+ * to size_t in most cases.  This macro avoids a lot of casting.  The macro
+ * comes in two flavors because we often want to clear the DBT first.
+ */
+#define	DB_SET_DBT(dbt, d, s)  do {					\
+	(dbt).data = (void *)(d);					\
+	(dbt).size = (u_int32_t)(s);					\
+} while (0)
+#define	DB_INIT_DBT(dbt, d, s)  do {					\
+	memset(&(dbt), 0, sizeof(dbt));					\
+	DB_SET_DBT(dbt, d, s);						\
+} while (0)
+
+/*******************************************************
+ * API return values
+ *******************************************************/
+/*
+ * Return values that are OK for each different call.  Most calls have a
+ * standard 'return of 0 is only OK value', but some, like db->get have
+ * DB_NOTFOUND as a return value, but it really isn't an error.
+ */
+#define	DB_RETOK_STD(ret)	((ret) == 0)
+#define	DB_RETOK_DBCDEL(ret)	((ret) == 0 || (ret) == DB_KEYEMPTY || \
+				    (ret) == DB_NOTFOUND)
+#define	DB_RETOK_DBCGET(ret)	((ret) == 0 || (ret) == DB_KEYEMPTY || \
+				    (ret) == DB_NOTFOUND)
+#define	DB_RETOK_DBCPUT(ret)	((ret) == 0 || (ret) == DB_KEYEXIST || \
+				    (ret) == DB_NOTFOUND)
+#define	DB_RETOK_DBDEL(ret)	DB_RETOK_DBCDEL(ret)
+#define	DB_RETOK_DBGET(ret)	DB_RETOK_DBCGET(ret)
+#define	DB_RETOK_DBPUT(ret)	((ret) == 0 || (ret) == DB_KEYEXIST)
+#define	DB_RETOK_EXISTS(ret)	DB_RETOK_DBCGET(ret)
+#define	DB_RETOK_LGGET(ret)	((ret) == 0 || (ret) == DB_NOTFOUND)
+#define	DB_RETOK_MPGET(ret)	((ret) == 0 || (ret) == DB_PAGE_NOTFOUND)
+#define	DB_RETOK_REPPMSG(ret)	((ret) == 0 || \
+				    (ret) == DB_REP_IGNORE || \
+				    (ret) == DB_REP_ISPERM || \
+				    (ret) == DB_REP_NEWMASTER || \
+				    (ret) == DB_REP_NEWSITE || \
+				    (ret) == DB_REP_NOTPERM || \
+				    (ret) == DB_REP_WOULDROLLBACK)
+#define	DB_RETOK_REPMGR_LOCALSITE(ret)	((ret) == 0 || (ret) == DB_NOTFOUND)
+#define	DB_RETOK_REPMGR_START(ret) ((ret) == 0 || (ret) == DB_REP_IGNORE)
+#define	DB_RETOK_TXNAPPLIED(ret) ((ret) == 0 || \
+				    (ret) == DB_NOTFOUND ||		\
+				    (ret) == DB_TIMEOUT ||		\
+				    (ret) == DB_KEYEMPTY)
+
+/* Find a reasonable operation-not-supported error. */
+#ifdef	EOPNOTSUPP
+#define	DB_OPNOTSUP	EOPNOTSUPP
+#else
+#ifdef	ENOTSUP
+#define	DB_OPNOTSUP	ENOTSUP
+#else
+#define	DB_OPNOTSUP	EINVAL
+#endif
+#endif
+
+/*******************************************************
+ * Files.
+ *******************************************************/
+/*
+ * We use 1024 as the maximum path length.  It's too hard to figure out what
+ * the real path length is, as it was traditionally stored in <sys/param.h>,
+ * and that file isn't always available.
+ */
+#define	DB_MAXPATHLEN	1024
+
+#define	PATH_DOT	"."	/* Current working directory. */
+				/* Path separator character(s). */
+#define	PATH_SEPARATOR	"@PATH_SEPARATOR@"
+
+/*******************************************************
+ * Environment.
+ *******************************************************/
+/* Type passed to __db_appname(). */
+typedef enum {
+	DB_APP_NONE=0,			/* No type (region). */
+	DB_APP_DATA,			/* Data file. */
+	DB_APP_LOG,			/* Log file. */
+	DB_APP_META,			/* Persistent metadata file. */
+	DB_APP_RECOVER,			/* We are in recovery. */
+	DB_APP_TMP			/* Temporary file. */
+} APPNAME;
+
+/*
+ * A set of macros to check if various functionality has been configured.
+ *
+ * ALIVE_ON	The is_alive function is configured.
+ * CDB_LOCKING	CDB product locking.
+ * CRYPTO_ON	Security has been configured.
+ * LOCKING_ON	Locking has been configured.
+ * LOGGING_ON	Logging has been configured.
+ * MUTEX_ON	Mutexes have been configured.
+ * MPOOL_ON	Memory pool has been configured.
+ * REP_ON	Replication has been configured.
+ * TXN_ON	Transactions have been configured.
+ *
+ * REP_ON is more complex than most: if the BDB library was compiled without
+ * replication support, ENV->rep_handle will be NULL; if the BDB library has
+ * replication support, but it was not configured, the region reference will
+ * be NULL.
+ */
+#define	ALIVE_ON(env)		((env)->dbenv->is_alive != NULL)
+#define	CDB_LOCKING(env)	F_ISSET(env, ENV_CDB)
+#define	CRYPTO_ON(env)		((env)->crypto_handle != NULL)
+#define	LOCKING_ON(env)		((env)->lk_handle != NULL)
+#define	LOGGING_ON(env)		((env)->lg_handle != NULL)
+#define	MPOOL_ON(env)		((env)->mp_handle != NULL)
+#define	MUTEX_ON(env)		((env)->mutex_handle != NULL)
+#define	REP_ON(env)							\
+	((env)->rep_handle != NULL && (env)->rep_handle->region != NULL)
+#define	TXN_ON(env)		((env)->tx_handle != NULL)
+
+/*
+ * STD_LOCKING	Standard locking, that is, locking was configured and CDB
+ *		was not.  We do not do locking in off-page duplicate trees,
+ *		so we check for that in the cursor first.
+ */
+#define	STD_LOCKING(dbc)						\
+	(!F_ISSET(dbc, DBC_OPD) &&					\
+	    !CDB_LOCKING((dbc)->env) && LOCKING_ON((dbc)->env))
+
+/*
+ * IS_RECOVERING: The system is running recovery.
+ */
+#define	IS_RECOVERING(env)						\
+	(LOGGING_ON(env) && F_ISSET((env)->lg_handle, DBLOG_RECOVER))
+
+/* Initialization methods are often illegal before/after open is called. */
+#define	ENV_ILLEGAL_AFTER_OPEN(env, name)				\
+	if (F_ISSET((env), ENV_OPEN_CALLED))				\
+		return (__db_mi_open(env, name, 1));
+#define	ENV_ILLEGAL_BEFORE_OPEN(env, name)				\
+	if (!F_ISSET((env), ENV_OPEN_CALLED))				\
+		return (__db_mi_open(env, name, 0));
+
+/* We're not actually user hostile, honest. */
+#define	ENV_REQUIRES_CONFIG(env, handle, i, flags)			\
+	if (handle == NULL)						\
+		return (__env_not_config(env, i, flags));
+#define	ENV_REQUIRES_CONFIG_XX(env, handle, i, flags)			\
+	if ((env)->handle->region == NULL)				\
+		return (__env_not_config(env, i, flags));
+#define	ENV_NOT_CONFIGURED(env, handle, i, flags)			\
+	if (F_ISSET((env), ENV_OPEN_CALLED))				\
+		ENV_REQUIRES_CONFIG(env, handle, i, flags)
+
+#define	ENV_ENTER_RET(env, ip, ret) do {				\
+	ret = 0;							\
+	PANIC_CHECK_RET(env, ret);					\
+ 	if (ret == 0) {							\
+		if ((env)->thr_hashtab == NULL)				\
+			ip = NULL;					\
+		else 							\
+			ret = __env_set_state(env, &(ip), THREAD_ACTIVE);\
+	}								\
+} while (0)
+
+#define	ENV_ENTER(env, ip) do {						\
+	int __ret;							\
+	ip = NULL;							\
+	ENV_ENTER_RET(env, ip, __ret);					\
+	if (__ret != 0)							\
+		return (__ret);						\
+} while (0)
+
+#define	FAILCHK_THREAD(env, ip) do {					\
+	if ((ip) != NULL)						\
+		(ip)->dbth_state = THREAD_FAILCHK;			\
+} while (0)
+
+#define	ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip)
+
+#ifdef DIAGNOSTIC
+#define	ENV_LEAVE(env, ip) do {						\
+	if ((ip) != NULL) {						\
+		DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE  ||	\
+		    (ip)->dbth_state == THREAD_FAILCHK));		\
+		(ip)->dbth_state = THREAD_OUT;				\
+	}								\
+} while (0)
+#else
+#define	ENV_LEAVE(env, ip) do {						\
+	if ((ip) != NULL)						\
+		(ip)->dbth_state = THREAD_OUT;				\
+} while (0)
+#endif
+#ifdef DIAGNOSTIC
+#define	CHECK_THREAD(env) do {						\
+	if ((env)->thr_hashtab != NULL)					\
+		(void)__env_set_state(env, NULL, THREAD_VERIFY);	\
+} while (0)
+#ifdef HAVE_STATISTICS
+#define	CHECK_MTX_THREAD(env, mtx) do {					\
+	if (mtx->alloc_id != MTX_MUTEX_REGION &&			\
+	    mtx->alloc_id != MTX_ENV_REGION &&				\
+	    mtx->alloc_id != MTX_APPLICATION)				\
+		CHECK_THREAD(env);					\
+} while (0)
+#else
+#define	CHECK_MTX_THREAD(env, mtx)	NOP_STATEMENT
+#endif
+#else
+#define	CHECK_THREAD(env)		NOP_STATEMENT
+#define	CHECK_MTX_THREAD(env, mtx)	NOP_STATEMENT
+#endif
+
+typedef enum {
+	THREAD_SLOT_NOT_IN_USE=0,
+	THREAD_OUT,
+	THREAD_ACTIVE,
+	THREAD_BLOCKED,
+	THREAD_BLOCKED_DEAD,
+	THREAD_FAILCHK,
+	THREAD_VERIFY
+} DB_THREAD_STATE;
+
+typedef struct __pin_list {
+	roff_t b_ref;		/* offset to buffer. */
+	int region;		/* region containing buffer. */
+} PIN_LIST;
+#define	PINMAX 4
+
+struct __db_thread_info { /* SHARED */
+	pid_t		dbth_pid;
+	db_threadid_t	dbth_tid;
+	DB_THREAD_STATE	dbth_state;
+	SH_TAILQ_ENTRY	dbth_links;
+	/*
+	 * The next field contains the (process local) reference to the XA
+	 * transaction currently associated with this thread of control.
+	 */
+	SH_TAILQ_HEAD(__dbth_xatxn) dbth_xatxn;
+	u_int32_t	dbth_xa_status;
+	/*
+	 * The following fields track which buffers this thread of
+	 * control has pinned in the mpool buffer cache.
+	 */
+	u_int16_t	dbth_pincount;	/* Number of pins for this thread. */
+	u_int16_t	dbth_pinmax;	/* Number of slots allocated. */
+	roff_t		dbth_pinlist;	/* List of pins. */
+	PIN_LIST	dbth_pinarray[PINMAX];	/* Initial array of slots. */
+#ifdef DIAGNOSTIC
+	roff_t		dbth_locker;	/* Current locker for this thread. */
+	u_int32_t	dbth_check_off;	/* Count of number of LOCK_OFF calls. */
+#endif
+};
+#ifdef DIAGNOSTIC
+#define LOCK_CHECK_OFF(ip) if ((ip) != NULL)				\
+	(ip)->dbth_check_off++
+
+#define LOCK_CHECK_ON(ip) if ((ip) != NULL)				\
+	(ip)->dbth_check_off--
+
+#define LOCK_CHECK(dbc, pgno, mode, type)				\
+	DB_ASSERT((dbc)->dbp->env, (dbc)->locker == NULL ||		\
+	     __db_haslock((dbc)->dbp->env,				\
+	    (dbc)->locker, (dbc)->dbp->mpf, pgno, mode, type) == 0)
+#else
+#define LOCK_CHECK_OFF(ip)	NOP_STATEMENT
+#define LOCK_CHECK_ON(ip)	NOP_STATEMENT
+#define LOCK_CHECK(dbc, pgno, mode)	NOP_STATEMENT
+#endif
+
+typedef struct __env_thread_info {
+	u_int32_t	thr_count;
+	u_int32_t	thr_init;
+	u_int32_t	thr_max;
+	u_int32_t	thr_nbucket;
+	roff_t		thr_hashoff;
+} THREAD_INFO;
+
+#define	DB_EVENT(env, e, einfo) do {					\
+	DB_ENV *__dbenv = (env)->dbenv;					\
+	if (__dbenv->db_event_func != NULL)				\
+		__dbenv->db_event_func(__dbenv, e, einfo);		\
+} while (0)
+
+typedef struct __flag_map {
+	u_int32_t inflag, outflag;
+} FLAG_MAP;
+
+typedef struct __db_backup_handle {
+	int	(*open) __P((DB_ENV *, const char *, const char *, void **));
+	int	(*write) __P((DB_ENV *,
+		    u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *));
+	int	(*close) __P((DB_ENV *, const char *, void *));
+	u_int32_t	size;
+	u_int32_t	read_count;
+	u_int32_t	read_sleep;
+#define	BACKUP_WRITE_DIRECT	0x0001
+	int	flags;
+} DB_BACKUP;
+
+/*
+ * Internal database environment structure.
+ *
+ * This is the private database environment handle.  The public environment
+ * handle is the DB_ENV structure.   The library owns this structure, the user
+ * owns the DB_ENV structure.  The reason there are two structures is because
+ * the user's configuration outlives any particular DB_ENV->open call, and
+ * separate structures allows us to easily discard internal information without
+ * discarding the user's configuration.
+ */
+struct __env {
+	DB_ENV *dbenv;			/* Linked DB_ENV structure */
+
+	/*
+	 * The ENV structure can be used concurrently, so field access is
+	 * protected.
+	 */
+	db_mutex_t mtx_env;		/* ENV structure mutex */
+
+	/*
+	 * Some fields are included in the ENV structure rather than in the
+	 * DB_ENV structure because they are only set as arguments to the
+	 * DB_ENV->open method.  In other words, because of the historic API,
+	 * not for any rational reason.
+	 *
+	 * Arguments to DB_ENV->open.
+	 */
+	char	 *db_home;		/* Database home */
+	u_int32_t open_flags;		/* Flags */
+	int	  db_mode;		/* Default open permissions */
+
+	pid_t	pid_cache;		/* Cached process ID */
+
+	DB_FH	*lockfhp;		/* fcntl(2) locking file handle */
+
+	DB_LOCKER *env_lref;		/* Locker in non-threaded handles */
+
+	DB_DISTAB   recover_dtab;	/* Dispatch table for recover funcs */
+
+	int dir_mode;			/* Intermediate directory perms. */
+
+#define ENV_DEF_DATA_LEN		100
+	u_int32_t data_len;		/* Data length in __db_prbytes. */
+
+	/* Thread tracking */
+	u_int32_t	 thr_nbucket;	/* Number of hash buckets */
+	DB_HASHTAB	*thr_hashtab;	/* Hash table of DB_THREAD_INFO */
+
+	/*
+	 * List of open DB handles for this ENV, used for cursor
+	 * adjustment.  Must be protected for multi-threaded support.
+	 */
+	db_mutex_t mtx_dblist;
+	int	   db_ref;		/* DB handle reference count */
+	TAILQ_HEAD(__dblist, __db) dblist;
+
+	/*
+	 * List of open file handles for this ENV.  Must be protected
+	 * for multi-threaded support.
+	 */
+	TAILQ_HEAD(__fdlist, __fh_t) fdlist;
+
+	db_mutex_t	 mtx_mt;	/* Mersenne Twister mutex */
+	int		 mti;		/* Mersenne Twister index */
+	u_long		*mt;		/* Mersenne Twister state vector */
+
+	DB_CIPHER	*crypto_handle;	/* Crypto handle */
+	DB_LOCKTAB	*lk_handle;	/* Lock handle */
+	DB_LOG		*lg_handle;	/* Log handle */
+	DB_MPOOL	*mp_handle;	/* Mpool handle */
+	DB_MUTEXMGR	*mutex_handle;	/* Mutex handle */
+	DB_REP		*rep_handle;	/* Replication handle */
+	DB_TXNMGR	*tx_handle;	/* Txn handle */
+
+	DB_BACKUP	*backup_handle;	/* database copy configuration. */
+
+	/*
+	 * XA support.
+	 */
+	int		 xa_rmid;	/* XA Resource Manager ID */
+	int		 xa_ref;	/* XA Reference count */
+	TAILQ_ENTRY(__env) links;	/* XA environments */
+
+	/* Application callback to copy data to/from a custom data source */
+#define	DB_USERCOPY_GETDATA	0x0001
+#define	DB_USERCOPY_SETDATA	0x0002
+	int (*dbt_usercopy)
+	    __P((DBT *, u_int32_t, void *, u_int32_t, u_int32_t));
+
+	int (*log_verify_wrap) __P((ENV *, const char *, u_int32_t,
+	    const char *, const char *, time_t, time_t, u_int32_t,  u_int32_t,
+	    u_int32_t, u_int32_t, int, int));
+
+	REGINFO	*reginfo;		/* REGINFO structure reference */
+
+#define	DB_TEST_ELECTINIT	 1	/* after __rep_elect_init */
+#define	DB_TEST_ELECTVOTE1	 2	/* after sending VOTE1 */
+#define	DB_TEST_NO_PAGES	 3	/* before sending PAGE */
+#define	DB_TEST_POSTDESTROY	 4	/* after destroy op */
+#define	DB_TEST_POSTLOG		 5	/* after logging all pages */
+#define	DB_TEST_POSTLOGMETA	 6	/* after logging meta in btree */
+#define	DB_TEST_POSTOPEN	 7	/* after __os_open */
+#define	DB_TEST_POSTSYNC	 8	/* after syncing the log */
+#define	DB_TEST_PREDESTROY	 9	/* before destroy op */
+#define	DB_TEST_PREOPEN		 10	/* before __os_open */
+#define	DB_TEST_REPMGR_PERM	 11	/* repmgr perm/archiving tests */
+#define	DB_TEST_SUBDB_LOCKS	 12	/* subdb locking tests */
+	int	test_abort;		/* Abort value for testing */
+	int	test_check;		/* Checkpoint value for testing */
+	int	test_copy;		/* Copy value for testing */
+
+#define	ENV_CDB			0x00000001 /* DB_INIT_CDB */
+#define	ENV_DBLOCAL		0x00000002 /* Environment for a private DB */
+#define	ENV_LITTLEENDIAN	0x00000004 /* Little endian system. */
+#define	ENV_LOCKDOWN		0x00000008 /* DB_LOCKDOWN set */
+#define	ENV_NO_OUTPUT_SET	0x00000010 /* No output channel set */
+#define	ENV_OPEN_CALLED		0x00000020 /* DB_ENV->open called */
+#define	ENV_PRIVATE		0x00000040 /* DB_PRIVATE set */
+#define	ENV_RECOVER_FATAL	0x00000080 /* Doing fatal recovery in env */
+#define	ENV_REF_COUNTED		0x00000100 /* Region references this handle */
+#define	ENV_SYSTEM_MEM		0x00000200 /* DB_SYSTEM_MEM set */
+#define	ENV_THREAD		0x00000400 /* DB_THREAD set */
+#define ENV_FORCE_TXN_BULK	0x00000800 /* Txns use bulk mode-for testing */
+	u_int32_t flags;
+};
+
+/*******************************************************
+ * Database Access Methods.
+ *******************************************************/
+/*
+ * DB_IS_THREADED --
+ *	The database handle is free-threaded (was opened with DB_THREAD).
+ */
+#define	DB_IS_THREADED(dbp)						\
+	((dbp)->mutex != MUTEX_INVALID)
+
+/* Initialization methods are often illegal before/after open is called. */
+#define	DB_ILLEGAL_AFTER_OPEN(dbp, name)				\
+	if (F_ISSET((dbp), DB_AM_OPEN_CALLED))				\
+		return (__db_mi_open((dbp)->env, name, 1));
+#define	DB_ILLEGAL_BEFORE_OPEN(dbp, name)				\
+	if (!F_ISSET((dbp), DB_AM_OPEN_CALLED))				\
+		return (__db_mi_open((dbp)->env, name, 0));
+/* Some initialization methods are illegal if environment isn't local. */
+#define	DB_ILLEGAL_IN_ENV(dbp, name)					\
+	if (!F_ISSET((dbp)->env, ENV_DBLOCAL))				\
+		return (__db_mi_env((dbp)->env, name));
+#define	DB_ILLEGAL_METHOD(dbp, flags) {					\
+	int __ret;							\
+	if ((__ret = __dbh_am_chk(dbp, flags)) != 0)			\
+		return (__ret);						\
+}
+
+/*
+ * Common DBC->internal fields.  Each access method adds additional fields
+ * to this list, but the initial fields are common.
+ */
+#define	__DBC_INTERNAL							\
+	DBC	 *opd;			/* Off-page duplicate cursor. */\
+	DBC	 *pdbc;			/* Pointer to parent cursor. */ \
+									\
+	void	 *page;			/* Referenced page. */		\
+	u_int32_t part;			/* Partition number. */		\
+	db_pgno_t root;			/* Tree root. */		\
+	db_pgno_t pgno;			/* Referenced page number. */	\
+	db_indx_t indx;			/* Referenced key item index. */\
+									\
+	/* Streaming -- cache last position. */				\
+	db_pgno_t stream_start_pgno;	/* Last start pgno. */		\
+	u_int32_t stream_off;		/* Current offset. */		\
+	db_pgno_t stream_curr_pgno;	/* Current overflow page. */	\
+									\
+	DB_LOCK		lock;		/* Cursor lock. */		\
+	db_lockmode_t	lock_mode;	/* Lock mode. */
+
+struct __dbc_internal {
+	__DBC_INTERNAL
+};
+
+/* Actions that __db_master_update can take. */
+typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN, MU_MOVE } mu_action;
+
+/*
+ * Access-method-common macro for determining whether a cursor
+ * has been initialized.
+ */
+#ifdef HAVE_PARTITION
+#define	IS_INITIALIZED(dbc)	(DB_IS_PARTITIONED((dbc)->dbp) ?	\
+		((PART_CURSOR *)(dbc)->internal)->sub_cursor != NULL && \
+		((PART_CURSOR *)(dbc)->internal)->sub_cursor->		\
+		    internal->pgno != PGNO_INVALID :			\
+		(dbc)->internal->pgno != PGNO_INVALID)
+#else
+#define	IS_INITIALIZED(dbc)	((dbc)->internal->pgno != PGNO_INVALID)
+#endif
+
+/* Free the callback-allocated buffer, if necessary, hanging off of a DBT. */
+#define	FREE_IF_NEEDED(env, dbt)					\
+	if (F_ISSET((dbt), DB_DBT_APPMALLOC)) {				\
+		__os_ufree((env), (dbt)->data);				\
+		F_CLR((dbt), DB_DBT_APPMALLOC);				\
+	}
+
+/*
+ * Use memory belonging to object "owner" to return the results of
+ * any no-DBT-flag get ops on cursor "dbc".
+ */
+#define	SET_RET_MEM(dbc, owner)				\
+	do {						\
+		(dbc)->rskey = &(owner)->my_rskey;	\
+		(dbc)->rkey = &(owner)->my_rkey;	\
+		(dbc)->rdata = &(owner)->my_rdata;	\
+	} while (0)
+
+/* Use the return-data memory src is currently set to use in dest as well. */
+#define	COPY_RET_MEM(src, dest)				\
+	do {						\
+		(dest)->rskey = (src)->rskey;		\
+		(dest)->rkey = (src)->rkey;		\
+		(dest)->rdata = (src)->rdata;		\
+	} while (0)
+
+/* Reset the returned-memory pointers to their defaults. */
+#define	RESET_RET_MEM(dbc)				\
+	do {						\
+		(dbc)->rskey = &(dbc)->my_rskey;	\
+		(dbc)->rkey = &(dbc)->my_rkey;		\
+		(dbc)->rdata = &(dbc)->my_rdata;	\
+	} while (0)
+
+#define	COMPACT_TRUNCATE(c_data) do {			\
+	if (c_data->compact_truncate > 1)		\
+		c_data->compact_truncate--;		\
+} while (0)
+
+/*******************************************************
+ * Mpool.
+ *******************************************************/
+/*
+ * File types for DB access methods.  Negative numbers are reserved to DB.
+ */
+#define	DB_FTYPE_SET		-1		/* Call pgin/pgout functions. */
+#define	DB_FTYPE_NOTSET		 0		/* Don't call... */
+#define	DB_LSN_OFF_NOTSET	-1		/* Not yet set. */
+#define	DB_CLEARLEN_NOTSET	UINT32_MAX	/* Not yet set. */
+
+/* Structure used as the DB pgin/pgout pgcookie. */
+typedef struct __dbpginfo {
+	u_int32_t db_pagesize;		/* Underlying page size. */
+	u_int32_t flags;		/* Some DB_AM flags needed. */
+	DBTYPE  type;			/* DB type */
+} DB_PGINFO;
+
+/*******************************************************
+ * Log.
+ *******************************************************/
+/* Initialize an LSN to 'zero'. */
+#define	ZERO_LSN(LSN) do {						\
+	(LSN).file = 0;							\
+	(LSN).offset = 0;						\
+} while (0)
+#define	IS_ZERO_LSN(LSN)	((LSN).file == 0 && (LSN).offset == 0)
+
+#define	IS_INIT_LSN(LSN)	((LSN).file == 1 && (LSN).offset == 0)
+#define	INIT_LSN(LSN)		do {					\
+	(LSN).file = 1;							\
+	(LSN).offset = 0;						\
+} while (0)
+
+#define	MAX_LSN(LSN) do {						\
+	(LSN).file = UINT32_MAX;					\
+	(LSN).offset = UINT32_MAX;					\
+} while (0)
+#define	IS_MAX_LSN(LSN) \
+	((LSN).file == UINT32_MAX && (LSN).offset == UINT32_MAX)
+
+/* If logging is turned off, smash the lsn. */
+#define	LSN_NOT_LOGGED(LSN) do {					\
+	(LSN).file = 0;							\
+	(LSN).offset = 1;						\
+} while (0)
+#define	IS_NOT_LOGGED_LSN(LSN) \
+	((LSN).file == 0 && (LSN).offset == 1)
+
+/*
+ * LOG_COMPARE -- compare two LSNs.
+ */
+
+#define	LOG_COMPARE(lsn0, lsn1)						\
+	((lsn0)->file != (lsn1)->file ?					\
+	((lsn0)->file < (lsn1)->file ? -1 : 1) :			\
+	((lsn0)->offset != (lsn1)->offset ?				\
+	((lsn0)->offset < (lsn1)->offset ? -1 : 1) : 0))
+
+/*******************************************************
+ * Txn.
+ *******************************************************/
+#define	DB_NONBLOCK(C)	((C)->txn != NULL && F_ISSET((C)->txn, TXN_NOWAIT))
+#define	NOWAIT_FLAG(txn) \
+	((txn) != NULL && F_ISSET((txn), TXN_NOWAIT) ? DB_LOCK_NOWAIT : 0)
+#define	IS_REAL_TXN(txn)						\
+	((txn) != NULL && !F_ISSET(txn, TXN_FAMILY))
+#define	IS_SUBTRANSACTION(txn)						\
+	((txn) != NULL && (txn)->parent != NULL)
+
+/* Checks for existence of an XA transaction in access method interfaces. */
+#define	XA_CHECK_TXN(ip, txn) 						\
+	if ((ip) != NULL && (txn) == NULL) {				\
+		(txn) = SH_TAILQ_FIRST(&(ip)->dbth_xatxn, __db_txn);	\
+		DB_ASSERT(env, txn == NULL ||				\
+		    txn->xa_thr_status == TXN_XA_THREAD_ASSOCIATED);	\
+	}
+
+/* Ensure that there is no XA transaction active. */
+#define	XA_NO_TXN(ip, retval) {						\
+	DB_TXN *__txn;							\
+	retval = 0;							\
+	if ((ip) != NULL) {						\
+		__txn = SH_TAILQ_FIRST(&(ip)->dbth_xatxn, __db_txn);	\
+		if (__txn != NULL &&					\
+		    __txn->xa_thr_status == TXN_XA_THREAD_ASSOCIATED)	\
+		    	retval = EINVAL;				\
+	}								\
+}
+
+/*******************************************************
+ * Crypto.
+ *******************************************************/
+#define	DB_IV_BYTES     16		/* Bytes per IV */
+#define	DB_MAC_KEY	20		/* Bytes per MAC checksum */
+
+/*******************************************************
+ * Compression
+ *******************************************************/
+#define	CMP_INT_SPARE_VAL	0xFC	/* Smallest byte value that the integer
+					   compression algorithm doesn't use */
+
+#if defined(__cplusplus)
+}
+#endif
+
+/*******************************************************
+ * Remaining general DB includes.
+ *******************************************************/
+@db_int_def@
+
+#include "dbinc/globals.h"
+#include "dbinc/clock.h"
+#include "dbinc/debug.h"
+#include "dbinc/region.h"
+#include "dbinc_auto/env_ext.h"
+#include "dbinc/mutex.h"
+#ifdef HAVE_REPLICATION_THREADS
+#include "dbinc/repmgr.h"
+#endif
+#include "dbinc/rep.h"
+#include "dbinc/os.h"
+#include "dbinc_auto/clib_ext.h"
+#include "dbinc_auto/common_ext.h"
+
+/*******************************************************
+ * Remaining Log.
+ * These need to be defined after the general includes
+ * because they need rep.h from above.
+ *******************************************************/
+/*
+ * Test if the environment is currently logging changes.  If we're in recovery
+ * or we're a replication client, we don't need to log changes because they're
+ * already in the log, even though we have a fully functional log system.
+ */
+#define	DBENV_LOGGING(env)						\
+	(LOGGING_ON(env) && !IS_REP_CLIENT(env) && (!IS_RECOVERING(env)))
+
+/*
+ * Test if we need to log a change.  By default, we don't log operations without
+ * associated transactions, unless DIAGNOSTIC, DEBUG_ROP or DEBUG_WOP are on.
+ * This is because we want to get log records for read/write operations, and, if
+ * we are trying to debug something, more information is always better.
+ *
+ * The DBC_RECOVER flag is set when we're in abort, as well as during recovery;
+ * thus DBC_LOGGING may be false for a particular dbc even when DBENV_LOGGING
+ * is true.
+ *
+ * We explicitly use LOGGING_ON/IS_REP_CLIENT here because we don't want to pull
+ * in the log headers, which IS_RECOVERING (and thus DBENV_LOGGING) rely on, and
+ * because DBC_RECOVER should be set anytime IS_RECOVERING would be true.
+ *
+ * If we're not in recovery (master - doing an abort or a client applying
+ * a txn), then a client's only path through here is on an internal
+ * operation, and a master's only path through here is a transactional
+ * operation.  Detect if either is not the case.
+ */
+#if defined(DIAGNOSTIC) || defined(DEBUG_ROP)  || defined(DEBUG_WOP)
+#define	DBC_LOGGING(dbc)	__dbc_logging(dbc)
+#else
+#define	DBC_LOGGING(dbc)						\
+	((dbc)->txn != NULL && LOGGING_ON((dbc)->env) &&		\
+	    !F_ISSET((dbc), DBC_RECOVER) && !IS_REP_CLIENT((dbc)->env))
+#endif
+
+#endif /* !_DB_INT_H_ */
diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h
new file mode 100644
index 00000000..aecf059a
--- /dev/null
+++ b/src/dbinc/db_join.h
@@ -0,0 +1,37 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_JOIN_H_
+#define	_DB_JOIN_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Joins use a join cursor that is similar to a regular DB cursor except
+ * that it only supports c_get and c_close functionality.  Also, it does
+ * not support the full range of flags for get.
+ */
+typedef struct __join_cursor {
+	u_int8_t *j_exhausted;	/* Array of flags; is cursor i exhausted? */
+	DBC	**j_curslist;	/* Array of cursors in the join: constant. */
+	DBC	**j_fdupcurs;	/* Cursors w/ first instances of current dup. */
+	DBC	**j_workcurs;	/* Scratch cursor copies to muck with. */
+	DB	 *j_primary;	/* Primary dbp. */
+	DBT	  j_key;	/* Used to do lookups. */
+	DBT	  j_rdata;	/* Memory used for data return. */
+	u_int32_t j_ncurs;	/* How many cursors do we have? */
+#define	JOIN_RETRY	0x01	/* Error on primary get; re-return same key. */
+	u_int32_t flags;
+} JOIN_CURSOR;
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_JOIN_H_ */
diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h
new file mode 100644
index 00000000..2d4de2e5
--- /dev/null
+++ b/src/dbinc/db_page.h
@@ -0,0 +1,841 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_PAGE_H_
+#define	_DB_PAGE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * DB page formats.
+ *
+ * !!!
+ * This implementation requires that values within the following structures
+ * NOT be padded -- note, ANSI C permits random padding within structures.
+ * If your compiler pads randomly you can just forget ever making DB run on
+ * your system.  In addition, no data type can require larger alignment than
+ * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
+ *
+ * Note that key/data lengths are often stored in db_indx_t's -- this is
+ * not accidental, nor does it limit the key/data size.  If the key/data
+ * item fits on a page, it's guaranteed to be small enough to fit into a
+ * db_indx_t, and storing it in one saves space.
+ */
+
+#define	PGNO_INVALID	0	/* Invalid page number in any database. */
+#define	PGNO_BASE_MD	0	/* Base database: metadata page number. */
+
+/* Page types. */
+#define	P_INVALID	0	/* Invalid page type. */
+#define	__P_DUPLICATE	1	/* Duplicate. DEPRECATED in 3.1 */
+#define	P_HASH_UNSORTED	2	/* Hash pages created pre 4.6. DEPRECATED */
+#define	P_IBTREE	3	/* Btree internal. */
+#define	P_IRECNO	4	/* Recno internal. */
+#define	P_LBTREE	5	/* Btree leaf. */
+#define	P_LRECNO	6	/* Recno leaf. */
+#define	P_OVERFLOW	7	/* Overflow. */
+#define	P_HASHMETA	8	/* Hash metadata page. */
+#define	P_BTREEMETA	9	/* Btree metadata page. */
+#define	P_QAMMETA	10	/* Queue metadata page. */
+#define	P_QAMDATA	11	/* Queue data page. */
+#define	P_LDUP		12	/* Off-page duplicate leaf. */
+#define	P_HASH		13	/* Sorted hash page. */
+#define	P_HEAPMETA	14	/* Heap metadata page. */
+#define	P_HEAP		15	/* Heap data page. */
+#define	P_IHEAP		16	/* Heap internal. */
+#define	P_PAGETYPE_MAX	17
+/* Flag to __db_new */
+#define	P_DONTEXTEND	0x8000	/* Don't allocate if there are no free pages. */
+
+/*
+ * When we create pages in mpool, we ask mpool to clear some number of bytes
+ * in the header.  This number must be at least as big as the regular page
+ * headers and cover enough of the btree and hash meta-data pages to obliterate
+ * the page type.
+ */
+#define	DB_PAGE_DB_LEN		32
+#define	DB_PAGE_QUEUE_LEN	0
+
+/************************************************************************
+ GENERIC METADATA PAGE HEADER
+ *
+ * !!!
+ * The magic and version numbers have to be in the same place in all versions
+ * of the metadata page as the application may not have upgraded the database.
+ ************************************************************************/
+typedef struct _dbmeta33 {
+	DB_LSN	  lsn;		/* 00-07: LSN. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t magic;	/* 12-15: Magic number. */
+	u_int32_t version;	/* 16-19: Version. */
+	u_int32_t pagesize;	/* 20-23: Pagesize. */
+	u_int8_t  encrypt_alg;	/*    24: Encryption algorithm. */
+	u_int8_t  type;		/*    25: Page type. */
+#define	DBMETA_CHKSUM		0x01
+#define	DBMETA_PART_RANGE	0x02
+#define	DBMETA_PART_CALLBACK	0x04
+	u_int8_t  metaflags;	/* 26: Meta-only flags */
+	u_int8_t  unused1;	/* 27: Unused. */
+	u_int32_t free;		/* 28-31: Free list page number. */
+	db_pgno_t last_pgno;	/* 32-35: Page number of last page in db. */
+	u_int32_t nparts;	/* 36-39: Number of partitions. */
+	u_int32_t key_count;	/* 40-43: Cached key count. */
+	u_int32_t record_count;	/* 44-47: Cached record count. */
+	u_int32_t flags;	/* 48-51: Flags: unique to each AM. */
+				/* 52-71: Unique file ID. */
+	u_int8_t  uid[DB_FILE_ID_LEN];
+} DBMETA33, DBMETA;
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _btmeta33 {
+#define	BTM_DUP		0x001	/*	  Duplicates. */
+#define	BTM_RECNO	0x002	/*	  Recno tree. */
+#define	BTM_RECNUM	0x004	/*	  Btree: maintain record count. */
+#define	BTM_FIXEDLEN	0x008	/*	  Recno: fixed length records. */
+#define	BTM_RENUMBER	0x010	/*	  Recno: renumber on insert/delete. */
+#define	BTM_SUBDB	0x020	/*	  Subdatabases. */
+#define	BTM_DUPSORT	0x040	/*	  Duplicates are sorted. */
+#define	BTM_COMPRESS	0x080	/*	  Compressed. */
+#define	BTM_MASK	0x0ff
+	DBMETA	dbmeta;		/* 00-71: Generic meta-data header. */
+
+	u_int32_t unused1;	/* 72-75: Unused space. */
+	u_int32_t minkey;	/* 76-79: Btree: Minkey. */
+	u_int32_t re_len;	/* 80-83: Recno: fixed-length record length. */
+	u_int32_t re_pad;	/* 84-87: Recno: fixed-length record pad. */
+	u_int32_t root;		/* 88-91: Root page. */
+	u_int32_t unused2[92];	/* 92-459: Unused space. */
+	u_int32_t crypto_magic;		/* 460-463: Crypto magic number */
+	u_int32_t trash[3];		/* 464-475: Trash space - Do not use */
+	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
+	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
+
+	/*
+	 * Minimum page size is 512.
+	 */
+} BTMETA33, BTMETA;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta33 {
+#define	DB_HASH_DUP	0x01	/*	  Duplicates. */
+#define	DB_HASH_SUBDB	0x02	/*	  Subdatabases. */
+#define	DB_HASH_DUPSORT	0x04	/*	  Duplicates are sorted. */
+	DBMETA dbmeta;		/* 00-71: Generic meta-data page header. */
+
+	u_int32_t max_bucket;	/* 72-75: ID of Maximum bucket in use */
+	u_int32_t high_mask;	/* 76-79: Modulo mask into table */
+	u_int32_t low_mask;	/* 80-83: Modulo mask into table lower half */
+	u_int32_t ffactor;	/* 84-87: Fill factor */
+	u_int32_t nelem;	/* 88-91: Number of keys in hash table */
+	u_int32_t h_charkey;	/* 92-95: Value of hash(CHARKEY) */
+#define	NCACHED	32		/* number of spare points */
+				/* 96-223: Spare pages for overflow */
+	u_int32_t spares[NCACHED];
+	u_int32_t unused[59];	/* 224-459: Unused space */
+	u_int32_t crypto_magic;	/* 460-463: Crypto magic number */
+	u_int32_t trash[3];	/* 464-475: Trash space - Do not use */
+	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
+	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
+
+	/*
+	 * Minimum page size is 512.
+	 */
+} HMETA33, HMETA;
+
+/************************************************************************
+ HEAP METADATA PAGE LAYOUT
+*************************************************************************/
+/*
+ * Heap Meta data page structure
+ *
+ */
+typedef struct _heapmeta {
+	DBMETA    dbmeta;		/* 00-71: Generic meta-data header. */
+
+	db_pgno_t curregion;		/* 72-75: Current region pgno. */
+	u_int32_t nregions;		/* 76-79: Number of regions. */
+	u_int32_t gbytes;		/* 80-83: GBytes for fixed size heap. */
+	u_int32_t bytes;		/* 84-87: Bytes for fixed size heap. */
+	u_int32_t region_size;		/* 88-91: Max region size. */
+	u_int32_t unused2[92];		/* 92-459: Unused space.*/
+	u_int32_t crypto_magic;		/* 460-463: Crypto magic number */
+	u_int32_t trash[3];		/* 464-475: Trash space - Do not use */
+	u_int8_t  iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
+	u_int8_t  chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
+
+
+	/*
+	 * Minimum page size is 512.
+	 */
+} HEAPMETA;
+		
+/************************************************************************
+ QUEUE METADATA PAGE LAYOUT
+ ************************************************************************/
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta33 {
+	DBMETA    dbmeta;	/* 00-71: Generic meta-data header. */
+
+	u_int32_t first_recno;	/* 72-75: First not deleted record. */
+	u_int32_t cur_recno;	/* 76-79: Next recno to be allocated. */
+	u_int32_t re_len;	/* 80-83: Fixed-length record length. */
+	u_int32_t re_pad;	/* 84-87: Fixed-length record pad. */
+	u_int32_t rec_page;	/* 88-91: Records Per Page. */
+	u_int32_t page_ext;	/* 92-95: Pages per extent */
+
+	u_int32_t unused[91];	/* 96-459: Unused space */
+	u_int32_t crypto_magic;	/* 460-463: Crypto magic number */
+	u_int32_t trash[3];	/* 464-475: Trash space - Do not use */
+	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
+	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
+	/*
+	 * Minimum page size is 512.
+	 */
+} QMETA33, QMETA;
+
+/*
+ * DBMETASIZE is a constant used by __db_file_setup and DB->verify
+ * as a buffer which is guaranteed to be larger than any possible
+ * metadata page size and smaller than any disk sector.
+ */
+#define	DBMETASIZE	512
+
+/************************************************************************
+ BTREE/HASH MAIN PAGE LAYOUT
+ ************************************************************************/
+/*
+ *	+-----------------------------------+
+ *	|    lsn    |   pgno    | prev pgno |
+ *	+-----------------------------------+
+ *	| next pgno |  entries  | hf offset |
+ *	+-----------------------------------+
+ *	|   level   |   type    |   chksum  |
+ *	+-----------------------------------+
+ *	|    iv     |   index   | free -->  |
+ *	+-----------+-----------------------+
+ *	|	 F R E E A R E A            |
+ *	+-----------------------------------+
+ *	|              <-- free |   item    |
+ *	+-----------------------------------+
+ *	|   item    |   item    |   item    |
+ *	+-----------------------------------+
+ *
+ * sizeof(PAGE) == 26 bytes + possibly 20 bytes of checksum and possibly
+ * 16 bytes of IV (+ 2 bytes for alignment), and the following indices
+ * are guaranteed to be two-byte aligned.  If we aren't doing crypto or
+ * checksumming the bytes are reclaimed for data storage.
+ *
+ * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
+ * key for inp[1]'s data.  All other types of pages only contain single items.
+ */
+typedef struct __pg_chksum {
+	u_int8_t	unused[2];		/* 26-27: For alignment */
+	u_int8_t	chksum[4];		/* 28-31: Checksum */
+} PG_CHKSUM;
+
+typedef struct __pg_crypto {
+	u_int8_t	unused[2];		/* 26-27: For alignment */
+	u_int8_t	chksum[DB_MAC_KEY];	/* 28-47: Checksum */
+	u_int8_t	iv[DB_IV_BYTES];	/* 48-63: IV */
+	/* !!!
+	 * Must be 16-byte aligned for crypto
+	 */
+} PG_CRYPTO;
+
+typedef struct _db_page {
+	DB_LSN	  lsn;		/* 00-07: Log sequence number. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	db_pgno_t prev_pgno;	/* 12-15: Previous page number. */
+	db_pgno_t next_pgno;	/* 16-19: Next page number. */
+	db_indx_t entries;	/* 20-21: Number of items on the page. */
+	db_indx_t hf_offset;	/* 22-23: High free byte page offset. */
+
+	/*
+	 * The btree levels are numbered from the leaf to the root, starting
+	 * with 1, so the leaf is level 1, its parent is level 2, and so on.
+	 * We maintain this level on all btree pages, but the only place that
+	 * we actually need it is on the root page.  It would not be difficult
+	 * to hide the byte on the root page once it becomes an internal page,
+	 * so we could get this byte back if we needed it for something else.
+	 */
+#define	LEAFLEVEL	  1
+#define	MAXBTREELEVEL	255
+	u_int8_t  level;	/*    24: Btree tree level. */
+	u_int8_t  type;		/*    25: Page type. */
+} PAGE;
+
+/*
+ * With many compilers sizeof(PAGE) == 28, while SIZEOF_PAGE == 26.
+ * We add in other things directly after the page header and need
+ * the SIZEOF_PAGE.  When giving the sizeof(), many compilers will
+ * pad it out to the next 4-byte boundary.
+ */
+#define	SIZEOF_PAGE	26
+/*
+ * !!!
+ * DB_AM_ENCRYPT always implies DB_AM_CHKSUM so that must come first.
+ */
+#define	P_INP(dbp, pg)							\
+	((db_indx_t *)((u_int8_t *)(pg) + SIZEOF_PAGE +			\
+	(F_ISSET((dbp), DB_AM_ENCRYPT) ? sizeof(PG_CRYPTO) :		\
+	(F_ISSET((dbp), DB_AM_CHKSUM) ? sizeof(PG_CHKSUM) : 0))))
+
+#define	P_IV(dbp, pg)							\
+	(F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) +		\
+	SIZEOF_PAGE + SSZA(PG_CRYPTO, iv))				\
+	: NULL)
+
+#define	P_CHKSUM(dbp, pg)						\
+	(F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) +		\
+	SIZEOF_PAGE + SSZA(PG_CRYPTO, chksum)) :			\
+	(F_ISSET((dbp), DB_AM_CHKSUM) ? ((u_int8_t *)(pg) +		\
+	SIZEOF_PAGE + SSZA(PG_CHKSUM, chksum))				\
+	: NULL))
+
+/* PAGE element macros. */
+#define	LSN(p)		(((PAGE *)p)->lsn)
+#define	PGNO(p)		(((PAGE *)p)->pgno)
+#define	PREV_PGNO(p)	(((PAGE *)p)->prev_pgno)
+#define	NEXT_PGNO(p)	(((PAGE *)p)->next_pgno)
+#define	NUM_ENT(p)	(((PAGE *)p)->entries)
+#define	HOFFSET(p)	(((PAGE *)p)->hf_offset)
+#define	LEVEL(p)	(((PAGE *)p)->level)
+#define	TYPE(p)		(((PAGE *)p)->type)
+
+/************************************************************************
+ HEAP PAGE LAYOUT
+ ************************************************************************/
+#define HEAPPG_NORMAL	26
+#define HEAPPG_CHKSUM	48
+#define HEAPPG_SEC	64
+
+/*
+ *	+0-----------2------------4-----------6-----------7+
+ *	|                        lsn                       |
+ *	+-------------------------+------------------------+
+ *	|           pgno          |         unused0        |
+ *      +-------------+-----------+-----------+------------+
+ *	|  high_indx  | free_indx |  entries  |  hf offset |
+ *	+-------+-----+-----------+-----------+------------+
+ *	|unused2|type |  unused3  |      ...chksum...      |
+ *	+-------+-----+-----------+------------------------+
+ *	|  ...iv...   |   offset table / free space map    |
+ *	+-------------+------------------------------------+
+ *	|free->	 	F R E E A R E A                    |
+ *	+--------------------------------------------------+
+ *	|                <-- free |          item          |
+ *	+-------------------------+------------------------+
+ *	|           item          |          item          |
+ *	+-------------------------+------------------------+
+ *
+ * The page layout of both heap internal and data pages.  If not using
+ * crypto, iv will be overwritten with data.  If not using checksumming,
+ * unused3 and chksum will also be overwritten with data and data will start at
+ * 26.  Note that this layout lets us re-use a lot of the PAGE element macros
+ * defined above.
+ */
+typedef struct _heappg {
+	DB_LSN lsn;		/* 00-07: Log sequence number. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t high_pgno;	/* 12-15: Highest page in region. */
+	u_int16_t high_indx;	/* 16-17: Highest index in the offset table. */
+	db_indx_t free_indx;	/* 18-19: First available index. */
+	db_indx_t entries;	/* 20-21: Number of items on the page. */
+	db_indx_t hf_offset;	/* 22-23: High free byte page offset. */
+	u_int8_t unused2[1];	/*    24: Unused. */
+	u_int8_t type;		/*    25: Page type. */
+	u_int8_t unused3[2];    /* 26-27: Never used, just checksum alignment. */
+	u_int8_t  chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+	u_int8_t  iv[DB_IV_BYTES]; /* 48-63: IV */
+} HEAPPG;
+
+/* Define first possible data page for heap, 0 is metapage, 1 is region page */
+#define FIRST_HEAP_RPAGE 1 
+#define FIRST_HEAP_DPAGE 2
+
+typedef struct __heaphdr {
+#define HEAP_RECSPLIT 0x01 /* Heap data record is split */
+#define HEAP_RECFIRST 0x02 /* First piece of a split record */
+#define HEAP_RECLAST  0x04 /* Last piece of a split record */
+	u_int8_t flags;		/* 00: Flags describing record. */
+	u_int8_t unused;	/* 01: Padding. */
+	u_int16_t size;		/* 02-03: The size of the stored data piece. */
+} HEAPHDR;
+
+typedef struct __heaphdrsplt {
+	HEAPHDR std_hdr;	/* 00-03: The standard data header */
+	u_int32_t tsize;	/* 04-07: Total record size, 1st piece only */
+	db_pgno_t nextpg;	/* 08-11: RID.pgno of the next record piece */
+	db_indx_t nextindx;	/* 12-13: RID.indx of the next record piece */
+	u_int16_t unused;	/* 14-15: Padding. */
+} HEAPSPLITHDR;
+
+#define HEAP_HDRSIZE(hdr) 					\
+	(F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR))
+
+#define HEAPPG_SZ(dbp)			       			\
+	(F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC :		\
+	F_ISSET((dbp), DB_AM_CHKSUM) ? HEAPPG_CHKSUM : HEAPPG_NORMAL)
+
+/* Each byte in the bitmap describes 4 pages (2 bits per page.) */
+#define HEAP_REGION_COUNT(dbp, size) (((size) - HEAPPG_SZ(dbp)) * 4)
+#define HEAP_DEFAULT_REGION_MAX(dbp)				\
+	(HEAP_REGION_COUNT(dbp, (u_int32_t)8 * 1024))
+#define	HEAP_REGION_SIZE(dbp)	(((HEAP*) (dbp)->heap_internal)->region_size)
+
+/* Figure out which region a given page belongs to. */
+#define HEAP_REGION_PGNO(dbp, p) 				\
+	((((p) - 1) / (HEAP_REGION_SIZE(dbp) + 1)) * 		\
+	(HEAP_REGION_SIZE(dbp) + 1) + 1)
+/* Translate a region pgno to region number */
+#define HEAP_REGION_NUM(dbp, pgno)				\
+	((((pgno) - 1) / (HEAP_REGION_SIZE((dbp)) + 1)) + 1)
+/* 
+ * Given an internal heap page and page number relative to that page, return the
+ * bits from map describing free space on the nth page.  Each byte in the map
+ * describes 4 pages. Point at the correct byte and mask the correct 2 bits.
+ */
+#define HEAP_SPACE(dbp, pg, n)					\
+	(HEAP_SPACEMAP((dbp), (pg))[(n) / 4] >> (2 * ((n) % 4)) & 3)
+      
+#define HEAP_SETSPACE(dbp, pg, n, b) do {				\
+	HEAP_SPACEMAP((dbp), (pg))[(n) / 4] &= ~(3 << (2 * ((n) % 4))); \
+	HEAP_SPACEMAP((dbp), (pg))[(n) / 4] |= ((b & 3) << (2 * ((n) % 4))); \
+} while (0)
+		
+/* Return the bitmap describing free space on heap data pages. */
+#define HEAP_SPACEMAP(dbp, pg) ((u_int8_t *)P_INP((dbp), (pg)))
+
+/* Return the offset table for a heap data page. */
+#define HEAP_OFFSETTBL(dbp, pg) P_INP((dbp), (pg))
+
+/* 
+ * Calculate the % of a page a given size occupies and translate that to the
+ * corresponding bitmap value. 
+ */
+#define HEAP_CALCSPACEBITS(dbp, sz, space) do {			\
+	(space) = 100 * (sz) / (dbp)->pgsize;			\
+	if ((space) <= HEAP_PG_FULL_PCT)			\
+		(space) = HEAP_PG_FULL;				\
+	else if ((space) <= HEAP_PG_GT66_PCT)			\
+		(space) = HEAP_PG_GT66;				\
+	else if ((space) <= HEAP_PG_GT33_PCT)			\
+		(space) = HEAP_PG_GT33;				\
+	else							\
+		(space) = HEAP_PG_LT33;				\
+} while (0)
+	
+/* Return the amount of free space on a heap data page. */
+#define HEAP_FREESPACE(dbp, p)                                  \
+	(HOFFSET(p) - HEAPPG_SZ(dbp) -				\
+	(NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t))))
+
+/* The maximum amount of data that can fit on an empty heap data page. */
+#define HEAP_MAXDATASIZE(dbp)					\
+	((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t))
+
+#define HEAP_FREEINDX(p)	(((HEAPPG *)p)->free_indx)
+#define HEAP_HIGHINDX(p)	(((HEAPPG *)p)->high_indx)
+
+/* True if we have a page that deals with heap */
+#define HEAPTYPE(h)                                           \
+    (TYPE(h) == P_HEAPMETA || TYPE(h) == P_HEAP || TYPE(h) == P_IHEAP)
+
+/************************************************************************
+ QUEUE MAIN PAGE LAYOUT
+ ************************************************************************/
+/*
+ * Sizes of page below.  Used to reclaim space if not doing
+ * crypto or checksumming.  If you change the QPAGE below you
+ * MUST adjust this too.
+ */
+#define	QPAGE_NORMAL	28
+#define	QPAGE_CHKSUM	48
+#define	QPAGE_SEC	64
+
+typedef struct _qpage {
+	DB_LSN	  lsn;		/* 00-07: Log sequence number. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t unused0[3];	/* 12-23: Unused. */
+	u_int8_t  unused1[1];	/*    24: Unused. */
+	u_int8_t  type;		/*    25: Page type. */
+	u_int8_t  unused2[2];	/* 26-27: Unused. */
+	u_int8_t  chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+	u_int8_t  iv[DB_IV_BYTES]; /* 48-63: IV */
+} QPAGE;
+
+#define	QPAGE_SZ(dbp)						\
+	(F_ISSET((dbp), DB_AM_ENCRYPT) ? QPAGE_SEC :		\
+	F_ISSET((dbp), DB_AM_CHKSUM) ? QPAGE_CHKSUM : QPAGE_NORMAL)
+/*
+ * !!!
+ * The next_pgno and prev_pgno fields are not maintained for btree and recno
+ * internal pages.  Doing so only provides a minor performance improvement,
+ * it's hard to do when deleting internal pages, and it increases the chance
+ * of deadlock during deletes and splits because we have to re-link pages at
+ * more than the leaf level.
+ *
+ * !!!
+ * The btree/recno access method needs db_recno_t bytes of space on the root
+ * page to specify how many records are stored in the tree.  (The alternative
+ * is to store the number of records in the meta-data page, which will create
+ * a second hot spot in trees being actively modified, or recalculate it from
+ * the BINTERNAL fields on each access.)  Overload the PREV_PGNO field.
+ */
+#define	RE_NREC(p)							\
+	((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ?	PREV_PGNO(p) :	\
+	(db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p)))
+#define	RE_NREC_ADJ(p, adj)						\
+	PREV_PGNO(p) += adj;
+#define	RE_NREC_SET(p, num)						\
+	PREV_PGNO(p) = (num);
+
+/*
+ * Initialize a page.
+ *
+ * !!!
+ * Don't modify the page's LSN, code depends on it being unchanged after a
+ * P_INIT call.
+ */
+#define	P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do {	\
+	PGNO(pg) = (n);							\
+	PREV_PGNO(pg) = (pg_prev);					\
+	NEXT_PGNO(pg) = (pg_next);					\
+	NUM_ENT(pg) = (0);						\
+	HOFFSET(pg) = (db_indx_t)(pg_size);				\
+	LEVEL(pg) = (btl);						\
+	TYPE(pg) = (pg_type);						\
+} while (0)
+
+/* Page header length (offset to first index). */
+#define	P_OVERHEAD(dbp)	P_TO_UINT16(P_INP(dbp, 0))
+
+/* First free byte. */
+#define	LOFFSET(dbp, pg)						\
+    (P_OVERHEAD(dbp) + NUM_ENT(pg) * sizeof(db_indx_t))
+
+/* Free space on a regular page. */
+#define	P_FREESPACE(dbp, pg)	(HOFFSET(pg) - LOFFSET(dbp, pg))
+
+/* Get a pointer to the bytes at a specific index. */
+#define	P_ENTRY(dbp, pg, indx)	((u_int8_t *)pg + P_INP(dbp, pg)[indx])
+
+/************************************************************************
+ OVERFLOW PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
+ * store a page number (the first page of the overflow item) and a length
+ * (the total length of the overflow item).  The overflow item consists of
+ * some number of overflow pages, linked by the next_pgno field of the page.
+ * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
+ *
+ * Overflow page overloads:
+ *	The amount of overflow data stored on each page is stored in the
+ *	hf_offset field.
+ *
+ *	The implementation reference counts overflow items as it's possible
+ *	for them to be promoted onto btree internal pages.  The reference
+ *	count is stored in the entries field.
+ */
+#define	OV_LEN(p)	(((PAGE *)p)->hf_offset)
+#define	OV_REF(p)	(((PAGE *)p)->entries)
+
+/* Maximum number of bytes that you can put on an overflow page. */
+#define	P_MAXSPACE(dbp, psize)	((psize) - P_OVERHEAD(dbp))
+
+/* Free space on an overflow page. */
+#define	P_OVFLSPACE(dbp, psize, pg)	(P_MAXSPACE(dbp, psize) - HOFFSET(pg))
+
+/************************************************************************
+ HASH PAGE LAYOUT
+ ************************************************************************/
+
+/* Each index references a group of bytes on the page. */
+#define	H_KEYDATA	1	/* Key/data item. */
+#define	H_DUPLICATE	2	/* Duplicate key/data item. */
+#define	H_OFFPAGE	3	/* Overflow key/data item. */
+#define	H_OFFDUP	4	/* Overflow page of duplicates. */
+
+/*
+ * !!!
+ * Items on hash pages are (potentially) unaligned, so we can never cast the
+ * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
+ * we do with B+tree on-page structures.  Because we frequently want the type
+ * field, it requires no alignment, and it's in the same location in all three
+ * structures, there's a pair of macros.
+ */
+#define	HPAGE_PTYPE(p)		(*(u_int8_t *)p)
+#define	HPAGE_TYPE(dbp, pg, indx)	(*P_ENTRY(dbp, pg, indx))
+
+/*
+ * The first and second types are H_KEYDATA and H_DUPLICATE, represented
+ * by the HKEYDATA structure:
+ *
+ *	+-----------------------------------+
+ *	|    type   | key/data ...          |
+ *	+-----------------------------------+
+ *
+ * For duplicates, the data field encodes duplicate elements in the data
+ * field:
+ *
+ *	+---------------------------------------------------------------+
+ *	|    type   | len1 | element1 | len1 | len2 | element2 | len2   |
+ *	+---------------------------------------------------------------+
+ *
+ * Thus, by keeping track of the offset in the element, we can do both
+ * backward and forward traversal.
+ */
+typedef struct _hkeydata {
+	u_int8_t  type;		/*    00: Page type. */
+	u_int8_t  data[1];	/* Variable length key/data item. */
+} HKEYDATA;
+#define	HKEYDATA_DATA(p)	(((u_int8_t *)p) + SSZA(HKEYDATA, data))
+
+/*
+ * The length of any HKEYDATA item. Note that indx is an element index,
+ * not a PAIR index.
+ */
+#define	LEN_HITEM(dbp, pg, pgsize, indx)				\
+	(((indx) == 0 ? (pgsize) :					\
+	(P_INP(dbp, pg)[(indx) - 1])) - (P_INP(dbp, pg)[indx]))
+
+#define	LEN_HKEYDATA(dbp, pg, psize, indx)				\
+	(db_indx_t)(LEN_HITEM(dbp, pg, psize, indx) - HKEYDATA_SIZE(0))
+
+/*
+ * Page space required to add a new HKEYDATA item to the page, with and
+ * without the index value.
+ */
+#define	HKEYDATA_SIZE(len)						\
+	((len) + SSZA(HKEYDATA, data))
+#define	HKEYDATA_PSIZE(len)						\
+	(HKEYDATA_SIZE(len) + sizeof(db_indx_t))
+
+/* Put a HKEYDATA item at the location referenced by a page entry. */
+#define	PUT_HKEYDATA(pe, kd, len, etype) {				\
+	((HKEYDATA *)(pe))->type = etype;				\
+	memcpy((u_int8_t *)(pe) + sizeof(u_int8_t), kd, len);		\
+}
+
+/*
+ * Macros the describe the page layout in terms of key-data pairs.
+ */
+#define	H_NUMPAIRS(pg)			(NUM_ENT(pg) / 2)
+#define	H_KEYINDEX(indx)		(indx)
+#define	H_DATAINDEX(indx)		((indx) + 1)
+#define	H_PAIRKEY(dbp, pg, indx)	P_ENTRY(dbp, pg, H_KEYINDEX(indx))
+#define	H_PAIRDATA(dbp, pg, indx)	P_ENTRY(dbp, pg, H_DATAINDEX(indx))
+#define	H_PAIRSIZE(dbp, pg, psize, indx)				\
+	(LEN_HITEM(dbp, pg, psize, H_KEYINDEX(indx)) +			\
+	LEN_HITEM(dbp, pg, psize, H_DATAINDEX(indx)))
+#define	LEN_HDATA(dbp, p, psize, indx)					\
+    LEN_HKEYDATA(dbp, p, psize, H_DATAINDEX(indx))
+#define	LEN_HKEY(dbp, p, psize, indx)					\
+    LEN_HKEYDATA(dbp, p, psize, H_KEYINDEX(indx))
+
+/*
+ * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
+ */
+typedef struct _hoffpage {
+	u_int8_t  type;		/*    00: Page type and delete flag. */
+	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
+	db_pgno_t pgno;		/* 04-07: Offpage page number. */
+	u_int32_t tlen;		/* 08-11: Total length of item. */
+} HOFFPAGE;
+
+#define	HOFFPAGE_PGNO(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
+#define	HOFFPAGE_TLEN(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
+
+/*
+ * Page space required to add a new HOFFPAGE item to the page, with and
+ * without the index value.
+ */
+#define	HOFFPAGE_SIZE		(sizeof(HOFFPAGE))
+#define	HOFFPAGE_PSIZE		(HOFFPAGE_SIZE + sizeof(db_indx_t))
+
+/*
+ * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
+ */
+typedef struct _hoffdup {
+	u_int8_t  type;		/*    00: Page type and delete flag. */
+	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
+	db_pgno_t pgno;		/* 04-07: Offpage page number. */
+} HOFFDUP;
+#define	HOFFDUP_PGNO(p)		(((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
+
+/*
+ * Page space required to add a new HOFFDUP item to the page, with and
+ * without the index value.
+ */
+#define	HOFFDUP_SIZE		(sizeof(HOFFDUP))
+
+/************************************************************************
+ BTREE PAGE LAYOUT
+ ************************************************************************/
+
+/* Each index references a group of bytes on the page. */
+#define	B_KEYDATA	1	/* Key/data item. */
+#define	B_DUPLICATE	2	/* Duplicate key/data item. */
+#define	B_OVERFLOW	3	/* Overflow key/data item. */
+
+/*
+ * We have to store a deleted entry flag in the page.   The reason is complex,
+ * but the simple version is that we can't delete on-page items referenced by
+ * a cursor -- the return order of subsequent insertions might be wrong.  The
+ * delete flag is an overload of the top bit of the type byte.
+ */
+#define	B_DELETE	(0x80)
+#define	B_DCLR(t)	(t) &= ~B_DELETE
+#define	B_DSET(t)	(t) |= B_DELETE
+#define	B_DISSET(t)	((t) & B_DELETE)
+
+#define	B_TYPE(t)		((t) & ~B_DELETE)
+#define	B_TSET(t, type)	((t) = B_TYPE(type))
+#define	B_TSET_DELETED(t, type) ((t) = (type) | B_DELETE)
+
+/*
+ * The first type is B_KEYDATA, represented by the BKEYDATA structure:
+ */
+typedef struct _bkeydata {
+	db_indx_t len;		/* 00-01: Key/data item length. */
+	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
+	u_int8_t  data[1];	/* Variable length key/data item. */
+} BKEYDATA;
+
+/* Get a BKEYDATA item for a specific index. */
+#define	GET_BKEYDATA(dbp, pg, indx)					\
+	((BKEYDATA *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BKEYDATA item to the page, with and
+ * without the index value.  The (u_int16_t) cast avoids warnings: DB_ALIGN
+ * casts to uintmax_t, the cast converts it to a small integral type so we
+ * don't get complaints when we assign the final result to an integral type
+ * smaller than uintmax_t.
+ */
+#define	BKEYDATA_SIZE(len)						\
+	(u_int16_t)DB_ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t))
+#define	BKEYDATA_PSIZE(len)						\
+	(BKEYDATA_SIZE(len) + sizeof(db_indx_t))
+
+/*
+ * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
+ * by the BOVERFLOW structure.
+ */
+typedef struct _boverflow {
+	db_indx_t unused1;	/* 00-01: Padding, unused. */
+	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
+	u_int8_t  unused2;	/*    03: Padding, unused. */
+	db_pgno_t pgno;		/* 04-07: Next page number. */
+	u_int32_t tlen;		/* 08-11: Total length of item. */
+} BOVERFLOW;
+
+/* Get a BOVERFLOW item for a specific index. */
+#define	GET_BOVERFLOW(dbp, pg, indx)					\
+	((BOVERFLOW *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BOVERFLOW item to the page, with and
+ * without the index value.
+ */
+#define	BOVERFLOW_SIZE							\
+	((u_int16_t)DB_ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t)))
+#define	BOVERFLOW_PSIZE							\
+	(BOVERFLOW_SIZE + sizeof(db_indx_t))
+
+#define	BITEM_SIZE(bk)							\
+	(B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE :		\
+	BKEYDATA_SIZE((bk)->len))
+
+#define	BITEM_PSIZE(bk)							\
+	(B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE :		\
+	BKEYDATA_PSIZE((bk)->len))
+
+/*
+ * Btree leaf and hash page layouts group indices in sets of two, one for the
+ * key and one for the data.  Everything else does it in sets of one to save
+ * space.  Use the following macros so that it's real obvious what's going on.
+ */
+#define	O_INDX	1
+#define	P_INDX	2
+
+/************************************************************************
+ BTREE INTERNAL PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Btree internal entry.
+ */
+typedef struct _binternal {
+	db_indx_t  len;		/* 00-01: Key/data item length. */
+	u_int8_t   type;	/*    02: Page type AND DELETE FLAG. */
+	u_int8_t   unused;	/*    03: Padding, unused. */
+	db_pgno_t  pgno;	/* 04-07: Page number of referenced page. */
+	db_recno_t nrecs;	/* 08-11: Subtree record count. */
+	u_int8_t   data[1];	/* Variable length key item. */
+} BINTERNAL;
+
+/* Get a BINTERNAL item for a specific index. */
+#define	GET_BINTERNAL(dbp, pg, indx)					\
+	((BINTERNAL *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BINTERNAL item to the page, with and
+ * without the index value.
+ */
+#define	BINTERNAL_SIZE(len)						\
+	(u_int16_t)DB_ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t))
+#define	BINTERNAL_PSIZE(len)						\
+	(BINTERNAL_SIZE(len) + sizeof(db_indx_t))
+
+/************************************************************************
+ RECNO INTERNAL PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * The recno internal entry.
+ */
+typedef struct _rinternal {
+	db_pgno_t  pgno;	/* 00-03: Page number of referenced page. */
+	db_recno_t nrecs;	/* 04-07: Subtree record count. */
+} RINTERNAL;
+
+/* Get a RINTERNAL item for a specific index. */
+#define	GET_RINTERNAL(dbp, pg, indx)					\
+	((RINTERNAL *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new RINTERNAL item to the page, with and
+ * without the index value.
+ */
+#define	RINTERNAL_SIZE							\
+	(u_int16_t)DB_ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t))
+#define	RINTERNAL_PSIZE							\
+	(RINTERNAL_SIZE + sizeof(db_indx_t))
+
+typedef struct __pglist {
+	db_pgno_t pgno, next_pgno;
+	DB_LSN lsn;
+} db_pglist_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_PAGE_H_ */
diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h
new file mode 100644
index 00000000..352ae227
--- /dev/null
+++ b/src/dbinc/db_swap.h
@@ -0,0 +1,262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_SWAP_H_
+#define	_DB_SWAP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Little endian <==> big endian 64-bit swap macros.
+ *	M_64_SWAP	swap a memory location
+ *	P_64_COPY	copy potentially unaligned 4 byte quantities
+ *	P_64_SWAP	swap a referenced memory location
+ */
+#undef	M_64_SWAP
+#define	M_64_SWAP(a) {							\
+	u_int64_t _tmp;							\
+	_tmp = (u_int64_t)a;						\
+	((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7];			\
+	((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6];			\
+	((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5];			\
+	((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4];			\
+	((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3];			\
+	((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2];			\
+	((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1];			\
+	((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0];			\
+}
+#undef	P_64_COPY
+#define	P_64_COPY(a, b) {						\
+	((u_int8_t *)b)[0] = ((u_int8_t *)a)[0];			\
+	((u_int8_t *)b)[1] = ((u_int8_t *)a)[1];			\
+	((u_int8_t *)b)[2] = ((u_int8_t *)a)[2];			\
+	((u_int8_t *)b)[3] = ((u_int8_t *)a)[3];			\
+	((u_int8_t *)b)[4] = ((u_int8_t *)a)[4];			\
+	((u_int8_t *)b)[5] = ((u_int8_t *)a)[5];			\
+	((u_int8_t *)b)[6] = ((u_int8_t *)a)[6];			\
+	((u_int8_t *)b)[7] = ((u_int8_t *)a)[7];			\
+}
+#undef	P_64_SWAP
+#define	P_64_SWAP(a) {							\
+	u_int64_t _tmp;							\
+	P_64_COPY(a, &_tmp);						\
+	((u_int8_t *)a)[0] = ((u_int8_t *)&_tmp)[7];			\
+	((u_int8_t *)a)[1] = ((u_int8_t *)&_tmp)[6];			\
+	((u_int8_t *)a)[2] = ((u_int8_t *)&_tmp)[5];			\
+	((u_int8_t *)a)[3] = ((u_int8_t *)&_tmp)[4];			\
+	((u_int8_t *)a)[4] = ((u_int8_t *)&_tmp)[3];			\
+	((u_int8_t *)a)[5] = ((u_int8_t *)&_tmp)[2];			\
+	((u_int8_t *)a)[6] = ((u_int8_t *)&_tmp)[1];			\
+	((u_int8_t *)a)[7] = ((u_int8_t *)&_tmp)[0];			\
+}
+
+/*
+ * Little endian <==> big endian 32-bit swap macros.
+ *	P_32_COPY	copy potentially unaligned 4 byte quantities
+ *	P_32_COPYSWAP	copy and swap potentially unaligned 4 byte quantities
+ *	P_32_SWAP	swap a referenced memory location
+ *	M_32_SWAP	swap a memory location
+ */
+#undef	P_32_COPY
+#define	P_32_COPY(a, b) do {						\
+	((u_int8_t *)b)[0] = ((u_int8_t *)a)[0];			\
+	((u_int8_t *)b)[1] = ((u_int8_t *)a)[1];			\
+	((u_int8_t *)b)[2] = ((u_int8_t *)a)[2];			\
+	((u_int8_t *)b)[3] = ((u_int8_t *)a)[3];			\
+} while (0)
+#undef	P_32_COPYSWAP
+#define	P_32_COPYSWAP(a, b) do {					\
+	((u_int8_t *)b)[0] = ((u_int8_t *)a)[3];			\
+	((u_int8_t *)b)[1] = ((u_int8_t *)a)[2];			\
+	((u_int8_t *)b)[2] = ((u_int8_t *)a)[1];			\
+	((u_int8_t *)b)[3] = ((u_int8_t *)a)[0];			\
+} while (0)
+#undef	P_32_SWAP
+#define	P_32_SWAP(a) do {						\
+	u_int32_t _tmp;							\
+	P_32_COPY(a, &_tmp);						\
+	P_32_COPYSWAP(&_tmp, a);					\
+} while (0)
+#undef	M_32_SWAP
+#define	M_32_SWAP(a) P_32_SWAP(&a)
+
+/*
+ * Little endian <==> big endian 16-bit swap macros.
+ *	P_16_COPY	copy potentially unaligned 2 byte quantities
+ *	P_16_COPYSWAP	copy and swap potentially unaligned 2 byte quantities
+ *	P_16_SWAP	swap a referenced memory location
+ *	M_16_SWAP	swap a memory location
+ */
+#undef	P_16_COPY
+#define	P_16_COPY(a, b) do {						\
+	((u_int8_t *)b)[0] = ((u_int8_t *)a)[0];			\
+	((u_int8_t *)b)[1] = ((u_int8_t *)a)[1];			\
+} while (0)
+#undef	P_16_COPYSWAP
+#define	P_16_COPYSWAP(a, b) do {					\
+	((u_int8_t *)b)[0] = ((u_int8_t *)a)[1];			\
+	((u_int8_t *)b)[1] = ((u_int8_t *)a)[0];			\
+} while (0)
+#undef	P_16_SWAP
+#define	P_16_SWAP(a) do {						\
+	u_int16_t _tmp;							\
+	P_16_COPY(a, &_tmp);						\
+	P_16_COPYSWAP(&_tmp, a);					\
+} while (0)
+#undef	M_16_SWAP
+#define	M_16_SWAP(a) P_16_SWAP(&a)
+
+#undef	SWAP32
+#define	SWAP32(p) {							\
+	P_32_SWAP(p);							\
+	(p) += sizeof(u_int32_t);					\
+}
+#undef	SWAP16
+#define	SWAP16(p) {							\
+	P_16_SWAP(p);							\
+	(p) += sizeof(u_int16_t);					\
+}
+
+/*
+ * Berkeley DB has local versions of htonl() and ntohl() that operate on
+ * pointers to the right size memory locations; the portability magic for
+ * finding the real system functions isn't worth the effort.
+ */
+#undef	DB_HTONL_SWAP
+#define	DB_HTONL_SWAP(env, p) do {					\
+	if (F_ISSET((env), ENV_LITTLEENDIAN))				\
+		P_32_SWAP(p);						\
+} while (0)
+#undef	DB_NTOHL_SWAP
+#define	DB_NTOHL_SWAP(env, p) do {					\
+	if (F_ISSET((env), ENV_LITTLEENDIAN))				\
+		P_32_SWAP(p);						\
+} while (0)
+
+#undef	DB_NTOHL_COPYIN
+#define	DB_NTOHL_COPYIN(env, i, p) do {					\
+	u_int8_t *tmp;							\
+	tmp = (u_int8_t *)&(i);						\
+	if (F_ISSET(env, ENV_LITTLEENDIAN)) {				\
+		tmp[3] = *p++;						\
+		tmp[2] = *p++;						\
+		tmp[1] = *p++;						\
+		tmp[0] = *p++;						\
+	} else {							\
+		memcpy(&i, p, sizeof(u_int32_t));			\
+		p = (u_int8_t *)p + sizeof(u_int32_t);			\
+	}								\
+} while (0)
+
+#undef	DB_NTOHS_COPYIN
+#define	DB_NTOHS_COPYIN(env, i, p) do {					\
+	u_int8_t *tmp;							\
+	tmp = (u_int8_t *)&(i);						\
+	if (F_ISSET(env, ENV_LITTLEENDIAN)) {				\
+		tmp[1] = *p++;						\
+		tmp[0] = *p++;						\
+	} else {							\
+		memcpy(&i, p, sizeof(u_int16_t));			\
+		p = (u_int8_t *)p + sizeof(u_int16_t);			\
+	}								\
+} while (0)
+
+#undef	DB_HTONL_COPYOUT
+#define	DB_HTONL_COPYOUT(env, p, i) do {				\
+	u_int8_t *tmp;							\
+	tmp = (u_int8_t *)p;						\
+	if (F_ISSET(env, ENV_LITTLEENDIAN)) {				\
+		*tmp++ = ((u_int8_t *)&(i))[3];				\
+		*tmp++ = ((u_int8_t *)&(i))[2];				\
+		*tmp++ = ((u_int8_t *)&(i))[1];				\
+		*tmp++ = ((u_int8_t *)&(i))[0];				\
+	} else								\
+		memcpy(p, &i, sizeof(u_int32_t));			\
+	p = (u_int8_t *)p + sizeof(u_int32_t);				\
+} while (0)
+
+#undef	DB_HTONS_COPYOUT
+#define	DB_HTONS_COPYOUT(env, p, i) do {				\
+	u_int8_t *tmp;							\
+	tmp = (u_int8_t *)p;						\
+	if (F_ISSET(env, ENV_LITTLEENDIAN)) {				\
+		*tmp++ = ((u_int8_t *)&(i))[1];				\
+		*tmp++ = ((u_int8_t *)&(i))[0];				\
+	} else								\
+		memcpy(p, &i, sizeof(u_int16_t));			\
+	p = (u_int8_t *)p + sizeof(u_int16_t);				\
+} while (0)
+
+/*
+ * Helper macros for swapped logs.  We write logs in little endian format to
+ * minimize disruption on x86 when upgrading from native byte order to
+ * platform-independent logs.
+ */
+#define	LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN)
+
+#define	LOGCOPY_32(env, x, p) do {					\
+	if (LOG_SWAPPED(env))						\
+		P_32_COPYSWAP((p), (x));				\
+	else								\
+		memcpy((x), (p), sizeof(u_int32_t));			\
+} while (0)
+
+#define	LOGCOPY_16(env, x, p) do {					\
+	if (LOG_SWAPPED(env))						\
+		P_16_COPYSWAP((p), (x));				\
+	else								\
+		memcpy((x), (p), sizeof(u_int16_t));			\
+} while (0)
+
+#define	LOGCOPY_TOLSN(env, lsnp, p) do {				\
+	LOGCOPY_32((env), &(lsnp)->file, (p));				\
+	LOGCOPY_32((env), &(lsnp)->offset,				\
+	    (u_int8_t *)(p) + sizeof(u_int32_t));			\
+} while (0)
+
+#define	LOGCOPY_FROMLSN(env, p, lsnp) do {				\
+	LOGCOPY_32((env), (p), &(lsnp)->file);				\
+	LOGCOPY_32((env),						\
+	    (u_int8_t *)(p) + sizeof(u_int32_t), &(lsnp)->offset);	\
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_SWAP_H_ */
diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h
new file mode 100644
index 00000000..45fb624d
--- /dev/null
+++ b/src/dbinc/db_upgrade.h
@@ -0,0 +1,248 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_UPGRADE_H_
+#define	_DB_UPGRADE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines the metadata pages from the previous release.
+ * These structures are only used to upgrade old versions of databases.
+ */
+
+/* Structures from the 3.1 release */
+typedef struct _dbmeta31 {
+	DB_LSN	  lsn;		/* 00-07: LSN. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t magic;	/* 12-15: Magic number. */
+	u_int32_t version;	/* 16-19: Version. */
+	u_int32_t pagesize;	/* 20-23: Pagesize. */
+	u_int8_t  unused1[1];	/*    24: Unused. */
+	u_int8_t  type;		/*    25: Page type. */
+	u_int8_t  unused2[2];	/* 26-27: Unused. */
+	u_int32_t free;		/* 28-31: Free list page number. */
+	DB_LSN    unused3;	/* 36-39: Unused. */
+	u_int32_t key_count;	/* 40-43: Cached key count. */
+	u_int32_t record_count;	/* 44-47: Cached record count. */
+	u_int32_t flags;	/* 48-51: Flags: unique to each AM. */
+				/* 52-71: Unique file ID. */
+	u_int8_t  uid[DB_FILE_ID_LEN];
+} DBMETA31;
+
+typedef struct _btmeta31 {
+	DBMETA31  dbmeta;		/* 00-71: Generic meta-data header. */
+
+	u_int32_t maxkey;	/* 72-75: Btree: Maxkey. */
+	u_int32_t minkey;	/* 76-79: Btree: Minkey. */
+	u_int32_t re_len;	/* 80-83: Recno: fixed-length record length. */
+	u_int32_t re_pad;	/* 84-87: Recno: fixed-length record pad. */
+	u_int32_t root;		/* 88-92: Root page. */
+
+	/*
+	 * Minimum page size is 128.
+	 */
+} BTMETA31;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta31 {
+	DBMETA31 dbmeta;	/* 00-71: Generic meta-data page header. */
+
+	u_int32_t max_bucket;	/* 72-75: ID of Maximum bucket in use */
+	u_int32_t high_mask;	/* 76-79: Modulo mask into table */
+	u_int32_t low_mask;	/* 80-83: Modulo mask into table lower half */
+	u_int32_t ffactor;	/* 84-87: Fill factor */
+	u_int32_t nelem;	/* 88-91: Number of keys in hash table */
+	u_int32_t h_charkey;	/* 92-95: Value of hash(CHARKEY) */
+#define	NCACHED	32		/* number of spare points */
+				/* 96-223: Spare pages for overflow */
+	u_int32_t spares[NCACHED];
+
+	/*
+	 * Minimum page size is 256.
+	 */
+} HMETA31;
+
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta31 {
+	DBMETA31 dbmeta;	/* 00-71: Generic meta-data header. */
+
+	u_int32_t start;	/* 72-75: Start offset. */
+	u_int32_t first_recno;	/* 76-79: First not deleted record. */
+	u_int32_t cur_recno;	/* 80-83: Last recno allocated. */
+	u_int32_t re_len;	/* 84-87: Fixed-length record length. */
+	u_int32_t re_pad;	/* 88-91: Fixed-length record pad. */
+	u_int32_t rec_page;	/* 92-95: Records Per Page. */
+
+	/*
+	 * Minimum page size is 128.
+	 */
+} QMETA31;
+/* Structures from the 3.2 release */
+typedef struct _qmeta32 {
+	DBMETA31 dbmeta;	/* 00-71: Generic meta-data header. */
+
+	u_int32_t first_recno;	/* 72-75: First not deleted record. */
+	u_int32_t cur_recno;	/* 76-79: Last recno allocated. */
+	u_int32_t re_len;	/* 80-83: Fixed-length record length. */
+	u_int32_t re_pad;	/* 84-87: Fixed-length record pad. */
+	u_int32_t rec_page;	/* 88-91: Records Per Page. */
+	u_int32_t page_ext;	/* 92-95: Pages per extent */
+
+	/*
+	 * Minimum page size is 128.
+	 */
+} QMETA32;
+
+/* Structures from the 3.0 release */
+
+typedef struct _dbmeta30 {
+	DB_LSN	  lsn;		/* 00-07: LSN. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t magic;	/* 12-15: Magic number. */
+	u_int32_t version;	/* 16-19: Version. */
+	u_int32_t pagesize;	/* 20-23: Pagesize. */
+	u_int8_t  unused1[1];	/*    24: Unused. */
+	u_int8_t  type;		/*    25: Page type. */
+	u_int8_t  unused2[2];	/* 26-27: Unused. */
+	u_int32_t free;		/* 28-31: Free list page number. */
+	u_int32_t flags;	/* 32-35: Flags: unique to each AM. */
+				/* 36-55: Unique file ID. */
+	u_int8_t  uid[DB_FILE_ID_LEN];
+} DBMETA30;
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _btmeta30 {
+	DBMETA30	dbmeta;	/* 00-55: Generic meta-data header. */
+
+	u_int32_t maxkey;	/* 56-59: Btree: Maxkey. */
+	u_int32_t minkey;	/* 60-63: Btree: Minkey. */
+	u_int32_t re_len;	/* 64-67: Recno: fixed-length record length. */
+	u_int32_t re_pad;	/* 68-71: Recno: fixed-length record pad. */
+	u_int32_t root;		/* 72-75: Root page. */
+
+	/*
+	 * Minimum page size is 128.
+	 */
+} BTMETA30;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta30 {
+	DBMETA30 dbmeta;	/* 00-55: Generic meta-data page header. */
+
+	u_int32_t max_bucket;	/* 56-59: ID of Maximum bucket in use */
+	u_int32_t high_mask;	/* 60-63: Modulo mask into table */
+	u_int32_t low_mask;	/* 64-67: Modulo mask into table lower half */
+	u_int32_t ffactor;	/* 68-71: Fill factor */
+	u_int32_t nelem;	/* 72-75: Number of keys in hash table */
+	u_int32_t h_charkey;	/* 76-79: Value of hash(CHARKEY) */
+#define	NCACHED30	32		/* number of spare points */
+				/* 80-207: Spare pages for overflow */
+	u_int32_t spares[NCACHED30];
+
+	/*
+	 * Minimum page size is 256.
+	 */
+} HMETA30;
+
+/************************************************************************
+ QUEUE METADATA PAGE LAYOUT
+ ************************************************************************/
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta30 {
+	DBMETA30    dbmeta;	/* 00-55: Generic meta-data header. */
+
+	u_int32_t start;	/* 56-59: Start offset. */
+	u_int32_t first_recno;	/* 60-63: First not deleted record. */
+	u_int32_t cur_recno;	/* 64-67: Last recno allocated. */
+	u_int32_t re_len;	/* 68-71: Fixed-length record length. */
+	u_int32_t re_pad;	/* 72-75: Fixed-length record pad. */
+	u_int32_t rec_page;	/* 76-79: Records Per Page. */
+
+	/*
+	 * Minimum page size is 128.
+	 */
+} QMETA30;
+
+/* Structures from Release 2.x */
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Btree metadata page layout:
+ */
+typedef struct _btmeta2X {
+	DB_LSN	  lsn;		/* 00-07: LSN. */
+	db_pgno_t pgno;		/* 08-11: Current page number. */
+	u_int32_t magic;	/* 12-15: Magic number. */
+	u_int32_t version;	/* 16-19: Version. */
+	u_int32_t pagesize;	/* 20-23: Pagesize. */
+	u_int32_t maxkey;	/* 24-27: Btree: Maxkey. */
+	u_int32_t minkey;	/* 28-31: Btree: Minkey. */
+	u_int32_t free;		/* 32-35: Free list page number. */
+	u_int32_t flags;	/* 36-39: Flags. */
+	u_int32_t re_len;	/* 40-43: Recno: fixed-length record length. */
+	u_int32_t re_pad;	/* 44-47: Recno: fixed-length record pad. */
+				/* 48-67: Unique file ID. */
+	u_int8_t  uid[DB_FILE_ID_LEN];
+} BTMETA2X;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Hash metadata page layout:
+ */
+/* Hash Table Information */
+typedef struct hashhdr {	/* Disk resident portion */
+	DB_LSN	lsn;		/* 00-07: LSN of the header page */
+	db_pgno_t pgno;		/* 08-11: Page number (btree compatibility). */
+	u_int32_t magic;	/* 12-15: Magic NO for hash tables */
+	u_int32_t version;	/* 16-19: Version ID */
+	u_int32_t pagesize;	/* 20-23: Bucket/Page Size */
+	u_int32_t ovfl_point;	/* 24-27: Overflow page allocation location */
+	u_int32_t last_freed;	/* 28-31: Last freed overflow page pgno */
+	u_int32_t max_bucket;	/* 32-35: ID of Maximum bucket in use */
+	u_int32_t high_mask;	/* 36-39: Modulo mask into table */
+	u_int32_t low_mask;	/* 40-43: Modulo mask into table lower half */
+	u_int32_t ffactor;	/* 44-47: Fill factor */
+	u_int32_t nelem;	/* 48-51: Number of keys in hash table */
+	u_int32_t h_charkey;	/* 52-55: Value of hash(CHARKEY) */
+	u_int32_t flags;	/* 56-59: Allow duplicates. */
+#define	NCACHED2X	32		/* number of spare points */
+				/* 60-187: Spare pages for overflow */
+	u_int32_t spares[NCACHED2X];
+				/* 188-207: Unique file ID. */
+	u_int8_t  uid[DB_FILE_ID_LEN];
+
+	/*
+	 * Minimum page size is 256.
+	 */
+} HASHHDR;
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_UPGRADE_H_ */
diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h
new file mode 100644
index 00000000..68acbf6c
--- /dev/null
+++ b/src/dbinc/db_verify.h
@@ -0,0 +1,210 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_VERIFY_H_
+#define	_DB_VERIFY_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Structures and macros for the storage and retrieval of all information
+ * needed for inter-page verification of a database.
+ */
+
+/*
+ * EPRINT is the macro for error printing.  Takes as an arg the arg set
+ * for DB->err.
+ */
+#define	EPRINT(x) do {							\
+	if (!LF_ISSET(DB_SALVAGE))					\
+		__db_errx x;						\
+} while (0)
+
+/* Complain about a totally zeroed page where we don't expect one. */
+#define	ZEROPG_ERR_PRINT(dbenv, pgno, str) do {				\
+	EPRINT(((dbenv), DB_STR_A("0501", 				\
+	    "Page %lu: %s is of inappropriate type %lu", "%lu %s %lu"),	\
+	    (u_long)(pgno), str, (u_long)P_INVALID));			\
+	EPRINT(((dbenv), DB_STR_A("0502",				\
+	    "Page %lu: totally zeroed page",				\
+	    "%lu"), (u_long)(pgno)));					\
+} while (0)
+
+/*
+ * Note that 0 is, in general, a valid pgno, despite equaling PGNO_INVALID;
+ * we have to test it separately where it's not appropriate.
+ */
+#define	IS_VALID_PGNO(x)	((x) <= vdp->last_pgno)
+
+/*
+ * VRFY_DBINFO is the fundamental structure;  it either represents the database
+ * of subdatabases, or the sole database if there are no subdatabases.
+ */
+struct __vrfy_dbinfo {
+	DB_THREAD_INFO *thread_info;
+	/* Info about this database in particular. */
+	DBTYPE		type;
+
+	/* List of subdatabase meta pages, if any. */
+	LIST_HEAD(__subdbs, __vrfy_childinfo) subdbs;
+
+	/* Transaction handle for CDS group. */
+	DB_TXN *txn;
+
+	/* File-global info--stores VRFY_PAGEINFOs for each page. */
+	DB *pgdbp;
+
+	/* Child database--stores VRFY_CHILDINFOs of each page. */
+	DB *cdbp;
+
+	/* Page info structures currently in use. */
+	LIST_HEAD(__activepips, __vrfy_pageinfo) activepips;
+
+	/*
+	 * DB we use to keep track of which pages are linked somehow
+	 * during verification.  0 is the default, "unseen";  1 is seen.
+	 */
+	DB *pgset;
+
+	/*
+	 * This is a database we use during salvaging to keep track of which
+	 * overflow and dup pages we need to come back to at the end and print
+	 * with key "UNKNOWN".  Pages which print with a good key get set
+	 * to SALVAGE_IGNORE;  others get set, as appropriate, to SALVAGE_LDUP,
+	 * SALVAGE_LRECNODUP, SALVAGE_OVERFLOW for normal db overflow pages,
+	 * and SALVAGE_BTREE, SALVAGE_LRECNO, and SALVAGE_HASH for subdb
+	 * pages.
+	 */
+#define	SALVAGE_INVALID		0
+#define	SALVAGE_IGNORE		1
+#define	SALVAGE_LDUP		2
+#define	SALVAGE_IBTREE		3
+#define	SALVAGE_OVERFLOW	4
+#define	SALVAGE_LBTREE		5
+#define	SALVAGE_HASH		6
+#define	SALVAGE_LRECNO		7
+#define	SALVAGE_LRECNODUP	8
+	DB *salvage_pages;
+
+	db_pgno_t	last_pgno;
+	db_pgno_t	meta_last_pgno;
+	db_pgno_t	pgs_remaining;	/* For dbp->db_feedback(). */
+
+	/*
+	 * These are used during __bam_vrfy_subtree to keep track, while
+	 * walking up and down the Btree structure, of the prev- and next-page
+	 * chain of leaf pages and verify that it's intact.  Also, make sure
+	 * that this chain contains pages of only one type.
+	 */
+	db_pgno_t	prev_pgno;
+	db_pgno_t	next_pgno;
+	u_int8_t	leaf_type;
+
+	/* Queue needs these to verify data pages in the first pass. */
+	u_int32_t	re_pad;		/* Record pad character. */
+	u_int32_t	re_len;		/* Record length. */
+	u_int32_t	rec_page;
+	u_int32_t	page_ext;
+	u_int32_t       first_recno;
+	u_int32_t       last_recno;
+	int		nextents;
+	db_pgno_t	*extents;
+
+#define	SALVAGE_PRINTABLE	0x01	/* Output printable chars literally. */
+#define	SALVAGE_PRINTHEADER	0x02	/* Print the unknown-key header. */
+#define	SALVAGE_PRINTFOOTER	0x04	/* Print the unknown-key footer. */
+#define	SALVAGE_HASSUBDBS	0x08	/* There are subdatabases to salvage. */
+#define	VRFY_LEAFCHAIN_BROKEN	0x10	/* Lost one or more Btree leaf pgs. */
+#define	VRFY_QMETA_SET		0x20    /* We've seen a QUEUE meta page and
+					   set things up for it. */
+	u_int32_t	flags;
+}; /* VRFY_DBINFO */
+
+/*
+ * The amount of state information we need per-page is small enough that
+ * it's not worth the trouble to define separate structures for each
+ * possible type of page, and since we're doing verification with these we
+ * have to be open to the possibility that page N will be of a completely
+ * unexpected type anyway.  So we define one structure here with all the
+ * info we need for inter-page verification.
+ */
+struct __vrfy_pageinfo {
+	u_int8_t	type;
+	u_int8_t	bt_level;
+	u_int8_t	unused1;
+	u_int8_t	unused2;
+	db_pgno_t	pgno;
+	db_pgno_t	prev_pgno;
+	db_pgno_t	next_pgno;
+
+	/* meta pages */
+	db_pgno_t	root;
+	db_pgno_t	free;		/* Free list head. */
+
+	db_indx_t	entries;	/* Actual number of entries. */
+	u_int16_t	unused;
+	db_recno_t	rec_cnt;	/* Record count. */
+	u_int32_t	re_pad;		/* Record pad character. */
+	u_int32_t	re_len;		/* Record length. */
+	u_int32_t	bt_minkey;
+	u_int32_t	h_ffactor;
+	u_int32_t	h_nelem;
+
+	/* overflow pages */
+	/*
+	 * Note that refcount is the refcount for an overflow page; pi_refcount
+	 * is this structure's own refcount!
+	 */
+	u_int32_t	refcount;
+	u_int32_t	olen;
+
+#define	VRFY_DUPS_UNSORTED	0x0001	/* Have to flag the negative! */
+#define	VRFY_HAS_CHKSUM		0x0002
+#define	VRFY_HAS_DUPS		0x0004
+#define	VRFY_HAS_DUPSORT	0x0008	/* Has the flag set. */
+#define	VRFY_HAS_PART_RANGE	0x0010	/* Has the flag set. */
+#define	VRFY_HAS_PART_CALLBACK	0x0020	/* Has the flag set. */
+#define	VRFY_HAS_RECNUMS	0x0040
+#define	VRFY_HAS_SUBDBS		0x0080
+#define	VRFY_INCOMPLETE		0x0100	/* Meta or item order checks incomp. */
+#define	VRFY_IS_ALLZEROES	0x0200	/* Hash page we haven't touched? */
+#define	VRFY_IS_FIXEDLEN	0x0400
+#define	VRFY_IS_RECNO		0x0800
+#define	VRFY_IS_RRECNO		0x1000
+#define	VRFY_OVFL_LEAFSEEN	0x2000
+#define	VRFY_HAS_COMPRESS	0x4000
+#define	VRFY_NONEXISTENT	0x8000
+	u_int32_t	flags;
+
+	LIST_ENTRY(__vrfy_pageinfo) links;
+	u_int32_t	pi_refcount;
+}; /* VRFY_PAGEINFO */
+
+struct __vrfy_childinfo {
+	/* The following fields are set by the caller of __db_vrfy_childput. */
+	db_pgno_t	pgno;
+
+#define	V_DUPLICATE	1		/* off-page dup metadata */
+#define	V_OVERFLOW	2		/* overflow page */
+#define	V_RECNO		3		/* btree internal or leaf page */
+	u_int32_t	type;
+	db_recno_t	nrecs;		/* record count on a btree subtree */
+	u_int32_t	tlen;		/* ovfl. item total size */
+
+	/* The following field is maintained by __db_vrfy_childput. */
+	u_int32_t	refcnt;		/* # of times parent points to child. */
+
+	LIST_ENTRY(__vrfy_childinfo) links;
+}; /* VRFY_CHILDINFO */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_VERIFY_H_ */
diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h
new file mode 100644
index 00000000..a8da000d
--- /dev/null
+++ b/src/dbinc/debug.h
@@ -0,0 +1,283 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_DEBUG_H_
+#define	_DB_DEBUG_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Turn on additional error checking in gcc 3.X.
+ */
+#if !defined(__GNUC__) || __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 5)
+#define	__attribute__(s)
+#endif
+
+/*
+ * When running with #DIAGNOSTIC defined, we smash memory and do memory
+ * guarding with a special byte value.
+ */
+#define	CLEAR_BYTE	0xdb
+#define	GUARD_BYTE	0xdc
+
+/*
+ * DB assertions.
+ *
+ * Use __STDC__ rather than STDC_HEADERS, the #e construct is ANSI C specific.
+ */
+#if defined(DIAGNOSTIC) && defined(__STDC__)
+#define	DB_ASSERT(env, e)						\
+	((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__))
+#else
+#define	DB_ASSERT(env, e)	NOP_STATEMENT
+#endif
+
+/*
+ * "Shut that bloody compiler up!"
+ *
+ * Unused, or not-used-yet variable.  We need to write and then read the
+ * variable, some compilers are too bloody clever by half.
+ */
+#define	COMPQUIET(n, v)	do {					        \
+	(n) = (v);						        \
+	(n) = (n);						        \
+} while (0)
+
+/*
+ * Purify and other run-time tools complain about uninitialized reads/writes
+ * of structure fields whose only purpose is padding, as well as when heap
+ * memory that was never initialized is written to disk.
+ */
+#ifdef	UMRW
+#define	UMRW_SET(v)	(v) = 0
+#else
+#define	UMRW_SET(v)	NOP_STATEMENT
+#endif
+
+/*
+ * Errors are in one of two areas: a Berkeley DB error, or a system-level
+ * error.  We use db_strerror to translate the former and __os_strerror to
+ * translate the latter.
+ */
+typedef enum {
+	DB_ERROR_NOT_SET=0,
+	DB_ERROR_SET=1,
+	DB_ERROR_SYSTEM=2
+} db_error_set_t;
+
+/*
+ * Message handling.  Use a macro instead of a function because va_list
+ * references to variadic arguments cannot be reset to the beginning of the
+ * variadic argument list (and then rescanned), by functions other than the
+ * original routine that took the variadic list of arguments.
+ */
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#define	DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) {		\
+	va_list __ap;							\
+									\
+	/* Call the application's callback function, if specified. */	\
+	va_start(__ap, fmt);						\
+	if ((dbenv) != NULL && (dbenv)->db_errcall != NULL)		\
+		__db_errcall(dbenv, error, error_set, fmt, __ap);	\
+	va_end(__ap);							\
+									\
+	/*								\
+	 * If the application specified a file descriptor, write to it.	\
+	 * If we wrote to neither the application's callback routine or	\
+	 * its file descriptor, and it's an application error message	\
+	 * using {DbEnv,Db}.{err,errx} or the application has never	\
+	 * configured an output channel, default by writing to stderr.	\
+	 */								\
+	va_start(__ap, fmt);						\
+	if ((dbenv) == NULL ||						\
+	    (dbenv)->db_errfile != NULL ||				\
+	    ((dbenv)->db_errcall == NULL &&				\
+	    ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET))))	\
+		__db_errfile(dbenv, error, error_set, fmt, __ap);	\
+	va_end(__ap);							\
+}
+#else
+#define	DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) {		\
+	va_list __ap;							\
+									\
+	/* Call the application's callback function, if specified. */	\
+	va_start(__ap);							\
+	if ((dbenv) != NULL && (dbenv)->db_errcall != NULL)		\
+		__db_errcall(dbenv, error, error_set, fmt, __ap);	\
+	va_end(__ap);							\
+									\
+	/*								\
+	 * If the application specified a file descriptor, write to it.	\
+	 * If we wrote to neither the application's callback routine or	\
+	 * its file descriptor, and it's an application error message	\
+	 * using {DbEnv,Db}.{err,errx} or the application has never	\
+	 * configured an output channel, default by writing to stderr.	\
+	 */								\
+	va_start(__ap);							\
+	if ((dbenv) == NULL ||						\
+	    (dbenv)->db_errfile != NULL ||				\
+	    ((dbenv)->db_errcall == NULL &&				\
+	    ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET))))	\
+		 __db_errfile(env, error, error_set, fmt, __ap);	\
+	va_end(__ap);							\
+}
+#endif
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#define	DB_REAL_MSG(dbenv, fmt) {					\
+	va_list __ap;							\
+									\
+	/* Call the application's callback function, if specified. */	\
+	va_start(__ap, fmt);						\
+	if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL)		\
+		__db_msgcall(dbenv, fmt, __ap);				\
+	va_end(__ap);							\
+									\
+	/*								\
+	 * If the application specified a file descriptor, write to it.	\
+	 * If we wrote to neither the application's callback routine or	\
+	 * its file descriptor, write to stdout.			\
+	 */								\
+	va_start(__ap, fmt);						\
+	if ((dbenv) == NULL ||						\
+	    (dbenv)->db_msgfile != NULL ||				\
+	    (dbenv)->db_msgcall == NULL) {				\
+		__db_msgfile(dbenv, fmt, __ap);				\
+	}								\
+	va_end(__ap);							\
+}
+#else
+#define	DB_REAL_MSG(dbenv, fmt) {					\
+	va_list __ap;							\
+									\
+	/* Call the application's callback function, if specified. */	\
+	va_start(__ap);							\
+	if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL)		\
+		__db_msgcall(dbenv, fmt, __ap);				\
+	va_end(__ap);							\
+									\
+	/*								\
+	 * If the application specified a file descriptor, write to it.	\
+	 * If we wrote to neither the application's callback routine or	\
+	 * its file descriptor, write to stdout.			\
+	 */								\
+	va_start(__ap);							\
+	if ((dbenv) == NULL ||						\
+	    (dbenv)->db_msgfile != NULL ||				\
+	    (dbenv)->db_msgcall == NULL) {				\
+		__db_msgfile(dbenv, fmt, __ap);				\
+	}								\
+	va_end(__ap);							\
+}
+#endif
+
+/*
+ * Debugging macro to log operations.
+ *	If DEBUG_WOP is defined, log operations that modify the database.
+ *	If DEBUG_ROP is defined, log operations that read the database.
+ *
+ * D dbp
+ * T txn
+ * O operation (string)
+ * K key
+ * A data
+ * F flags
+ */
+#define	LOG_OP(C, T, O, K, A, F) {					\
+	DB_LSN __lsn;							\
+	DBT __op;							\
+	if (DBC_LOGGING((C))) {						\
+		memset(&__op, 0, sizeof(__op));				\
+		__op.data = O;						\
+		__op.size = (u_int32_t)strlen(O) + 1;			\
+		(void)__db_debug_log((C)->env, T, &__lsn, 0,		\
+		    &__op, (C)->dbp->log_filename->id, K, A, F);	\
+	}								\
+}
+#ifdef	DEBUG_ROP
+#define	DEBUG_LREAD(C, T, O, K, A, F)	LOG_OP(C, T, O, K, A, F)
+#else
+#define	DEBUG_LREAD(C, T, O, K, A, F)
+#endif
+#ifdef	DEBUG_WOP
+#define	DEBUG_LWRITE(C, T, O, K, A, F)	LOG_OP(C, T, O, K, A, F)
+#else
+#define	DEBUG_LWRITE(C, T, O, K, A, F)
+#endif
+
+/*
+ * Hook for testing recovery at various places in the create/delete paths.
+ * Hook for testing subdb locks.
+ */
+#if CONFIG_TEST
+#define	DB_TEST_SUBLOCKS(env, flags) do {				\
+	if ((env)->test_abort == DB_TEST_SUBDB_LOCKS)			\
+		(flags) |= DB_LOCK_NOWAIT;				\
+} while (0)
+
+#define	DB_ENV_TEST_RECOVERY(env, val, ret, name) do {			\
+	int __ret;							\
+	PANIC_CHECK((env));						\
+	if ((env)->test_copy == (val)) {				\
+		/* COPY the FILE */					\
+		if ((__ret = __db_testcopy((env), NULL, (name))) != 0)	\
+			(ret) = __env_panic((env), __ret);		\
+	}								\
+	if ((env)->test_abort == (val)) {				\
+		/* ABORT the TXN */					\
+		(env)->test_abort = 0;					\
+		(ret) = EINVAL;						\
+		goto db_tr_err;						\
+	}								\
+} while (0)
+
+#define	DB_TEST_RECOVERY(dbp, val, ret, name) do {			\
+	ENV *__env = (dbp)->env;					\
+	int __ret;							\
+	PANIC_CHECK(__env);						\
+	if (__env->test_copy == (val)) {				\
+		/* Copy the file. */					\
+		if (F_ISSET((dbp),					\
+		    DB_AM_OPEN_CALLED) && (dbp)->mpf != NULL)		\
+			(void)__db_sync(dbp);				\
+		if ((__ret =						\
+		    __db_testcopy(__env, (dbp), (name))) != 0)		\
+			(ret) = __env_panic(__env, __ret);		\
+	}								\
+	if (__env->test_abort == (val)) {				\
+		/* Abort the transaction. */				\
+		__env->test_abort = 0;					\
+		(ret) = EINVAL;						\
+		goto db_tr_err;						\
+	}								\
+} while (0)
+
+#define	DB_TEST_RECOVERY_LABEL	db_tr_err:
+
+#define	DB_TEST_SET(field, val) do {					\
+	if (field == (val))						\
+		goto db_tr_err;						\
+} while (0)
+
+#define	DB_TEST_WAIT(env, val)						\
+	if ((val) != 0)							\
+		__os_yield((env), (u_long)(val), 0)
+#else
+#define	DB_TEST_SUBLOCKS(env, flags)
+#define	DB_ENV_TEST_RECOVERY(env, val, ret, name)
+#define	DB_TEST_RECOVERY(dbp, val, ret, name)
+#define	DB_TEST_RECOVERY_LABEL
+#define	DB_TEST_SET(env, val)
+#define	DB_TEST_WAIT(env, val)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_DEBUG_H_ */
diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h
new file mode 100644
index 00000000..94f27f9f
--- /dev/null
+++ b/src/dbinc/fop.h
@@ -0,0 +1,32 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_FOP_H_
+#define	_DB_FOP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define	MAKE_INMEM(D) do {					\
+	F_SET((D), DB_AM_INMEM);				\
+	(void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 1);	\
+} while (0)
+
+#define	CLR_INMEM(D) do {					\
+	F_CLR((D), DB_AM_INMEM);				\
+	(void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0);	\
+} while (0)
+
+#include "dbinc_auto/fileops_auto.h"
+#include "dbinc_auto/fileops_ext.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_FOP_H_ */
diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h
new file mode 100644
index 00000000..95e5c118
--- /dev/null
+++ b/src/dbinc/globals.h
@@ -0,0 +1,105 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_GLOBALS_H_
+#define	_DB_GLOBALS_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************************
+ * Global variables.
+ *
+ * Held in a single structure to minimize the name-space pollution.
+ *******************************************************/
+#ifdef HAVE_VXWORKS
+#include "semLib.h"
+#endif
+
+typedef struct __db_globals {
+#ifdef HAVE_VXWORKS
+	u_int32_t db_global_init;	/* VxWorks: inited */
+	SEM_ID db_global_lock;		/* VxWorks: global semaphore */
+#endif
+#ifdef DB_WIN32
+#ifndef DB_WINCE
+	/*
+	 * These fields are used by the Windows implementation of mutexes.
+	 * Usually they are initialized by the first DB API call to lock a
+	 * mutex. If that would result in the mutexes being inaccessible by
+	 * other threads (e.g., ones which have lesser privileges) the
+	 * application may first call db_env_set_win_security().
+	 */
+	SECURITY_DESCRIPTOR win_default_sec_desc;
+	SECURITY_ATTRIBUTES win_default_sec_attr;
+#endif
+	SECURITY_ATTRIBUTES *win_sec_attr;
+#endif
+	
+	/* TAILQ_HEAD(__envq, __dbenv) envq; */
+	struct __envq {
+		struct __env *tqh_first;
+		struct __env **tqh_last;
+	} envq;
+
+	char *db_line;			/* DB display string. */
+
+	char error_buf[40];		/* Error string buffer. */
+
+	int uid_init;			/* srand set in UID generator */
+
+	u_long rand_next;		/* rand/srand value */
+
+	u_int32_t fid_serial;		/* file id counter */
+
+	int db_errno;			/* Errno value if not available */
+
+	size_t num_active_pids;		/* number of entries in active_pids */
+
+	size_t size_active_pids;	/* allocated size of active_pids */
+
+	pid_t *active_pids;		/* array active pids */
+
+	char *saved_errstr;		/* saved error string from backup */
+
+	/* Underlying OS interface jump table.*/
+	void	(*j_assert) __P((const char *, const char *, int));
+	int	(*j_close) __P((int));	
+	void	(*j_dirfree) __P((char **, int));
+	int	(*j_dirlist) __P((const char *, char ***, int *));
+	int	(*j_exists) __P((const char *, int *));
+	void	(*j_free) __P((void *));
+	int	(*j_fsync) __P((int));
+	int	(*j_ftruncate) __P((int, off_t));
+	int	(*j_ioinfo) __P((const char *,
+		    int, u_int32_t *, u_int32_t *, u_int32_t *));
+	void   *(*j_malloc) __P((size_t));
+	int	(*j_file_map) __P((DB_ENV *, char *, size_t, int, void **));
+	int	(*j_file_unmap) __P((DB_ENV *, void *));
+	int	(*j_open) __P((const char *, int, ...));
+	ssize_t	(*j_pread) __P((int, void *, size_t, off_t));
+	ssize_t	(*j_pwrite) __P((int, const void *, size_t, off_t));
+	ssize_t	(*j_read) __P((int, void *, size_t));
+	void   *(*j_realloc) __P((void *, size_t));
+	int	(*j_region_map) __P((DB_ENV *, char *, size_t, int *, void **));
+	int	(*j_region_unmap) __P((DB_ENV *, void *));
+	int	(*j_rename) __P((const char *, const char *));
+	int	(*j_seek) __P((int, off_t, int));
+	int	(*j_unlink) __P((const char *));
+	ssize_t	(*j_write) __P((int, const void *, size_t));
+	int	(*j_yield) __P((u_long, u_long));
+} DB_GLOBALS;
+
+extern	DB_GLOBALS	__db_global_values;
+#define	DB_GLOBAL(v)	__db_global_values.v
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_GLOBALS_H_ */
diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h
new file mode 100644
index 00000000..f485128a
--- /dev/null
+++ b/src/dbinc/hash.h
@@ -0,0 +1,173 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_HASH_H_
+#define	_DB_HASH_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Hash internal structure. */
+typedef struct hash_t {
+	db_pgno_t meta_pgno;	/* Page number of the meta data page. */
+	u_int32_t revision;	/* Revision of subdb metadata. */
+	u_int32_t h_ffactor;	/* Fill factor. */
+	u_int32_t h_nelem;	/* Number of elements. */
+				/* Hash and compare functions. */
+	u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t));
+	int (*h_compare) __P((DB *, const DBT *, const DBT *));
+} HASH;
+
+/* Cursor structure definitions. */
+typedef struct cursor_t {
+	/* struct __dbc_internal */
+	__DBC_INTERNAL
+
+	/* Hash private part */
+
+	/* Per-thread information */
+	DB_LOCK hlock;			/* Metadata page lock. */
+	HMETA *hdr;			/* Pointer to meta-data page. */
+	PAGE *split_buf;		/* Temporary buffer for splits. */
+
+	/* Hash cursor information */
+	db_pgno_t	bucket;		/* Bucket we are traversing. */
+	db_pgno_t	lbucket;	/* Bucket for which we are locked. */
+	db_indx_t	dup_off;	/* Offset within a duplicate set. */
+	db_indx_t	dup_len;	/* Length of current duplicate. */
+	db_indx_t	dup_tlen;	/* Total length of duplicate entry. */
+	u_int32_t	seek_size;	/* Number of bytes we need for add. */
+	db_pgno_t	seek_found_page;/* Page on which we can insert. */
+	db_indx_t	seek_found_indx;/* Insert position for item. */
+	u_int32_t	order;		/* Relative order among deleted curs. */
+
+#define	H_CONTINUE	0x0001		/* Join--search strictly fwd for data */
+#define	H_CONTRACT	0x0002		/* Table contracted.*/
+#define	H_DELETED	0x0004		/* Cursor item is deleted. */
+#define	H_DUPONLY	0x0008		/* Dups only; do not change key. */
+#define	H_EXPAND	0x0010		/* Table expanded. */
+#define	H_ISDUP		0x0020		/* Cursor is within duplicate set. */
+#define	H_NEXT_NODUP	0x0040		/* Get next non-dup entry. */
+#define	H_NOMORE	0x0080		/* No more entries in bucket. */
+#define	H_OK		0x0100		/* Request succeeded. */
+	u_int32_t	flags;
+} HASH_CURSOR;
+
+/* Test string. */
+#define	CHARKEY			"%$sniglet^&"
+
+/* Overflow management */
+/*
+ * The spares table indicates the page number at which each doubling begins.
+ * From this page number we subtract the number of buckets already allocated
+ * so that we can do a simple addition to calculate the page number here.
+ */
+#define	BS_TO_PAGE(bucket, spares)		\
+	((bucket) + (spares)[__db_log2((bucket) + 1)])
+#define	BUCKET_TO_PAGE(I, B)	(BS_TO_PAGE((B), (I)->hdr->spares))
+
+/* Constraints about much data goes on a page. */
+
+#define	MINFILL		4
+#define	ISBIG(I, N)	(((N) > ((I)->hdr->dbmeta.pagesize / MINFILL)) ? 1 : 0)
+
+/* Shorthands for accessing structure */
+#define	NDX_INVALID	0xFFFF
+#define	BUCKET_INVALID	0xFFFFFFFF
+
+/* On page duplicates are stored as a string of size-data-size triples. */
+#define	DUP_SIZE(len)	((len) + 2 * sizeof(db_indx_t))
+
+/* Log messages types (these are subtypes within a record type) */
+/* These bits are obsolete and are only needed for down rev logs. */
+#define	PAIR_KEYMASK		0x1
+#define	PAIR_DATAMASK		0x2
+#define	PAIR_DUPMASK		0x4
+#define	PAIR_MASK		0xf
+#define	PAIR_ISKEYBIG(N)	(N & PAIR_KEYMASK)
+#define	PAIR_ISDATABIG(N)	(N & PAIR_DATAMASK)
+#define	PAIR_ISDATADUP(N)	(N & PAIR_DUPMASK)
+#define	OPCODE_OF(N)	(N & ~PAIR_MASK)
+
+/* Operators for hash recover routines. */
+#define	PUTPAIR		0x20
+#define	DELPAIR		0x30
+#define	PUTOVFL		0x40
+#define	DELOVFL		0x50
+#define	HASH_UNUSED1	0x60
+#define	HASH_UNUSED2	0x70
+#define	SPLITOLD	0x80
+#define	SPLITNEW	0x90
+#define	SORTPAGE	0x100
+
+/* Flags to control behavior of __ham_del_pair */
+#define	HAM_DEL_NO_CURSOR	0x01 /* Don't do any cursor adjustment */
+#define	HAM_DEL_NO_RECLAIM	0x02 /* Don't reclaim empty pages */
+/* Just delete onpage items (even if they are references to off-page items). */
+#define	HAM_DEL_IGNORE_OFFPAGE	0x04
+
+typedef enum {
+	DB_HAM_CURADJ_DEL = 1,
+	DB_HAM_CURADJ_ADD = 2,
+	DB_HAM_CURADJ_ADDMOD = 3,
+	DB_HAM_CURADJ_DELMOD = 4
+} db_ham_curadj;
+
+typedef enum {
+	DB_HAM_CHGPG = 1,
+	DB_HAM_DELFIRSTPG = 2,
+	DB_HAM_DELMIDPG = 3,
+	DB_HAM_DELLASTPG = 4,
+	DB_HAM_DUP   = 5,
+	DB_HAM_SPLIT = 6
+} db_ham_mode;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/hash_auto.h"
+#include "dbinc_auto/hash_ext.h"
+#include "dbinc/db_am.h"
+#endif /* !_DB_HASH_H_ */
diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h
new file mode 100644
index 00000000..ca3407e0
--- /dev/null
+++ b/src/dbinc/heap.h
@@ -0,0 +1,59 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#ifndef _DB_HEAP_H_
+#define _DB_HEAP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Forward structure declarations. */
+struct __heap;		typedef struct __heap HEAP;
+struct __heap_cursor;	typedef struct __heap_cursor HEAP_CURSOR;
+
+/*
+ * The in-memory, per-heap data structure.
+ */
+struct __heap {		/* Heap access method. */
+	
+	u_int32_t gbytes;	/* Initial heap size. */
+	u_int32_t bytes;	/* Initial heap size. */
+	u_int32_t region_size;	/* Size of each region. */
+
+	db_pgno_t curregion;	/* The region of the next insert. */
+	db_pgno_t maxpgno;	/* Maximum page number of a fixed size heap. */
+	int curpgindx;	/* The last used offset in the region's space bitmap. */
+};
+
+struct __heap_cursor {
+	/* struct __dbc_internal */
+	__DBC_INTERNAL
+
+	/* Heap private part */
+
+	u_int32_t	flags;
+};
+
+#define HEAP_PG_FULL	3	/* No space on page. */
+#define HEAP_PG_GT66	2	/* Page greater than 66% full */
+#define HEAP_PG_GT33	1	/* Page greater than 33% full */
+#define HEAP_PG_LT33	0	/* Page less than 33% full */
+
+#define HEAP_PG_FULL_PCT	5	/* Less than 5% of page is free. */
+#define HEAP_PG_GT66_PCT	33	/* Less than 33% of page is free. */
+#define HEAP_PG_GT33_PCT	66	/* Less than 66% of page is free. */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/heap_auto.h"
+#include "dbinc_auto/heap_ext.h"
+#include "dbinc/db_am.h"
+#endif
+
+	
diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h
new file mode 100644
index 00000000..2a495b17
--- /dev/null
+++ b/src/dbinc/hmac.h
@@ -0,0 +1,39 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_HMAC_H_
+#define	_DB_HMAC_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Algorithm specific information.
+ */
+/*
+ * SHA1 checksumming
+ */
+typedef struct {
+	u_int32_t	state[5];
+	u_int32_t	count[2];
+	unsigned char	buffer[64];
+} SHA1_CTX;
+
+/*
+ * AES assumes the SHA1 checksumming (also called MAC)
+ */
+#define	DB_MAC_MAGIC	"mac derivation key magic value"
+#define	DB_ENC_MAGIC	"encryption and decryption key value magic"
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/hmac_ext.h"
+#endif /* !_DB_HMAC_H_ */
diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h
new file mode 100644
index 00000000..eab51832
--- /dev/null
+++ b/src/dbinc/lock.h
@@ -0,0 +1,326 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_LOCK_H_
+#define	_DB_LOCK_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define	DB_LOCK_DEFAULT_N	1000	/* Default # of locks in region. */
+
+/*
+ * The locker id space is divided between the transaction manager and the lock
+ * manager.  Lock IDs start at 1 and go to DB_LOCK_MAXID.  Txn IDs start at
+ * DB_LOCK_MAXID + 1 and go up to TXN_MAXIMUM.
+ */
+#define	DB_LOCK_INVALIDID	0
+#define	DB_LOCK_MAXID		0x7fffffff
+
+/*
+ * A locker's deadlock resolution priority is stored as a 32 bit unsigned
+ * integer.  The maximum priority is DB_LOCK_MAXPRIORITY and the default
+ * priority is DB_LOCK_DEFPRIORITY.
+ */
+#define	DB_LOCK_DEFPRIORITY	100
+#define DB_LOCK_MAXPRIORITY	UINT32_MAX
+
+/*
+ * Out of band value for a lock.  Locks contain an offset into a lock region,
+ * so we use an invalid region offset to indicate an invalid or unset lock.
+ */
+#define	LOCK_INVALID		INVALID_ROFF
+#define	LOCK_ISSET(lock)	((lock).off != LOCK_INVALID)
+#define	LOCK_INIT(lock)		((lock).off = LOCK_INVALID)
+
+/*
+ * Macro to identify a write lock for the purpose of counting locks
+ * for the NUMWRITES option to deadlock detection.
+ */
+#define	IS_WRITELOCK(m) \
+	((m) == DB_LOCK_WRITE || (m) == DB_LOCK_WWRITE || \
+	    (m) == DB_LOCK_IWRITE || (m) == DB_LOCK_IWR)
+
+/*
+ * Macros to lock/unlock the lock region as a whole. Mostly used for
+ * initialization.
+ */
+#define	LOCK_REGION_LOCK(env)						\
+	MUTEX_LOCK(env, ((DB_LOCKREGION *)				\
+	    (env)->lk_handle->reginfo.primary)->mtx_region)
+#define	LOCK_REGION_UNLOCK(env)						\
+	MUTEX_UNLOCK(env, ((DB_LOCKREGION *)				\
+	    (env)->lk_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_LOCKREGION --
+ *	The lock shared region.
+ */
+
+typedef struct __db_lockregion { /* SHARED */
+	db_mutex_t	mtx_region;	/* Region mutex. */
+
+	u_int32_t	need_dd;	/* flag for deadlock detector */
+	u_int32_t	detect;		/* run dd on every conflict */
+	db_timespec	next_timeout;	/* next time to expire a lock */
+	db_mutex_t	mtx_dd;		/* mutex for lock object dd list. */
+	db_mutex_t	mtx_lockers;	/* mutex for locker allocation. */
+	SH_TAILQ_HEAD(__dobj) dd_objs;	/* objects with waiters */
+					/* free locker header */
+        roff_t          locker_mem_off; /* block memory for lockers */
+	SH_TAILQ_HEAD(__flocker) free_lockers;
+	SH_TAILQ_HEAD(__lkrs) lockers;	/* list of lockers */
+
+	db_timeout_t	lk_timeout;	/* timeout for locks. */
+	db_timeout_t	tx_timeout;	/* timeout for txns. */
+
+	u_int32_t	locker_t_size;	/* size of locker hash table */
+	u_int32_t	object_t_size;	/* size of object hash table */
+	u_int32_t	part_t_size;	/* number of partitions */
+
+	roff_t		conf_off;	/* offset of conflicts array */
+	roff_t		obj_off;	/* offset of object hash table */
+	roff_t		part_off;	/* offset of partition array */
+	roff_t		stat_off;	/* offset to object hash stats */
+	roff_t		locker_off;	/* offset of locker hash table */
+
+	u_int32_t	lock_id;	/* Current lock(er) id to allocate. */
+	u_int32_t	cur_maxid;	/* Current max lock(er) id. */
+	u_int32_t	nlockers;	/* Current number of lockers. */
+	int32_t		nmodes;		/* Number of modes in conflict table. */
+	DB_LOCK_STAT	stat;		/* stats about locking. */
+} DB_LOCKREGION;
+
+/*
+ * Since we will store DBTs in shared memory, we need the equivalent of a
+ * DBT that will work in shared memory.
+ */
+typedef struct __sh_dbt { /* SHARED */
+	u_int32_t size;			/* Byte length. */
+	roff_t    off;			/* Region offset. */
+} SH_DBT;
+
+#define	SH_DBT_PTR(p)	((void *)(((u_int8_t *)(p)) + (p)->off))
+
+/*
+ * Object structures;  these live in the object hash table.
+ */
+typedef struct __db_lockobj { /* SHARED */
+	u_int32_t	indx;		/* Hash index of this object. */
+	u_int32_t	generation;	/* Generation of this object. */
+	SH_DBT	lockobj;		/* Identifies object locked. */
+	SH_TAILQ_ENTRY links;		/* Links for free list or hash list. */
+	SH_TAILQ_ENTRY dd_links;	/* Links for dd list. */
+	SH_TAILQ_HEAD(__waitl) waiters;	/* List of waiting locks. */
+	SH_TAILQ_HEAD(__holdl) holders;	/* List of held locks. */
+					/* Declare room in the object to hold
+					 * typical DB lock structures so that
+					 * we do not have to allocate them from
+					 * shalloc at run-time. */
+	u_int8_t objdata[sizeof(struct __db_ilock)];
+} DB_LOCKOBJ;
+
+/*
+ * Locker structures; these live in the locker hash table.
+ */
+struct __db_locker { /* SHARED */
+	u_int32_t id;			/* Locker id. */
+
+	pid_t pid;			/* Process owning locker ID */
+	db_threadid_t tid;		/* Thread owning locker ID */
+	db_mutex_t mtx_locker;		/* Mutex to block on. */
+
+	u_int32_t dd_id;		/* Deadlock detector id. */
+
+	u_int32_t nlocks;		/* Number of locks held. */
+	u_int32_t nwrites;		/* Number of write locks held. */
+	u_int32_t priority;		/* Deadlock resolution priority. */
+	u_int32_t nrequest;             /* number of requests. */
+	
+	roff_t  master_locker;		/* Locker of master transaction. */
+	roff_t  parent_locker;		/* Parent of this child. */
+	SH_LIST_HEAD(_child) child_locker;	/* List of descendant txns;
+						   only used in a "master"
+						   txn. */
+	SH_LIST_ENTRY child_link;	/* Links transactions in the family;
+					   elements of the child_locker
+					   list. */
+	SH_TAILQ_ENTRY links;		/* Links for free and hash list. */
+	SH_TAILQ_ENTRY ulinks;		/* Links in-use list. */
+	SH_LIST_HEAD(_held) heldby;	/* Locks held by this locker. */
+	db_timespec	lk_expire;	/* When current lock expires. */
+	db_timespec	tx_expire;	/* When this txn expires. */
+	db_timeout_t	lk_timeout;	/* How long do we let locks live. */
+
+#define	DB_LOCKER_DIRTY		0x0001	/* Has write locks. */
+#define	DB_LOCKER_INABORT	0x0002	/* Is aborting, don't abort again. */
+#define	DB_LOCKER_TIMEOUT	0x0004	/* Has timeout set. */
+#define	DB_LOCKER_FAMILY_LOCKER 0x0008	/* Part of a family of lockers. */
+#define	DB_LOCKER_HANDLE_LOCKER 0x0010	/* Not associated with a thread. */
+	u_int32_t flags;
+};
+
+/*
+ * Map a hash index into a partition.
+ */
+#define	LOCK_PART(reg, ndx)  (ndx % (reg)->part_t_size)
+
+/*
+ * Structure that contains information about a lock table partition.
+ */
+typedef struct __db_lockpart{ /* SHARED */
+	db_mutex_t	mtx_part;	/* mutex for partition*/
+					/* free lock header */
+	SH_TAILQ_HEAD(__flock) free_locks;
+					/* free obj header */
+	SH_TAILQ_HEAD(__fobj) free_objs;
+        roff_t          lock_mem_off;   /* block memory for locks */
+        roff_t          lockobj_mem_off;/* block memory for lockobjs */
+#ifdef HAVE_STATISTICS
+	DB_LOCK_PSTAT	part_stat;	/* Partition stats. */
+#endif
+} DB_LOCKPART;
+
+#define	FREE_LOCKS(lt, part)	((lt)->part_array[part].free_locks)
+#define	FREE_OBJS(lt, part)	((lt)->part_array[part].free_objs)
+
+/*
+ * DB_LOCKTAB --
+ *	The primary library lock data structure (i.e., the one referenced
+ * by the environment, as opposed to the internal one laid out in the region.)
+ */
+struct __db_locktab {
+	ENV		*env;		/* Environment. */
+	REGINFO		 reginfo;	/* Region information. */
+	u_int8_t	*conflicts;	/* Pointer to conflict matrix. */
+	DB_LOCKPART	*part_array;	/* Beginning of partition array. */
+#ifdef HAVE_STATISTICS
+	DB_LOCK_HSTAT	*obj_stat;	/* Object hash stats array. */
+#endif
+	DB_HASHTAB	*obj_tab;	/* Beginning of object hash table. */
+	DB_HASHTAB	*locker_tab;	/* Beginning of locker hash table. */
+};
+
+/*
+ * Test for conflicts.
+ *
+ * Cast HELD and WANTED to ints, they are usually db_lockmode_t enums.
+ */
+#define	CONFLICTS(T, R, HELD, WANTED) \
+	(T)->conflicts[((int)HELD) * (R)->nmodes + ((int)WANTED)]
+
+#define	OBJ_LINKS_VALID(L) ((L)->links.stqe_prev != -1)
+
+struct __db_lock { /* SHARED */
+	/*
+	 * Wait on mutex to wait on lock.  You reference your own mutex with
+	 * ID 0 and others reference your mutex with ID 1.
+	 */
+	db_mutex_t	mtx_lock;
+
+	roff_t		holder;		/* Who holds this lock. */
+	u_int32_t	gen;		/* Generation count. */
+	SH_TAILQ_ENTRY	links;		/* Free or holder/waiter list. */
+	SH_LIST_ENTRY	locker_links;	/* List of locks held by a locker. */
+	u_int32_t	refcount;	/* Reference count the lock. */
+	db_lockmode_t	mode;		/* What sort of lock. */
+	roff_t		obj;		/* Relative offset of object struct. */
+	u_int32_t	indx;		/* Hash index of this object. */
+	db_status_t	status;		/* Status of this lock. */
+};
+
+/*
+ * Flag values for __lock_put_internal:
+ * DB_LOCK_DOALL:     Unlock all references in this lock (instead of only 1).
+ * DB_LOCK_FREE:      Free the lock (used in checklocker).
+ * DB_LOCK_NOPROMOTE: Don't bother running promotion when releasing locks
+ *		      (used by __lock_put_internal).
+ * DB_LOCK_UNLINK:    Remove from the locker links (used in checklocker).
+ * Make sure that these do not conflict with the interface flags because
+ * we pass some of those around.
+ */
+#define	DB_LOCK_DOALL		0x010000
+#define	DB_LOCK_FREE		0x040000
+#define	DB_LOCK_NOPROMOTE	0x080000
+#define	DB_LOCK_UNLINK		0x100000
+#define	DB_LOCK_ONEWAITER	0x400000
+
+/*
+ * Macros to get/release different types of mutexes.
+ */
+/*
+ * Operations on lock objects must be protected by a mutex, either on their
+ * partition or on the lock region.  Lock structures associated with that
+ * object are protected as well.  Each partition has a free list of objects
+ * and lock structures protected by that mutex.  We want to avoid getting
+ * multiple mutexes, particularly in __lock_vec, when there is only a
+ * single partition.  If there is only one partition, then all the calls
+ * to LOCK_SYSTEM_LOCK(UNLOCK) actually acquire(release) a lock system
+ * wide mutex and MUTEX_LOCK(UNLOCK)_PARTITION are no-ops.  If the number
+ * of partitions is greater than one, then LOCK_SYSTEM_LOCK(UNLOCK) is a
+ * no-op, and MUTEX_LOCK(UNLOCK)_PARTITION acquire a mutex on a particular
+ * partition of the lock table.
+ */
+#define	LOCK_SYSTEM_LOCK(lt, reg) do {					\
+	if ((reg)->part_t_size == 1)					\
+		MUTEX_LOCK((lt)->env, (reg)->mtx_region);		\
+} while (0)
+#define	LOCK_SYSTEM_UNLOCK(lt, reg) do {				\
+	if ((reg)->part_t_size == 1)					\
+		MUTEX_UNLOCK((lt)->env, (reg)->mtx_region);		\
+} while (0)
+#define	MUTEX_LOCK_PARTITION(lt, reg, p) do {				\
+	if ((reg)->part_t_size != 1)					\
+		MUTEX_LOCK((lt)->env, (lt)->part_array[p].mtx_part);	\
+} while (0)
+#define	MUTEX_UNLOCK_PARTITION(lt, reg, p) do {				\
+	if ((reg)->part_t_size != 1)					\
+		MUTEX_UNLOCK((lt)->env, (lt)->part_array[p].mtx_part);	\
+} while (0)
+
+#define	OBJECT_LOCK(lt, reg, obj, ndx) do {				\
+	ndx = __lock_ohash(obj) % (reg)->object_t_size;			\
+	MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx));		\
+} while (0)
+
+#define	OBJECT_LOCK_NDX(lt, reg, ndx)					\
+	MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx));
+
+#define	OBJECT_UNLOCK(lt, reg, ndx)					\
+	MUTEX_UNLOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx));
+
+/*
+ * Protect the object deadlock detector queue and the locker allocation
+ * and active queues
+ */
+#define	LOCK_DD(env, region)						\
+	MUTEX_LOCK(env, (region)->mtx_dd)
+#define	UNLOCK_DD(env, region)						\
+	MUTEX_UNLOCK(env, (region)->mtx_dd)
+#define	LOCK_LOCKERS(env, region)					\
+	MUTEX_LOCK(env, (region)->mtx_lockers)
+#define	UNLOCK_LOCKERS(env, region)					\
+	MUTEX_UNLOCK(env, (region)->mtx_lockers)
+
+/*
+ * __lock_locker_hash --
+ *	Hash function for entering lockers into the locker hash table.
+ *	Since these are simply 32-bit unsigned integers at the moment,
+ *	just return the locker value.
+ */
+#define	__lock_locker_hash(locker)	(locker)
+#define	LOCKER_HASH(lt, reg, locker, ndx)				\
+	ndx = __lock_locker_hash(locker) % (reg)->locker_t_size;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/lock_ext.h"
+#endif /* !_DB_LOCK_H_ */
diff --git a/src/dbinc/log.h b/src/dbinc/log.h
new file mode 100644
index 00000000..c4dea6fc
--- /dev/null
+++ b/src/dbinc/log.h
@@ -0,0 +1,463 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_LOG_H_
+#define	_DB_LOG_H_
+
+#include "dbinc/db_swap.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************************
+ * DBREG:
+ *	The DB file register code keeps track of open files.  It's stored
+ *	in the log subsystem's shared region, and so appears in the log.h
+ *	header file, but is logically separate.
+ *	The dbp may not be open if we are recovering the abort of a create.
+ *******************************************************/
+/*
+ * The per-process table that maps log file-id's to DB structures.
+ */
+typedef	struct __db_entry {
+	DB	*dbp;			/* Open dbp for this file id. */
+	int	deleted;		/* File was not found during open. */
+} DB_ENTRY;
+
+/*
+ * FNAME --
+ *	File name and id.
+ */
+struct __fname {
+	SH_TAILQ_ENTRY q;		/* File name queue. */
+
+	pid_t	  pid;			/* Process that owns this. */
+	int32_t   id;			/* Logging file id. */
+	int32_t   old_id;		/* Saved logging file id. */
+	DBTYPE	  s_type;		/* Saved DB type. */
+
+	roff_t	  fname_off;		/* File name offset. */
+	roff_t	  dname_off;		/* Database name offset. */
+	db_pgno_t meta_pgno;		/* Page number of the meta page. */
+	u_int8_t  ufid[DB_FILE_ID_LEN];	/* Unique file id. */
+
+	u_int32_t create_txnid;		/*
+					 * Txn ID of the DB create, stored so
+					 * we can log it at register time.
+					 */
+	db_mutex_t mutex;		/* mutex from db handle. */
+			/* number of txn referencing + 1 for the db handle. */
+	u_int32_t txn_ref;
+
+#define	DB_FNAME_CLOSED		0x01	/* DBP was closed. */
+#define	DB_FNAME_DURABLE	0x02	/* File is durable. */
+#define	DB_FNAME_INMEM		0x04	/* File is in memory. */
+#define	DB_FNAME_NOTLOGGED	0x08	/* Log of close failed. */
+#define	DB_FNAME_RECOVER	0x10	/* File was opened by recovery code. */
+#define	DB_FNAME_RESTORED	0x20	/* File may be in restored txn. */
+#define	DB_FNAME_DBREG_MASK	0xf000	/* These bits come from DBREG below. */
+	u_int32_t flags;
+};
+
+/* File open/close register log record opcodes. */
+#define	DBREG_CHKPNT	1		/* Checkpoint: file name/id dump. */
+#define	DBREG_CLOSE	2		/* File close. */
+#define	DBREG_OPEN	3		/* File open. */
+#define	DBREG_PREOPEN	4		/* Open in mpool only. */
+#define	DBREG_RCLOSE	5		/* File close after recovery. */
+#define	DBREG_REOPEN	6		/* Open for in-memory database. */
+#define	DBREG_XCHKPNT	7		/* Checkpoint of exclusive file. */
+#define	DBREG_XOPEN	8		/* File exclusive open. */
+#define	DBREG_XREOPEN	9		/* File exclusive open in-memory. */
+
+/* These bits are logged so db_printlog can handle page data. */
+#define	DBREG_OP_MASK	0xf		/* Opcode mask */
+#define	DBREG_BIGEND	0x1000		/* Db Big endian. */
+#define	DBREG_CHKSUM	0x2000		/* Db is checksummed. */
+#define	DBREG_ENCRYPT	0x4000		/* Db is encrypted. */
+#define	DBREG_EXCL	0x8000		/* Db is exclusive. */
+
+/*******************************************************
+ * LOG:
+ *	The log subsystem information.
+ *******************************************************/
+struct __hdr;		typedef struct __hdr HDR;
+struct __log;		typedef struct __log LOG;
+struct __log_persist;	typedef struct __log_persist LOGP;
+
+#define	LFPREFIX	"log."		/* Log file name prefix. */
+#define	LFNAME		"log.%010d"	/* Log file name template. */
+#define	LFNAME_V1	"log.%05d"	/* Log file name template, rev 1. */
+#define IS_LOG_FILE(name)  (strncmp(name, LFPREFIX, sizeof(LFPREFIX) - 1) == 0)
+
+#define	LG_MAX_DEFAULT		(10 * MEGABYTE)	/* 10 MB. */
+#define	LG_MAX_INMEM		(256 * 1024)	/* 256 KB. */
+#define	LG_BSIZE_INMEM		(1 * MEGABYTE)	/* 1 MB. */
+
+/*
+ * Allocate a few bytes under a power-of-two value.  BDB doesn't care if it's
+ * a power-of-two or not, and requesting slightly under a power-of-two allows
+ * stupid allocators to avoid wasting space.
+ */
+#define	LG_BASE_REGION_SIZE	(130000)	/* 128KB - 1072B */
+#define	LG_BSIZE_DEFAULT	(32000)		/* 32 KB - 768B */
+#define	LG_CURSOR_BUF_SIZE	(32000)		/* 32 KB - 768B */
+
+/*
+ * DB_LOG
+ *	Per-process log structure.
+ */
+struct __db_log {
+	/*
+	 * These fields need to be protected for multi-threaded support.
+	 */
+	db_mutex_t mtx_dbreg;		/* Mutex for thread protection. */
+
+	DB_ENTRY *dbentry;		/* Recovery file-id mapping. */
+#define	DB_GROW_SIZE	64
+	int32_t	dbentry_cnt;		/* Entries.  Grows by DB_GROW_SIZE. */
+
+	/*
+	 * These fields are only accessed when the region lock is held, so
+	 * they do not have to be protected by the thread lock as well.
+	 */
+	u_int32_t lfname;		/* Log file "name". */
+	DB_FH	 *lfhp;			/* Log file handle. */
+	time_t	  lf_timestamp;		/* Log file timestamp. */
+
+	u_int8_t *bufp;			/* Region buffer. */
+
+	/* These fields are not thread protected. */
+	ENV	 *env;			/* Environment */
+	REGINFO	  reginfo;		/* Region information. */
+
+#define	DBLOG_AUTOREMOVE	0x01	/* Autoremove log files. */
+#define	DBLOG_DIRECT		0x02	/* Do direct I/O on the log. */
+#define	DBLOG_DSYNC		0x04	/* Set OS_DSYNC on the log. */
+#define	DBLOG_FORCE_OPEN	0x08	/* Force the DB open even if it appears
+					 * to be deleted. */
+#define	DBLOG_INMEMORY		0x10	/* Logging is in memory. */
+#define	DBLOG_OPENFILES		0x20	/* Prepared files need to be open. */
+#define	DBLOG_RECOVER		0x40	/* We are in recovery. */
+#define	DBLOG_ZERO		0x80	/* Zero fill the log. */
+#define	DBLOG_VERIFYING		0x100	/* The log is being verified. */
+	u_int32_t flags;
+};
+
+/*
+ * HDR --
+ *	Log record header.
+ */
+struct __hdr {
+	u_int32_t prev;			/* Previous offset. */
+	u_int32_t len;			/* Current length. */
+	u_int8_t  chksum[DB_MAC_KEY];	/* Current checksum. */
+	u_int8_t  iv[DB_IV_BYTES];	/* IV */
+	u_int32_t orig_size;		/* Original size of log record */
+	/* !!! - 'size' is not written to log, must be last in hdr */
+	size_t	  size;			/* Size of header to use */
+};
+
+/*
+ * LOG_HDR_SUM -- XOR in prev and len
+ *	This helps avoids the race misreading the log while it
+ * it is being updated.
+ */
+#define	LOG_HDR_SUM(crypto, hdr, sum) do {				\
+	if (crypto) {							\
+		((u_int32_t *)sum)[0] ^= ((HDR *)hdr)->prev;		\
+		((u_int32_t *)sum)[1] ^= ((HDR *)hdr)->len;		\
+	} else {							\
+		((u_int32_t *)sum)[0] ^=				\
+		     ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len;		\
+	}								\
+} while (0)
+
+/*
+ * We use HDR internally, and then when we write out, we write out
+ * prev, len, and then a 4-byte checksum if normal operation or
+ * a crypto-checksum and IV and original size if running in crypto
+ * mode.  We must store the original size in case we pad.  Set the
+ * size when we set up the header.  We compute a DB_MAC_KEY size
+ * checksum regardless, but we can safely just use the first 4 bytes.
+ */
+#define	HDR_NORMAL_SZ	12
+#define	HDR_CRYPTO_SZ	12 + DB_MAC_KEY + DB_IV_BYTES
+
+struct __log_persist {
+	u_int32_t magic;		/* DB_LOGMAGIC */
+	u_int32_t version;		/* DB_LOGVERSION */
+
+	u_int32_t log_size;		/* Log file size. */
+	u_int32_t notused;		/* Historically the log file mode. */
+};
+
+/* Macros to lock/unlock the log region as a whole. */
+#define	LOG_SYSTEM_LOCK(env)						\
+	MUTEX_LOCK(env, ((LOG *)					\
+	    (env)->lg_handle->reginfo.primary)->mtx_region)
+#define	LOG_SYSTEM_UNLOCK(env)						\
+	MUTEX_UNLOCK(env, ((LOG *)					\
+	    (env)->lg_handle->reginfo.primary)->mtx_region)
+
+/*
+ * LOG --
+ *	Shared log region.  One of these is allocated in shared memory,
+ *	and describes the log.
+ */
+struct __log { /* SHARED */
+	db_mutex_t mtx_region;		/* Region mutex. */
+
+	db_mutex_t mtx_filelist;	/* Mutex guarding file name list. */
+
+	LOGP	persist;		/* Persistent information. */
+
+	SH_TAILQ_HEAD(__fq1) fq;	/* List of file names. */
+	int32_t	fid_max;		/* Max fid allocated. */
+	roff_t	free_fid_stack;		/* Stack of free file ids. */
+	u_int32_t  free_fids;		/* Height of free fid stack. */
+	u_int32_t  free_fids_alloced;	/* N free fid slots allocated. */
+
+	/*
+	 * The lsn LSN is the file offset that we're about to write and which
+	 * we will return to the user.
+	 */
+	DB_LSN	  lsn;			/* LSN at current file offset. */
+
+	/*
+	 * The f_lsn LSN is the LSN (returned to the user) that "owns" the
+	 * first byte of the buffer.  If the record associated with the LSN
+	 * spans buffers, it may not reflect the physical file location of
+	 * the first byte of the buffer.
+	 */
+	DB_LSN	  f_lsn;		/* LSN of first byte in the buffer. */
+	db_size_t b_off;		/* Current offset in the buffer. */
+	u_int32_t w_off;		/* Current write offset in the file. */
+	u_int32_t len;			/* Length of the last record. */
+
+	DB_LSN	  active_lsn;		/* Oldest active LSN in the buffer. */
+	db_size_t a_off;		/* Offset in the buffer of first active
+					   file. */
+
+	/*
+	 * The s_lsn LSN is the last LSN that we know is on disk, not just
+	 * written, but synced.  This field is protected by the flush mutex
+	 * rather than by the region mutex.
+	 */
+	db_mutex_t mtx_flush;		/* Mutex guarding flushing. */
+	int32_t	   in_flush;	/* Log flush in progress. */
+	DB_LSN	   s_lsn;		/* LSN of the last sync. */
+
+	DB_LOG_STAT stat;		/* Log statistics. */
+
+	/*
+	 * This timestamp is updated anytime someone unlinks log
+	 * files.  This can happen when calling __log_vtruncate
+	 * or replication internal init when it unlinks log files.
+	 *
+	 * The timestamp is used so that other processes that might
+	 * have file handles to log files know to close/reopen them
+	 * so they're not potentially writing to now-removed files.
+	 */
+	time_t	   timestamp;		/* Log trunc timestamp. */
+
+	/*
+	 * !!!
+	 * NOTE: the next group of fields are NOT protected by the log
+	 * region lock.  They are protected by REP->mtx_clientdb.  If you
+	 * need access to both, you must acquire REP->mtx_clientdb
+	 * before acquiring the log region lock.
+	 *
+	 * The waiting_lsn is used by the replication system.  It is the
+	 * first LSN that we are holding without putting in the log, because
+	 * we received one or more log records out of order.  Associated with
+	 * the waiting_lsn is the number of log records that we still have to
+	 * receive before we decide that we should request it again.
+	 *
+	 * The max_wait_lsn is used to control retransmission in the face
+	 * of dropped messages.  If we are requesting all records from the
+	 * current gap (i.e., chunk of the log that we are missing), then
+	 * the max_wait_lsn contains the first LSN that we are known to have
+	 * in the __db.rep.db.  If we requested only a single record, then
+	 * the max_wait_lsn has the LSN of that record we requested.
+	 */
+	/* BEGIN fields protected by rep->mtx_clientdb. */
+	DB_LSN	  waiting_lsn;		/* First log record after a gap. */
+	DB_LSN	  verify_lsn;		/* LSN we are waiting to verify. */
+	DB_LSN	  prev_ckp;		/* LSN of ckp preceding verify_lsn. */
+	DB_LSN	  max_wait_lsn;		/* Maximum LSN requested. */
+	DB_LSN	  max_perm_lsn;		/* Maximum PERMANENT LSN processed. */
+	db_timespec max_lease_ts;	/* Maximum Lease timestamp seen. */
+	db_timespec wait_ts;		/* Time to wait before requesting. */
+	db_timespec rcvd_ts;		/* Initial received time to wait. */
+	db_timespec last_ts;		/* Last time of insert in temp db. */
+	/*
+	 * The ready_lsn is also used by the replication system.  It is the
+	 * next LSN we expect to receive.  It's normally equal to "lsn",
+	 * except at the beginning of a log file, at which point it's set
+	 * to the LSN of the first record of the new file (after the
+	 * header), rather than to 0.
+	 */
+	DB_LSN	  ready_lsn;
+	/*
+	 * The bulk_buf is used by replication for bulk transfer.  While this
+	 * is protected by REP->mtx_clientdb, this doesn't contend with the
+	 * above fields because the above are used by clients and the bulk
+	 * fields below are used by a master.
+	 */
+	roff_t	  bulk_buf;		/* Bulk transfer buffer in region. */
+	roff_t	  bulk_off;		/* Current offset into bulk buffer. */
+	u_int32_t bulk_len;		/* Length of buffer. */
+	u_int32_t bulk_flags;		/* Bulk buffer flags. */
+	/* END fields protected by rep->mtx_clientdb. */
+
+	/*
+	 * During initialization, the log system walks forward through the
+	 * last log file to find its end.  If it runs into a checkpoint
+	 * while it's doing so, it caches it here so that the transaction
+	 * system doesn't need to walk through the file again on its
+	 * initialization.
+	 */
+	DB_LSN	cached_ckp_lsn;
+
+	u_int32_t regionmax;		/* Configured size of the region. */
+
+	roff_t	  buffer_off;		/* Log buffer offset in the region. */
+	u_int32_t buffer_size;		/* Log buffer size. */
+
+	u_int32_t log_size;		/* Log file's size. */
+	u_int32_t log_nsize;		/* Next log file's size. */
+
+	int	  filemode;		/* Log file permissions mode. */
+
+	/*
+	 * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex,
+	 * all we care about is if they're zero or non-zero.
+	 */
+	int32_t	  db_log_autoremove;
+	int32_t	  db_log_inmemory;
+
+	u_int32_t ncommit;		/* Number of txns waiting to commit. */
+	DB_LSN	  t_lsn;		/* LSN of first commit */
+	SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */
+	SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */
+
+	/*
+	 * In-memory logs maintain a list of the start positions of all log
+	 * files currently active in the in-memory buffer.  This is to make the
+	 * lookup from LSN to log buffer offset efficient.
+	 */
+	SH_TAILQ_HEAD(__logfile) logfiles;
+	SH_TAILQ_HEAD(__free_logfile) free_logfiles;
+};
+
+/*
+ * __db_commit structure --
+ *	One of these is allocated for each transaction waiting to commit.
+ */
+struct __db_commit {
+	db_mutex_t	mtx_txnwait;	/* Mutex for txn to wait on. */
+	DB_LSN		lsn;		/* LSN of commit record. */
+	SH_TAILQ_ENTRY	links;		/* Either on free or waiting list. */
+
+#define	DB_COMMIT_FLUSH		0x0001	/* Flush the log when you wake up. */
+	u_int32_t	flags;
+};
+
+/*
+ * Check for the proper progression of Log Sequence Numbers.
+ * If we are rolling forward the LSN on the page must be greater
+ * than or equal to the previous LSN in log record.
+ * We ignore NOT LOGGED LSNs.  The user did an unlogged update.
+ * We should eventually see a log record that matches and continue
+ * forward.
+ * A ZERO LSN implies a page that was allocated prior to the recovery
+ * start point and then truncated later in the log.  An allocation of a
+ * page after this page will extend the file, leaving a hole.  We want to
+ * ignore this page until it is truncated again.
+ *
+ */
+
+#define	CHECK_LSN(e, redo, cmp, lsn, prev)				\
+	if (DB_REDO(redo) && (cmp) < 0 &&				\
+	    ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) ||	\
+	    IS_REP_CLIENT(e))) {					\
+		ret = __db_check_lsn(e, lsn, prev);			\
+		goto out;						\
+	}
+#define	CHECK_ABORT(e, redo, cmp, lsn, prev)				\
+	if (redo == DB_TXN_ABORT && (cmp) != 0 &&			\
+	    ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) ||	\
+	    IS_REP_CLIENT(e))) {					\
+		ret = __db_check_lsn(e, lsn, prev);			\
+		goto out;						\
+	}
+
+/*
+ * Helper for in-memory logs -- check whether an offset is in range
+ * in a ring buffer (inclusive of start, exclusive of end).
+ */
+struct __db_filestart {
+	u_int32_t	file;
+	size_t		b_off;
+
+	SH_TAILQ_ENTRY	links;		/* Either on free or waiting list. */
+};
+
+#define	RINGBUF_LEN(lp, start, end)					\
+	((start) < (end) ?						\
+	    (end) - (start) : (lp)->buffer_size - ((start) - (end)))
+
+/*
+ * Internal macro to set pointer to the begin_lsn for generated
+ * logging routines.  If begin_lsn is already set then do nothing.
+ * Return a pointer to the last lsn too.
+ */
+#undef DB_SET_TXN_LSNP
+#define	DB_SET_TXN_LSNP(txn, blsnp, llsnp) do {				\
+	DB_LSN *__lsnp;							\
+	TXN_DETAIL *__td;						\
+	__td = (txn)->td;						\
+	*(llsnp) = &__td->last_lsn;					\
+	while (__td->parent != INVALID_ROFF)				\
+		__td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent);	\
+	__lsnp = &__td->begin_lsn;					\
+	if (IS_ZERO_LSN(*__lsnp))					\
+		*(blsnp) = __lsnp;					\
+} while (0)
+
+/*
+ * Status codes indicating the validity of a log file examined by
+ * __log_valid().
+ */
+typedef enum {
+	DB_LV_INCOMPLETE,
+	DB_LV_NONEXISTENT,
+	DB_LV_NORMAL,
+	DB_LV_OLD_READABLE,
+	DB_LV_OLD_UNREADABLE
+} logfile_validity;
+
+/*
+ * All log records have these fields.
+ */
+typedef struct __log_rec_hdr {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+} LOG_REC_HEADER;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/log_ext.h"
+#include "dbinc_auto/dbreg_auto.h"
+#include "dbinc_auto/dbreg_ext.h"
+#endif /* !_DB_LOG_H_ */
diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h
new file mode 100644
index 00000000..fa90ace4
--- /dev/null
+++ b/src/dbinc/log_verify.h
@@ -0,0 +1,207 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+#ifndef _DB_LOG_VERIFY_H_
+#define _DB_LOG_VERIFY_H_
+
+#include "db_config.h"
+#include "db_int.h"
+
+/* 
+ * Log verification handle, such a handle is shared among all verification 
+ * functions during one verification process. 
+ */
+struct __db_log_verify_info {
+	DB_ENV *dbenv;		/* The database environment. */
+	DB *txninfo;		/* (txnid, __txn_verify_info) map. */
+	DB *ckps;		/* (ckp lrid, __ckpinfo) map. */
+	DB *fileregs; 		/* (file-uid, __file_reg_info) map. */
+	DB *fnameuid;		/* (fname, fuid), secondary db of fileregs. */
+	/* (dbreg-id, __file_reg_info) map, NOT the sec db for fileregs. */
+	DB *dbregids;
+	DB *pgtxn; 		/* (fileid-pageno, txnid) map. */
+	DB *txnpg; 		/* (txnid, fileid-pageno), sec db of pgtxn. */
+	/* lsn, (time-stamp, logtype(txn_regop or txn_ckp)) map. */
+	DB *lsntime; 
+	/* Secondary db of lsntime, use timestamp as secindex. */
+	DB *timelsn;
+
+	/* Time range database, (u_int32_t, __lv_txnrange) db. */
+	DB *txnrngs;
+	/* Store abort txn (lsn, txnid) map. */
+	DB *txnaborts;	
+	DB_LSN last_lsn;	/* Lsn of last log record we verified. */
+	/* The number of active, abort, commit and prepared txns. */
+	u_int32_t ntxn_active, ntxn_abort, ntxn_commit, ntxn_prep; 
+	u_int32_t nckp;		/* The number of checkpoint log records. */
+	/* 
+	 * Target database file unique id. Set if only verify log records 
+	 * of a database. 
+	 */
+	u_int8_t target_dbid[DB_FILE_ID_LEN];	
+	u_int32_t non_txnup_cnt;/* Number of non-txnal log records. */
+	u_int32_t unknown_logrec_cnt;/* Number of unknown log record. */
+	u_int32_t external_logrec_cnt;/* Number of external log record. */
+	/* 
+	 * (Log type, number of record) map. typeids are continuous 
+	 * integers, 256 is a big enough number. 
+	 */
+	u_int32_t lrtypes[256];	
+	u_int32_t aborted_txnid;/* The last aborted txnid. */
+	DB_LSN aborted_txnlsn; /* Last aborted txn's last log. */
+	DB_LSN valid_lsn; /* When reach this log,unset DB_LOG_VERIFY_PARTIAL. */
+	char *logtype_names[256];/* The type name string of each type of log.*/
+	const DB_LOG_VERIFY_CONFIG *lv_config;
+	DB_THREAD_INFO *ip;
+	u_int32_t flags;	/* The result of the verification. */
+};
+
+/* Transaction information. */
+struct __txn_verify_info {
+#define TXN_VERIFY_INFO_FIXSIZE (4 * sizeof(DB_LSN) + 9 * sizeof(u_int32_t))
+#define TXN_VERIFY_INFO_TOTSIZE(s)					\
+	(TXN_VERIFY_INFO_FIXSIZE + (s).num_recycle * sizeof(DB_LSN) + 	\
+	__lv_dbt_arrsz((s).fileups, (s).filenum) + 			\
+	sizeof(int32_t) * (s).filenum)
+
+	u_int32_t txnid; 	/* The key, also stored in data here. */
+	u_int32_t ptxnid; 	/* The parent txn id. */
+	
+	DB_LSN first_lsn;	/* Lsn of the first log record of this txn. */
+	DB_LSN last_lsn; 	/* Last lsn of the txn. */
+	DB_LSN prep_lsn; 	/* txn_prepare's lsn.*/
+	DB_LSN cur_lsn;		/* The lsn of the latest db op of this txn. */
+
+	u_int32_t num_recycle; 	/* The number of recycle lsns. */
+	u_int32_t filenum; 	/* The number of files updated. */
+
+#define TXN_STAT_ACTIVE 0
+#define TXN_STAT_ABORT 1
+#define TXN_STAT_COMMIT 2
+#define TXN_STAT_PREPARE 3
+	u_int32_t status;	/* Txn status */
+
+	/* The number of active, abort and commit children. */
+	u_int32_t nchild_active;
+	u_int32_t nchild_abort;
+	u_int32_t nchild_commit;
+
+	u_int32_t flags; /* Copied from the DB_TXN::flags member. */
+
+	DB_LSN *recycle_lsns; 	/* The array of txn_recycle records' lsns. */
+	/* The array of file unique ids of files updated by this txn. */
+	DBT *fileups; 	
+	int32_t *dbregid;/* The array of dbreg file ids updated by this txn. */
+};
+
+/* Database file information. */
+struct __lv_filereg_info {
+#define FILE_REG_INFO_FIXSIZE (sizeof(u_int32_t))
+#define FILE_REG_INFO_TOTSIZE(s) (FILE_REG_INFO_FIXSIZE + (s).fileid.size + \
+	sizeof((s).fileid.size) + sizeof(int32_t) * (s).regcnt 	+ \
+	strlen((s).fname) + 1)
+
+	u_int32_t regcnt;	/* The number of dbregids for this file-uid. */
+	int32_t *dbregids;
+	DBT fileid;		/* The file unique id. */
+	char *fname;		/* Database file name. */
+};
+
+/* Database file dbreg_register information. */
+struct __lv_filelife {
+	int32_t dbregid;	/* The primary key. */
+	DBTYPE dbtype;		/* The database type. */
+	u_int32_t lifetime;	/* DBREG_CHKPNT, DBREG_CLOSE, DBREG_OPEN, DBREG_XCHKPNT, DBREG_XOPEN */
+	db_pgno_t meta_pgno;	/* The meta_pgno; */
+	u_int8_t fileid[DB_FILE_ID_LEN];
+	DB_LSN lsn;		/* The lsn of log updating lifetime. */
+};
+
+/* Checkpoint information. */
+struct __lv_ckp_info {
+	int32_t timestamp;
+	DB_LSN lsn, ckplsn;	/* Lsn member is the primary key. */
+};
+
+/* 
+ * General information from log records which have timestamps. 
+ * We use it to do time range verifications. Such information is 
+ * acquired when backward-playing the logs before verification. 
+ */
+struct __lv_timestamp_info {
+	DB_LSN lsn;		/* The primary key. */
+	int32_t timestamp;	/* The secondary key. */
+
+	/* 
+	 * The log types containing a time stamp, so far only txn_ckp
+	 * and txn_regop types.
+	 */
+	u_int32_t logtype;
+};
+
+/* 
+ * Transaction ranges. Such information is acquired when backward-playing the
+ * logs before verification. Can be used to find aborted txns.
+ */
+struct __lv_txnrange {
+	/* 
+	 * Transaction ID, the primary key. The db storing records of this 
+	 * type should allow dup since txnids maybe reused. 
+	 */
+	u_int32_t txnid;
+
+	/* 
+	 * The parent txn id, ptxnid is the parent of txnid 
+	 * during [begin, end]. 
+	 */
+	u_int32_t ptxnid; 	
+
+	/* 
+	 * The first and last lsn, end is used to sort dup data because it's
+	 * seen prior to begin in a backward playback, and [begin, end] 
+	 * intervals won't overlap. 
+	 */
+	DB_LSN begin, end;
+
+	int32_t when_commit;/* The time of the commit, 0 if aborted. */
+};
+
+
+/* Parameter types for __iterate_txninfo function. */
+struct __add_recycle_params {
+	u_int32_t min, max;/* The recycled txnid range. */
+	/* The array of txn info to update into db. */
+	VRFY_TXN_INFO **ti2u;
+	u_int32_t ti2ui, ti2ul;/* The effective length and array length. */
+	DB_LSN recycle_lsn;
+}; 
+
+struct __ckp_verify_params {
+	DB_LSN lsn, ckp_lsn;
+	ENV *env;
+};
+
+/* Helper macros. */
+#define LOGTYPE_NAME(lvh, type) (lvh->logtype_names[type] == NULL ? \
+	NULL : lvh->logtype_names[type] + 3)
+#define NUMCMP(i1, i2) ((i1) > (i2) ? 1 : ((i1) < (i2) ? -1 : 0))
+
+#define INVAL_DBREGID -1
+
+/* 
+ * During recovery, DBREG_CHKPNT and DBREG_XCHKPNT can be seen as open,
+ * and it's followed by a DBREG_RCLOSE or DBREG_CLOSE. 
+ */
+#define IS_DBREG_OPEN(opcode) (opcode == DBREG_OPEN || opcode == \
+	DBREG_PREOPEN || opcode == DBREG_REOPEN || opcode == DBREG_CHKPNT \
+	|| opcode == DBREG_XCHKPNT || opcode == DBREG_XOPEN || \
+	opcode == DBREG_XREOPEN)
+#define IS_DBREG_CLOSE(opcode) (opcode == DBREG_CLOSE || opcode == DBREG_RCLOSE)
+
+#define IS_LOG_VRFY_SUPPORTED(version) ((version) == DB_LOGVERSION)
+
+#endif /* !_DB_LOG_VERIFY_H_*/
diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h
new file mode 100644
index 00000000..9a10c6d9
--- /dev/null
+++ b/src/dbinc/mp.h
@@ -0,0 +1,700 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_MP_H_
+#define	_DB_MP_H_
+
+#include "dbinc/atomic.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct __bh;		typedef struct __bh BH;
+struct __bh_frozen_p;	typedef struct __bh_frozen_p BH_FROZEN_PAGE;
+struct __bh_frozen_a;	typedef struct __bh_frozen_a BH_FROZEN_ALLOC;
+struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
+struct __db_mpool_fstat_int;
+typedef struct __db_mpool_fstat_int DB_MPOOL_FSTAT_INT;
+struct __db_mpreg;	typedef struct __db_mpreg DB_MPREG;
+struct __mpool;		typedef struct __mpool MPOOL;
+
+				/* We require at least 20KB of cache. */
+#define	DB_CACHESIZE_MIN	(20 * 1024)
+
+/*
+ * DB_MPOOLFILE initialization methods cannot be called after open is called,
+ * other methods cannot be called before open is called
+ */
+#define	MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)				\
+	if (F_ISSET(dbmfp, MP_OPEN_CALLED))				\
+		return (__db_mi_open((dbmfp)->env, name, 1));
+#define	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name)				\
+	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))				\
+		return (__db_mi_open((dbmfp)->env, name, 0));
+
+/*
+ * Cache flush operations, plus modifiers.
+ */
+#define	DB_SYNC_ALLOC		0x0001	/* Flush for allocation. */
+#define	DB_SYNC_CACHE		0x0002	/* Flush entire cache. */
+#define	DB_SYNC_CHECKPOINT	0x0004	/* Checkpoint. */
+#define	DB_SYNC_FILE		0x0008	/* Flush file. */
+#define	DB_SYNC_INTERRUPT_OK	0x0010	/* Allow interrupt and return OK. */
+#define	DB_SYNC_QUEUE_EXTENT	0x0020	/* Flush a queue file with extents. */
+#define	DB_SYNC_SUPPRESS_WRITE	0x0040	/* Ignore max-write configuration. */
+#define	DB_SYNC_TRICKLE		0x0080	/* Trickle sync. */
+
+/*
+ * DB_MPOOL --
+ *	Per-process memory pool structure.
+ */
+struct __db_mpool {
+	/* These fields need to be protected for multi-threaded support. */
+	db_mutex_t mutex;		/* Thread mutex. */
+
+	/*
+	 * DB_MPREG structure for the DB pgin/pgout routines.
+	 *
+	 * Linked list of application-specified pgin/pgout routines.
+	 */
+	DB_MPREG *pg_inout;
+	LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
+
+					/* List of DB_MPOOLFILE's. */
+	TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
+
+	/*
+	 * The env and reginfo fields are not thread protected, as they are
+	 * initialized during mpool creation, and not modified again.
+	 */
+	ENV	   *env;		/* Enclosing environment. */
+	REGINFO	   *reginfo;		/* Underlying cache regions. */
+};
+
+/*
+ * DB_MPREG --
+ *	DB_MPOOL registry of pgin/pgout functions.
+ */
+struct __db_mpreg {
+	LIST_ENTRY(__db_mpreg) q;	/* Linked list. */
+
+	int32_t ftype;			/* File type. */
+					/* Pgin, pgout routines. */
+	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+};
+
+/*
+ * File hashing --
+ *	We hash each file to hash bucket based on its fileid
+ *	or, in the case of in memory files, its name.
+ */
+
+/* Number of file hash buckets, a small prime number */
+#define	MPOOL_FILE_BUCKETS	17
+
+#define	FHASH(id, len)	__ham_func5(NULL, id, (u_int32_t)(len))
+
+#define	FNBUCKET(id, len)						\
+	(FHASH(id, len) % MPOOL_FILE_BUCKETS)
+
+/* Macros to lock/unlock the mpool region as a whole. */
+#define	MPOOL_SYSTEM_LOCK(env)						\
+	MUTEX_LOCK(env, ((MPOOL *)					\
+	    (env)->mp_handle->reginfo[0].primary)->mtx_region)
+#define	MPOOL_SYSTEM_UNLOCK(env)					\
+	MUTEX_UNLOCK(env, ((MPOOL *)					\
+	    (env)->mp_handle->reginfo[0].primary)->mtx_region)
+
+/* Macros to lock/unlock a specific mpool region. */
+#define	MPOOL_REGION_LOCK(env, infop)					\
+	MUTEX_LOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
+#define	MPOOL_REGION_UNLOCK(env, infop)					\
+	MUTEX_UNLOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
+
+/*
+ * MPOOL --
+ *	Shared memory pool region.
+ */
+struct __mpool { /* SHARED */
+	/*
+	 * The memory pool can be broken up into individual pieces/files.
+	 * There are two reasons for this: firstly, on Solaris you can allocate
+	 * only a little more than 2GB of memory in a contiguous chunk,
+	 * and I expect to see more systems with similar issues.  Secondly,
+	 * applications can add / remove pieces to dynamically resize the
+	 * cache.
+	 *
+	 * While this structure is duplicated in each piece of the cache,
+	 * the first of these pieces/files describes the entire pool, the
+	 * second only describe a piece of the cache.
+	 */
+	db_mutex_t	mtx_region;	/* Region mutex. */
+	db_mutex_t	mtx_resize;	/* Resizing mutex. */
+
+	/*
+	 * The lsn field and list of underlying MPOOLFILEs are thread protected
+	 * by the region lock.
+	 */
+	DB_LSN	  lsn;			/* Maximum checkpoint LSN. */
+
+	/* Configuration information: protected by the region lock. */
+	u_int32_t max_nreg;		/* Maximum number of regions. */
+	u_int32_t gbytes;		/* Number of gigabytes in cache. */
+	u_int32_t bytes;		/* Number of bytes in cache. */
+	u_int32_t pagesize;		/* Default page size. */
+	db_size_t mp_mmapsize;		/* Maximum file size for mmap. */
+	int32_t mp_maxopenfd;		/* Maximum open file descriptors. */
+	int32_t mp_maxwrite;		/* Maximum buffers to write. */
+	db_timeout_t mp_maxwrite_sleep;	/* Sleep after writing max buffers. */
+
+	/*
+	 * The number of regions and the total number of hash buckets across
+	 * all regions.
+	 * These fields are not protected by a mutex because we assume that we
+	 * can read a 32-bit value atomically.  They are only modified by cache
+	 * resizing which holds the mpool resizing mutex to ensure that
+	 * resizing is single-threaded.  See the comment in mp_resize.c for
+	 * more information.
+	 */
+	u_int32_t nreg;			/* Number of underlying REGIONS. */
+	u_int32_t nbuckets;		/* Total number of hash buckets. */
+
+	/*
+	 * The regid field is protected by the resize mutex.
+	 */
+	roff_t	  regids;		/* Array of underlying REGION Ids. */
+
+	roff_t	  ftab;			/* Hash table of files. */
+
+	/*
+	 * The following fields describe the per-cache portion of the region.
+	 *
+	 * The htab and htab_buckets fields are not thread protected as they
+	 * are initialized during mpool creation, and not modified again.
+	 *
+	 * The last_checked, lru_priority, and lru_generation fields are thread
+	 * protected by the region lock.
+	 */
+	roff_t	  htab;			/* Hash table offset. */
+	u_int32_t htab_buckets;		/* Number of hash table entries. */
+	u_int32_t last_checked;		/* Last bucket checked for free. */
+	u_int32_t lru_priority;		/* Priority counter for buffer LRU. */
+	u_int32_t lru_generation;	/* Allocation race condition detector. */
+	u_int32_t htab_mutexes;		/* Number of hash mutexes per region. */
+
+	 /*
+	  * The pages field keeps track of the number of pages in the cache
+	  * and is protected by the region lock.  It is accessed for reading
+	  * without the lock to return statistics.
+	  */
+	u_int32_t pages;		/* Number of pages in the cache. */
+
+	/*
+	 * The stat fields are not thread protected, and cannot be trusted.
+	 */
+	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
+
+	/*
+	 * We track page puts so that we can decide when allocation is never
+	 * going to succeed.  We don't lock the field, all we care about is
+	 * if it changes.
+	 */
+	u_int32_t  put_counter;		/* Count of page put calls. */
+
+	/*
+	 * Cache flush operations take a long time...
+	 *
+	 * Some cache flush operations want to ignore the app's configured
+	 * max-write parameters (they are trying to quickly shut down an
+	 * environment, for example).  We can't specify that as an argument
+	 * to the cache region functions, because we may decide to ignore
+	 * the max-write configuration after the cache operation has begun.
+	 * If the variable suppress_maxwrite is set, ignore the application
+	 * max-write config.
+	 *
+	 * We may want to interrupt cache flush operations in high-availability
+	 * configurations.
+	 */
+#define	DB_MEMP_SUPPRESS_WRITE	0x01
+#define	DB_MEMP_SYNC_INTERRUPT	0x02
+	u_int32_t config_flags;
+
+	/* Free frozen buffer headers, protected by the region lock. */
+	SH_TAILQ_HEAD(__free_frozen) free_frozen;
+
+	/* Allocated blocks of frozen buffer headers. */
+	SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
+};
+
+/*
+ * NREGION --
+ *	Select a cache region given the bucket number.
+ */
+#define	NREGION(mp, bucket)						\
+	((bucket) / (mp)->htab_buckets)
+
+/*
+ * MP_HASH --
+ *	 We make the assumption that early pages of the file are more likely
+ *	 to be retrieved than the later pages, which means the top bits will
+ *	 be more interesting for hashing as they're less likely to collide.
+ *	 That said, as 512 8K pages represents a 4MB file, so only reasonably
+ *	 large files will have page numbers with any other than the bottom 9
+ *	 bits set.  We XOR in the MPOOL offset of the MPOOLFILE that backs the
+ *	 page, since that should also be unique for the page.  We don't want
+ *	 to do anything very fancy -- speed is more important to us than using
+ *	 good hashing.
+ *
+ *	 Since moving to a dynamic hash, which boils down to using some of the
+ *	 least significant bits of the hash value, we no longer want to use a
+ *	 simple shift here, because it's likely with a bit shift that mf_offset
+ *	 will be ignored, and pages from different files end up in the same
+ *	 hash bucket.  Use a nearby prime instead.
+ */
+#define	MP_HASH(mf_offset, pgno)					\
+	((((pgno) << 8) ^ (pgno)) ^ (((u_int32_t) mf_offset) * 509))
+
+/*
+ * Inline the calculation of the mask, since we can't reliably store the mask
+ * with the number of buckets in the region.
+ *
+ * This is equivalent to:
+ *     mask = (1 << __db_log2(nbuckets)) - 1;
+ */
+#define	MP_MASK(nbuckets, mask) do {					\
+	for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1)	\
+		;							\
+} while (0)
+
+#define	MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do {		\
+	(bucket) = (hash) & (mask);					\
+	if ((bucket) >= (nbuckets))					\
+		(bucket) &= ((mask) >> 1);				\
+} while (0)
+
+#define	MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do {		\
+	u_int32_t __mask;						\
+	MP_MASK(nbuckets, __mask);					\
+	MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets,		\
+	    __mask, bucket);						\
+} while (0)
+
+/*
+ * MP_GET_REGION --
+ *	Select the region for a given page.
+ */
+#define	MP_GET_REGION(dbmfp, pgno, infopp, ret) do {			\
+	DB_MPOOL *__t_dbmp;						\
+	MPOOL *__t_mp;							\
+									\
+	__t_dbmp = dbmfp->env->mp_handle;				\
+	__t_mp = __t_dbmp->reginfo[0].primary;				\
+	if (__t_mp->max_nreg == 1) {					\
+		*(infopp) = &__t_dbmp->reginfo[0];			\
+	} else								\
+		ret = __memp_get_bucket((dbmfp)->env,			\
+		    (dbmfp)->mfp, (pgno), (infopp), NULL, NULL);	\
+} while (0)
+
+/*
+ * MP_GET_BUCKET --
+ *	Select and lock the bucket for a given page.
+ */
+#define	MP_GET_BUCKET(env, mfp, pgno, infopp, hp, bucket, ret) do {	\
+	DB_MPOOL *__t_dbmp;						\
+	MPOOL *__t_mp;							\
+	roff_t __t_mf_offset;						\
+									\
+	__t_dbmp = (env)->mp_handle;					\
+	__t_mp = __t_dbmp->reginfo[0].primary;				\
+	if (__t_mp->max_nreg == 1) {					\
+		*(infopp) = &__t_dbmp->reginfo[0];			\
+		__t_mf_offset = R_OFFSET(*(infopp), (mfp));		\
+		MP_BUCKET(__t_mf_offset,				\
+		    (pgno), __t_mp->nbuckets, bucket);		\
+		(hp) = R_ADDR(*(infopp), __t_mp->htab);			\
+		(hp) = &(hp)[bucket];				\
+		MUTEX_READLOCK(env, (hp)->mtx_hash);			\
+		ret = 0;						\
+	} else								\
+		ret = __memp_get_bucket((env),				\
+		    (mfp), (pgno), (infopp), &(hp), &(bucket));		\
+} while (0)
+
+struct __db_mpool_hash {
+	db_mutex_t	mtx_hash;	/* Per-bucket mutex. */
+
+	DB_HASHTAB	hash_bucket;	/* Head of bucket. */
+
+	db_atomic_t	hash_page_dirty;/* Count of dirty pages. */
+
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t	hash_io_wait;	/* Count of I/O waits. */
+	u_int32_t	hash_frozen;	/* Count of frozen buffers. */
+	u_int32_t	hash_thawed;	/* Count of thawed buffers. */
+	u_int32_t	hash_frozen_freed;/* Count of freed frozen buffers. */
+#endif
+
+	DB_LSN		old_reader;	/* Oldest snapshot reader (cached). */
+
+	u_int32_t	flags;
+};
+
+/*
+ * Mpool file statistics structure for use in shared memory.
+ * This structure must contain the same fields as the __db_mpool_fstat struct
+ * except for any pointer fields that are filled in only when the struct is
+ * being populated for output through the API.
+ */
+struct __db_mpool_fstat_int { /* SHARED */
+	u_int32_t st_pagesize;		/* Page size. */
+#ifndef __TEST_DB_NO_STATISTICS
+	u_int32_t st_map;		/* Pages from mapped files. */
+	uintmax_t st_cache_hit;	/* Pages found in the cache. */
+	uintmax_t st_cache_miss;	/* Pages not found in the cache. */
+	uintmax_t st_page_create;	/* Pages created in the cache. */
+	uintmax_t st_page_in;		/* Pages read in. */
+	uintmax_t st_page_out;		/* Pages written out. */
+	uintmax_t st_backup_spins;	/* Number of spins by a backup. */
+#endif
+};
+
+/*
+ * The base mpool priority is 1/4th of the name space, or just under 2^30. When
+ * the LRU priority counter is about to wrap (within a 128-entry 'red zone'
+ * area) we adjust everybody down so that no one is larger than the new LRU
+ * priority.
+ */
+#define	MPOOL_LRU_MAX		UINT32_MAX
+#define	MPOOL_LRU_REDZONE	(MPOOL_LRU_MAX - 128)
+#define	MPOOL_LRU_BASE		(MPOOL_LRU_MAX / 4)
+#define	MPOOL_LRU_DECREMENT	(MPOOL_LRU_MAX - MPOOL_LRU_BASE)
+
+/*
+ * Mpool priorities from low to high.  Defined in terms of fractions of the
+ * buffers in the pool.
+ */
+#define	MPOOL_PRI_VERY_LOW	-1	/* Dead duck.  Check and set to 0. */
+#define	MPOOL_PRI_LOW		-2	/* Low. */
+#define	MPOOL_PRI_DEFAULT	0	/* No adjustment -- special case.*/
+#define	MPOOL_PRI_HIGH		10	/* With the dirty buffers. */
+#define	MPOOL_PRI_DIRTY		10	/* Dirty gets a 10% boost. */
+#define	MPOOL_PRI_VERY_HIGH	1	/* Add number of buffers in pool. */
+
+/*
+ * MPOOLFILE --
+ *	Shared DB_MPOOLFILE information.
+ */
+struct __mpoolfile { /* SHARED */
+	db_mutex_t mutex;		/* MPOOLFILE mutex. */
+
+#ifndef HAVE_ATOMICFILEREAD
+	/* Information to synchronize backups. */
+	u_int32_t   backup_in_progress;	/* Backup running. */
+	pid_t       pid;		/* Process doing backup. */
+	db_threadid_t tid;		/* Thread doing backup. */
+	db_atomic_t writers;		/* Number of current writers. */
+	db_mutex_t  mtx_write;		/* block writers while updating.*/
+	db_pgno_t   low_pgno, high_pgno;/* Low and high backup range.*/
+#endif
+	
+	/* Protected by MPOOLFILE mutex. */
+	u_int32_t revision;		/* Bumped on any movement subdbs. */
+	u_int32_t mpf_cnt;		/* Ref count: DB_MPOOLFILEs. */
+	u_int32_t neutral_cnt;		/* Ref count: refs that don't care about
+					 * MVCC or DURABLE.  That is, read-only
+					 * or write behind references.
+					 */
+	u_int32_t block_cnt;		/* Ref count: blocks in cache. */
+	db_pgno_t last_pgno;		/* Last page in the file. */
+	db_pgno_t last_flushed_pgno;	/* Last page flushed to disk. */
+	db_pgno_t orig_last_pgno;	/* Original last page in the file. */
+	db_pgno_t maxpgno;		/* Maximum page number. */
+	u_int8_t excl_lockout;		/* Internal exclusive db lockout. */
+
+	roff_t	  path_off;		/* File name location. */
+
+	/* Protected by hash bucket mutex. */
+	SH_TAILQ_ENTRY q;		/* List of MPOOLFILEs */
+
+	/*
+	 * The following are used for file compaction processing.
+	 * They are only used when a thread is in the process
+	 * of trying to move free pages to the end of the file.
+	 * Other threads may look here when freeing a page.
+	 * Protected by a lock on the metapage.
+	 */
+	u_int32_t free_ref;		/* Refcount to freelist. */
+	u_int32_t free_cnt;		/* Count of free pages. */
+	db_size_t free_size;		/* Allocated size of free list. */
+	roff_t	  free_list;		/* Offset to free list. */
+
+	/*
+	 * We normally don't lock the deadfile field when we read it since we
+	 * only care if the field is zero or non-zero.  We do lock on read when
+	 * searching for a matching MPOOLFILE -- see that code for more detail.
+	 */
+	int32_t	  deadfile;		/* Dirty pages can be discarded. */
+
+	u_int32_t bucket;		/* hash bucket for this file. */
+
+	/*
+	 * None of the following fields are thread protected.
+	 *
+	 * There are potential races with the ftype field because it's read
+	 * without holding a lock.  However, it has to be set before adding
+	 * any buffers to the cache that depend on it being set, so there
+	 * would need to be incorrect operation ordering to have a problem.
+	 */
+	int32_t	  ftype;		/* File type. */
+
+	/*
+	 * There are potential races with the priority field because it's read
+	 * without holding a lock.  However, a collision is unlikely and if it
+	 * happens is of little consequence.
+	 */
+	int32_t   priority;		/* Priority when unpinning buffer. */
+
+	/*
+	 * There are potential races with the file_written field (many threads
+	 * may be writing blocks at the same time), and with no_backing_file
+	 * and unlink_on_close fields, as they may be set while other threads
+	 * are reading them.  However, we only care if the field value is zero
+	 * or non-zero, so don't lock the memory.
+	 *
+	 * !!!
+	 * Theoretically, a 64-bit architecture could put two of these fields
+	 * in a single memory operation and we could race.  I have never seen
+	 * an architecture where that's a problem, and I believe Java requires
+	 * that to never be the case.
+	 *
+	 * File_written is set whenever a buffer is marked dirty in the cache.
+	 * It can be cleared in some cases, after all dirty buffers have been
+	 * written AND the file has been flushed to disk.
+	 */
+	int32_t	  file_written;		/* File was written. */
+	int32_t	  no_backing_file;	/* Never open a backing file. */
+	int32_t	  unlink_on_close;	/* Unlink file on last close. */
+	db_atomic_t	  multiversion;	/* Number of DB_MULTIVERSION handles. */
+
+	/*
+	 * We do not protect the statistics in "stat" because of the cost of
+	 * the mutex in the get/put routines.  There is a chance that a count
+	 * will get lost.
+	 */
+	DB_MPOOL_FSTAT_INT stat;	/* Per-file mpool statistics. */
+
+	/*
+	 * The remaining fields are initialized at open and never subsequently
+	 * modified.
+	 */
+	int32_t	  lsn_off;		/* Page's LSN offset. */
+	u_int32_t clear_len;		/* Bytes to clear on page create. */
+
+	roff_t	  fileid_off;		/* File ID string location. */
+
+	u_int32_t pagesize;		/* Underlying pagesize. */
+	roff_t	  pgcookie_len;		/* Pgin/pgout cookie length. */
+	roff_t	  pgcookie_off;		/* Pgin/pgout cookie location. */
+
+	/*
+	 * The flags are initialized at open and never subsequently modified.
+	 */
+#define	MP_CAN_MMAP		0x001	/* If the file can be mmap'd. */
+#define	MP_DATABASE_LOCKING	0x002	/* Lock in exclusive mode. */
+#define	MP_DIRECT		0x004	/* No OS buffering. */
+#define	MP_DURABLE_UNKNOWN	0x008	/* We don't care about durability. */
+#define	MP_EXTENT		0x010	/* Extent file. */
+#define	MP_FAKE_DEADFILE	0x020	/* Deadfile field: fake flag. */
+#define	MP_FAKE_FILEWRITTEN	0x040	/* File_written field: fake flag. */
+#define	MP_FAKE_NB		0x080	/* No_backing_file field: fake flag. */
+#define	MP_FAKE_UOC		0x100	/* Unlink_on_close field: fake flag. */
+#define	MP_NOT_DURABLE		0x200	/* File is not durable. */
+#define	MP_TEMP			0x400	/* Backing file is a temporary. */
+	u_int32_t  flags;
+
+	db_pgno_t  fe_watermark;	/* File extension watermark. */
+	u_int32_t  fe_txnid;		/* Transaction that set watermark. */
+	u_int32_t  fe_nlws;		/* Number of log writes suppressed. */
+};
+
+/*
+ * Flags to __memp_bh_free.
+ */
+#define	BH_FREE_FREEMEM		0x01
+#define	BH_FREE_REUSE		0x02
+#define	BH_FREE_UNLOCKED	0x04
+
+/*
+ * BH --
+ *	Buffer header.
+ */
+struct __bh { /* SHARED */
+	db_mutex_t	mtx_buf;	/* Shared/Exclusive mutex */
+	db_atomic_t	ref;		/* Reference count. */
+#define	BH_REFCOUNT(bhp)	atomic_read(&(bhp)->ref)
+
+#define	BH_CALLPGIN	0x001		/* Convert the page before use. */
+#define	BH_DIRTY	0x002		/* Page is modified. */
+#define	BH_DIRTY_CREATE	0x004		/* Page is modified. */
+#define	BH_DISCARD	0x008		/* Page is useless. */
+#define	BH_EXCLUSIVE	0x010		/* Exclusive access acquired. */
+#define	BH_FREED	0x020		/* Page was freed. */
+#define	BH_FROZEN	0x040		/* Frozen buffer: allocate & re-read. */
+#define	BH_TRASH	0x080		/* Page is garbage. */
+#define	BH_THAWED	0x100		/* Page was thawed. */
+	u_int16_t	flags;
+
+	u_int32_t	priority;	/* Priority. */
+	SH_TAILQ_ENTRY	hq;		/* MPOOL hash bucket queue. */
+
+	db_pgno_t	pgno;		/* Underlying MPOOLFILE page number. */
+	roff_t		mf_offset;	/* Associated MPOOLFILE offset. */
+	u_int32_t	bucket;		/* Hash bucket containing header. */
+	int		region;		/* Region containing header. */
+
+	roff_t		td_off;		/* MVCC: creating TXN_DETAIL offset. */
+	SH_CHAIN_ENTRY	vc;		/* MVCC: version chain. */
+#ifdef DIAG_MVCC
+	u_int16_t	align_off;	/* Alignment offset for diagnostics.*/
+#endif
+
+	/*
+	 * !!!
+	 * This array must be at least size_t aligned -- the DB access methods
+	 * put PAGE and other structures into it, and then access them directly.
+	 * (We guarantee size_t alignment to applications in the documentation,
+	 * too.)
+	 */
+	DB_ALIGN8	u_int8_t buf[1];	/* Variable length data. */
+};
+
+/*
+ * BH_FROZEN_PAGE --
+ *	Data used to find a frozen buffer header.
+ */
+struct __bh_frozen_p {
+	BH header;
+	db_pgno_t	spgno;		/* Page number in freezer file. */
+};
+
+/*
+ * BH_FROZEN_ALLOC --
+ *	Frozen buffer headers are allocated a page at a time in general.  This
+ *	structure is allocated at the beginning of the page so that the
+ *	allocation chunks can be tracked and freed (for private environments).
+ */
+struct __bh_frozen_a {
+	SH_TAILQ_ENTRY links;
+};
+
+#define	MULTIVERSION(dbp)	atomic_read(&(dbp)->mpf->mfp->multiversion)
+
+#define	PAGE_TO_BH(p)	(BH *)((u_int8_t *)(p) - SSZA(BH, buf))
+#define	IS_DIRTY(p)							\
+    (F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE))
+
+#define	BH_OWNER(env, bhp)						\
+    ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off))
+
+#define	BH_OWNED_BY(env, bhp, txn)	((txn) != NULL &&		\
+    (bhp)->td_off != INVALID_ROFF &&					\
+    (txn)->td == BH_OWNER(env, bhp))
+
+#define	VISIBLE_LSN(env, bhp)						\
+    (&BH_OWNER(env, bhp)->visible_lsn)
+
+/*
+ * Make a copy of the buffer's visible LSN, one field at a time.  We rely on the
+ * 32-bit operations being atomic.  The visible_lsn starts at MAX_LSN and is
+ * set during commit or abort to the current LSN.
+ *
+ * If we race with a commit / abort, we may see either the file or the offset
+ * still at UINT32_MAX, so vlsn is guaranteed to be in the future.  That's OK,
+ * since we had to take the log region lock to allocate the read LSN so we were
+ * never going to see this buffer anyway.
+ */
+#define	BH_VISIBLE(env, bhp, read_lsnp, vlsn)				\
+    (bhp->td_off == INVALID_ROFF ||					\
+    ((vlsn).file = VISIBLE_LSN(env, bhp)->file,			\
+    (vlsn).offset = VISIBLE_LSN(env, bhp)->offset,			\
+    LOG_COMPARE((read_lsnp), &(vlsn)) >= 0))
+
+#define	BH_OBSOLETE(bhp, old_lsn, vlsn)	(SH_CHAIN_HASNEXT(bhp, vc) ?	\
+    BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\
+    BH_VISIBLE(env, bhp, &(old_lsn), vlsn))
+
+#define	MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL &&		\
+    F_ISSET(dbc->txn, TXN_SNAPSHOT) && MULTIVERSION(dbc->dbp) &&	\
+    dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))
+
+#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
+#define	VM_PAGESIZE 4096
+#define	MVCC_BHSIZE(mfp, sz) do {					\
+	sz += VM_PAGESIZE + sizeof(BH);					\
+	if (mfp->pagesize < VM_PAGESIZE)				\
+		sz += VM_PAGESIZE - mfp->pagesize;			\
+} while (0)
+
+#define	MVCC_BHALIGN(p) do {						\
+	BH *__bhp;							\
+	void *__orig = (p);						\
+	p = ALIGNP_INC(p, VM_PAGESIZE);					\
+	if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH))		\
+		p = (u_int8_t *)p + VM_PAGESIZE;			\
+	__bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf));			\
+	DB_ASSERT(env,							\
+	    ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0);		\
+	DB_ASSERT(env,							\
+	    (u_int8_t *)__bhp >= (u_int8_t *)__orig);			\
+	DB_ASSERT(env, (u_int8_t *)p + mfp->pagesize <			\
+	    (u_int8_t *)__orig + len);					\
+	__bhp->align_off =						\
+	    (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig);	\
+	p = __bhp;							\
+} while (0)
+
+#define	MVCC_BHUNALIGN(bhp) do {					\
+	(bhp) = (BH *)((u_int8_t *)(bhp) - (bhp)->align_off);		\
+} while (0)
+
+#ifdef linux
+#define	MVCC_MPROTECT(buf, sz, mode) do {				\
+	int __ret = mprotect((buf), (sz), (mode));			\
+	DB_ASSERT(env, __ret == 0);					\
+} while (0)
+#else
+#define	MVCC_MPROTECT(buf, sz, mode) do {				\
+	if (!F_ISSET(env, ENV_PRIVATE | ENV_SYSTEM_MEM)) {		\
+		int __ret = mprotect((buf), (sz), (mode));		\
+		DB_ASSERT(env, __ret == 0);				\
+	}								\
+} while (0)
+#endif /* linux */
+
+#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
+#define	MVCC_BHSIZE(mfp, sz) do {} while (0)
+#define	MVCC_BHALIGN(p) do {} while (0)
+#define	MVCC_BHUNALIGN(bhp) do {} while (0)
+#define	MVCC_MPROTECT(buf, size, mode) do {} while (0)
+#endif
+
+/*
+ * Flags to __memp_ftruncate.
+ */
+#define	MP_TRUNC_NOCACHE	0x01
+#define	MP_TRUNC_RECOVER	0x02
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/mp_ext.h"
+#endif /* !_DB_MP_H_ */
diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h
new file mode 100644
index 00000000..b699142c
--- /dev/null
+++ b/src/dbinc/mutex.h
@@ -0,0 +1,305 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_MUTEX_H_
+#define	_DB_MUTEX_H_
+
+#ifdef HAVE_MUTEX_SUPPORT
+/* The inlined trylock calls need access to the details of mutexes. */
+#define	LOAD_ACTUAL_MUTEX_CODE
+#include "dbinc/mutex_int.h"
+
+#ifndef HAVE_SHARED_LATCHES
+ #error "Shared latches are required in DB 4.8 and above"
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * By default, spin 50 times per processor if fail to acquire a test-and-set
+ * mutex, we have anecdotal evidence it's a reasonable value.
+ */
+#define	MUTEX_SPINS_PER_PROCESSOR	50
+
+/*
+ * Mutexes are represented by unsigned, 32-bit integral values.  As the
+ * OOB value is 0, mutexes can be initialized by zero-ing out the memory
+ * in which they reside.
+ */
+#define	MUTEX_INVALID	0
+
+/*
+ * We track mutex allocations by ID.
+ */
+#define	MTX_APPLICATION		 1
+#define	MTX_ATOMIC_EMULATION	 2
+#define	MTX_DB_HANDLE		 3
+#define	MTX_ENV_DBLIST		 4
+#define	MTX_ENV_EXCLDBLIST	 5
+#define	MTX_ENV_HANDLE		 6
+#define	MTX_ENV_REGION		 7
+#define	MTX_LOCK_REGION		 8
+#define	MTX_LOGICAL_LOCK	 9
+#define	MTX_LOG_FILENAME	10
+#define	MTX_LOG_FLUSH		11
+#define	MTX_LOG_HANDLE		12
+#define	MTX_LOG_REGION		13
+#define	MTX_MPOOLFILE_HANDLE	14
+#define	MTX_MPOOL_BH		15
+#define	MTX_MPOOL_FH		16
+#define	MTX_MPOOL_FILE_BUCKET	17
+#define	MTX_MPOOL_HANDLE	18
+#define	MTX_MPOOL_HASH_BUCKET	19
+#define	MTX_MPOOL_REGION	20
+#define	MTX_MUTEX_REGION	21
+#define	MTX_MUTEX_TEST		22
+#define	MTX_REP_CHKPT		23
+#define	MTX_REP_DATABASE	24
+#define	MTX_REP_DIAG		25
+#define	MTX_REP_EVENT		26
+#define	MTX_REP_REGION		27
+#define	MTX_REP_START		28
+#define	MTX_REP_WAITER		29
+#define	MTX_REPMGR		30
+#define	MTX_SEQUENCE		31
+#define	MTX_TWISTER		32
+#define	MTX_TCL_EVENTS		33
+#define	MTX_TXN_ACTIVE		34
+#define	MTX_TXN_CHKPT		35
+#define	MTX_TXN_COMMIT		36
+#define	MTX_TXN_MVCC		37
+#define	MTX_TXN_REGION		38
+
+#define	MTX_MAX_ENTRY		38
+
+/* The following macros are defined on some platforms, e.g. QNX. */
+#undef __mutex_init
+#undef __mutex_lock
+#undef __mutex_timedlock
+#undef __mutex_unlock
+#undef __mutex_destroy
+#undef __mutex_trylock
+
+/* Redirect mutex calls to the correct functions. */
+#if !defined(HAVE_MUTEX_HYBRID) && (					\
+    defined(HAVE_MUTEX_PTHREADS) ||					\
+    defined(HAVE_MUTEX_SOLARIS_LWP) ||					\
+    defined(HAVE_MUTEX_UI_THREADS))
+#define	__mutex_init(a, b, c)		__db_pthread_mutex_init(a, b, c)
+#define	__mutex_lock(a, b)		__db_pthread_mutex_lock(a, b, 0)
+#define	__mutex_timedlock(a, b, c)	__db_pthread_mutex_lock(a, b, c)
+#define	__mutex_unlock(a, b)		__db_pthread_mutex_unlock(a, b)
+#define	__mutex_destroy(a, b)		__db_pthread_mutex_destroy(a, b)
+#define	__mutex_trylock(a, b)		__db_pthread_mutex_trylock(a, b)
+/*
+ * These trylock versions do not support DB_ENV_FAILCHK. Callers which loop
+ * checking mutexes which are held by dead processes or threads might spin.
+ * These have ANSI-style definitions because this file can be included by
+ * C++ files, and extern "C" affects linkage only, not argument typing.
+ */
+static inline int __db_pthread_mutex_trylock(ENV *env, db_mutex_t mutex)
+{
+	int ret;
+	DB_MUTEX *mutexp;
+	if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+	mutexp = MUTEXP_SET(env, mutex);
+#ifdef HAVE_SHARED_LATCHES
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+		ret = pthread_rwlock_trywrlock(&mutexp->u.rwlock);
+	else
+#endif
+		ret = pthread_mutex_trylock(&mutexp->u.m.mutex);
+	if (ret == EBUSY)
+		ret = DB_LOCK_NOTGRANTED;
+	else if (ret == 0) {
+		F_SET(mutexp, DB_MUTEX_LOCKED);
+		env->dbenv->thread_id(env->dbenv, &mutexp->pid, &mutexp->tid);
+		STAT_INC(env,
+		    mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+	}
+	return (ret);
+}
+#ifdef HAVE_SHARED_LATCHES
+#define	__mutex_rdlock(a, b)		__db_pthread_mutex_readlock(a, b)
+#define	__mutex_tryrdlock(a, b)		__db_pthread_mutex_tryreadlock(a, b)
+static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
+{
+	int ret;
+	DB_MUTEX *mutexp;
+	if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+	mutexp = MUTEXP_SET(env, mutex);
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+		ret = pthread_rwlock_tryrdlock(&mutexp->u.rwlock);
+	else
+		return (EINVAL);
+	if (ret == EBUSY)
+		ret = DB_LOCK_NOTGRANTED;
+#ifdef HAVE_STATISTICS
+	if (ret == 0)
+		STAT_INC(env,
+		    mutex, set_rd_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+	return (ret);
+}
+#endif
+#elif defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC)
+#define	__mutex_init(a, b, c)		__db_win32_mutex_init(a, b, c)
+#define	__mutex_lock(a, b)		__db_win32_mutex_lock(a, b, 0)
+#define	__mutex_timedlock(a, b, c)	__db_win32_mutex_lock(a, b, c)
+#define	__mutex_trylock(a, b)		__db_win32_mutex_trylock(a, b)
+#define	__mutex_unlock(a, b)		__db_win32_mutex_unlock(a, b)
+#define	__mutex_destroy(a, b)		__db_win32_mutex_destroy(a, b)
+#ifdef HAVE_SHARED_LATCHES
+#define	__mutex_rdlock(a, b)		__db_win32_mutex_readlock(a, b)
+#define	__mutex_tryrdlock(a, b)		__db_win32_mutex_tryreadlock(a, b)
+#endif
+#elif defined(HAVE_MUTEX_FCNTL)
+#define	__mutex_init(a, b, c)		__db_fcntl_mutex_init(a, b, c)
+#define	__mutex_lock(a, b)		__db_fcntl_mutex_lock(a, b, 0)
+#define	__mutex_timedlock(a, b, c)	__db_fcntl_lock(a, b, c)
+#define	__mutex_trylock(a, b)		__db_fcntl_mutex_trylock(a, b)
+#define	__mutex_unlock(a, b)		__db_fcntl_mutex_unlock(a, b)
+#define	__mutex_destroy(a, b)		__db_fcntl_mutex_destroy(a, b)
+#else
+#define	__mutex_init(a, b, c)		__db_tas_mutex_init(a, b, c)
+#define	__mutex_lock(a, b)		__db_tas_mutex_lock(a, b, 0)
+#define	__mutex_timedlock(a, b, c)	__db_tas_mutex_lock(a, b, c)
+#define	__mutex_trylock(a, b)		__db_tas_mutex_trylock(a, b)
+#define	__mutex_unlock(a, b)		__db_tas_mutex_unlock(a, b)
+#define	__mutex_destroy(a, b)		__db_tas_mutex_destroy(a, b)
+#if defined(HAVE_SHARED_LATCHES)
+#define	__mutex_rdlock(a, b)		__db_tas_mutex_readlock(a, b)
+#define	__mutex_tryrdlock(a,b)		__db_tas_mutex_tryreadlock(a, b)
+#endif
+#endif
+
+/*
+ * When there is no method to get a shared latch, fall back to
+ * implementing __mutex_rdlock() as getting an exclusive one.
+ * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL.
+ */
+#ifndef __mutex_rdlock
+#define	__mutex_rdlock(a, b)		__mutex_lock(a, b)
+#endif
+#ifndef __mutex_tryrdlock
+#define	__mutex_tryrdlock(a, b)		__mutex_trylock(a, b)
+#endif
+
+/*
+ * Lock/unlock a mutex.  If the mutex was never required, the thread of
+ * control can proceed without it.
+ *
+ * We never fail to acquire or release a mutex without panicing.  Simplify
+ * the macros to always return a panic value rather than saving the actual
+ * return value of the mutex routine.
+ */
+#ifdef HAVE_MUTEX_SUPPORT
+#define	MUTEX_LOCK(env, mutex) do {					\
+	if ((mutex) != MUTEX_INVALID &&					\
+	    __mutex_lock(env, mutex) != 0)				\
+		return (DB_RUNRECOVERY);				\
+} while (0)
+
+/*
+ * Always check the return value of MUTEX_TRYLOCK()!  Expect 0 on success,
+ * or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk.
+ */
+#define	MUTEX_TRYLOCK(env, mutex)					\
+	(((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex))
+
+/*
+ * Acquire a DB_MUTEX_SHARED "mutex" in shared mode.
+ */
+#define	MUTEX_READLOCK(env, mutex) do {					\
+	if ((mutex) != MUTEX_INVALID &&					\
+	    __mutex_rdlock(env, mutex) != 0)				\
+		return (DB_RUNRECOVERY);				\
+} while (0)
+#define	MUTEX_TRY_READLOCK(env, mutex)					\
+	((mutex) != MUTEX_INVALID ? __mutex_tryrdlock(env, mutex) : 0)
+
+#define	MUTEX_UNLOCK(env, mutex) do {					\
+	if ((mutex) != MUTEX_INVALID &&					\
+	    __mutex_unlock(env, mutex) != 0)				\
+		return (DB_RUNRECOVERY);				\
+} while (0)
+
+#define	MUTEX_WAIT(env, mutex, duration) do {			      \
+	int __ret;						      \
+	if ((mutex) != MUTEX_INVALID &&				      \
+	    (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
+	    __ret != DB_TIMEOUT)				      \
+		return (DB_RUNRECOVERY);			      \
+} while (0)
+#else
+/*
+ * There are calls to lock/unlock mutexes outside of #ifdef's -- replace
+ * the call with something the compiler can discard, but which will make
+ * if-then-else blocks work correctly.
+ */
+#define	MUTEX_LOCK(env, mutex)		(mutex) = (mutex)
+#define	MUTEX_TRYLOCK(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_READLOCK(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_TRY_READLOCK(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_UNLOCK(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_REQUIRED(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_REQUIRED_READ(env, mutex)	(mutex) = (mutex)
+#define	MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex)
+#endif
+
+/*
+ * Berkeley DB ports may require single-threading at places in the code.
+ */
+#ifdef HAVE_MUTEX_VXWORKS
+#include "taskLib.h"
+/*
+ * Use the taskLock() mutex to eliminate a race where two tasks are
+ * trying to initialize the global lock at the same time.
+ */
+#define	DB_BEGIN_SINGLE_THREAD do {					\
+	if (DB_GLOBAL(db_global_init))					\
+		(void)semTake(DB_GLOBAL(db_global_lock), WAIT_FOREVER);	\
+	else {								\
+		taskLock();						\
+		if (DB_GLOBAL(db_global_init)) {			\
+			taskUnlock();					\
+			(void)semTake(DB_GLOBAL(db_global_lock),	\
+			    WAIT_FOREVER);				\
+			continue;					\
+		}							\
+		DB_GLOBAL(db_global_lock) =				\
+		    semBCreate(SEM_Q_FIFO, SEM_EMPTY);			\
+		if (DB_GLOBAL(db_global_lock) != NULL)			\
+			DB_GLOBAL(db_global_init) = 1;			\
+		taskUnlock();						\
+	}								\
+} while (DB_GLOBAL(db_global_init) == 0)
+#define	DB_END_SINGLE_THREAD	(void)semGive(DB_GLOBAL(db_global_lock))
+#endif
+
+/*
+ * Single-threading defaults to a no-op.
+ */
+#ifndef DB_BEGIN_SINGLE_THREAD
+#define	DB_BEGIN_SINGLE_THREAD
+#endif
+#ifndef DB_END_SINGLE_THREAD
+#define	DB_END_SINGLE_THREAD
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/mutex_ext.h"
+#endif /* !_DB_MUTEX_H_ */
diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h
new file mode 100644
index 00000000..b9bccdf7
--- /dev/null
+++ b/src/dbinc/mutex_int.h
@@ -0,0 +1,1070 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_MUTEX_INT_H_
+#define	_DB_MUTEX_INT_H_
+
+#include "dbinc/atomic.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Mutexes and Shared Latches
+ *
+ * Mutexes may be test-and-set (spinning & yielding when busy),
+ * native versions (pthreads, WaitForSingleObject)
+ * or a hybrid which has the lower no-contention overhead of test-and-set
+ * mutexes, using operating system calls only to block and wakeup.
+ *
+ * Hybrid exclusive-only mutexes include a 'tas' field.
+ * Hybrid DB_MUTEX_SHARED latches also include a 'shared' field.
+ */
+
+/*********************************************************************
+ * POSIX.1 pthreads interface.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_PTHREADS)
+/*
+ * Pthreads-based mutexes (exclusive-only) and latches (possibly shared)
+ * have the same MUTEX_FIELDS union. Different parts of the union are used
+ * depending on:
+ *    -	whether HAVE_SHARED_LATCHES is defined, and
+ *    - if HAVE_SHARED_LATCHES, whether this particular instance of a mutex
+ *	is a shared mutexDB_MUTEX_SHARED.
+ *
+ * The rwlock part of the union is used *only* for non-hybrid shared latches;
+ * in all other cases the mutex and cond fields are the only ones used.
+ *
+ *  configuration &	Who uses the field
+ *  mutex
+ *			mutex	cond	rwlock	tas
+ * Native mutex		y	y
+ * Hybrid mutexes	y	y		y
+ * Native sharedlatches			y
+ * Hybrid sharedlatches	y	y		y
+ *
+ * They all have a condition variable which is used only for
+ * DB_MUTEX_SELF_BLOCK waits.
+ *
+ * There can be no self-blocking shared latches: the pthread_cond_wait() would
+ * require getting a pthread_mutex_t, also it would not make sense.
+ */
+#define	MUTEX_FIELDS							\
+	union {								\
+		struct {						\
+		    pthread_mutex_t mutex;	/* Mutex */		\
+		    pthread_cond_t  cond;	/* Condition variable */ \
+		} m;							\
+		pthread_rwlock_t rwlock;	/* Read/write lock */	\
+	} u;
+
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+#define	RET_SET_PTHREAD_LOCK(mutexp, ret) do {				\
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))				\
+		RET_SET((pthread_rwlock_wrlock(&(mutexp)->u.rwlock)),	\
+		    ret);						\
+	else								\
+		RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \
+} while (0)
+#define	RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do {			\
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))				\
+		RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \
+		    ret);						\
+	else								\
+		RET_SET((pthread_mutex_trylock(&(mutexp)->u.m.mutex)),	\
+		    ret);						\
+} while (0)
+#else
+#define	RET_SET_PTHREAD_LOCK(mutexp, ret)				\
+		RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret);
+#define	RET_SET_PTHREAD_TRYLOCK(mutexp, ret)				\
+		RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret);
+#endif
+#endif
+
+#ifdef HAVE_MUTEX_UI_THREADS
+#include <thread.h>
+#endif
+
+/*********************************************************************
+ * Solaris lwp threads interface.
+ *
+ * !!!
+ * We use LWP mutexes on Solaris instead of UI or POSIX mutexes (both of
+ * which are available), for two reasons.  First, the Solaris C library
+ * includes versions of the both UI and POSIX thread mutex interfaces, but
+ * they are broken in that they don't support inter-process locking, and
+ * there's no way to detect it, e.g., calls to configure the mutexes for
+ * inter-process locking succeed without error.  So, we use LWP mutexes so
+ * that we don't fail in fairly undetectable ways because the application
+ * wasn't linked with the appropriate threads library.  Second, there were
+ * bugs in SunOS 5.7 (Solaris 7) where if an application loaded the C library
+ * before loading the libthread/libpthread threads libraries (e.g., by using
+ * dlopen to load the DB library), the pwrite64 interface would be translated
+ * into a call to pwrite and DB would drop core.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+/*
+ * XXX
+ * Don't change <synch.h> to <sys/lwp.h> -- although lwp.h is listed in the
+ * Solaris manual page as the correct include to use, it causes the Solaris
+ * compiler on SunOS 2.6 to fail.
+ */
+#include <synch.h>
+
+#define	MUTEX_FIELDS							\
+	lwp_mutex_t mutex;		/* Mutex. */			\
+	lwp_cond_t cond;		/* Condition variable. */
+#endif
+
+/*********************************************************************
+ * Solaris/Unixware threads interface.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_UI_THREADS
+#include <thread.h>
+#include <synch.h>
+
+#define	MUTEX_FIELDS							\
+	mutex_t mutex;			/* Mutex. */			\
+	cond_t  cond;			/* Condition variable. */
+#endif
+
+/*********************************************************************
+ * AIX C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_AIX_CHECK_LOCK
+#include <sys/atomic_op.h>
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	0
+#define	MUTEX_SET(x)	(!_check_lock(x, 0, 1))
+#define	MUTEX_UNSET(x)	_clear_lock(x, 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Apple/Darwin library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_DARWIN_SPIN_LOCK_TRY
+typedef u_int32_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+extern int _spin_lock_try(tsl_t *);
+extern void _spin_unlock(tsl_t *);
+#define	MUTEX_SET(tsl)          _spin_lock_try(tsl)
+#define	MUTEX_UNSET(tsl)        _spin_unlock(tsl)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * General C library functions (msemaphore).
+ *
+ * !!!
+ * Check for HPPA as a special case, because it requires unusual alignment,
+ * and doesn't support semaphores in malloc(3) or shmget(2) memory.
+ *
+ * !!!
+ * Do not remove the MSEM_IF_NOWAIT flag.  The problem is that if a single
+ * process makes two msem_lock() calls in a row, the second one returns an
+ * error.  We depend on the fact that we can lock against ourselves in the
+ * locking subsystem, where we set up a mutex so that we can block ourselves.
+ * Tested on OSF1 v4.0.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+#define	MUTEX_ALIGN	16
+#endif
+
+#if defined(HAVE_MUTEX_MSEM_INIT) || defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+#include <sys/mman.h>
+typedef msemaphore tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	(msem_init(x, MSEM_UNLOCKED) <= (msemaphore *)0)
+#define	MUTEX_SET(x)	(!msem_lock(x, MSEM_IF_NOWAIT))
+#define	MUTEX_UNSET(x)	msem_unlock(x, 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Plan 9 library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_PLAN9
+typedef Lock tsl_t;
+
+#define	MUTEX_INIT(x)	(memset(x, 0, sizeof(Lock)), 0)
+#define	MUTEX_SET(x)	canlock(x)
+#define	MUTEX_UNSET(x)	unlock(x)
+#endif
+
+/*********************************************************************
+ * Reliant UNIX C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_RELIANTUNIX_INITSPIN
+#include <ulocks.h>
+typedef spinlock_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	(initspin(x, 1), 0)
+#define	MUTEX_SET(x)	(cspinlock(x) == 0)
+#define	MUTEX_UNSET(x)	spinunlock(x)
+#endif
+#endif
+
+/*********************************************************************
+ * General C library functions (POSIX 1003.1 sema_XXX).
+ *
+ * !!!
+ * Never selected by autoconfig in this release (semaphore calls are known
+ * to not work in Solaris 5.5).
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SEMA_INIT
+#include <synch.h>
+typedef sema_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_DESTROY(x) sema_destroy(x)
+#define	MUTEX_INIT(x)	 (sema_init(x, 1, USYNC_PROCESS, NULL) != 0)
+#define	MUTEX_SET(x)	 (sema_wait(x) == 0)
+#define	MUTEX_UNSET(x)	 sema_post(x)
+#endif
+#endif
+
+/*********************************************************************
+ * SGI C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SGI_INIT_LOCK
+#include <abi_mutex.h>
+typedef abilock_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	(init_lock(x) != 0)
+#define	MUTEX_SET(x)	(!acquire_lock(x))
+#define	MUTEX_UNSET(x)	release_lock(x)
+#endif
+#endif
+
+/*********************************************************************
+ * Solaris C library functions.
+ *
+ * !!!
+ * These are undocumented functions, but they're the only ones that work
+ * correctly as far as we know.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SOLARIS_LOCK_TRY
+#include <sys/atomic.h>
+#define	MUTEX_MEMBAR(x)	membar_enter()
+#define	MEMBAR_ENTER()	membar_enter()
+#define	MEMBAR_EXIT()	membar_exit()
+#include <sys/machlock.h>
+typedef lock_t tsl_t;
+
+/*
+ * The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL.
+ * Re-declare them here to avoid warnings.
+ */
+extern  int _lock_try(lock_t *);
+extern void _lock_clear(lock_t *);
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	0
+#define	MUTEX_SET(x)	_lock_try(x)
+#define	MUTEX_UNSET(x)	_lock_clear(x)
+#endif
+#endif
+
+/*********************************************************************
+ * VMS.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_VMS
+#include <sys/mman.h>
+#include <builtins.h>
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#ifdef __ALPHA
+#define	MUTEX_SET(tsl)		(!__TESTBITSSI(tsl, 0))
+#else /* __VAX */
+#define	MUTEX_SET(tsl)		(!(int)_BBSSI(0, tsl))
+#endif
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * VxWorks
+ * Use basic binary semaphores in VxWorks, as we currently do not need
+ * any special features.  We do need the ability to single-thread the
+ * entire system, however, because VxWorks doesn't support the open(2)
+ * flag O_EXCL, the mechanism we normally use to single thread access
+ * when we're first looking for a DB environment.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_VXWORKS
+#include "taskLib.h"
+typedef SEM_ID tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * Uses of this MUTEX_SET() need to have a local 'nowait' variable,
+ * which determines whether to return right away when the semaphore
+ * is busy or to wait until it is available.
+ */
+#define	MUTEX_SET(tsl)							\
+	(semTake((*(tsl)), nowait ? NO_WAIT : WAIT_FOREVER) == OK)
+#define	MUTEX_UNSET(tsl)	(semGive((*tsl)))
+#define	MUTEX_INIT(tsl)							\
+	((*(tsl) = semBCreate(SEM_Q_FIFO, SEM_FULL)) == NULL)
+#define	MUTEX_DESTROY(tsl)	semDelete(*tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * Win16
+ *
+ * Win16 spinlocks are simple because we cannot possibly be preempted.
+ *
+ * !!!
+ * We should simplify this by always returning a no-need-to-lock lock
+ * when we initialize the mutex.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_WIN16
+typedef unsigned int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)		0
+#define	MUTEX_SET(tsl)		(*(tsl) = 1)
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Win32 - always a hybrid mutex
+ *********************************************************************/
+#if defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC)
+typedef LONG volatile tsl_t;
+#define	MUTEX_FIELDS							\
+	LONG nwaiters;							\
+	u_int32_t id;	/* ID used for creating events */		\
+
+#if defined(LOAD_ACTUAL_MUTEX_CODE)
+#define	MUTEX_SET(tsl)		(!InterlockedExchange((PLONG)tsl, 1))
+#define	MUTEX_UNSET(tsl)	InterlockedExchange((PLONG)tsl, 0)
+#define	MUTEX_INIT(tsl)		MUTEX_UNSET(tsl)
+
+/*
+ * From Intel's performance tuning documentation (and see SR #6975):
+ * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf
+ *
+ * "For this reason, it is highly recommended that you insert the PAUSE
+ * instruction into all spin-wait code immediately. Using the PAUSE
+ * instruction does not affect the correctness of programs on existing
+ * platforms, and it improves performance on Pentium 4 processor platforms."
+ */
+#ifdef HAVE_MUTEX_WIN32
+#if !defined(_WIN64) && !defined(DB_WINCE)
+#define	MUTEX_PAUSE		{__asm{_emit 0xf3}; __asm{_emit 0x90}}
+#endif
+#endif
+#ifdef HAVE_MUTEX_WIN32_GCC
+#define	MUTEX_PAUSE		__asm__ volatile ("rep; nop" : : );
+#endif
+#endif
+#endif
+
+/*********************************************************************
+ * 68K/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_68K_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/68K: 0 is clear, 1 is set. */
+#define	MUTEX_SET(tsl) ({						\
+	register tsl_t *__l = (tsl);					\
+	int __r;							\
+	    __asm__ volatile("tas  %1; \n				\
+			  seq  %0"					\
+		: "=dm" (__r), "=m" (*__l)				\
+		: "1" (*__l)						\
+		);							\
+	__r & 1;							\
+})
+
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * ALPHA/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_ALPHA_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define	MUTEX_ALIGN	4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * For gcc/alpha.  Should return 0 if could not acquire the lock, 1 if
+ * lock was acquired properly.
+ */
+static inline int
+MUTEX_SET(tsl_t *tsl) {
+	register tsl_t *__l = tsl;
+	register tsl_t __r;
+	__asm__ volatile(
+		"1:	ldl_l	%0,%2\n"
+		"	blbs	%0,2f\n"
+		"	or	$31,1,%0\n"
+		"	stl_c	%0,%1\n"
+		"	beq	%0,3f\n"
+		"	mb\n"
+		"	br	3f\n"
+		"2:	xor	%0,%0\n"
+		"3:"
+		: "=&r"(__r), "=m"(*__l) : "1"(*__l) : "memory");
+	return __r;
+}
+
+/*
+ * Unset mutex. Judging by Alpha Architecture Handbook, the mb instruction
+ * might be necessary before unlocking
+ */
+static inline int
+MUTEX_UNSET(tsl_t *tsl) {
+	__asm__ volatile("	mb\n");
+	return *tsl = 0;
+}
+
+#define	MUTEX_INIT(tsl)		MUTEX_UNSET(tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * Tru64/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_TRU64_CC_ASSEMBLY
+typedef volatile u_int32_t tsl_t;
+
+#define	MUTEX_ALIGN	4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#include <alpha/builtins.h>
+#define	MUTEX_SET(tsl)		(__LOCK_LONG_RETRY((tsl), 1) != 0)
+#define	MUTEX_UNSET(tsl)	(__UNLOCK_LONG(tsl))
+
+#define	MUTEX_INIT(tsl)		(MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * ARM/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_ARM_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/arm: 0 is clear, 1 is set. */
+#define	MUTEX_SET(tsl) ({						\
+	int __r;							\
+	__asm__ volatile(						\
+		"swpb	%0, %1, [%2]\n\t"				\
+		"eor	%0, %0, #1\n\t"					\
+	    : "=&r" (__r)						\
+	    : "r" (1), "r" (tsl)					\
+	    );								\
+	__r & 1;							\
+})
+
+#define	MUTEX_UNSET(tsl)	(*(volatile tsl_t *)(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * HPPA/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_HPPA_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define	MUTEX_ALIGN	16
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The PA-RISC has a "load and clear" instead of a "test and set" instruction.
+ * The 32-bit word used by that instruction must be 16-byte aligned.  We could
+ * use the "aligned" attribute in GCC but that doesn't work for stack variables.
+ */
+#define	MUTEX_SET(tsl) ({						\
+	register tsl_t *__l = (tsl);					\
+	int __r;							\
+	__asm__ volatile("ldcws 0(%1),%0" : "=r" (__r) : "r" (__l));	\
+	__r & 1;							\
+})
+
+#define	MUTEX_UNSET(tsl)        (*(volatile tsl_t *)(tsl) = -1)
+#define	MUTEX_INIT(tsl)		(MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * IA64/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_IA64_GCC_ASSEMBLY
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/ia64: 0 is clear, 1 is set. */
+#define	MUTEX_SET(tsl) ({						\
+	register tsl_t *__l = (tsl);					\
+	long __r;							\
+	__asm__ volatile("xchg1 %0=%1,%2" :				\
+		     "=r"(__r), "+m"(*__l) : "r"(1));			\
+	__r ^ 1;							\
+})
+
+/*
+ * Store through a "volatile" pointer so we get a store with "release"
+ * semantics.
+ */
+#define	MUTEX_UNSET(tsl)	(*(tsl_t *)(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * PowerPC/gcc assembly.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_PPC_GCC_ASSEMBLY)
+typedef u_int32_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The PowerPC does a sort of pseudo-atomic locking.  You set up a
+ * 'reservation' on a chunk of memory containing a mutex by loading the
+ * mutex value with LWARX.  If the mutex has an 'unlocked' (arbitrary)
+ * value, you then try storing into it with STWCX.  If no other process or
+ * thread broke your 'reservation' by modifying the memory containing the
+ * mutex, then the STCWX succeeds; otherwise it fails and you try to get
+ * a reservation again.
+ *
+ * While mutexes are explicitly 4 bytes, a 'reservation' applies to an
+ * entire cache line, normally 32 bytes, aligned naturally.  If the mutex
+ * lives near data that gets changed a lot, there's a chance that you'll
+ * see more broken reservations than you might otherwise.  The only
+ * situation in which this might be a problem is if one processor is
+ * beating on a variable in the same cache block as the mutex while another
+ * processor tries to acquire the mutex.  That's bad news regardless
+ * because of the way it bashes caches, but if you can't guarantee that a
+ * mutex will reside in a relatively quiescent cache line, you might
+ * consider padding the mutex to force it to live in a cache line by
+ * itself.  No, you aren't guaranteed that cache lines are 32 bytes.  Some
+ * embedded processors use 16-byte cache lines, while some 64-bit
+ * processors use 128-bit cache lines.  But assuming a 32-byte cache line
+ * won't get you into trouble for now.
+ *
+ * If mutex locking is a bottleneck, then you can speed it up by adding a
+ * regular LWZ load before the LWARX load, so that you can test for the
+ * common case of a locked mutex without wasting cycles making a reservation.
+ *
+ * gcc/ppc: 0 is clear, 1 is set.
+ */
+static inline int
+MUTEX_SET(int *tsl)  {
+	int __r;
+	__asm__ volatile (
+"0:                             \n\t"
+"       lwarx   %0,0,%1         \n\t"
+"       cmpwi   %0,0            \n\t"
+"       bne-    1f              \n\t"
+"       stwcx.  %1,0,%1         \n\t"
+"       isync                   \n\t"
+"       beq+    2f              \n\t"
+"       b       0b              \n\t"
+"1:                             \n\t"
+"       li      %1,0            \n\t"
+"2:                             \n\t"
+	 : "=&r" (__r), "+r" (tsl)
+	 :
+	 : "cr0", "memory");
+	 return (int)tsl;
+}
+
+static inline int
+MUTEX_UNSET(tsl_t *tsl) {
+	 __asm__ volatile("sync" : : : "memory");
+	 return *tsl = 0;
+}
+#define	MUTEX_INIT(tsl)		MUTEX_UNSET(tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * OS/390 C.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_S390_CC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * cs() is declared in <stdlib.h> but is built in to the compiler.
+ * Must use LANGLVL(EXTENDED) to get its declaration.
+ */
+#define	MUTEX_SET(tsl)		(!cs(&zero, (tsl), 1))
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * S/390 32-bit assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_S390_GCC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/S390: 0 is clear, 1 is set. */
+static inline int
+MUTEX_SET(tsl_t *tsl) {							\
+	register tsl_t *__l = (tsl);					\
+	int __r;							\
+  __asm__ volatile(							\
+       "    la    1,%1\n"						\
+       "    lhi   0,1\n"						\
+       "    l     %0,%1\n"						\
+       "0:  cs    %0,0,0(1)\n"						\
+       "    jl    0b"							\
+       : "=&d" (__r), "+m" (*__l)					\
+       : : "0", "1", "cc");						\
+	return !__r;							\
+}
+
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * SCO/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SCO_X86_CC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * UnixWare has threads in libthread, but OpenServer doesn't (yet).
+ *
+ * cc/x86: 0 is clear, 1 is set.
+ */
+#if defined(__USLC__)
+asm int
+_tsl_set(void *tsl)
+{
+%mem tsl
+	movl	tsl, %ecx
+	movl	$1, %eax
+	lock
+	xchgb	(%ecx),%al
+	xorl	$1,%eax
+}
+#endif
+
+#define	MUTEX_SET(tsl)		_tsl_set(tsl)
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0)
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Sparc/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SPARC_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#define	MUTEX_ALIGN	8
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The ldstub instruction takes the location specified by its first argument
+ * (a register containing a memory address) and loads its contents into its
+ * second argument (a register) and atomically sets the contents the location
+ * specified by its first argument to a byte of 1s.  (The value in the second
+ * argument is never read, but only overwritten.)
+ *
+ * Hybrid mutexes require membar #StoreLoad and #LoadStore ordering on multi-
+ * processor v9 systems.
+ *
+ * gcc/sparc: 0 is clear, 1 is set.
+ */
+#define	MUTEX_SET(tsl) ({						\
+	register tsl_t *__l = (tsl);					\
+	register tsl_t __r;						\
+	__asm__ volatile						\
+	    ("ldstub [%1],%0; stbar"					\
+	    : "=r"( __r) : "r" (__l));					\
+	!__r;								\
+})
+
+#define	MUTEX_UNSET(tsl)	(*(tsl) = 0, MUTEX_MEMBAR(tsl))
+#define	MUTEX_INIT(tsl)         (MUTEX_UNSET(tsl), 0)
+#define	MUTEX_MEMBAR(x)	\
+	({ __asm__ volatile ("membar #StoreStore|#StoreLoad|#LoadStore"); })
+#define	MEMBAR_ENTER() \
+	({ __asm__ volatile ("membar #StoreStore|#StoreLoad"); })
+#define	MEMBAR_EXIT() \
+	({ __asm__ volatile ("membar #StoreStore|#LoadStore"); })
+#endif
+#endif
+
+/*********************************************************************
+ * UTS/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_UTS_CC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define	MUTEX_INIT(x)	0
+#define	MUTEX_SET(x)	(!uts_lock(x, 1))
+#define	MUTEX_UNSET(x)	(*(x) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * MIPS/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_MIPS_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define	MUTEX_ALIGN	4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * For gcc/MIPS.  Should return 0 if could not acquire the lock, 1 if
+ * lock was acquired properly.
+ */
+static inline int
+MUTEX_SET(tsl_t *tsl) {
+       register tsl_t *__l = tsl;
+       register tsl_t __r, __t;
+       __asm__ volatile(
+	       "       .set push           \n"
+	       "       .set mips2          \n"
+	       "       .set noreorder      \n"
+	       "       .set nomacro        \n"
+	       "1:     ll      %0, %3      \n"
+	       "       ori     %2, %0, 1   \n"
+	       "       sc      %2, %1      \n"
+	       "       beqzl   %2, 1b      \n"
+	       "       nop                 \n"
+	       "       andi    %2, %0, 1   \n"
+	       "       sync                \n"
+	       "       .set reorder        \n"
+	       "       .set pop            \n"
+	       : "=&r" (__t), "=m" (*tsl), "=&r" (__r)
+	       : "m" (*tsl)
+	       : "memory");
+       return (!__r);
+}
+
+static inline void
+MUTEX_UNSET(tsl_t *tsl) {
+	__asm__ volatile(
+	       "       .set noreorder      \n"
+	       "       sync                \n"
+	       "       sw      $0, %0      \n"
+	       "       .set reorder        \n"
+	       : "=m" (*tsl)
+	       : "m" (*tsl)
+	       : "memory");
+}
+
+#define	       MUTEX_INIT(tsl)         (*(tsl) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * x86/gcc (32- and 64-bit) assembly.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY) || \
+    defined(HAVE_MUTEX_X86_64_GCC_ASSEMBLY)
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/x86: 0 is clear, 1 is set. */
+#define	MUTEX_SET(tsl) ({						\
+	tsl_t __r;							\
+	__asm__ volatile("movb $1, %b0\n\t"				\
+		"xchgb %b0,%1"						\
+	    : "=&q" (__r)						\
+	    : "m" (*(tsl_t *)(tsl))					\
+	    : "memory", "cc");						\
+	!__r;	/* return 1 on success, 0 on failure */			\
+})
+
+#define	MUTEX_UNSET(tsl)        (*(tsl_t *)(tsl) = 0)
+#define	MUTEX_INIT(tsl)		(MUTEX_UNSET(tsl), 0)
+/*
+ * We need to pass a valid address to generate the memory barrier
+ * otherwise PURIFY will complain.  Use something referenced recently
+ * and initialized.
+ */
+#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY)
+#define	MUTEX_MEMBAR(addr)						\
+    ({ __asm__ volatile ("lock; addl $0, %0" ::"m" (addr): "memory"); 1; })
+#else
+#define	MUTEX_MEMBAR(addr)						\
+    ({ __asm__ volatile ("mfence" ::: "memory"); 1; })
+#endif
+
+/*
+ * From Intel's performance tuning documentation (and see SR #6975):
+ * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf
+ *
+ * "For this reason, it is highly recommended that you insert the PAUSE
+ * instruction into all spin-wait code immediately. Using the PAUSE
+ * instruction does not affect the correctness of programs on existing
+ * platforms, and it improves performance on Pentium 4 processor platforms."
+ */
+#define	MUTEX_PAUSE		__asm__ volatile ("rep; nop" : : );
+#endif
+#endif
+
+/* End of operating system & hardware architecture-specific definitions */
+
+/*
+ * Mutex alignment defaults to sizeof(unsigned int).
+ *
+ * !!!
+ * Various systems require different alignments for mutexes (the worst we've
+ * seen so far is 16-bytes on some HP architectures).  Malloc(3) is assumed
+ * to return reasonable alignment, all other mutex users must ensure proper
+ * alignment locally.
+ */
+#ifndef	MUTEX_ALIGN
+#define	MUTEX_ALIGN	sizeof(unsigned int)
+#endif
+
+/*
+ * Mutex destruction defaults to a no-op.
+ */
+#ifndef	MUTEX_DESTROY
+#define	MUTEX_DESTROY(x)
+#endif
+
+/*
+ * Mutex pause defaults to a no-op.
+ */
+#ifndef	MUTEX_PAUSE
+#define	MUTEX_PAUSE
+#endif
+
+/*
+ * If no native atomic support is available then use mutexes to
+ * emulate atomic increment, decrement, and compare-and-exchange.
+ * The address of the atomic value selects which of a small number
+ * of mutexes to use to protect the updates.
+ * The number of mutexes should be somewhat larger than the number of
+ * processors in the system in order to minimize unnecessary contention.
+ * It defaults to 8 to handle most small (1-4) cpu systems, if it hasn't
+ * already been defined (e.g. in db_config.h)
+ */
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) && \
+    !defined(MAX_ATOMIC_MUTEXES)
+#define	MAX_ATOMIC_MUTEXES	1
+#endif
+
+/*
+ * DB_MUTEXMGR --
+ *	The mutex manager encapsulates the mutex system.
+ */
+struct __db_mutexmgr {
+	/* These fields are never updated after creation, so not protected. */
+	DB_ENV	*dbenv;			/* Environment */
+	REGINFO	 reginfo;		/* Region information */
+
+	void	*mutex_array;		/* Base of the mutex array */
+};
+
+/* Macros to lock/unlock the mutex region as a whole. */
+#define	MUTEX_SYSTEM_LOCK(dbenv)					\
+	MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *)				\
+	    (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+#define	MUTEX_SYSTEM_UNLOCK(dbenv)					\
+	MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *)				\
+	    (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_MUTEXREGION --
+ *	The primary mutex data structure in the shared memory region.
+ */
+typedef struct __db_mutexregion { /* SHARED */
+	/* These fields are initialized at create time and never modified. */
+	roff_t		mutex_off_alloc;/* Offset of mutex array */
+	roff_t		mutex_off;	/* Adjusted offset of mutex array */
+	db_size_t	mutex_size;	/* Size of the aligned mutex */
+	roff_t		thread_off;	/* Offset of the thread area. */
+
+	db_mutex_t	mtx_region;	/* Region mutex. */
+
+	/* Protected using the region mutex. */
+	db_mutex_t	mutex_next;	/* Next free mutex */
+
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+	/* Mutexes for emulating atomic operations. */
+	db_mutex_t	mtx_atomic[MAX_ATOMIC_MUTEXES];
+#endif
+
+	DB_MUTEX_STAT	stat;		/* Mutex statistics */
+} DB_MUTEXREGION;
+
+#ifdef HAVE_MUTEX_SUPPORT
+struct __db_mutex_t { /* SHARED */	/* Mutex. */
+#ifdef MUTEX_FIELDS
+	MUTEX_FIELDS			/* Opaque thread mutex structures. */
+#endif
+#ifndef HAVE_MUTEX_FCNTL
+#if defined(HAVE_MUTEX_HYBRID) || \
+    (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
+	/*
+	 * For hybrid and test-and-set shared latches it is a counter:
+	 * 0 means it is free,
+	 * -1 is exclusively locked,
+	 * > 0 is the number of shared readers.
+	 * Pthreads shared latches use pthread_rwlock instead.
+	 */
+	tsl_t		tas;
+	db_atomic_t	sharecount;
+#elif !defined(MUTEX_FIELDS)
+	/*
+	 * This is the Test and Set flag for exclusive latches (mutexes):
+	 * there is a free value (often 0, 1, or -1) and a set value.
+	 */
+	tsl_t		tas;
+#endif
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+	volatile u_int32_t wait;	/* Count of waiters. */
+#endif
+	pid_t		pid;		/* Process owning mutex */
+	db_threadid_t	tid;		/* Thread owning mutex */
+
+	db_mutex_t mutex_next_link;	/* Linked list of free mutexes. */
+
+#ifdef HAVE_STATISTICS
+	int	  alloc_id;		/* Allocation ID. */
+
+	u_int32_t mutex_set_wait;	/* Granted after wait. */
+	u_int32_t mutex_set_nowait;	/* Granted without waiting. */
+#ifdef HAVE_SHARED_LATCHES
+	u_int32_t mutex_set_rd_wait;	/* Granted shared lock after wait. */
+	u_int32_t mutex_set_rd_nowait;	/* Granted shared lock w/out waiting. */
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+	u_int32_t hybrid_wait;
+	u_int32_t hybrid_wakeup;	/* for counting spurious wakeups */
+#endif
+#endif
+
+	/*
+	 * A subset of the flag arguments for __mutex_alloc().
+	 *
+	 * Flags should be an unsigned integer even if it's not required by
+	 * the possible flags values, getting a single byte on some machines
+	 * is expensive, and the mutex structure is a MP hot spot.
+	 */
+	volatile u_int32_t flags;		/* MUTEX_XXX */
+};
+#endif
+
+/* Macro to get a reference to a specific mutex. */
+#define	MUTEXP_SET(env, indx)						\
+	(F_ISSET(env, ENV_PRIVATE) ? (DB_MUTEX *) indx :		\
+	(DB_MUTEX *)((u_int8_t *)env->mutex_handle->mutex_array +	\
+	    (indx) *							\
+	    ((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size))
+
+/*
+ * Check that a particular mutex is exclusively held at least by someone, not
+ * necessarily the current thread.
+ */
+#ifdef HAVE_MUTEX_SUPPORT
+#define	MUTEX_IS_OWNED(env, mutex)					\
+	(mutex == MUTEX_INVALID || !MUTEX_ON(env) ||			\
+	F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||			\
+	F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
+#else
+#define	MUTEX_IS_OWNED(env, mutex)	0
+#endif
+
+#if defined(HAVE_MUTEX_HYBRID) ||  defined(DB_WIN32) ||		\
+	(defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
+#define	MUTEXP_IS_BUSY(mutexp)					\
+	(F_ISSET(mutexp, DB_MUTEX_SHARED) ?			\
+	(atomic_read(&(mutexp)->sharecount) != 0) :		\
+	F_ISSET(mutexp, DB_MUTEX_LOCKED))
+#define	MUTEXP_BUSY_FIELD(mutexp)		\
+	(F_ISSET(mutexp, DB_MUTEX_SHARED) ?	\
+	(atomic_read(&(mutexp)->sharecount)) : (mutexp)->flags)
+#else
+/* Pthread_rwlocks don't have an low-cost 'is it being shared?' predicate. */
+#define	MUTEXP_IS_BUSY(mutexp)	(F_ISSET((mutexp), DB_MUTEX_LOCKED))
+#define	MUTEXP_BUSY_FIELD(mutexp)	((mutexp)->flags)
+#endif
+
+#define	MUTEX_IS_BUSY(env, mutex)					\
+	(mutex == MUTEX_INVALID || !MUTEX_ON(env) ||			\
+	F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||			\
+	MUTEXP_IS_BUSY(MUTEXP_SET(env, mutex)))
+
+#define	MUTEX_REQUIRED(env, mutex)					\
+	DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex))
+
+#define	MUTEX_REQUIRED_READ(env, mutex)					\
+	DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex) || MUTEX_IS_BUSY(env, mutex))
+
+/*
+ * Test and set (and thus hybrid) shared latches use compare & exchange
+ * to acquire; the others the mutex-setting primitive defined above.
+ */
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+
+#if defined(HAVE_SHARED_LATCHES)
+/* This is the value of the 'sharecount' of an exclusively held tas latch.
+ * The particular value is not special; it is just unlikely to be caused
+ * by releasing or acquiring a shared latch too many times.
+ */
+#define	MUTEX_SHARE_ISEXCLUSIVE	(-1024)
+
+/*
+ * Get an exclusive lock on a possibly sharable latch. We use the native
+ * MUTEX_SET() operation for non-sharable latches; it usually is faster.
+ */
+#define	MUTEXP_ACQUIRE(mutexp)	\
+	(F_ISSET(mutexp, DB_MUTEX_SHARED) ?			\
+	atomic_compare_exchange(env,				\
+	    &(mutexp)->sharecount, 0, MUTEX_SHARE_ISEXCLUSIVE) :	\
+	MUTEX_SET(&(mutexp)->tas))
+#else
+#define	MUTEXP_ACQUIRE(mutexp)		MUTEX_SET(&(mutexp)->tas)
+#endif
+
+#ifndef MEMBAR_ENTER
+#define	MEMBAR_ENTER()
+#define	MEMBAR_EXIT()
+#endif
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_MUTEX_INT_H_ */
diff --git a/src/dbinc/os.h b/src/dbinc/os.h
new file mode 100644
index 00000000..2515e6ee
--- /dev/null
+++ b/src/dbinc/os.h
@@ -0,0 +1,178 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_OS_H_
+#define	_DB_OS_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Number of times to retry system calls that return EINTR or EBUSY. */
+#define	DB_RETRY	100
+
+#ifdef __TANDEM
+/*
+ * OSS Tandem problem: fsync can return a Guardian file system error of 70,
+ * which has no symbolic name in OSS.  HP says to retry the fsync. [#12957]
+ */
+#define	RETRY_CHK(op, ret) do {						\
+	int __retries, __t_ret;						\
+	for ((ret) = 0, __retries = DB_RETRY;;) {			\
+		if ((op) == 0)						\
+			break;						\
+		(ret) = __os_get_syserr();				\
+		if (((__t_ret = __os_posix_err(ret)) == EAGAIN ||	\
+		    __t_ret == EBUSY || __t_ret == EINTR ||		\
+		    __t_ret == EIO || __t_ret == 70) && --__retries > 0)\
+			continue;					\
+		break;							\
+	}								\
+} while (0)
+#else
+#define	RETRY_CHK(op, ret) do {						\
+	int __retries, __t_ret;						\
+	for ((ret) = 0, __retries = DB_RETRY;;) {			\
+		if ((op) == 0)						\
+			break;						\
+		(ret) = __os_get_syserr();				\
+		if (((__t_ret = __os_posix_err(ret)) == EAGAIN ||	\
+		    __t_ret == EBUSY || __t_ret == EINTR ||		\
+		    __t_ret == EIO) && --__retries > 0)			\
+			continue;					\
+		break;							\
+	}								\
+} while (0)
+#endif
+
+#define	RETRY_CHK_EINTR_ONLY(op, ret) do {				\
+	int __retries;							\
+	for ((ret) = 0, __retries = DB_RETRY;;) {			\
+		if ((op) == 0)						\
+			break;						\
+		(ret) = __os_get_syserr();				\
+		if (__os_posix_err(ret) == EINTR && --__retries > 0)	\
+			continue;					\
+		break;							\
+	}								\
+} while (0)
+
+/*
+ * Flags understood by __os_open.
+ */
+#define	DB_OSO_ABSMODE	0x0001		/* Absolute mode specified. */
+#define	DB_OSO_CREATE	0x0002		/* POSIX: O_CREAT */
+#define	DB_OSO_DIRECT	0x0004		/* Don't buffer the file in the OS. */
+#define	DB_OSO_DSYNC	0x0008		/* POSIX: O_DSYNC. */
+#define	DB_OSO_EXCL	0x0010		/* POSIX: O_EXCL */
+#define	DB_OSO_RDONLY	0x0020		/* POSIX: O_RDONLY */
+#define	DB_OSO_REGION	0x0040		/* Opening a region file. */
+#define	DB_OSO_SEQ	0x0080		/* Expected sequential access. */
+#define	DB_OSO_TEMP	0x0100		/* Remove after last close. */
+#define	DB_OSO_TRUNC	0x0200		/* POSIX: O_TRUNC */
+
+/*
+ * File modes.
+ */
+#define	DB_MODE_400	(S_IRUSR)
+#define	DB_MODE_600	(S_IRUSR|S_IWUSR)
+#define	DB_MODE_660	(S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
+#define	DB_MODE_666	(S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
+#define	DB_MODE_700	(S_IRUSR|S_IWUSR|S_IXUSR)
+
+/*
+ * We group certain seek/write calls into a single function so that we
+ * can use pread(2)/pwrite(2) where they're available.
+ */
+#define	DB_IO_READ	1
+#define	DB_IO_WRITE	2
+
+/*
+ * Make a last "panic" check.  Imagine a thread of control running in Berkeley
+ * DB, going to sleep.  Another thread of control decides to run recovery
+ * because the environment is broken.  The first thing recovery does is panic
+ * the existing environment, but we only check the panic flag when crossing the
+ * public API.  If the sleeping thread wakes up and writes something, we could
+ * have two threads of control writing the log files at the same time.  So,
+ * before reading or writing, make a last panic check.  Obviously, there's still
+ * a window, but it's very, very small.
+ */
+#define	LAST_PANIC_CHECK_BEFORE_IO(env)					\
+	PANIC_CHECK(env);                                               \
+	if (env != NULL &&						\
+	    F_ISSET((env)->dbenv, DB_ENV_NOFLUSH))			\
+	    return (0)							\
+									\
+/* DB filehandle. */
+struct __fh_t {
+	/*
+	 * Linked list of DB_FH's, linked from the DB_ENV, used to keep track
+	 * of all open file handles for resource cleanup.
+	 */
+	 TAILQ_ENTRY(__fh_t) q;
+
+	/*
+	 * The file-handle mutex is only used to protect the handle/fd
+	 * across seek and read/write pairs, it does not protect the
+	 * the reference count, or any other fields in the structure.
+	 */
+	db_mutex_t mtx_fh;		/* Mutex to lock. */
+
+	int	ref;			/* Reference count. */
+
+#if defined(DB_WIN32)
+	HANDLE	handle;			/* Windows/32 file handle. */
+	HANDLE	trunc_handle;		/* Handle for truncate calls. */
+#endif
+	int	fd;			/* POSIX file descriptor. */
+
+	char	*name;			/* File name at open. */
+
+	/*
+	 * Last seek statistics, used for zero-filling on filesystems
+	 * that don't support it directly.
+	 */
+	db_pgno_t pgno;
+	u_int32_t pgsize;
+	off_t offset;
+
+#ifdef HAVE_STATISTICS
+	u_int32_t seek_count;		/* I/O statistics */
+	u_int32_t read_count;
+	u_int32_t write_count;
+#endif
+
+#define	DB_FH_ENVLINK	0x01		/* We're linked on the DB_ENV. */
+#define	DB_FH_NOSYNC	0x02		/* Handle doesn't need to be sync'd. */
+#define	DB_FH_OPENED	0x04		/* Handle is valid. */
+#define	DB_FH_UNLINK	0x08		/* Unlink on close */
+#define	DB_FH_REGION	0x10		/* Opened to contain a region */
+	u_int8_t flags;
+};
+
+/* Standard buffer size for ctime/ctime_r function calls. */
+#define	CTIME_BUFLEN	26
+
+/*
+ * VxWorks requires we cast (const char *) variables to (char *) in order to
+ * pass them to system calls like stat, read and write.
+ */
+#ifdef HAVE_VXWORKS
+#define	CHAR_STAR_CAST	(char *)
+#define	VOID_STAR_CAST	(void *)
+#else
+#define	CHAR_STAR_CAST
+#define VOID_STAR_CAST
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/os_ext.h"
+#endif /* !_DB_OS_H_ */
diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h
new file mode 100644
index 00000000..09e42573
--- /dev/null
+++ b/src/dbinc/partition.h
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * $Id$
+ */
+#ifndef	_DB_PART_H_
+#define	_DB_PART_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct __db_partition {
+	u_int32_t	nparts;		/* number of partitions. */
+	DBT		*keys;		/* array of range keys. */
+	void		*data;		/* the partition info. */
+	const char	**dirs;		/* locations for partitions. */
+	DB		**handles;	/* array of partition handles. */
+	u_int32_t	(*callback) (DB *, DBT *);
+#define	PART_CALLBACK	0x01
+#define	PART_RANGE	0x02
+	u_int32_t	flags;
+} DB_PARTITION;
+
+/*
+ * Internal part of a partitioned cursor.
+ */
+typedef struct __part_internal {
+	__DBC_INTERNAL
+	u_int32_t	part_id;
+	DBC		*sub_cursor;
+} PART_CURSOR;
+
+#ifdef HAVE_PARTITION
+#define	PART_NAME	"__dbp.%s.%03d"
+#define	PART_LEN	(strlen("__dbp..")+3)
+#define	PART_PREFIX	"__dbp."
+#define IS_PARTITION_DB_FILE(name)	(strncmp(name, PART_PREFIX,	\
+					    sizeof(PART_PREFIX) - 1) == 0)
+
+#define	DB_IS_PARTITIONED(dbp)						\
+      (dbp->p_internal != NULL &&					\
+      ((DB_PARTITION *)dbp->p_internal)->handles != NULL)
+
+#define	DBC_PART_REFRESH(dbc)	(F_SET(dbc, DBC_PARTITIONED))
+#else
+#define	DBC_PART_REFRESH(dbc)
+#define	DB_IS_PARTITIONED(dbp)	(0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h
new file mode 100644
index 00000000..c3b9b9fa
--- /dev/null
+++ b/src/dbinc/perfmon.h
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_PERFMON_H_
+#define	_DB_PERFMON_H_
+
+/*******************************************************
+ * Oracle Berkeley DB Performance Event Monitoring
+ *
+ * Some events inside of Oracle Berkeley DB can be 'published'
+ * to the operating environment's performance tracing system
+ * as they occur. Current support includes
+ *	--enable-dtrace
+ *		Solaris
+ *		Linux (via SystemTap's dtrace wrappers)
+ *		Darwin (Mac OS X)
+ *		QNX(?)  
+ *
+ ******************************************************/
+
+/*
+ * The performance monitoring system can display many of the statistics which
+ * are obtainable through the {DB,DB_ENV}->xxx_stat() functions. By default
+ * they are excluded. They can be enabled with --enable-perfmon-statistics.
+ */
+#ifdef HAVE_PERFMON_STATISTICS
+#define STAT_PERFMON1(env, cat, id, a1)		PERFMON1(env, cat, id, (a1))
+#define STAT_PERFMON2(env, cat, id, a1, a2) 	\
+    PERFMON2(env, cat, id, (a1), (a2))
+#define STAT_PERFMON3(env, cat, id, a1, a2, a3)	\
+    PERFMON3(env, cat, id, (a1), (a2), (a3))
+#else
+#define STAT_PERFMON1(env, cat, id, a1)		NOP_STATEMENT
+#define STAT_PERFMON2(env, cat, id, a1, a2)	NOP_STATEMENT
+#define STAT_PERFMON3(env, cat, id, a1, a2, a3)	NOP_STATEMENT
+#endif
+
+
+#if defined(HAVE_PERFMON) && defined(HAVE_STATISTICS)
+/*
+ * The DTrace macros which are generated at configure time in db_provider.h can
+ * have full function signatures. These declarations are needed for compilation
+ * when DTrace support is enabled. It is "too early" in the include sequence
+ * to include the header files which define these structs.
+ */
+struct _db_page;
+struct __bh;
+struct __db_dbt;
+struct __sh_dbt;
+struct __db_mutex_t;
+
+#if defined(HAVE_DTRACE)
+/*
+ * Solaris 10, Darwin/Mac OS X starting in 10.6 (Snow Leopard), Linux with
+ * the DTrace-compatible version of SystemTap, possibly QNX.
+ */
+#include "db_provider.h"
+
+#define PERFMON0(env, cat, id)		bdb_##cat##_##id()
+#define PERFMON1(env, cat, id, a1)	bdb_##cat##_##id(a1)
+#define PERFMON2(env, cat, id, a1, a2)					\
+    bdb_##cat##_##id((a1), (a2))
+#define PERFMON3(env, cat, id, a1, a2, a3)				\
+    do {								\
+    	if (PERFMON_ENABLED(env, cat, id))				\
+	    bdb_##cat##_##id((a1), (a2), (a3));			\
+    } while (0)
+#define PERFMON4(env, cat, id, a1, a2, a3, a4)				\
+    do {								\
+    	if (PERFMON_ENABLED(env, cat, id))				\
+	    bdb_##cat##_##id((a1), (a2), (a3), (a4));			\
+    } while (0)
+#define PERFMON5(env, cat, id, a1, a2, a3, a4, a5)			\
+    do {								\
+    	if (PERFMON_ENABLED(env, cat, id))				\
+	    bdb_##cat##_##id((a1), (a2), (a3), (a4), (a5));		\
+    } while (0)
+#define PERFMON6(env, cat, id, a1, a2, a3, a4, a5, a6)			\
+    do {								\
+    	if (PERFMON_ENABLED(env, cat, id))				\
+	    bdb_##cat##_##id((a1), (a2), (a3), (a4), (a5), (a6));	\
+    } while (0)
+#define PERFMON_ENABLED(env, cat, id)	 bdb_##cat##_##id##_enabled()
+#endif
+
+#else
+/* Without HAVE_PERFMON or HAVE_STATISTICS these macros map to null bodies. */
+#define PERFMON0(env, cat, id)				NOP_STATEMENT
+#define PERFMON1(env, cat, id, a1)			NOP_STATEMENT
+#define PERFMON2(env, cat, id, a1, a2)			NOP_STATEMENT
+#define PERFMON3(env, cat, id, a1, a2, a3)		NOP_STATEMENT
+#define PERFMON4(env, cat, id, a1, a2, a3, a4)		NOP_STATEMENT
+#define PERFMON5(env, cat, id, a1, a2, a3, a4, a5)	NOP_STATEMENT
+#define PERFMON6(env, cat, id, a1, a2, a3, a4, a5, a6)	NOP_STATEMENT
+#define PERFMON_ENABLED(env, cat, id)	 		FALSE
+#endif
+
+#endif
diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h
new file mode 100644
index 00000000..657c11e2
--- /dev/null
+++ b/src/dbinc/qam.h
@@ -0,0 +1,203 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_QAM_H_
+#define	_DB_QAM_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * QAM data elements: a status field and the data.
+ */
+typedef struct _qamdata {
+	u_int8_t  flags;	/* 00: delete bit. */
+#define	QAM_VALID	0x01
+#define	QAM_SET		0x02
+	u_int8_t  data[1];	/* Record. */
+} QAMDATA;
+
+struct __queue;		typedef struct __queue QUEUE;
+struct __qcursor;	typedef struct __qcursor QUEUE_CURSOR;
+
+struct __qcursor {
+	/* struct __dbc_internal */
+	__DBC_INTERNAL
+
+	/* Queue private part */
+
+	/* Per-thread information: queue private. */
+	db_recno_t	 recno;		/* Current record number. */
+
+	u_int32_t	 flags;
+};
+
+typedef struct __mpfarray {
+	u_int32_t n_extent;		/* Number of extents in table. */
+	u_int32_t low_extent;		/* First extent open. */
+	u_int32_t hi_extent;		/* Last extent open. */
+	struct __qmpf {
+		int pinref;
+		DB_MPOOLFILE *mpf;
+	} *mpfarray;			 /* Array of open extents. */
+} MPFARRAY;
+
+/*
+ * The in-memory, per-tree queue data structure.
+ */
+struct __queue {
+	db_pgno_t q_meta;		/* Database meta-data page. */
+	db_pgno_t q_root;		/* Database root page. */
+
+	int	  re_pad;		/* Fixed-length padding byte. */
+	u_int32_t re_len;		/* Length for fixed-length records. */
+	u_int32_t rec_page;		/* records per page */
+	u_int32_t page_ext;		/* Pages per extent */
+	MPFARRAY array1, array2;	/* File arrays. */
+
+					/* Extent file configuration: */
+	DBT pgcookie;			/* Initialized pgcookie. */
+	DB_PGINFO pginfo;		/* Initialized pginfo struct. */
+
+	char *path;			/* Space allocated to file pathname. */
+	char *name;			/* The name of the file. */
+	char *dir;			/* The dir of the file. */
+	int mode;			/* Mode to open extents. */
+};
+
+/* Format for queue extent names. */
+#define	QUEUE_EXTENT		"%s%c__dbq.%s.%d"
+#define	QUEUE_EXTENT_HEAD	"__dbq.%s."
+#define	QUEUE_EXTENT_PREFIX	"__dbq."
+
+typedef struct __qam_filelist {
+	DB_MPOOLFILE *mpf;
+	u_int32_t id;
+} QUEUE_FILELIST;
+
+/*
+ * Calculate the page number of a recno.
+ *
+ * Number of records per page =
+ *	Divide the available space on the page by the record len + header.
+ *
+ * Page number for record =
+ *	divide the physical record number by the records per page
+ *	add the root page number
+ *	For now the root page will always be 1, but we might want to change
+ *	in the future (e.g. multiple fixed len queues per file).
+ *
+ * Index of record on page =
+ *	physical record number, less the logical pno times records/page
+ */
+#define	CALC_QAM_RECNO_PER_PAGE(dbp)					\
+    (((dbp)->pgsize - QPAGE_SZ(dbp)) /					\
+    (u_int32_t)DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) +		\
+    ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t)))
+
+#define	QAM_RECNO_PER_PAGE(dbp)	(((QUEUE*)(dbp)->q_internal)->rec_page)
+
+#define	QAM_RECNO_PAGE(dbp, recno)					\
+    (((QUEUE *)(dbp)->q_internal)->q_root				\
+    + (((recno) - 1) / QAM_RECNO_PER_PAGE(dbp)))
+
+#define	QAM_PAGE_EXTENT(dbp, pgno)					\
+    (((pgno) - 1) / ((QUEUE *)(dbp)->q_internal)->page_ext)
+
+#define	QAM_RECNO_EXTENT(dbp, recno)					\
+    QAM_PAGE_EXTENT(dbp, QAM_RECNO_PAGE(dbp, recno))
+
+#define	QAM_RECNO_INDEX(dbp, pgno, recno)				\
+    (u_int32_t)(((recno) - 1) - (QAM_RECNO_PER_PAGE(dbp)		\
+    * (pgno - ((QUEUE *)(dbp)->q_internal)->q_root)))
+
+#define	QAM_GET_RECORD(dbp, page, index)				\
+    ((QAMDATA *)((u_int8_t *)(page) + (QPAGE_SZ(dbp) +			\
+    (DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) +				\
+    ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t)) * index))))
+
+#define QAM_OUTSIDE_QUEUE(meta, recno)					\
+	(((meta)->cur_recno >= (meta)->first_recno ?			\
+	    ((recno) < (meta)->first_recno ||				\
+	         (recno) > (meta)->cur_recno) :				\
+	    ((recno) > (meta)->cur_recno && 				\
+	        (recno) < (meta)->first_recno)))
+
+#define	QAM_AFTER_CURRENT(meta, recno)					\
+	((recno) == (meta)->cur_recno ||				\
+	(QAM_OUTSIDE_QUEUE(meta, recno) &&				\
+        ((recno) - (meta)->cur_recno) <= ((meta)->first_recno - (recno))))
+
+#define	QAM_BEFORE_FIRST(meta, recno)					\
+	(QAM_OUTSIDE_QUEUE(meta, recno) &&				\
+	((meta)->first_recno - (recno)) < ((recno) - (meta)->cur_recno))
+
+#define	QAM_NOT_VALID(meta, recno)					\
+    (recno == RECNO_OOB ||						\
+	QAM_BEFORE_FIRST(meta, recno) || QAM_AFTER_CURRENT(meta, recno))
+
+#define QAM_WAKEUP(dbc, ret) do {					\
+	if (STD_LOCKING(dbc)) {						\
+		dbc->lock.pgno = PGNO_INVALID;				\
+		dbc->lock.type = DB_PAGE_LOCK;				\
+		ret = __lock_wakeup((dbc)->dbp->env, &(dbc)->lock_dbt);	\
+	} else								\
+		ret = 0;						\
+} while (0)
+
+/* Handle wrap around. */
+#define QAM_INC_RECNO(recno) do {					\
+	recno++;							\
+} while (recno == RECNO_OOB)
+
+#define QAM_DEC_RECNO(recno) do {					\
+	recno--;							\
+} while (recno == RECNO_OOB)
+
+
+/*
+ * Log opcodes for the mvptr routine.
+ */
+#define	QAM_SETFIRST		0x01
+#define	QAM_SETCUR		0x02
+#define	QAM_TRUNCATE		0x04
+
+typedef enum {
+	QAM_PROBE_GET,
+	QAM_PROBE_PUT,
+	QAM_PROBE_DIRTY,
+	QAM_PROBE_MPF
+} qam_probe_mode;
+
+/*
+ * Ops for __qam_nameop.
+ */
+typedef enum {
+	QAM_NAME_DISCARD,
+	QAM_NAME_RENAME,
+	QAM_NAME_REMOVE
+} qam_name_op;
+
+#define	__qam_fget(dbc, pgnoaddr, flags, addrp)		\
+	__qam_fprobe(dbc, *pgnoaddr,					\
+	    addrp, QAM_PROBE_GET, DB_PRIORITY_UNCHANGED, flags)
+
+#define	__qam_fput(dbc, pgno, addrp, priority)			\
+	__qam_fprobe(dbc, pgno, addrp, QAM_PROBE_PUT, priority, 0)
+
+#define	__qam_dirty(dbc, pgno, pagep, priority)		\
+	__qam_fprobe(dbc, pgno, pagep, QAM_PROBE_DIRTY, priority, 0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/qam_auto.h"
+#include "dbinc_auto/qam_ext.h"
+#endif /* !_DB_QAM_H_ */
diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h
new file mode 100644
index 00000000..5a62741a
--- /dev/null
+++ b/src/dbinc/queue.h
@@ -0,0 +1,570 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef	_DB_QUEUE_H_
+#define	_DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ
+ * _HEAD		+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+
+ * _ENTRY		+	+	+	+
+ * _INIT		+	+	+	+
+ * _EMPTY		+	+	+	+
+ * _FIRST		+	+	+	+
+ * _NEXT		+	+	+	+
+ * _PREV		-	-	-	+
+ * _LAST		-	-	+	+
+ * _FOREACH		+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+
+ * _INSERT_HEAD		+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+
+ * _INSERT_AFTER	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+
+ * _CONCAT		-	-	+	+
+ * _REMOVE_HEAD		+	-	+	-
+ * _REMOVE		+	+	+	+
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems.  What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here.  For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros.  Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h.  Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define	QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+	char * lastfile;
+	int lastline;
+	char * prevfile;
+	int prevline;
+};
+
+#define	TRACEBUF	struct qm_trace trace;
+#define	TRASHIT(x)	do {(x) = (void *)-1;} while (0)
+
+#define	QMD_TRACE_HEAD(head) do {					\
+	(head)->trace.prevline = (head)->trace.lastline;		\
+	(head)->trace.prevfile = (head)->trace.lastfile;		\
+	(head)->trace.lastline = __LINE__;				\
+	(head)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#define	QMD_TRACE_ELEM(elem) do {					\
+	(elem)->trace.prevline = (elem)->trace.lastline;		\
+	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
+	(elem)->trace.lastline = __LINE__;				\
+	(elem)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#else
+#define	QMD_TRACE_ELEM(elem)
+#define	QMD_TRACE_HEAD(head)
+#define	TRACEBUF
+#define	TRASHIT(x)
+#endif	/* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
+	for ((varp) = &SLIST_FIRST((head));				\
+	    ((var) = *(varp)) != NULL;					\
+	    (varp) = &SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (curelm != NULL && 				\
+		    SLIST_NEXT(curelm, field) != (elm))			\
+			curelm = SLIST_NEXT(curelm, field);		\
+		if (curelm != NULL)					\
+			SLIST_NEXT(curelm, field) =			\
+			    SLIST_NEXT(SLIST_NEXT(curelm, field), field);\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_CONCAT(head1, head2) do {				\
+	if (!STAILQ_EMPTY((head2))) {					\
+		*(head1)->stqh_last = (head2)->stqh_first;		\
+		(head1)->stqh_last = (head2)->stqh_last;		\
+		STAILQ_INIT((head2));					\
+	}								\
+} while (0)
+
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for ((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY((head)) ?						\
+		NULL :							\
+		((struct type *)					\
+		((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev =		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_CONCAT(head1, head2, field) do {				\
+	if (!TAILQ_EMPTY(head2)) {					\
+		*(head1)->tqh_last = (head2)->tqh_first;		\
+		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
+		(head1)->tqh_last = (head2)->tqh_last;			\
+		TAILQ_INIT((head2));					\
+		QMD_TRACE_HEAD(head);					\
+		QMD_TRACE_HEAD(head2);					\
+	}								\
+} while (0)
+
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    &TAILQ_NEXT((elm), field);				\
+	else {								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    (elm)->field.tqe_prev;				\
+	else {								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+	TRASHIT((elm)->field.tqe_next);					\
+	TRASHIT((elm)->field.tqe_prev);					\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif	/* !_DB_QUEUE_H_ */
diff --git a/src/dbinc/region.h b/src/dbinc/region.h
new file mode 100644
index 00000000..ac0ff16f
--- /dev/null
+++ b/src/dbinc/region.h
@@ -0,0 +1,329 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REGION_H_
+#define	_DB_REGION_H_
+
+/*
+ * The DB environment consists of some number of "regions", which are described
+ * by the following four structures:
+ *
+ *	REGENV	   -- shared information about the environment
+ *	REGENV_REF -- file describing system memory version of REGENV
+ *	REGION	   -- shared information about a single region
+ *	REGINFO	   -- per-process information about a REGION
+ *
+ * There are three types of memory that hold regions:
+ *	per-process heap (malloc)
+ *	file mapped into memory (mmap, MapViewOfFile)
+ *	system memory (shmget, CreateFileMapping)
+ *
+ * By default, regions are created in filesystem-backed shared memory.  They
+ * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private
+ * to a process, in heap memory (DB_PRIVATE).
+ *
+ * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
+ * we're not using a private environment allocated in heap, "__db.001" will
+ * always exist, as we use it to synchronize on the regions, whether they are
+ * in filesystem-backed memory or system memory.
+ *
+ * The file "__db.001" contains a REGENV structure pointing to  an
+ * array of REGION structures.  Each REGION structures describes an
+ * underlying chunk of shared memory.
+ *
+ *	__db.001
+ *	+---------+
+ *	|REGENV   |
+ *	+---------+
+ *          |
+ *         \/
+ *	+---------+   +----------+
+ *	|REGION   |-> | __db.001 |
+ *	|	  |   +----------+
+ *	+---------+   +----------+
+ *	|REGION   |-> | __db.002 |
+ *	|	  |   +----------+
+ *	+---------+   +----------+
+ *	|REGION   |-> | __db.003 |
+ *	|	  |   +----------+
+ *	+---------+   +----------+
+ *	|REGION   |-> | __db.004 |
+ *	|	  |   +----------+
+ *	+---------+
+ *
+ * The tricky part about manipulating the regions is creating or joining the
+ * database environment.  We have to be sure only a single thread of control
+ * creates and/or recovers a database environment.  All other threads should
+ * then join without seeing inconsistent data.
+ *
+ * We do this in two parts: first, we use the underlying O_EXCL flag to the
+ * open system call to serialize creation of the __db.001 file.  The thread
+ * of control creating that file then proceeds to create the remaining
+ * regions in the environment, including the mutex region.  Once the mutex
+ * region has been created, the creating thread of control fills in the
+ * __db.001 file's magic number.  Other threads of control (the ones that
+ * didn't create the __db.001 file), wait on the initialization of the
+ * __db.001 file's magic number.  After it has been initialized, all threads
+ * of control can proceed, using normal shared mutex locking procedures for
+ * exclusion.
+ *
+ * REGIONs are not moved or removed during the life of the environment, and
+ * so processes can have long-lived references to them.
+ *
+ * One of the REGION structures describes the environment region itself.
+ *
+ * The REGION array is not locked in any way.  It's an array so we don't have
+ * to manipulate data structures after a crash -- on some systems, we have to
+ * join and clean up the mutex region after application failure.  Using an
+ * array means we don't have to worry about broken links or other nastiness
+ * after the failure.
+ *
+ * All requests to create or join a region return a REGINFO structure, which
+ * is held by the caller and used to open and subsequently close the reference
+ * to the region.  The REGINFO structure contains the per-process information
+ * that we need to access the region.
+ *
+ * The one remaining complication.  If the regions (including the environment
+ * region) live in system memory, and the system memory isn't "named" somehow
+ * in the filesystem name space, we need some way of finding it.  Do this by
+ * by writing the REGENV_REF structure into the "__db.001" file.  When we find
+ * a __db.001 file that is too small to be a real, on-disk environment, we use
+ * the information it contains to redirect to the real "__db.001" file/memory.
+ * This currently only happens when the REGENV file is in shared system memory.
+ *
+ * Although DB does not currently grow regions when they run out of memory, it
+ * would be possible to do so.  To grow a region, allocate a new region of the
+ * appropriate size, then copy the old region over it and insert the additional
+ * memory into the already existing shalloc arena.  Region users must reset
+ * their base addresses and any local pointers into the memory, of course.
+ * This failed in historic versions of DB because the region mutexes lived in
+ * the mapped memory, and when it was unmapped and remapped (or copied),
+ * threads could lose track of it.  Also, some systems didn't support mutex
+ * copying, e.g., from OSF1 V4.0:
+ *
+ *	The address of an msemaphore structure may be significant.  If the
+ *	msemaphore structure contains any value copied from an msemaphore
+ *	structure at a different address, the result is undefined.
+ *
+ * All mutexes are now maintained in a separate region which is never unmapped,
+ * so growing regions should be possible.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define	DB_REGION_PREFIX	"__db"		/* DB file name prefix. */
+#define	DB_REGION_FMT		"__db.%03d"	/* Region file name format. */
+#define	DB_REGION_ENV		"__db.001"	/* Primary environment name. */
+#define IS_DB_FILE(name)	(strncmp(name, DB_REGION_PREFIX,	\
+				    sizeof(DB_REGION_PREFIX) - 1) == 0)
+
+#define	INVALID_REGION_ID	0	/* Out-of-band region ID. */
+#define	REGION_ID_ENV		1	/* Primary environment ID. */
+
+typedef enum {
+	INVALID_REGION_TYPE=0,		/* Region type. */
+	REGION_TYPE_ENV,
+	REGION_TYPE_LOCK,
+	REGION_TYPE_LOG,
+	REGION_TYPE_MPOOL,
+	REGION_TYPE_MUTEX,
+	REGION_TYPE_TXN } reg_type_t;
+
+#define	INVALID_REGION_SEGID	-1	/* Segment IDs are either shmget(2) or
+					 * Win16 segment identifiers.  They are
+					 * both stored in a "long", and we need
+					 * an out-of-band value.
+					 */
+/*
+ * Nothing can live at region offset 0, because, in all cases, that's where
+ * we store *something*.  Lots of code needs an out-of-band value for region
+ * offsets, so we use 0.
+ */
+#define	INVALID_ROFF		0
+
+/* Reference describing system memory version of REGENV. */
+typedef struct __db_reg_env_ref {
+	roff_t	   size;		/* Region size. */
+	roff_t	   max;			/* Region max in bytes. */
+	long	   segid;		/* UNIX shmget ID, VxWorks ID. */
+} REGENV_REF;
+
+/* Per-environment region information. */
+typedef struct __db_reg_env { /* SHARED */
+	/*
+	 * !!!
+	 * The magic, panic, version, envid and signature fields of the region
+	 * are fixed in size, the timestamp field is the first field which is
+	 * variable length.  These fields must never change in order, to
+	 * guarantee we can always read them, no matter what release we have.
+	 *
+	 * !!!
+	 * The magic and panic fields are NOT protected by any mutex, and for
+	 * this reason cannot be anything more complicated than zero/non-zero.
+	 */
+	u_int32_t magic;		/* Valid region magic number. */
+	u_int32_t panic;		/* Environment is dead. */
+
+	u_int32_t majver;		/* Major DB version number. */
+	u_int32_t minver;		/* Minor DB version number. */
+	u_int32_t patchver;		/* Patch DB version number. */
+
+	u_int32_t envid;		/* Unique environment ID. */
+
+	u_int32_t signature;		/* Structure signatures. */
+
+	time_t	  timestamp;		/* Creation time. */
+
+	/*
+	 * Flags saved in the init_flags field of the environment, representing
+	 * flags to DB_ENV->set_flags and DB_ENV->open that need to be set.
+	 */
+	u_int32_t init_flags;
+#define	DB_INITENV_CDB		0x0001	/* DB_INIT_CDB */
+#define	DB_INITENV_CDB_ALLDB	0x0002	/* DB_INIT_CDB_ALLDB */
+#define	DB_INITENV_LOCK		0x0004	/* DB_INIT_LOCK */
+#define	DB_INITENV_LOG		0x0008	/* DB_INIT_LOG */
+#define	DB_INITENV_MPOOL	0x0010	/* DB_INIT_MPOOL */
+#define	DB_INITENV_REP		0x0020	/* DB_INIT_REP */
+#define	DB_INITENV_TXN		0x0040	/* DB_INIT_TXN */
+
+
+	/*
+	 * The mtx_regenv mutex protects the environment reference count and
+	 * memory allocation from the primary shared region (the crypto, thread
+	 * control block and replication implementations allocate memory from
+	 * the primary shared region).
+	 *
+	 * The rest of the fields are initialized at creation time, and don't
+	 * need mutex protection.  The flags, op_timestamp and rep_timestamp
+	 * fields are used by replication only and are protected by the
+	 * replication mutex.  The rep_timestamp is is not protected when it
+	 * is used in recovery as that is already single threaded.
+	 */
+	db_mutex_t mtx_regenv;		/* Refcnt, region allocation mutex. */
+	u_int32_t  refcnt;		/* References to the environment. */
+
+	u_int32_t region_cnt;		/* Number of REGIONs. */
+	roff_t	  region_off;		/* Offset of region array */
+	roff_t    lt_primary;		/* Lock primary. */
+	roff_t    lg_primary;		/* Log primary. */
+	roff_t    tx_primary;		/* Txn primary. */
+
+	roff_t	  cipher_off;		/* Offset of cipher area */
+
+	roff_t	  thread_off;		/* Offset of the thread area. */
+
+	roff_t	  rep_off;		/* Offset of the replication area. */
+#define	DB_REGENV_REPLOCKED	0x0001	/* Env locked for rep backup. */
+	u_int32_t flags;		/* Shared environment flags. */
+#define	DB_REGENV_TIMEOUT	30	/* Backup timeout. */
+	time_t	  op_timestamp;		/* Timestamp for operations. */
+	time_t	  rep_timestamp;	/* Timestamp for rep db handles. */
+	u_int32_t reg_panic;		/* DB_REGISTER triggered panic */
+	uintmax_t unused;		/* The ALLOC_LAYOUT structure follows
+					 * the REGENV structure in memory and
+					 * contains uintmax_t fields.  Force
+					 * proper alignment of that structure.
+					 */
+} REGENV;
+
+/* Per-region shared region information. */
+typedef struct __db_region { /* SHARED */
+	roff_t	size;			/* Region size in bytes. */
+	roff_t  max;			/* Region max in bytes. */
+	long	segid;			/* UNIX shmget(2), Win16 segment ID. */
+
+	u_int32_t	id;		/* Region id. */
+	reg_type_t	type;		/* Region type. */
+
+	roff_t	primary;		/* Primary data structure offset. */
+	roff_t  alloc;			/* Region allocation size in bytes. */
+} REGION;
+
+/*
+ * Per-process/per-attachment information about a single region.
+ */
+
+/*
+ * Structure used for tracking allocations in DB_PRIVATE regions. 
+ */
+struct __db_region_mem_t;	typedef struct __db_region_mem_t REGION_MEM;
+struct __db_region_mem_t {
+	REGION_MEM *next;
+};
+
+struct __db_reginfo_t {		/* __env_region_attach IN parameters. */
+	ENV	   *env;		/* Enclosing environment. */
+	reg_type_t  type;		/* Region type. */
+	u_int32_t   id;			/* Region id. */
+
+				/* env_region_attach OUT parameters. */
+	REGION	   *rp;			/* Shared region. */
+
+	char	   *name;		/* Region file name. */
+	DB_FH	   *fhp;		/* Region file handle */
+
+	void	   *addr;		/* Region address. */
+	void	   *head;		/* Head of the allocation struct. */
+	void	   *primary;		/* Primary data structure address. */
+
+					/* Private Memory Tracking. */
+	size_t	    max_alloc;		/* Maximum bytes allocated. */
+	size_t	    allocated;		/* Bytes allocated. */
+	REGION_MEM  *mem;		/* List of memory to free */
+
+	db_mutex_t  mtx_alloc;		/* number of mutex for allocation. */
+
+#ifdef DB_WIN32
+	HANDLE	wnt_handle;		/* Win/NT HANDLE. */
+#endif
+
+#define	REGION_CREATE		0x01	/* Caller created region. */
+#define	REGION_CREATE_OK	0x02	/* Caller willing to create region. */
+#define	REGION_JOIN_OK		0x04	/* Caller is looking for a match. */
+#define	REGION_SHARED		0x08	/* Region is shared. */
+#define	REGION_TRACKED		0x10	/* Region private memory is tracked. */
+	u_int32_t   flags;
+};
+
+/*
+ * R_ADDR	Return a per-process address for a shared region offset.
+ * R_OFFSET	Return a shared region offset for a per-process address.
+ */
+#define	R_ADDR(reginfop, offset)					\
+	(F_ISSET((reginfop)->env, ENV_PRIVATE) ?			\
+	    ROFF_TO_P(offset) :						\
+	    (void *)((u_int8_t *)((reginfop)->addr) + (offset)))
+#define	R_OFFSET(reginfop, p)						\
+	(F_ISSET((reginfop)->env, ENV_PRIVATE) ?			\
+	    P_TO_ROFF(p) :						\
+	    (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))
+
+/*
+ * PANIC_ISSET, PANIC_CHECK:
+ *	Check to see if the DB environment is dead.
+ */
+#define	PANIC_ISSET(env)						\
+	((env) != NULL && (env)->reginfo != NULL &&			\
+	    ((REGENV *)(env)->reginfo->primary)->panic != 0 &&		\
+	    !F_ISSET((env)->dbenv, DB_ENV_NOPANIC))
+
+#define	PANIC_CHECK(env)						\
+	if (PANIC_ISSET(env))						\
+		return (__env_panic_msg(env));
+
+#define	PANIC_CHECK_RET(env, ret)			       		\
+	if (PANIC_ISSET(env))						\
+		ret = (__env_panic_msg(env));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_REGION_H_ */
diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h
new file mode 100644
index 00000000..75004239
--- /dev/null
+++ b/src/dbinc/rep.h
@@ -0,0 +1,1102 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REP_H_
+#define	_DB_REP_H_
+
+#include "dbinc_auto/rep_automsg.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Names of client temp databases.
+ */
+#define	REPFILEPREFIX	"__db.rep"
+#define	REPDBNAME	"__db.rep.db"
+#define	REPPAGENAME     "__db.reppg.db"
+
+/*
+ * Name of replicated system database file, and LSN history subdatabase within
+ * it.  If the INMEM config flag is set, we create the database in memory, with
+ * the REPLSNHIST name (so that is why it also follows the __db naming
+ * convention).
+ */
+#define	REPSYSDBNAME	"__db.rep.system"
+#define	REPLSNHIST	"__db.lsn.history"
+#define	REPMEMBERSHIP	"__db.membership"
+#define	REPSYSDBPGSZ	1024
+#define IS_REP_FILE(name)	(strcmp(name, REPSYSDBNAME) == 0)
+				    
+
+/* Current version of commit token format, and LSN history database format. */
+#define	REP_COMMIT_TOKEN_FMT_VERSION	1
+#define	REP_LSN_HISTORY_FMT_VERSION	1
+
+/*
+ * Message types
+ */
+#define	REP_INVALID	0	/* Invalid message type. */
+#define	REP_ALIVE	1	/* I am alive message. */
+#define	REP_ALIVE_REQ	2	/* Request for alive messages. */
+#define	REP_ALL_REQ	3	/* Request all log records greater than LSN. */
+#define	REP_BULK_LOG	4	/* Bulk transfer of log records. */
+#define	REP_BULK_PAGE	5	/* Bulk transfer of pages. */
+#define	REP_DUPMASTER	6	/* Duplicate master detected; propagate. */
+#define	REP_FILE	7	/* Page of a database file. NOTUSED */
+#define	REP_FILE_FAIL	8	/* File requested does not exist. */
+#define	REP_FILE_REQ	9	/* Request for a database file. NOTUSED */
+#define	REP_LEASE_GRANT	10	/* Client grants a lease to a master. */
+#define	REP_LOG		11	/* Log record. */
+#define	REP_LOG_MORE	12	/* There are more log records to request. */
+#define	REP_LOG_REQ	13	/* Request for a log record. */
+#define	REP_MASTER_REQ	14	/* Who is the master */
+#define	REP_NEWCLIENT	15	/* Announces the presence of a new client. */
+#define	REP_NEWFILE	16	/* Announce a log file change. */
+#define	REP_NEWMASTER	17	/* Announces who the master is. */
+#define	REP_NEWSITE	18	/* Announces that a site has heard from a new
+				 * site; like NEWCLIENT, but indirect.  A
+				 * NEWCLIENT message comes directly from the new
+				 * client while a NEWSITE comes indirectly from
+				 * someone who heard about a NEWSITE.
+				 */
+#define	REP_PAGE	19	/* Database page. */
+#define	REP_PAGE_FAIL	20	/* Requested page does not exist. */
+#define	REP_PAGE_MORE	21	/* There are more pages to request. */
+#define	REP_PAGE_REQ	22	/* Request for a database page. */
+#define	REP_REREQUEST	23	/* Force rerequest. */
+#define	REP_START_SYNC	24	/* Tell client to begin syncing a ckp.*/
+#define	REP_UPDATE	25	/* Environment hotcopy information. */
+#define	REP_UPDATE_REQ	26	/* Request for hotcopy information. */
+#define	REP_VERIFY	27	/* A log record for verification. */
+#define	REP_VERIFY_FAIL	28	/* The client is outdated. */
+#define	REP_VERIFY_REQ	29	/* Request for a log record to verify. */
+#define	REP_VOTE1	30	/* Send out your information for an election. */
+#define	REP_VOTE2	31	/* Send a "you are master" vote. */
+/*
+ * Maximum message number for conversion tables.  Update this
+ * value as the largest message number above increases.
+ * It might make processing messages more straightforward if
+ * the *_MORE and BULK* messages were flags within the regular
+ * message type instead of separate message types themselves.
+ *
+ * !!!
+ * NOTE: When changing messages above, the two tables for upgrade support
+ * need adjusting.  They are in rep_util.c.
+ */
+#define	REP_MAX_MSG	31
+
+/*
+ * This is the list of client-to-client requests messages.
+ * We use this to decide if we're doing client-to-client and
+ * might need to send a rerequest.
+ */
+#define	REP_MSG_REQ(rectype)			\
+    (rectype == REP_ALL_REQ ||			\
+    rectype == REP_LOG_REQ ||			\
+    rectype == REP_PAGE_REQ ||			\
+    rectype == REP_VERIFY_REQ)
+
+/*
+ * Note that the version information should be at the beginning of the
+ * structure, so that we can rearrange the rest of it while letting the
+ * version checks continue to work.  DB_REPVERSION should be revved any time
+ * the rest of the structure changes or when the message numbers change.
+ *
+ * Define also, the corresponding log versions that are tied to the
+ * replication/release versions.  These are only needed in replication
+ * and that is why they're defined here. db_printlog takes notice as well.
+ */
+#define	DB_LOGVERSION_42	8
+#define	DB_LOGVERSION_43	10
+#define	DB_LOGVERSION_44	11
+#define	DB_LOGVERSION_45	12
+#define	DB_LOGVERSION_46	13
+#define	DB_LOGVERSION_47	14
+#define	DB_LOGVERSION_48	15
+#define	DB_LOGVERSION_48p2	16
+#define	DB_LOGVERSION_50	17
+#define	DB_LOGVERSION_51	17
+#define	DB_LOGVERSION_52	18
+#define	DB_LOGVERSION_53	19
+#define	DB_LOGVERSION_MIN	DB_LOGVERSION_44
+#define	DB_REPVERSION_INVALID	0
+#define	DB_REPVERSION_44	3
+#define	DB_REPVERSION_45	3
+#define	DB_REPVERSION_46	4
+#define	DB_REPVERSION_47	5
+#define	DB_REPVERSION_48	5
+#define	DB_REPVERSION_50	5
+#define	DB_REPVERSION_51	5
+#define	DB_REPVERSION_52	6
+#define	DB_REPVERSION_53	7
+#define	DB_REPVERSION		DB_REPVERSION_53
+#define	DB_REPVERSION_MIN	DB_REPVERSION_44
+
+/*
+ * RPRINT - Replication diagnostic output
+ * VPRINT - Replication verbose output (superset of RPRINT).
+ * REP_PRINT_MESSAGE
+ *	Macros for verbose replication messages.
+ *
+ * Everything using RPRINT will go to the system diag file (if it
+ * is configured) and also to the user's verbose output if
+ * they have that verbose level configured.
+ * Messages using VPRINT do not ever go to the system diag file,
+ * but will go to the user's verbose output if configured.
+ *
+ * Use VPRINT for anything that might be printed on a standard,
+ * successful transaction.  Use RPRINT for error paths, rep
+ * state changes, elections, etc.
+ */
+#define	REP_DIAGNAME	"__db.rep.diag%02d"
+#define	REP_DIAGSIZE	MEGABYTE
+#define	RPRINT(env, x) do {						\
+	if ((env)->dbenv->verbose != 0)					\
+		(void)__rep_print_system x;				\
+} while (0)
+#define	VPRINT(env, x) do {						\
+	if ((env)->dbenv->verbose != 0)					\
+		(void)__rep_print x;					\
+} while (0)
+#define	REP_PRINT_MESSAGE(env, eid, rp, str, fl) do {			\
+	if ((env)->dbenv->verbose != 0)					\
+		__rep_print_message(env, eid, rp, str, fl);		\
+} while (0)
+
+/*
+ * Election gen file name
+ * The file contains an egen number for an election this client has NOT
+ * participated in.  I.e. it is the number of a future election.  We
+ * create it when we create the rep region, if it doesn't already exist
+ * and initialize egen to 1.  If it does exist, we read it when we create
+ * the rep region.  We write it immediately before sending our VOTE1 in
+ * an election.  That way, if a client has ever sent a vote for any
+ * election, the file is already going to be updated to reflect a future
+ * election, should it crash.
+ */
+#define	REP_EGENNAME	"__db.rep.egen"
+#define	REP_GENNAME	"__db.rep.gen"
+
+/*
+ * Internal init flag file name:
+ * The existence of this file serves as an indication that the client is in the
+ * process of Internal Initialization, in case it crashes before completing.
+ * During internal init the client's partially reconstructed database pages and
+ * logs may be in an inconsistent state, so much so that running recovery must
+ * be avoided.  Furthermore, there is no other way to reliably recognize this
+ * condition.  Therefore, when we open an environment, and we're just about to
+ * run recovery, we check for this file first.  If it exists we must discard all
+ * logs and databases.  This avoids the recovery problems, and leads to a fresh
+ * attempt at internal init if the environment becomes a replication client and
+ * finds a master.  The list of databases which may need to be removed is stored
+ * in this file.
+ */
+#define	REP_INITNAME	"__db.rep.init"
+#define	REP_INITVERSION_46	1
+#define	REP_INITVERSION_47	2
+#define	REP_INITVERSION		3
+
+/*
+ * Database types for __rep_client_dbinit
+ */
+typedef enum {
+	REP_DB,		/* Log record database. */
+	REP_PG		/* Pg database. */
+} repdb_t;
+
+/* Macros to lock/unlock the replication region as a whole. */
+#define	REP_SYSTEM_LOCK(env)						\
+	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region)
+#define	REP_SYSTEM_UNLOCK(env)						\
+	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region)
+
+/*
+ * Macros for manipulating the event synchronization.  We use a separate mutex
+ * so that an application's call-back function can be invoked without locking
+ * the whole region.
+ */
+#define	REP_EVENT_LOCK(env)						\
+	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event)
+#define	REP_EVENT_UNLOCK(env)						\
+	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event)
+
+/*
+ * Synchronization states
+ * Please change __rep_syncstate_to_string (rep_stat.c) to track any changes
+ * made to these states.
+ *
+ * The states are in alphabetical order (except for OFF).  The usual
+ * order of progression for a full internal init is:
+ * VERIFY, UPDATE, PAGE, LOG (then back to OFF when we're done).
+ */
+typedef enum {
+	SYNC_OFF,	/* No recovery. */
+	SYNC_LOG,	/* Recovery - log. */
+	SYNC_PAGE,	/* Recovery - pages. */
+	SYNC_UPDATE,	/* Recovery - update. */
+	SYNC_VERIFY 	/* Recovery - verify. */
+} repsync_t;
+
+/*
+ * A record of the contents of the VOTE1 msg we sent out at current egen, in
+ * case we need to send out a duplicate VOTE1 to a late-joining client in a full
+ * election.  The nsites, nvotes, and priority fields of the REP struct can't be
+ * used, because those could change.  It's only safe to send out a dup if we
+ * send out the exact same info.
+ */
+typedef struct {
+	DB_LSN lsn;
+	u_int32_t nsites;
+	u_int32_t nvotes;
+	u_int32_t priority;
+	u_int32_t tiebreaker;
+	u_int32_t ctlflags;
+	u_int32_t data_gen;
+} VOTE1_CONTENT;
+
+/*
+ * REP --
+ * Shared replication structure.
+ */
+typedef struct __rep { /* SHARED */
+	db_mutex_t	mtx_region;	/* Region mutex. */
+	db_mutex_t	mtx_clientdb;	/* Client database mutex. */
+	db_mutex_t	mtx_ckp;	/* Checkpoint mutex. */
+	db_mutex_t	mtx_diag;	/* Diagnostic message mutex. */
+	db_mutex_t	mtx_repstart;	/* Role change mutex. */
+	int		diag_index;	/* Diagnostic file index. */
+	off_t		diag_off;	/* Diagnostic message offset. */
+	roff_t		lease_off;	/* Offset of the lease table. */
+	roff_t		tally_off;	/* Offset of the tally region. */
+	roff_t		v2tally_off;	/* Offset of the vote2 tally region. */
+	int		eid;		/* Environment id. */
+	int		master_id;	/* ID of the master site. */
+	u_int32_t	version;	/* Current replication version. */
+	u_int32_t	egen;		/* Replication election generation. */
+	u_int32_t	spent_egen;	/* Egen satisfied by rep_elect call. */
+	u_int32_t	gen;		/* Replication generation number. */
+	u_int32_t	mgen;		/* Master gen seen by client. */
+	u_int32_t	asites;		/* Space allocated for sites. */
+	u_int32_t	nsites;		/* Number of sites in group. */
+	u_int32_t	nvotes;		/* Number of votes needed. */
+	u_int32_t	priority;	/* My priority in an election. */
+	u_int32_t	config_nsites;
+
+	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
+	db_timeout_t	full_elect_timeout;
+
+	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
+
+#define	REP_DEFAULT_THROTTLE	(10 * MEGABYTE) /* Default value is < 1Gig. */
+	u_int32_t	gbytes;		/* Limit on data sent in single... */
+	u_int32_t	bytes;		/* __rep_process_message call. */
+#define	DB_REP_REQUEST_GAP	40000	/* 40 msecs */
+#define	DB_REP_MAX_GAP		1280000	/* 1.28 seconds */
+	db_timespec	request_gap;	/* Minimum time to wait before we
+					 * request a missing log record. */
+	db_timespec	max_gap;	/* Maximum time to wait before
+					 * requesting a missing log record. */
+	/* Status change information */
+	u_int32_t	apply_th;	/* Number of callers in rep_apply. */
+	u_int32_t	arch_th;	/* Number of callers in log_archive. */
+	u_int32_t	elect_th;	/* Elect threads in lock-out. */
+	u_int32_t	msg_th;		/* Number of callers in rep_proc_msg.*/
+	u_int32_t	handle_cnt;	/* Count of handles in library. */
+	u_int32_t	op_cnt;		/* Multi-step operation count.*/
+	DB_LSN		ckp_lsn;	/* LSN for syncing a checkpoint. */
+	DB_LSN		max_prep_lsn;	/* Max LSN of txn_prepare record. */
+
+	/*
+	 * Event notification synchronization: the mtx_event and associate
+	 * fields which it protects govern event notification to the
+	 * application.  They form a guarantee that no matter how crazy the
+	 * thread scheduling gets, the application sees a sensible, orderly
+	 * progression of events.
+	 */
+	db_mutex_t	mtx_event;	/* Serializes event notification. */
+	/*
+	 * Latest generation whose NEWMASTER event the application has been
+	 * notified of.  Also serves to force STARTUPDONE to occur after
+	 * NEWMASTER.
+	 */
+	u_int32_t	newmaster_event_gen;
+	/*
+	 * Latest local victory of an election that the application has been
+	 * notified of, expressed as the election generation number.  This
+	 * ensures we notify the application exactly once when it wins an
+	 * election.
+	 */
+	u_int32_t	notified_egen;
+
+	/* Internal init information. */
+	u_int32_t	nfiles;		/* Number of files we have info on. */
+	u_int32_t	curfile;	/* Cur file we're getting (0-based). */
+	roff_t		originfo_off;	/* Offset of original file info. */
+	u_int32_t	infolen;	/* Remaining length file info buffer. */
+	u_int32_t	originfolen;	/* Original length file info buffer. */
+	u_int32_t	infoversion;	/* Original file info version. */
+	DB_LSN		first_lsn;	/* Earliest LSN we need. */
+	u_int32_t	first_vers;	/* Log version of first log file. */
+	DB_LSN		last_lsn;	/* Latest LSN we need. */
+	/* These are protected by mtx_clientdb. */
+	db_timespec	last_pg_ts;	/* Last page stored timestamp. */
+	db_pgno_t	ready_pg;	/* Next pg expected. */
+	db_pgno_t	waiting_pg;	/* First pg after gap. */
+	db_pgno_t	max_wait_pg;	/* Maximum pg requested. */
+	u_int32_t	npages;		/* Num of pages rcvd for this file. */
+	roff_t		curinfo_off;	/* Offset of current file info. */
+					/* Always access with GET_CURINFO(). */
+
+	/* Vote tallying information. */
+	u_int32_t	sites;		/* Sites heard from. */
+	int		winner;		/* Current winner EID. */
+	u_int32_t	w_priority;	/* Winner priority. */
+	u_int32_t	w_gen;		/* Winner generation. */
+	u_int32_t	w_datagen;	/* Winner data generation. */
+	DB_LSN		w_lsn;		/* Winner LSN. */
+	u_int32_t	w_tiebreaker;	/* Winner tiebreaking value. */
+	u_int32_t	votes;		/* Number of votes for this site. */
+
+	VOTE1_CONTENT	vote1;		/* Valid until rep->egen changes. */
+
+	db_timespec	etime;		/* Election start timestamp. */
+	int		full_elect;	/* Is current election a "full" one? */
+
+	/* Leases. */
+	db_timeout_t	lease_timeout;	/* Lease timeout. */
+	db_timespec	lease_duration;	/* Lease timeout with clock skew. */
+	u_int32_t	clock_skew;	/* Clock skew. */
+	u_int32_t	clock_base;	/* Clock scale factor base. */
+	db_timespec	grant_expire;	/* Local grant expiration time. */
+
+	/* Cached LSN history, matching current gen. */
+	DB_LSN		gen_base_lsn;	/* Base LSN of current generation. */
+	u_int32_t	master_envid;	/* Current master's "unique" env ID. */
+
+	SH_TAILQ_HEAD(__wait) waiters;	/* List of threads in txn_applied(). */
+	SH_TAILQ_HEAD(__wfree) free_waiters;/* Free list of waiter structs. */
+
+#ifdef HAVE_REPLICATION_THREADS
+	/*
+	 * Replication Framework (repmgr) shared config information.
+	 */
+	db_mutex_t	mtx_repmgr;	/* Region mutex. */
+	roff_t		siteinfo_off;	/* Offset of site array region. */
+	u_int		site_cnt;	/* Array slots in use. */
+	u_int		site_max;	/* Total array slots allocated. */
+	int		self_eid;	/* Where to find the local site. */
+	u_int		siteinfo_seq;	/* Number of updates to this info. */
+	u_int32_t	min_log_file;	/* Earliest log needed by repgroup. */
+
+	pid_t		listener;
+
+	int		perm_policy;
+	db_timeout_t	ack_timeout;
+	db_timeout_t	election_retry_wait;
+	db_timeout_t	connection_retry_wait;
+	db_timeout_t	heartbeat_frequency; /* Max period between msgs. */
+	db_timeout_t	heartbeat_monitor_timeout;
+#endif  /* HAVE_REPLICATION_THREADS */
+
+	/* Statistics. */
+	DB_REP_STAT	stat;
+#if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS)
+	DB_REPMGR_STAT	mstat;
+#endif
+
+	/*
+	 * Please change __rep_print_all (rep_stat.c) to track any changes made
+	 * to all these flag families below.
+	 */
+	/* Configuration. */
+#define	REP_C_2SITE_STRICT	0x00001		/* Don't cheat on elections. */
+#define	REP_C_AUTOINIT		0x00002		/* Auto initialization. */
+#define	REP_C_AUTOROLLBACK	0x00004		/* Discard client txns: sync. */
+#define	REP_C_BULK		0x00008		/* Bulk transfer. */
+#define	REP_C_DELAYCLIENT	0x00010		/* Delay client sync-up. */
+#define	REP_C_ELECTIONS		0x00020		/* Repmgr to use elections. */
+#define	REP_C_INMEM		0x00040		/* In-memory replication. */
+#define	REP_C_LEASE		0x00080		/* Leases configured. */
+#define	REP_C_NOWAIT		0x00100		/* Immediate error return. */
+	u_int32_t	config;		/* Configuration flags. */
+
+	/* Election. */
+#define	REP_E_PHASE0		0x00000001	/* In phase 0 of election. */
+#define	REP_E_PHASE1		0x00000002	/* In phase 1 of election. */
+#define	REP_E_PHASE2		0x00000004	/* In phase 2 of election. */
+#define	REP_E_TALLY		0x00000008	/* Tallied vote before elect. */
+	u_int32_t	elect_flags;	/* Election flags. */
+
+	/* Lockout. */
+#define	REP_LOCKOUT_API		0x00000001	/* BDB API - handle_cnt. */
+#define	REP_LOCKOUT_APPLY	0x00000002	/* apply msgs - apply_th. */
+#define	REP_LOCKOUT_ARCHIVE	0x00000004	/* log_archive. */
+#define	REP_LOCKOUT_MSG		0x00000008	/* Message process - msg_th. */
+#define	REP_LOCKOUT_OP		0x00000010	/* BDB ops txn,curs - op_cnt. */
+	u_int32_t	lockout_flags;	/* Lockout flags. */
+
+	/* See above for enumerated sync states. */
+	repsync_t	sync_state;	/* Recovery/synchronization flags. */
+
+	/*
+	 * When adding a new flag value, consider whether it should be
+	 * cleared in rep_start() when starting as a master or a client.
+	 */
+#define	REP_F_ABBREVIATED	0x00000001	/* Recover NIMDB pages only. */
+#define	REP_F_APP_BASEAPI	0x00000002	/* Base API application. */
+#define	REP_F_APP_REPMGR	0x00000004	/* repmgr application. */
+#define	REP_F_CLIENT		0x00000008	/* Client replica. */
+#define	REP_F_DELAY		0x00000010	/* Delaying client sync-up. */
+#define	REP_F_GROUP_ESTD	0x00000020	/* Rep group is established. */
+#define	REP_F_INUPDREQ		0x00000040	/* Thread in rep_update_req. */
+#define	REP_F_LEASE_EXPIRED	0x00000080	/* Leases guaranteed expired. */
+#define	REP_F_MASTER		0x00000100	/* Master replica. */
+#define	REP_F_MASTERELECT	0x00000200	/* Master elect. */
+#define	REP_F_NEWFILE		0x00000400	/* Newfile in progress. */
+#define	REP_F_NIMDBS_LOADED	0x00000800	/* NIMDBs are materialized. */
+#define	REP_F_SKIPPED_APPLY	0x00001000	/* Skipped applying a record. */
+#define	REP_F_START_CALLED	0x00002000	/* Rep_start called. */
+#define	REP_F_SYS_DB_OP		0x00004000	/* Operation in progress. */
+	u_int32_t	flags;
+} REP;
+
+/* Information about a thread waiting in txn_applied(). */
+typedef enum {
+	AWAIT_GEN,		/* Client's gen is behind token gen. */
+	AWAIT_HISTORY,		/* Haven't received master's LSN db update. */
+	AWAIT_LSN,		/* Awaiting replication of user txn. */
+	AWAIT_NIMDB,		/* LSN db missing: maybe it's INMEM. */
+	LOCKOUT			/* Thread awoken due to pending lockout. */
+} rep_waitreason_t;
+
+struct rep_waitgoal {
+	rep_waitreason_t	why;
+	union {
+		DB_LSN	lsn;	/* For AWAIT_LSN and AWAIT_HISTORY. */
+		u_int32_t gen;	/* AWAIT_GEN */
+	} u;
+};
+
+struct __rep_waiter {
+	db_mutex_t	mtx_repwait; /* Self-blocking mutex. */
+	struct rep_waitgoal	goal;
+	SH_TAILQ_ENTRY	links;	     /* On either free or waiting list. */
+
+#define	REP_F_PENDING_LOCKOUT	0x00000001
+#define	REP_F_WOKEN		0x00000002
+	u_int32_t	flags;
+};
+
+/*
+ * Macros to check and clear the BDB lockouts.  Currently they are
+ * locked out/set individually because they pertain to different pieces of
+ * the BDB API, they are otherwise always checked and cleared together.
+ */
+#define ISSET_LOCKOUT_BDB(R) 						\
+    (FLD_ISSET((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
+
+#define CLR_LOCKOUT_BDB(R) 						\
+    (FLD_CLR((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
+
+/*
+ * Recovery flag mask to easily check any/all recovery bits.  That is
+ * REP_LOCKOUT_{API|OP} and most REP_S_*.  This must change if the values
+ * of the flags change.  NOTE:  We do not include REP_LOCKOUT_MSG in
+ * this mask because it is used frequently in non-recovery related
+ * areas and we want to manipulate it separately (see especially
+ * in __rep_new_master).
+ */
+#define CLR_RECOVERY_SETTINGS(R)					\
+do {									\
+	(R)->sync_state = SYNC_OFF;					\
+	CLR_LOCKOUT_BDB(R);						\
+} while (0)
+
+#define	IS_REP_RECOVERING(R)						\
+    ((R)->sync_state != SYNC_OFF || ISSET_LOCKOUT_BDB(R))
+
+/*
+ * REP_F_EPHASE0 is not a *real* election phase.  It is used for
+ * master leases and allowing the client to find the master or
+ * expire its lease.  However, EPHASE0 is cleared by __rep_elect_done.
+ */
+#define	IN_ELECTION(R)							\
+	FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2)
+#define	IN_ELECTION_TALLY(R) \
+	FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY)
+#define	ELECTION_MAJORITY(n) (((n) / 2) + 1)
+
+#define	IN_INTERNAL_INIT(R) \
+	((R)->sync_state == SYNC_LOG || (R)->sync_state == SYNC_PAGE)
+
+#define	IS_REP_MASTER(env)						\
+	(REP_ON(env) &&							\
+	    F_ISSET(((env)->rep_handle->region), REP_F_MASTER))
+
+#define	IS_REP_CLIENT(env)						\
+	(REP_ON(env) &&							\
+	    F_ISSET(((env)->rep_handle->region), REP_F_CLIENT))
+
+#define	IS_REP_STARTED(env)						\
+	(REP_ON(env) &&							\
+	    F_ISSET(((env)->rep_handle->region), REP_F_START_CALLED))
+
+#define	IS_USING_LEASES(env)						\
+	(REP_ON(env) &&							\
+	    FLD_ISSET(((env)->rep_handle->region)->config, REP_C_LEASE))
+
+#define	IS_CLIENT_PGRECOVER(env)					\
+	(IS_REP_CLIENT(env) &&						\
+	    (((env)->rep_handle->region)->sync_state ==  SYNC_PAGE))
+
+/*
+ * Macros to figure out if we need to do replication pre/post-amble processing.
+ * Skip for specific DB handles owned by the replication layer, either because
+ * replication is running recovery or because it's a handle entirely owned by
+ * the replication code (replication opens its own databases to track state).
+ */
+#define REP_FLAGS_SET(env)						\
+	((env)->rep_handle->region->flags != 0 ||			\
+	(env)->rep_handle->region->elect_flags != 0 ||			\
+	(env)->rep_handle->region->lockout_flags != 0)
+
+#define	IS_ENV_REPLICATED(env)						\
+	(REP_ON(env) && REP_FLAGS_SET(env))
+
+/*
+ * Update the temporary log archive block timer.
+ */
+#define	MASTER_UPDATE(env, renv) do {					\
+	REP_SYSTEM_LOCK(env);						\
+	F_SET((renv), DB_REGENV_REPLOCKED);				\
+	(void)time(&(renv)->op_timestamp);				\
+	REP_SYSTEM_UNLOCK(env);						\
+} while (0)
+
+/*
+ * Macro to set a new generation number.  Cached values from the LSN history
+ * database are associated with the current gen, so when the gen changes we must
+ * invalidate the cache.  Use this macro for all gen changes, to avoid
+ * forgetting to do so.  This macro should be used while holding the rep system
+ * mutex (unless we know we're single-threaded for some other reason, like at
+ * region create time).
+ */
+#define	SET_GEN(g) do {							\
+	rep->gen = (g);							\
+	ZERO_LSN(rep->gen_base_lsn);					\
+} while (0)
+
+
+/*
+ * Gap processing flags.  These provide control over the basic
+ * gap processing algorithm for some special cases.
+ */
+#define	REP_GAP_FORCE		0x001	/* Force a request for a gap. */
+#define	REP_GAP_REREQUEST	0x002	/* Gap request is a forced rerequest. */
+					/* REREQUEST is a superset of FORCE. */
+
+/*
+ * Flags indicating what kind of record we want to back up to, in the log.
+ */
+#define	REP_REC_COMMIT		0x001 	/* Most recent commit record. */
+#define	REP_REC_PERM		0x002	/* Most recent perm record. */
+					/* PERM is a superset of COMMIT. */
+
+/*
+ * Basic pre/post-amble processing.
+ */
+#define	REPLICATION_WRAP(env, func_call, checklock, ret) do {		\
+	int __rep_check, __t_ret;					\
+	__rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;			\
+	(ret) = __rep_check ? __env_rep_enter(env, checklock) : 0;	\
+	if ((ret) == 0) {						\
+		(ret) = func_call;					\
+		if (__rep_check && (__t_ret =				\
+		    __env_db_rep_exit(env)) != 0 && (ret) == 0)		\
+		(ret) = __t_ret;					\
+	}								\
+} while (0)
+
+/*
+ * Macro to safely access curinfo and its internal DBT pointers from
+ * any process.  This should always be used to access curinfo.  If
+ * the internal DBT pointers are to be used, mtx_clientdb must be held
+ * between the time of this call and the use of the pointers.
+ *
+ * The current file information (curinfo) is stored in shared region
+ * memory and accessed via an offset.  It contains DBTs that themselves
+ * point to allocated data.  __rep_nextfile() manages this information in a
+ * single chunk of shared memory.
+ *
+ * If different processes access curinfo, they may have different shared
+ * region addresses.  This means that curinfo and its pointers to DBT data
+ * must be recalculated for each process starting with the offset.
+ */
+#define GET_CURINFO(rep, infop, curinfo)				\
+do {									\
+	curinfo = R_ADDR(infop, rep->curinfo_off);			\
+	if ((curinfo)->uid.size > 0)					\
+		(curinfo)->uid.data = R_ADDR(infop,			\
+		    rep->curinfo_off + sizeof(__rep_fileinfo_args));	\
+	else								\
+		(curinfo)->uid.data = NULL;				\
+	if ((curinfo)->info.size > 0)					\
+		(curinfo)->info.data = R_ADDR(infop, rep->curinfo_off +	\
+		    sizeof(__rep_fileinfo_args) + (curinfo)->uid.size);	\
+	else								\
+		(curinfo)->info.data = NULL;				\
+	if ((curinfo)->dir.size > 0)					\
+		(curinfo)->dir.data = R_ADDR(infop, rep->curinfo_off +	\
+		    sizeof(__rep_fileinfo_args) + (curinfo)->uid.size +	\
+		    (curinfo)->info.size);				\
+	else								\
+		(curinfo)->dir.data = NULL;				\
+} while (0)
+
+/*
+ * Per-process replication structure.
+ *
+ * There are 2 mutexes used in the Base replication API.  (See LOCK_MUTEX in
+ * repmgr.h for a discussion of repmgr.)
+ * 1.  mtx_region - This protects the fields of the rep region above.
+ * 2.  mtx_clientdb - This protects the per-process flags, and bookkeeping
+ * database and all of the components that maintain it.  Those
+ * components include the following fields in the log region (see log.h):
+ *	a. ready_lsn
+ *	b. waiting_lsn
+ *	c. verify_lsn
+ *	d. wait_recs
+ *	e. rcvd_recs
+ *	f. max_wait_lsn
+ * These fields in the log region are NOT protected by the log region lock at
+ * all.
+ *
+ * Note that the per-process flags should truly be protected by a special
+ * per-process thread mutex, but it is currently set in so isolated a manner
+ * that it didn't make sense to do so and in most case we're already holding
+ * the mtx_clientdb anyway.
+ *
+ * The lock ordering protocol is that mtx_clientdb must be acquired first and
+ * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
+ * necessary.
+ *
+ * Note that the appropriate mutex is needed any time one or more related
+ * values are read or written that could possibly use more than one atomic
+ * machine instruction.  A single 32-bit integer value is safe without a
+ * mutex, but most other types of value should use a mutex.
+ *
+ * Any use of a mutex must be inside a matched pair of ENV_ENTER() and
+ * ENV_LEAVE() macros.  This ensures that if a thread dies while holding
+ * a lock (i.e. a mutex), recovery can clean it up so that it does not
+ * indefinitely block other threads.
+ */
+struct __db_rep {
+	/*
+	 * Shared configuration information -- copied to and maintained in the
+	 * shared region as soon as the shared region is created.
+	 */
+	int		eid;		/* Environment ID. */
+
+	u_int32_t	gbytes;		/* Limit on data sent in single... */
+	u_int32_t	bytes;		/* __rep_process_message call. */
+
+	db_timespec	request_gap;	/* Minimum time to wait before we
+					 * request a missing log record. */
+	db_timespec	max_gap;	/* Maximum time to wait before
+					 * requesting a missing log record. */
+
+	u_int32_t	clock_skew;	/* Clock skew factor. */
+	u_int32_t	clock_base;	/* Clock skew base. */
+	u_int32_t	config;		/* Configuration flags. */
+	u_int32_t	config_nsites;
+
+	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
+	db_timeout_t	full_elect_timeout;
+
+	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
+
+	u_int32_t	my_priority;
+	db_timeout_t	lease_timeout;	/* Master leases. */
+	/*
+	 * End of shared configuration information.
+	 */
+	int		(*send)		/* Send function. */
+			    __P((DB_ENV *, const DBT *, const DBT *,
+			    const DB_LSN *, int, u_int32_t));
+
+	DB		*rep_db;	/* Bookkeeping database. */
+	DB		*lsn_db;	/* (Replicated) LSN history database. */
+
+	REP		*region;	/* In memory structure. */
+	u_int8_t	*bulk;		/* Shared memory bulk area. */
+
+#define	DBREP_DIAG_FILES	2
+	DB_FH		*diagfile[DBREP_DIAG_FILES];	/* Diag files fhp. */
+	off_t		diag_off;	/* Current diag file offset. */
+
+	/* These are protected by mtx_clientdb. */
+	DB_MPOOLFILE	*file_mpf;	/* Mpoolfile for current database. */
+	DB		*file_dbp;	/* This file's page info. */
+	DBC		*queue_dbc;	/* Dbc for a queue file. */
+
+	/*
+	 * Please change __rep_print_all (rep_stat.c) to track any changes made
+	 * to these flags.
+	 */
+#define	DBREP_APP_BASEAPI	0x0001	/* Base API application. */
+#define	DBREP_APP_REPMGR	0x0002	/* repmgr application. */
+#define	DBREP_OPENFILES		0x0004	/* This handle has opened files. */
+	u_int32_t	flags;		/* per-process flags. */
+
+#ifdef HAVE_REPLICATION_THREADS
+	/*
+	 * Replication Framework (repmgr) per-process information.
+	 */
+	u_int		nthreads;	/* Msg processing threads. */
+	u_int		athreads;	/* Space allocated for msg threads. */
+	u_int		non_rep_th;	/* Threads in GMDB or channel msgs. */
+	u_int		aelect_threads; /* Space allocated for elect threads. */
+	u_int32_t	init_policy;
+	int		perm_policy;
+	DB_LSN		perm_lsn; /* Last perm LSN we've announced. */
+	db_timeout_t	ack_timeout;
+	db_timeout_t	election_retry_wait;
+	db_timeout_t	connection_retry_wait;
+	db_timeout_t	heartbeat_frequency; /* Max period between msgs. */
+	db_timeout_t	heartbeat_monitor_timeout;
+
+	/* Thread synchronization. */
+	REPMGR_RUNNABLE *selector, **messengers, **elect_threads;
+	REPMGR_RUNNABLE	*preferred_elect_thr;
+	db_timespec	repstart_time;
+	mgr_mutex_t	*mutex;
+	cond_var_t	check_election, gmdb_idle, msg_avail;
+	waiter_t	ack_waiters; /* For threads awaiting PERM acks. */
+#ifdef DB_WIN32
+	HANDLE		signaler;
+#else
+	int		read_pipe, write_pipe;
+#endif
+
+	/* Operational stuff. */
+	REPMGR_SITE	*sites;		/* Array of known sites. */
+	u_int		site_cnt;	/* Array slots in use. */
+	u_int		site_max;	/* Total array slots allocated. */
+	int		self_eid;	/* Where to find the local site. */
+	u_int		siteinfo_seq;	/* Last known update to this list. */
+
+	/*
+	 * The connections list contains only those connections not actively
+	 * associated with a known site (see repmgr.h).
+	 */
+	CONNECTION_LIST	connections;
+	RETRY_Q_HEADER	retries;	/* Sites needing connection retry. */
+	struct {
+		int	size;
+		STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header;
+	} input_queue;
+
+	socket_t	listen_fd;
+	db_timespec	last_bcast;	/* Time of last broadcast msg. */
+
+	/*
+	 * Status of repmgr.  It is ready when repmgr is not yet started.  It
+	 * is running after repmgr is (re)started.  It is stopped if the env
+	 * of the running repmgr is closed, or the site is removed. 
+	 */
+	enum { ready, running, stopped } repmgr_status;
+	int		new_connection;	  /* Since last master seek attempt. */
+	int		takeover_pending; /* We've been elected master. */
+	int		gmdb_busy;
+	int		client_intent;	/* Will relinquish master role. */
+	int		gmdb_dirty;
+	int		have_gmdb;
+	int		seen_repmsg;
+
+	/*
+	 * Flag to show what kind of transaction is currently in progress.
+	 * Primary means we're doing the first (critical) phase of a membership
+	 * DB update, where we care about perm failures.  In the secondary phase
+	 * we don't care.  Usually the value is "none", when normal user
+	 * transactions are happening.  We need to use this global flag because
+	 * we don't have a more proper direct channel to communicate information
+	 * between the originator of a transaction and the replication send()
+	 * function that has to wait for acks and decide what to do about them.
+	 */ 
+	enum { none, gmdb_primary, gmdb_secondary } active_gmdb_update;
+	int		limbo_resolution_needed;
+
+	/*
+	 * GMDB update sequence count.  On creation we write version 1; so, once
+	 * repmgr has started and tried to read, a 0 here can be taken to mean
+	 * that the DB doesn't exist yet.
+	 */
+	u_int32_t	membership_version;
+	u_int32_t	member_version_gen;
+
+	/* LSN of GMDB txn that got a perm failure. */
+	DB_LSN		limbo_failure;
+	/* EID whose membership status is therefore unresolved */
+	int		limbo_victim;
+	/* LSN of a later txn that achieves perm success. */
+	DB_LSN		durable_lsn;
+	DB		*gmdb;	/* Membership database handle. */
+	/*
+	 * Membership list restored from init file after crash during internal init.
+	 */
+	u_int8_t	*restored_list;
+	size_t		restored_list_length;
+
+	/* Application's message dispatch call-back function. */
+	void  (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *,
+		DBT *, u_int32_t, u_int32_t));
+#endif  /* HAVE_REPLICATION_THREADS */
+};
+
+/*
+ * Determine whether application is repmgr or base replication API.  If
+ * repmgr was configured, base the test on internal replication flags for
+ * APP_REPMGR and APP_BASEAPI.  These flags get set by the appropriate parts
+ * of the various replication APIs.
+ */
+#ifdef HAVE_REPLICATION_THREADS
+/*
+ * Application type is set to be repmgr when:
+ *   1. A local site is defined.
+ *   2. A remote site is defined.
+ *   3. An acknowledgement policy is configured.
+ *   4. A repmgr flag is configured.
+ *   5. A timeout value is configured for one of the repmgr timeouts.
+ */
+#define	APP_IS_REPMGR(env)						\
+	(REP_ON(env) ?							\
+	    F_ISSET((env)->rep_handle->region, REP_F_APP_REPMGR) :	\
+	    F_ISSET((env)->rep_handle, DBREP_APP_REPMGR))
+
+/*
+ * Application type is set to be base replication API when:
+ *   1. Transport send function is defined and is not the repmgr send
+ *      function.
+ */
+#define	APP_IS_BASEAPI(env)						\
+	(REP_ON(env) ?							\
+	    F_ISSET((env)->rep_handle->region, REP_F_APP_BASEAPI) :	\
+	    F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI))
+
+/*
+ * Set application type.  These macros do extra checking to guarantee that
+ * only one application type is ever set.
+ */
+#define	APP_SET_REPMGR(env) do {					\
+	if (REP_ON(env)) {						\
+		ENV_ENTER(env, ip);					\
+		REP_SYSTEM_LOCK(env);					\
+		if (!F_ISSET((env)->rep_handle->region,			\
+		    REP_F_APP_BASEAPI))					\
+			F_SET((env)->rep_handle->region,		\
+			    REP_F_APP_REPMGR);				\
+		REP_SYSTEM_UNLOCK(env);					\
+		ENV_LEAVE(env, ip);					\
+	} else if (!F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI))	\
+		F_SET((env)->rep_handle, DBREP_APP_REPMGR);		\
+} while (0)
+#define	APP_SET_BASEAPI(env) do {					\
+	if (REP_ON(env)) {						\
+		ENV_ENTER(env, ip);					\
+		REP_SYSTEM_LOCK(env);					\
+		if (!F_ISSET((env)->rep_handle->region,			\
+		    REP_F_APP_REPMGR))					\
+			F_SET((env)->rep_handle->region,		\
+			    REP_F_APP_BASEAPI);				\
+		REP_SYSTEM_UNLOCK(env);					\
+		ENV_LEAVE(env, ip);					\
+	} else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR))	\
+		F_SET((env)->rep_handle, DBREP_APP_BASEAPI);		\
+} while (0)
+
+#else
+/*
+ * We did not configure repmgr, application must be base replication API.
+ * The APP_SET_* macros are noops in this case, but they must be defined
+ * with a null body to avoid compiler warnings on some platforms.
+ */
+#define	APP_IS_REPMGR(env) 0
+#define	APP_SET_REPMGR(env) do {					\
+	;								\
+} while (0)
+#define	APP_IS_BASEAPI(env) 1
+#define	APP_SET_BASEAPI(env) do {					\
+	;								\
+} while (0)
+#endif  /* HAVE_REPLICATION_THREADS */
+
+/*
+ * Control structure flags for replication communication infrastructure.
+ */
+/*
+ * Define old DB_LOG_ values that we must support here.  For reasons of
+ * compatibility with old versions, these values must be reserved explicitly in
+ * the list of flag values (below)
+ */
+#define	DB_LOG_PERM_42_44	0x20
+#define	DB_LOG_RESEND_42_44	0x40
+#define	REPCTL_INIT_45		0x02	/* Back compatible flag value. */
+
+#define	REPCTL_ELECTABLE	0x01	/* Upgraded client is electable. */
+#define	REPCTL_FLUSH		0x02	/* Record should be flushed. */
+#define	REPCTL_GROUP_ESTD	0x04	/* Message from site in a group. */
+#define	REPCTL_INIT		0x08	/* Internal init message. */
+#define	REPCTL_LEASE		0x10	/* Lease related message.. */
+			/*
+			 * Skip over reserved values 0x20
+			 * and 0x40, as explained above.
+			 */
+#define	REPCTL_LOG_END		0x80	/* Approximate end of group-wide log. */
+#define	REPCTL_PERM		DB_LOG_PERM_42_44
+#define	REPCTL_RESEND		DB_LOG_RESEND_42_44
+
+/*
+ * File info flags for internal init.  The per-database (i.e., file) flag
+ * represents the on-disk format of the file, and is conveyed from the master to
+ * the initializing client in the UPDATE message, so that the client can know
+ * how to create the file.  The per-page flag is conveyed along with each PAGE
+ * message, describing the format of the page image being transmitted; it is of
+ * course set by the site serving the PAGE_REQ.  The serving site gets the page
+ * image from its own mpool, and thus the page is in the native format of the
+ * serving site.  This format may be different (i.e., opposite) from the on-disk
+ * format, and in fact can vary per-page, since with client-to-client sync it is
+ * possible for various different sites to serve the various PAGE_REQ requests.
+ */
+#define	REPINFO_DB_LITTLEENDIAN	0x0001	/* File is little-endian lorder. */
+#define	REPINFO_PG_LITTLEENDIAN	0x0002	/* Page is little-endian lorder. */
+
+/*
+ * Control message format for 4.6 release.  The db_timespec_t is
+ * not a portable structure.  Therefore, in 4.6, replication among
+ * mixed OSs such as Linux and Windows, which have different time_t
+ * sizes, does not work.
+ */
+typedef struct {
+	u_int32_t	rep_version;	/* Replication version number. */
+	u_int32_t	log_version;	/* Log version number. */
+
+	DB_LSN		lsn;		/* Log sequence number. */
+	u_int32_t	rectype;	/* Message type. */
+	u_int32_t	gen;		/* Generation number. */
+	db_timespec	msg_time;	/* Timestamp seconds for leases. */
+	u_int32_t	flags;		/* log_put flag value. */
+} REP_46_CONTROL;
+
+/*
+ * Control message format for 4.5 release and earlier.
+ */
+typedef struct {
+	u_int32_t	rep_version;	/* Replication version number. */
+	u_int32_t	log_version;	/* Log version number. */
+
+	DB_LSN		lsn;		/* Log sequence number. */
+	u_int32_t	rectype;	/* Message type. */
+	u_int32_t	gen;		/* Generation number. */
+	u_int32_t	flags;		/* log_put flag value. */
+} REP_OLD_CONTROL;
+
+#define	LEASE_REFRESH_MIN	30	/* Minimum number of refresh retries. */
+#define	LEASE_REFRESH_USEC	50000	/* Microseconds between refresh tries. */
+
+/* Master granted lease information. */
+typedef struct __rep_lease_entry {
+	int		eid;		/* EID of client grantor. */
+	db_timespec	start_time;	/* Start time clients echo back. */
+	db_timespec	end_time;	/* Master lease expiration time. */
+	DB_LSN		lease_lsn;	/* Durable LSN lease applies to. */
+} REP_LEASE_ENTRY;
+
+/*
+ * Old vote info where some fields were not fixed size.
+ */
+typedef struct {
+	u_int32_t	egen;		/* Election generation. */
+	int		nsites;		/* Number of sites I've been in
+					 * communication with. */
+	int		nvotes;		/* Number of votes needed to win. */
+	int		priority;	/* My site's priority. */
+	u_int32_t	tiebreaker;	/* Tie-breaking quasi-random value. */
+} REP_OLD_VOTE_INFO;
+
+typedef struct {
+	u_int32_t	egen;		/* Voter's election generation. */
+	int		eid;		/* Voter's ID. */
+} REP_VTALLY;
+
+/*
+ * The REP_THROTTLE_ONLY flag is used to do throttle processing only.
+ * If set, it will only allow sending the REP_*_MORE message, but not
+ * the normal, non-throttled message.  It is used to support throttling
+ * with bulk transfer.
+ */
+/* Flags for __rep_send_throttle. */
+#define	REP_THROTTLE_ONLY	0x0001	/* Send _MORE message only. */
+
+/* Throttled message processing information. */
+typedef struct {
+	DB_LSN		lsn;		/* LSN of this record. */
+	DBT		*data_dbt;	/* DBT of this record. */
+	u_int32_t	gbytes;		/* This call's max gbytes sent. */
+	u_int32_t	bytes;		/* This call's max bytes sent. */
+	u_int32_t	type;		/* Record type. */
+} REP_THROTTLE;
+
+/* Bulk processing information. */
+/*
+ * !!!
+ * We use a roff_t for the offset.  We'd really like to use a ptrdiff_t
+ * since that really is what it is.  But ptrdiff_t is not portable and
+ * doesn't exist everywhere.
+ */
+typedef struct {
+	u_int8_t	*addr;		/* Address of bulk buffer. */
+	roff_t		*offp;		/* Ptr to current offset into buffer. */
+	u_int32_t	len;		/* Bulk buffer length. */
+	u_int32_t	type;		/* Item type in buffer (log, page). */
+	DB_LSN		lsn;		/* First LSN in buffer. */
+	int		eid;		/* ID of potential recipients. */
+#define	BULK_XMIT	0x001		/* Buffer in transit. */
+	u_int32_t	*flagsp;	/* Buffer flags. */
+} REP_BULK;
+
+/*
+ * This structure takes care of representing a transaction.
+ * It holds all the records, sorted by page number so that
+ * we can obtain locks and apply updates in a deadlock free
+ * order.
+ */
+typedef struct {
+	u_int nlsns;
+	u_int nalloc;
+	DB_LSN *array;
+} LSN_COLLECTION;
+
+/*
+ * This is used by the page-prep routines to do the lock_vec call to
+ * apply the updates for a single transaction or a collection of
+ * transactions.
+ */
+typedef struct {
+	int		n;
+	DB_LOCKREQ	*reqs;
+	DBT		*objs;
+} linfo_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/rep_ext.h"
+#endif	/* !_DB_REP_H_ */
diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h
new file mode 100644
index 00000000..d8fd199c
--- /dev/null
+++ b/src/dbinc/repmgr.h
@@ -0,0 +1,843 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REPMGR_H_
+#define	_DB_REPMGR_H_
+
+#include "dbinc_auto/repmgr_automsg.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Replication Manager message format types.  These few format codes identify
+ * enough information to describe, at the lowest level, how a message should be
+ * read from the wire, including how much memory should be allocated to hold the
+ * result.  (Often we want to allocate more than just enough to hold the
+ * received bytes, if we know that we will need more during processing.)
+ *
+ * These values are transmitted between sites, even sites running differing BDB
+ * versions.  Therefore, once assigned, the values are permanently "frozen".
+ *
+ * For example, in repmgr wire protocol version 1 the highest assigned message
+ * type value was 3, for REPMGR_REP_MESSAGE.  Wire protocol version 2 added the
+ * HEARTBEAT message type (4).
+ *
+ * New message types added in later versions always get new (higher) values.  We
+ * still list them in alphabetical order, for ease of reference.  But this
+ * generally does not correspond to numerical order.
+ */
+#define	REPMGR_APP_MESSAGE	5	/* Msg sent from app. on DB_CHANNEL. */
+#define	REPMGR_APP_RESPONSE	6	/* Response to a channel request. */
+#define	REPMGR_OWN_MSG		8	/* Repmgr's own messages, to peers. */
+#define	REPMGR_HANDSHAKE	2	/* Connection establishment sequence. */
+#define	REPMGR_HEARTBEAT	4	/* Monitor connection health. */
+#define	REPMGR_PERMLSN		1	/* My perm LSN. */
+#define	REPMGR_REP_MESSAGE	3	/* Normal replication message. */
+#define	REPMGR_RESP_ERROR	7	/* Sys-gen'd error resp to request. */
+
+/*
+ * Largest known message type code known in each protocol version we support.
+ * In protocol version one there were only three message types: 1, 2, and 3; so
+ * 3 was the max.  In protocol version 2 we introduced heartbeats, type 4.
+ * (Protocol version 3 did not introduce any new message types.)  In version 4
+ * we introduced a few more new message types, the largest of which had value 7.
+ */
+#define	REPMGR_MAX_V1_MSG_TYPE	3
+#define	REPMGR_MAX_V2_MSG_TYPE	4
+#define	REPMGR_MAX_V3_MSG_TYPE	4
+#define	REPMGR_MAX_V4_MSG_TYPE	8
+#define	HEARTBEAT_MIN_VERSION	2
+#define	CHANNEL_MIN_VERSION	4
+#define	CONN_COLLISION_VERSION	4
+#define	GM_MIN_VERSION		4
+#define	OWN_MIN_VERSION		4
+
+/* The range of protocol versions we're willing to support. */
+#define	DB_REPMGR_VERSION	4
+#define	DB_REPMGR_MIN_VERSION	1
+
+/*
+ * For messages with the "REPMGR_OWN_MSG" format code, a message type (see
+ * REPMGR_OWN_MSG_TYPE, below) is included in the header.  While at the lowest
+ * level, the format codes identify only enough to read and allocate memory, at
+ * the next higher level the following message type codes identify the content
+ * of the message: how to unmarshal and dispatch it.
+ *
+ * Like the message format types, these message type values should be
+ * permanently frozen.
+ */
+#define	REPMGR_CONNECT_REJECT	1
+#define	REPMGR_GM_FAILURE	2
+#define	REPMGR_GM_FORWARD	3
+#define	REPMGR_JOIN_REQUEST	4
+#define	REPMGR_JOIN_SUCCESS	5
+#define	REPMGR_PARM_REFRESH	6
+#define	REPMGR_REJOIN		7
+#define	REPMGR_REMOVE_REQUEST	8
+#define	REPMGR_REMOVE_SUCCESS	9
+#define	REPMGR_RESOLVE_LIMBO	10
+#define	REPMGR_SHARING		11
+
+
+struct __repmgr_connection;
+    typedef struct __repmgr_connection REPMGR_CONNECTION;
+struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE;
+struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT;
+struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE;
+struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY;
+struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE;
+struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE;
+struct __cond_waiters_table;
+    typedef struct __cond_waiters_table COND_WAITERS_TABLE;
+
+/* Current Group Membership DB format ID. */
+#define	REPMGR_GMDB_FMT_VERSION	1
+
+#ifdef DB_WIN32
+typedef SOCKET socket_t;
+typedef HANDLE thread_id_t;
+typedef HANDLE mgr_mutex_t;
+typedef HANDLE cond_var_t;
+
+typedef COND_WAITERS_TABLE *waiter_t;
+typedef WSABUF db_iovec_t;
+#else
+typedef int socket_t;
+typedef pthread_t thread_id_t;
+typedef pthread_mutex_t mgr_mutex_t;
+typedef pthread_cond_t cond_var_t;
+typedef pthread_cond_t waiter_t;
+typedef struct iovec db_iovec_t;
+#endif
+
+/*
+ * The (arbitrary) maximum number of outgoing messages we're willing to hold, on
+ * a queue per connection, waiting for TCP buffer space to become available in
+ * the kernel.  Rather than exceeding this limit, we simply discard additional
+ * messages (since this is always allowed by the replication protocol).
+ *    As a special dispensation, if a message is destined for a specific remote
+ * site (i.e., it's not a broadcast), then we first try blocking the sending
+ * thread, waiting for space to become available (though we only wait a limited
+ * time).  This is so as to be able to handle the immediate flood of (a
+ * potentially large number of) outgoing messages that replication generates, in
+ * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests.
+ */
+#define	OUT_QUEUE_LIMIT	10
+
+/*
+ * The system value is available from sysconf(_SC_HOST_NAME_MAX).
+ * Historically, the maximum host name was 256.
+ */
+#ifndef MAXHOSTNAMELEN
+#define	MAXHOSTNAMELEN	256
+#endif
+
+/* A buffer big enough for the string "site host.domain.com:65535". */
+#define	MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20)
+typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
+
+#define	MAX_MSG_BUF	(__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1)
+
+/* Default timeout values, in seconds. */
+#define	DB_REPMGR_DEFAULT_ACK_TIMEOUT		(1 * US_PER_SEC)
+#define	DB_REPMGR_DEFAULT_CONNECTION_RETRY	(30 * US_PER_SEC)
+#define	DB_REPMGR_DEFAULT_ELECTION_RETRY	(10 * US_PER_SEC)
+#define	DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT	(5 * US_PER_SEC)
+
+typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
+typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
+typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
+
+/* Information about threads managed by Replication Framework. */
+struct __repmgr_runnable {
+	ENV *env;
+	thread_id_t thread_id;
+	void *(*run) __P((void *));
+	int finished;		/* Boolean: thread is exiting, may be joined. */
+	int quit_requested;	/* Boolean: thread has been asked to quit. */
+#ifdef DB_WIN32
+	HANDLE quit_event;
+#endif
+	union {
+
+/*
+ * Options governing requested behavior of election thread.
+ */
+#define	ELECT_F_EVENT_NOTIFY	0x01 /* Notify application of master failure. */
+#define	ELECT_F_FAST		0x02 /* First election "fast" (n-1 trick). */
+#define	ELECT_F_IMMED		0x04 /* Start with immediate election. */
+#define	ELECT_F_INVITEE		0x08 /* Honor (remote) inviter's nsites. */
+#define	ELECT_F_STARTUP		0x10 /* Observe repmgr_start() policy. */
+		u_int32_t flags;
+
+		int eid;	/* For Connector thread. */
+
+		/*
+		 * Args for other thread types can be added here in the future
+		 * as needed.
+		 */
+	} args;
+};
+
+/*
+ * Information about pending connection establishment retry operations.
+ *
+ * We keep these in order by time.  This works, under the assumption that the
+ * DB_REP_CONNECTION_RETRY never changes once we get going (though that
+ * assumption is of course wrong, so this needs to be fixed).
+ *
+ * Usually, we put things onto the tail end of the list.  But when we add a new
+ * site while threads are running, we trigger its first connection attempt by
+ * scheduling a retry for "0" microseconds from now, putting its retry element
+ * at the head of the list instead.
+ *
+ * TODO: I think this can be fixed by defining "time" to be the time the element
+ * was added (with some convention like "0" meaning immediate), rather than the
+ * deadline time.
+ */
+struct __repmgr_retry {
+	TAILQ_ENTRY(__repmgr_retry) entries;
+	int eid;
+	db_timespec time;
+};
+
+/*
+ * We use scatter/gather I/O for both reading and writing.  Repmgr messages
+ * (including rep messages) use 3 segments: envelope, control and rec.
+ * Application messages can have any number of segments (the number they
+ * specify, plus 1 for our envelope).  REPMGR_IOVECS_ALLOC_SZ should (only) be
+ * used when n > 3.
+ */
+#define	REPMGR_IOVECS_ALLOC_SZ(n) \
+	(sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t))
+typedef struct {
+	/*
+	 * Index of the first iovec to be used.  Initially of course this is
+	 * zero.  But as we progress through partial I/O transfers, it ends up
+	 * pointing to the first iovec to be used on the next operation.
+	 */
+	int offset;
+
+	/*
+	 * Total number of pieces defined for this message; equal to the number
+	 * of times add_buffer and/or add_dbt were called to populate it.  We do
+	 * *NOT* revise this as we go along.  So subsequent I/O operations must
+	 * use count-offset to get the number of active vector pieces still
+	 * remaining.
+	 */
+	int count;
+
+	/*
+	 * Total number of bytes accounted for in all the pieces of this
+	 * message.  We do *NOT* revise this as we go along.
+	 */
+	size_t total_bytes;
+
+#define	MIN_IOVEC	3
+	db_iovec_t vectors[MIN_IOVEC];	/* Variable length array. */
+} REPMGR_IOVECS;
+
+typedef struct {
+	size_t length;		/* number of bytes in data */
+	int ref_count;		/* # of sites' send queues pointing to us */
+	u_int8_t data[1];	/* variable size data area */
+} REPMGR_FLAT;
+
+struct __queued_output {
+	STAILQ_ENTRY(__queued_output) entries;
+	REPMGR_FLAT *msg;
+	size_t offset;
+};
+
+/*
+ * The following is for input.  Once we know the sizes of the pieces of an
+ * incoming message, we can create this struct (and also the data areas for the
+ * pieces themselves, in the same memory allocation).  This is also the struct
+ * in which the message lives while it's waiting to be processed by message
+ * threads.
+ */
+typedef struct __repmgr_message {
+	STAILQ_ENTRY(__repmgr_message) entries;
+	__repmgr_msg_hdr_args msg_hdr;
+	union {
+		struct {
+			int originating_eid;
+			DBT control, rec;
+		} repmsg;
+		struct {
+			REPMGR_CONNECTION *conn;
+			DBT request;
+		} gmdb_msg;
+		struct {
+			/*
+			 * Connection from which the message arrived; NULL if
+			 * generated on the local site.
+			 */
+			REPMGR_CONNECTION *conn;
+
+			DBT buf; /* for reading */
+			DBT segments[1]; /* expanded in msg th. before callbk */
+		} appmsg;
+	} v;			/* Variants */
+} REPMGR_MESSAGE;
+
+typedef enum {
+	SIZES_PHASE,
+	DATA_PHASE
+} phase_t;
+
+typedef enum {
+	APP_CONNECTION,
+	REP_CONNECTION,
+	UNKNOWN_CONN_TYPE
+} conn_type_t;
+
+struct __repmgr_connection {
+	TAILQ_ENTRY(__repmgr_connection) entries;
+
+	socket_t fd;
+#ifdef DB_WIN32
+	WSAEVENT event_object;
+#endif
+
+	/*
+	 * Number of other structures referring to this conn struct.  This
+	 * ref_count must be reduced to zero before this conn struct can be
+	 * destroyed.  Referents include:
+	 *
+	 * - the select() loop, which owns the right to do all reading, as well
+	 *   as the exclusive right to eventually close the socket
+	 *
+	 * - a "channel" that owns this APP_CONNECTION (on the originating side)
+	 *
+	 * - a message received on this APP_CONNECTION, queued for processing
+	 *
+	 * - any writer blocked on waiting for the outbound queue to drain
+	 */
+	u_int32_t	ref_count;
+
+	conn_type_t type;
+	u_int32_t version;	/* Wire protocol version on this connection. */
+				/* (0 means not yet determined.) */
+
+/*
+ * When we make an outgoing connection, it starts in CONNECTED state.  When we
+ * get the response to our version negotiation, we move to READY.
+ *     For incoming connections that we accept, we start in NEGOTIATE, then to
+ * PARAMETERS, and then to READY.
+ *     CONGESTED is a hierarchical substate of READY: it's just like READY, with
+ * the additional wrinkle that we don't bother waiting for the outgoing queue to
+ * drain in certain circumstances.
+ */
+#define	CONN_CONGESTED	1	/* Long-lived full outgoing queue. */
+#define	CONN_CONNECTED	2	/* Awaiting reply to our version negotiation. */
+#define	CONN_DEFUNCT	3	/* Basically dead, awaiting clean-up. */
+#define	CONN_NEGOTIATE	4	/* Awaiting version proposal. */
+#define	CONN_PARAMETERS	5	/* Awaiting parameters handshake. */
+#define	CONN_READY	6	/* Everything's fine. */
+	int state;
+
+	/*
+	 * Input: while we're reading a message, we keep track of what phase
+	 * we're in.  In both phases, we use a REPMGR_IOVECS to keep track of
+	 * our progress within the phase.  Depending upon the message type, we
+	 * end up with either a rep_message (which is a wrapper for the control
+	 * and rec DBTs), or a single generic DBT.
+	 *     Any time we're in DATA_PHASE, it means we have already received
+	 * the message header (consisting of msg_type and 2 sizes), and
+	 * therefore we have allocated buffer space to read the data.  (This is
+	 * important for resource clean-up.)
+	 */
+	phase_t		reading_phase;
+	REPMGR_IOVECS iovecs;
+
+	u_int8_t	msg_type;
+	u_int8_t	msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+
+	union {
+		REPMGR_MESSAGE *rep_message;
+		struct {
+			DBT cntrl, rec;
+		} repmgr_msg;
+	} input;
+
+	/*
+	 * Output: usually we just simply write messages right in line, in the
+	 * send() function's thread.  But if TCP doesn't have enough network
+	 * buffer space for us when we first try it, we instead allocate some
+	 * memory, and copy the message, and then send it as space becomes
+	 * available in our main select() thread.  In some cases, if the queue
+	 * gets too long we wait until it's drained, and then append to it.
+	 * This condition variable's associated mutex is the normal per-repmgr
+	 * db_rep->mutex, because that mutex is always held anyway whenever the
+	 * output queue is consulted.
+	 */
+	OUT_Q_HEADER outbound_queue;
+	int out_queue_length;
+	cond_var_t drained;
+
+	/* =-=-=-=-= app-channel stuff =-=-=-=-= */
+	waiter_t	response_waiters;
+
+	/*
+	 * Array of info about pending responses to requests.  This info is here
+	 * (rather than on the stack of the thread calling send_request())
+	 * because it provides an easy way to allocate available numbers for
+	 * message tags, and also so that we can easily find the right info when
+	 * we get the tag back in the msg header of the response.
+	 */
+	REPMGR_RESPONSE *responses;
+	u_int32_t	aresp;	/* Array size. */
+	u_int32_t	cur_resp; /* Index of response currently reading. */
+
+	/* =-=-=-=-= for normal repmgr connections =-=-=-=-= */
+	/*
+	 * Generally on a REP_CONNECTION type, we have an associated EID (which
+	 * is an index into the sites array, by the way).  When we initiate the
+	 * connection ("outgoing"), we know from the start what the EID is; the
+	 * connection struct is linked from the site struct.  On the other hand,
+	 * when we receive an incoming connection, we don't know at first what
+	 * site it may be associated with (or even whether it's an
+	 * APP_CONNECTION or REP_CONNECTION, for that matter).  During that
+	 * initial uncertain time, the eid is -1.  Also, when a connection
+	 * becomes defunct, but the conn struct hasn't yet been destroyed, the
+	 * eid also becomes -1.
+	 *
+	 * The eid should be -1 if and only if the connection is on the orphans
+	 * list.
+	 */
+	int eid;
+
+};
+
+#define	IS_READY_STATE(s)	((s) == CONN_READY || (s) == CONN_CONGESTED)
+
+#ifdef HAVE_GETADDRINFO
+typedef struct addrinfo	ADDRINFO;
+typedef struct sockaddr_storage ACCEPT_ADDR;
+#else
+typedef struct sockaddr_in ACCEPT_ADDR;
+/*
+ * Some windows platforms have getaddrinfo (Windows XP), some don't.  We don't
+ * support conditional compilation in our Windows build, so we always use our
+ * own getaddrinfo implementation.  Rename everything so that we don't collide
+ * with the system libraries.
+ */
+#undef	AI_PASSIVE
+#define	AI_PASSIVE	0x01
+#undef	AI_CANONNAME
+#define	AI_CANONNAME	0x02
+#undef	AI_NUMERICHOST
+#define	AI_NUMERICHOST	0x04
+
+typedef struct __addrinfo {
+	int ai_flags;		/* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */
+	int ai_family;		/* PF_xxx */
+	int ai_socktype;	/* SOCK_xxx */
+	int ai_protocol;	/* 0 or IPPROTO_xxx for IPv4 and IPv6 */
+	size_t ai_addrlen;	/* length of ai_addr */
+	char *ai_canonname;	/* canonical name for nodename */
+	struct sockaddr *ai_addr;	/* binary address */
+	struct __addrinfo *ai_next;	/* next structure in linked list */
+} ADDRINFO;
+#endif /* HAVE_GETADDRINFO */
+
+/*
+ * Unprocessed network address configuration.
+ */
+typedef struct {
+	roff_t host;		/* Separately allocated copy of string. */
+	u_int16_t port;		/* Stored in plain old host-byte-order. */
+} SITEADDR;
+
+/*
+ * Site information, as stored in shared region.
+ */
+typedef struct {
+	SITEADDR addr;		/* Unprocessed network address of site. */
+	u_int32_t config;	/* Configuration flags: peer, helper, etc. */
+	u_int32_t status;	/* Group membership status. */
+} SITEINFO;
+
+/*
+ * A site address, as stored locally.
+ */
+typedef struct {
+	char *host;		/* Separately allocated copy of string. */
+	u_int16_t port;		/* Stored in plain old host-byte-order. */
+} repmgr_netaddr_t;
+
+/*
+ * We store site structs in a dynamically allocated, growable array, indexed by
+ * EID.  We allocate EID numbers for all sites simply according to their
+ * index within this array.
+ */
+#define	SITE_FROM_EID(eid)	(&db_rep->sites[eid])
+#define	EID_FROM_SITE(s)	((int)((s) - (&db_rep->sites[0])))
+#define	IS_VALID_EID(e)		((e) >= 0)
+#define	IS_KNOWN_REMOTE_SITE(e)	((e) >= 0 && ((e) != db_rep->self_eid) && \
+	    (((u_int)(e)) < db_rep->site_cnt))
+#define	FOR_EACH_REMOTE_SITE_INDEX(i)                    \
+	for ((i) = (db_rep->self_eid == 0 ? 1 : 0);	\
+	     ((u_int)i) < db_rep->site_cnt;		 \
+	     (int)(++(i)) == db_rep->self_eid ? ++(i) : i)
+
+struct __repmgr_site {
+	repmgr_netaddr_t net_addr;
+
+	/*
+	 * Group membership status: a copy of the status from the membership
+	 * database, or the out-of-band value 0, meaning that it doesn't exist.
+	 * We keep track of a "non-existent" site because the associated
+	 * host/port network address is promised to be associated with the
+	 * locally known EID for the life of the environment.
+	 */
+	u_int32_t	membership; /* Status flags from GMDB. */
+	u_int32_t	config;	    /* Flags from site->set_config() */
+
+	/*
+	 * Everything below here is applicable only to remote sites.
+	 */
+	DB_LSN max_ack;		/* Best ack we've heard from this site. */
+	int ack_policy;		/* Or 0 if unknown. */
+	u_int16_t alignment;	/* Requirements for app channel msgs. */
+	db_timespec last_rcvd_timestamp;
+
+	/* Contents depends on state. */
+	struct {
+		struct {		 /* when CONNECTED */
+			/*
+			 * The only time we ever have two connections is in case
+			 * of a "collision" on the "server" side.  In that case,
+			 * the incoming connection either will be closed
+			 * promptly by the remote "client", or it is a half-open
+			 * connection due to the remote client system having
+			 * crashed and rebooted, in which case KEEPALIVE will
+			 * eventually clear it.
+			 */ 
+			REPMGR_CONNECTION *in; /* incoming connection */
+			REPMGR_CONNECTION *out; /* outgoing connection */
+		} conn;
+		REPMGR_RETRY *retry; /* when PAUSING */
+		/* Unused when CONNECTING. */
+	} ref;
+
+	/*
+	 * Subordinate connections (connections from subordinate processes at a
+	 * multi-process site).  Note that the SITE_CONNECTED state, and all the
+	 * ref.retry stuff above is irrelevant to subordinate connections.  If a
+	 * connection is on this list, it exists; and we never bother trying to
+	 * reconnect lost connections (indeed we can't, for these are always
+	 * incoming-only).
+	 */
+	CONNECTION_LIST	sub_conns;
+	REPMGR_RUNNABLE	*connector;	/* Thread to open a connection. */
+
+#define	SITE_CONNECTED 1	/* We have a (main) connection. */
+#define	SITE_CONNECTING 2	/* Trying to establish (main) connection. */
+#define	SITE_IDLE 3		/* Doing nothing. */
+#define	SITE_PAUSING 4		/* Waiting til time to retry connecting. */
+	int state;
+
+#define	SITE_HAS_PRIO	0x01	/* Set if "electable" flag bit is valid. */
+#define	SITE_ELECTABLE	0x02
+#define	SITE_TOUCHED	0x04	/* Seen GMDB record during present scan. */
+	u_int32_t flags;
+};
+
+/*
+ * Flag values for the public DB_SITE handle.
+ */
+#define	DB_SITE_PREOPEN	0x01	/* Provisional EID; may change at env open. */
+
+struct __repmgr_response {
+	DBT		dbt;
+	int		ret;
+
+#define	RESP_COMPLETE		0x01
+#define	RESP_DUMMY_BUF		0x02
+#define	RESP_IN_USE		0x04
+#define	RESP_READING		0x08
+#define	RESP_THREAD_WAITING	0x10
+	u_int32_t	flags;
+};
+
+/*
+ * Private structure for managing comms "channels."  This is separate from
+ * DB_CHANNEL so as to avoid dragging in other private structures (e.g.,
+ * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and
+ * ENV.
+ */
+struct __channel {
+	DB_CHANNEL *db_channel;
+	ENV *env;
+
+	union {
+		/* For simple, specific-EID channels. */
+		REPMGR_CONNECTION *conn;
+
+		/* For EID_MASTER or EID_BROADCAST channels. */
+		struct {
+			mgr_mutex_t *mutex;  /* For connection establishment. */
+			REPMGR_CONNECTION **array;
+			u_int32_t cnt;
+		} conns;
+	} c;
+	REPMGR_MESSAGE *msg;	/* Incoming channel only; NULL otherwise. */
+	int	responded;	/* Boolean flag. */
+	__repmgr_msg_metadata_args *meta;
+
+	/* Used only in send-to-self request case. */
+	struct __repmgr_response	response;
+};
+
+/*
+ * Repmgr keeps track of references to connection information (instances
+ * of struct __repmgr_connection).  There are three kinds of places
+ * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
+ * (3) db_rep->connections.
+ *
+ * 1. SITE->ref.conn points to our connection with the main process running
+ * at the given site, if such a connection exists.  We may have initiated
+ * the connection to the site ourselves, or we may have received it as an
+ * incoming connection.  Once it is established there is very little
+ * difference between those two cases.
+ *
+ * 2. SITE->sub_conns is a list of connections we have with subordinate
+ * processes running at the given site.  There can be any number of these
+ * connections, one per subordinate process.  Note that these connections
+ * are always incoming: there's no way for us to initiate this kind of
+ * connection because subordinate processes do not "listen".
+ *
+ * 3. The db_rep->connections list contains the references to any
+ * connections that are not actively associated with any site (we
+ * sometimes call these "orphans").  There are two times when this can
+ * be:
+ *
+ *   a) When we accept an incoming connection, we don't know what site it
+ *      comes from until we read the initial handshake message.
+ *
+ *   b) When an error occurs on a connection, we first mark it as DEFUNCT
+ *      and stop using it.  Then, at a later, well-defined time, we close
+ *      the connection's file descriptor and get rid of the connection
+ *      struct.
+ *
+ * In light of the above, we can see that the following describes the
+ * rules for how connections may be moved among these three kinds of
+ * "places":
+ *
+ * - when we initiate an outgoing connection, we of course know what site
+ *   it's going to be going to, and so we immediately put the pointer to
+ *   the connection struct into SITE->ref.conn
+ *
+ * - when we accept an incoming connection, we don't immediately know
+ *   whom it's from, so we have to put it on the orphans list
+ *   (db_rep->connections).
+ *
+ * - (incoming, cont.) But as soon as we complete the initial "handshake"
+ *   message exchange, we will know which site it's from and whether it's
+ *   a subordinate or main connection.  At that point we remove it from
+ *   db_rep->connections and either point to it by SITE->ref.conn, or add
+ *   it to the SITE->sub_conns list.
+ *
+ * - (for any active connection) when an error occurs, we move the
+ *   connection to the orphans list until we have a chance to close it.
+ */
+
+/*
+ * Repmgr message formats.
+ *
+ * Declarative definitions of current message formats appear in repmgr.msg.
+ * (The s_message/gen_msg.awk utility generates C code.)  In general, we send
+ * the buffers marshaled from those structure formats in the "control" portion
+ * of a message.
+ *
+ * Each message is prefaced by a 9-byte message header (as described in
+ * repmgr_net.c).  Different message types use the two available 32-bit integers
+ * in different ways, as codified here:
+ */
+#define	REPMGR_HDR1(hdr)		((hdr).word1)
+#define	REPMGR_HDR2(hdr)		((hdr).word2)
+
+/* REPMGR_APP_MESSAGE */
+#define APP_MSG_BUFFER_SIZE		REPMGR_HDR1
+#define	APP_MSG_SEGMENT_COUNT		REPMGR_HDR2
+
+/* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */
+#define	REP_MSG_CONTROL_SIZE		REPMGR_HDR1
+#define	REP_MSG_REC_SIZE		REPMGR_HDR2
+
+/* REPMGR_APP_RESPONSE */
+#define	APP_RESP_BUFFER_SIZE		REPMGR_HDR1
+#define	APP_RESP_TAG			REPMGR_HDR2
+
+/* REPMGR_RESP_ERROR.  Note that a zero-length message body is implied. */
+#define	RESP_ERROR_CODE			REPMGR_HDR1
+#define	RESP_ERROR_TAG			REPMGR_HDR2
+
+/* REPMGR_OWN_MSG */
+#define	REPMGR_OWN_BUF_SIZE		REPMGR_HDR1
+#define	REPMGR_OWN_MSG_TYPE		REPMGR_HDR2
+
+/*
+ * Flags for the handshake message.  As with repmgr message types, these values
+ * are transmitted between sites, and must therefore be "frozen" permanently.
+ * Names are alphabetized here for easy reference, but values reflect historical
+ * usage.
+ */
+#define	APP_CHANNEL_CONNECTION	0x02	/* Connection used for app channel. */
+#define	ELECTABLE_SITE		0x04
+#define	REPMGR_SUBORDINATE	0x01	/* This is a subordinate connection. */
+
+/*
+ * Flags for application-message meta-data.
+ */
+#define	REPMGR_MULTI_RESP	0x01
+#define	REPMGR_REQUEST_MSG_TYPE	0x02
+#define	REPMGR_RESPONSE_LIMIT	0x04
+
+/*
+ * Legacy V1 handshake message format.  For compatibility, we send this as part
+ * of version negotiation upon connection establishment.
+ */
+typedef struct {
+	u_int32_t version;
+	u_int16_t port;
+	u_int32_t priority;
+} DB_REPMGR_V1_HANDSHAKE;
+
+/*
+ * Storage formats.
+ *
+ * As with message formats, stored formats are defined in repmgr.msg.
+ */
+/*
+ * Flags for the Group Membership data portion of a record.  Like message type
+ * codes, these values are frozen across releases, in order to avoid pointless
+ * churn.
+ */
+#define	SITE_ADDING	0x01
+#define	SITE_DELETING	0x02
+#define	SITE_PRESENT	0x04
+
+/*
+ * Message types whose processing could take a long time.  We're careful to
+ * avoid using up all our message processing threads on these message types, so
+ * that we don't starve out the more important rep messages.
+ */ 
+#define	IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE)
+/*
+ * When using leases there are times when a thread processing a message
+ * must block, waiting for leases to be refreshed.  But refreshing the
+ * leases requires another thread to accept the lease grant messages.
+ */
+#define	RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1)
+
+#define	IS_SUBORDINATE(db_rep)	(db_rep->listen_fd == INVALID_SOCKET)
+
+#define	IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS ||		\
+    (p) == DB_REPMGR_ACKS_QUORUM ||		\
+    (p) == DB_REPMGR_ACKS_ONE_PEER)
+
+/*
+ * Most of the code in repmgr runs while holding repmgr's main mutex, which
+ * resides in db_rep->mutex.  This mutex is owned by a single repmgr process,
+ * and serializes access to the (large) critical sections among threads in the
+ * process.  Unlike many other mutexes in DB, it is specifically coded as either
+ * a POSIX threads mutex or a Win32 mutex.  Note that although it's a large
+ * fraction of the code, it's a tiny fraction of the time: repmgr spends most of
+ * its time in a call to select(), and as well a bit in calls into the Base
+ * replication API.  All of those release the mutex.
+ *     Access to repmgr's shared list of site addresses is protected by
+ * another mutex: mtx_repmgr.  And, when changing space allocation for that site
+ * list we conform to the convention of acquiring renv->mtx_regenv.  These are
+ * less frequent of course.
+ *     When it's necessary to acquire more than one of these mutexes, the
+ * ordering priority (or "lock ordering protocol") is:
+ *        db_rep->mutex (first)
+ *        mtx_repmgr    (briefly)
+ *        mtx_regenv    (last, and most briefly)
+ *
+ * There are also mutexes for app message "channels".  Each channel has a mutex,
+ * which is used to serialize any connection re-establishment that may become
+ * necessary during its lifetime (such as when a master changes).  This never
+ * happens on a simple, specific-EID channel, but in other cases multiple app
+ * threads could be making send_xxx() calls concurrently, and it would not do to
+ * have two of them try to re-connect concurrently.
+ *     When re-establishing a connection, the channel lock is held while
+ * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both
+ * together).  I.e., we have:
+ *        channel->mutex (first)
+ *        [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)]
+ */
+
+#define	LOCK_MUTEX(m) do {						\
+	if (__repmgr_lock_mutex(m) != 0)				\
+		return (DB_RUNRECOVERY);				\
+} while (0)
+
+#define	UNLOCK_MUTEX(m) do {						\
+		if (__repmgr_unlock_mutex(m) != 0)			\
+		return (DB_RUNRECOVERY);				\
+} while (0)
+
+/* POSIX/Win32 socket (and other) portability. */
+#ifdef DB_WIN32
+#define	WOULDBLOCK		WSAEWOULDBLOCK
+#undef	DB_REPMGR_EAGAIN
+
+#define	net_errno		WSAGetLastError()
+typedef int socklen_t;
+typedef char * sockopt_t;
+#define	sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags))
+
+#define	iov_len len
+#define	iov_base buf
+
+typedef DWORD threadsync_timeout_t;
+
+#define	REPMGR_INITED(db_rep) (db_rep->signaler != NULL)
+#else
+
+#define	INVALID_SOCKET		-1
+#define	SOCKET_ERROR		-1
+#define	WOULDBLOCK		EWOULDBLOCK
+#define	DB_REPMGR_EAGAIN	EAGAIN
+
+#define	net_errno		errno
+typedef void * sockopt_t;
+
+#define	sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags))
+#define	closesocket(fd)		close(fd)
+
+typedef struct timespec threadsync_timeout_t;
+
+#define	REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0)
+#endif
+
+#define	SELECTOR_RUNNING(db_rep)	((db_rep)->selector != NULL)
+
+/*
+ * Generic definition of some action to be performed on each connection, in the
+ * form of a call-back function.
+ */
+typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *));
+
+/*
+ * Generic predicate to test a condition that a thread is waiting for.
+ */
+typedef int (*PREDICATE) __P((ENV *, void *));
+
+#include "dbinc_auto/repmgr_ext.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_REPMGR_H_ */
diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h
new file mode 100644
index 00000000..22464462
--- /dev/null
+++ b/src/dbinc/shqueue.h
@@ -0,0 +1,410 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_SHQUEUE_H_
+#define	_DB_SHQUEUE_H_
+
+/*
+ * This file defines three types of data structures: chains, lists and
+ * tail queues similarly to the include file <sys/queue.h>.
+ *
+ * The difference is that this set of macros can be used for structures that
+ * reside in shared memory that may be mapped at different addresses in each
+ * process.  In most cases, the macros for shared structures exactly mirror
+ * the normal macros, although the macro calls require an additional type
+ * parameter, only used by the HEAD and ENTRY macros of the standard macros.
+ *
+ * Since we use relative offsets of type ssize_t rather than pointers, 0
+ * (aka NULL) is a valid offset and cannot be used to indicate the end
+ * of a list.  Therefore, we use -1 to indicate end of list.
+ *
+ * The macros ending in "P" return pointers without checking for end or
+ * beginning of lists, the others check for end of list and evaluate to
+ * either a pointer or NULL.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define	SH_PTR_TO_OFF(src, dest)					\
+	((db_ssize_t)(((u_int8_t *)(dest)) - ((u_int8_t *)(src))))
+
+#define SH_OFF_TO_PTR(base, off, type)           \
+       ((type *) (((u_int8_t *)(base)) + (db_ssize_t) (off)))
+
+
+/*
+ * Shared memory chain definitions.
+ */
+#define	SH_CHAIN_ENTRY							\
+struct {								\
+	db_ssize_t sce_next;	/* relative offset to next element */	\
+	db_ssize_t sce_prev;	/* relative offset of prev element */	\
+}
+
+#define	SH_CHAIN_INIT(elm, field)					\
+	(elm)->field.sce_next = (elm)->field.sce_prev =	-1
+
+#define	SH_CHAIN_HASNEXT(elm, field)	((elm)->field.sce_next != -1)
+#define	SH_CHAIN_NEXTP(elm, field, type)				\
+    ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_next))
+#define	SH_CHAIN_NEXT(elm, field, type)	(SH_CHAIN_HASNEXT(elm, field) ?	\
+    SH_CHAIN_NEXTP(elm, field, type) : (struct type *)NULL)
+
+#define	SH_CHAIN_HASPREV(elm, field)	((elm)->field.sce_prev != -1)
+#define	SH_CHAIN_PREVP(elm, field, type)				\
+    ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_prev))
+#define	SH_CHAIN_PREV(elm, field, type)	(SH_CHAIN_HASPREV(elm, field) ?	\
+     SH_CHAIN_PREVP(elm, field, type) : (struct type *)NULL)
+
+#define	SH_CHAIN_SINGLETON(elm, field)					\
+    (!(SH_CHAIN_HASNEXT(elm, field) || SH_CHAIN_HASPREV(elm, field)))
+
+#define	SH_CHAIN_INSERT_AFTER(listelm, elm, field, type) do {		\
+	struct type *__next = SH_CHAIN_NEXT(listelm, field, type);	\
+	if (__next != NULL) {						\
+		(elm)->field.sce_next =	SH_PTR_TO_OFF(elm, __next);	\
+		__next->field.sce_prev = SH_PTR_TO_OFF(__next, elm);	\
+	} else								\
+		(elm)->field.sce_next = -1;				\
+	(elm)->field.sce_prev = SH_PTR_TO_OFF(elm, listelm);		\
+	(listelm)->field.sce_next = SH_PTR_TO_OFF(listelm, elm);	\
+} while (0)
+
+#define	SH_CHAIN_INSERT_BEFORE(listelm, elm, field, type) do {		\
+	struct type *__prev = SH_CHAIN_PREV(listelm, field, type);	\
+	if (__prev != NULL) {						\
+		(elm)->field.sce_prev = SH_PTR_TO_OFF(elm, __prev);	\
+		__prev->field.sce_next = SH_PTR_TO_OFF(__prev, elm);	\
+	} else								\
+		(elm)->field.sce_prev = -1;				\
+	(elm)->field.sce_next = SH_PTR_TO_OFF(elm, listelm);		\
+	(listelm)->field.sce_prev = SH_PTR_TO_OFF(listelm, elm);	\
+} while (0)
+
+#define	SH_CHAIN_REMOVE(elm, field, type) do {				\
+	struct type *__prev = SH_CHAIN_PREV(elm, field, type);		\
+	struct type *__next = SH_CHAIN_NEXT(elm, field, type);		\
+	if (__next != NULL)						\
+		__next->field.sce_prev = (__prev == NULL) ? -1 :	\
+		    SH_PTR_TO_OFF(__next, __prev);			\
+	if (__prev != NULL)						\
+		__prev->field.sce_next = (__next == NULL) ? -1 :	\
+		    SH_PTR_TO_OFF(__prev, __next);			\
+	SH_CHAIN_INIT(elm, field);					\
+} while (0)
+
+/*
+ * Shared memory list definitions.
+ */
+#define	SH_LIST_HEAD(name)						\
+struct name {								\
+	db_ssize_t slh_first;	/* first element */			\
+}
+
+#define	SH_LIST_HEAD_INITIALIZER(head)					\
+	{ -1 }
+
+#define	SH_LIST_ENTRY							\
+struct {								\
+	db_ssize_t sle_next;	/* relative offset to next element */	\
+	db_ssize_t sle_prev;	/* relative offset of prev element */	\
+}
+
+/*
+ * Shared memory list functions.
+ */
+#define	SH_LIST_EMPTY(head)						\
+	((head)->slh_first == -1)
+
+#define	SH_LIST_FIRSTP(head, type)					\
+	((struct type *)(((u_int8_t *)(head)) + (head)->slh_first))
+
+#define	SH_LIST_FIRST(head, type)					\
+	(SH_LIST_EMPTY(head) ? NULL :					\
+	((struct type *)(((u_int8_t *)(head)) + (head)->slh_first)))
+
+#define	SH_LIST_NEXTP(elm, field, type)					\
+	((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))
+
+#define	SH_LIST_NEXT(elm, field, type)					\
+	((elm)->field.sle_next == -1 ? NULL :				\
+	((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)))
+
+  /*
+   *__SH_LIST_PREV_OFF is private API.  It calculates the address of
+   * the elm->field.sle_next member of a SH_LIST structure.  All offsets
+   * between elements are relative to that point in SH_LIST structures.
+   */
+#define	__SH_LIST_PREV_OFF(elm, field)					\
+	((db_ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.sle_prev))
+
+#define	SH_LIST_PREV(elm, field, type)					\
+	(struct type *)((db_ssize_t)(elm) - (*__SH_LIST_PREV_OFF(elm, field)))
+
+#define	SH_LIST_FOREACH(var, head, field, type)				\
+	for ((var) = SH_LIST_FIRST((head), type);			\
+	    (var) != NULL;						\
+	    (var) = SH_LIST_NEXT((var), field, type))
+
+/*
+ * Given correct A.next: B.prev = SH_LIST_NEXT_TO_PREV(A)
+ * in a list [A, B]
+ * The prev value is always the offset from an element to its preceding
+ * element's next location, not the beginning of the structure.  To get
+ * to the beginning of an element structure in memory given an element
+ * do the following:
+ * A = B - (B.prev + (&B.next - B))
+ * Take the element's next pointer and calculate what the corresponding
+ * Prev pointer should be -- basically it is the negation plus the offset
+ * of the next field in the structure.
+ */
+#define	SH_LIST_NEXT_TO_PREV(elm, field)				\
+	(((elm)->field.sle_next == -1 ? 0 : -(elm)->field.sle_next) +	\
+	   SH_PTR_TO_OFF(elm, &(elm)->field.sle_next))
+
+#define	SH_LIST_INIT(head) (head)->slh_first = -1
+
+#define	SH_LIST_INSERT_BEFORE(head, listelm, elm, field, type) do {	\
+	if (listelm == SH_LIST_FIRST(head, type)) {			\
+	SH_LIST_INSERT_HEAD(head, elm, field, type);			\
+	} else {							\
+		(elm)->field.sle_next = SH_PTR_TO_OFF(elm, listelm);	\
+		(elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(		\
+			SH_LIST_PREV((listelm), field, type), field) +	\
+		(elm)->field.sle_next;					\
+		(SH_LIST_PREV(listelm, field, type))->field.sle_next =	\
+			(SH_PTR_TO_OFF((SH_LIST_PREV(listelm, field,	\
+						     type)), elm));	\
+	(listelm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(elm, field);	\
+	}								\
+} while (0)
+
+#define	SH_LIST_INSERT_AFTER(listelm, elm, field, type) do {		\
+	if ((listelm)->field.sle_next != -1) {				\
+		(elm)->field.sle_next = SH_PTR_TO_OFF(elm,		\
+		    SH_LIST_NEXTP(listelm, field, type));		\
+		SH_LIST_NEXTP(listelm, field, type)->field.sle_prev =	\
+			SH_LIST_NEXT_TO_PREV(elm, field);		\
+	} else								\
+		(elm)->field.sle_next = -1;				\
+	(listelm)->field.sle_next = SH_PTR_TO_OFF(listelm, elm);	\
+	(elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(listelm, field);	\
+} while (0)
+
+#define	SH_LIST_INSERT_HEAD(head, elm, field, type) do {		\
+	if ((head)->slh_first != -1) {					\
+		(elm)->field.sle_next =					\
+		    (head)->slh_first - SH_PTR_TO_OFF(head, elm);	\
+		SH_LIST_FIRSTP(head, type)->field.sle_prev =		\
+			SH_LIST_NEXT_TO_PREV(elm, field);		\
+	} else								\
+		(elm)->field.sle_next = -1;				\
+	(head)->slh_first = SH_PTR_TO_OFF(head, elm);			\
+	(elm)->field.sle_prev = SH_PTR_TO_OFF(elm, &(head)->slh_first);	\
+} while (0)
+
+#define	SH_LIST_REMOVE(elm, field, type) do {				\
+	if ((elm)->field.sle_next != -1) {				\
+		SH_LIST_NEXTP(elm, field, type)->field.sle_prev =	\
+			(elm)->field.sle_prev - (elm)->field.sle_next;	\
+		*__SH_LIST_PREV_OFF(elm, field) += (elm)->field.sle_next;\
+	} else								\
+		*__SH_LIST_PREV_OFF(elm, field) = -1;			\
+} while (0)
+
+#define	SH_LIST_REMOVE_HEAD(head, field, type) do {			\
+	if (!SH_LIST_EMPTY(head)) {					\
+		SH_LIST_REMOVE(SH_LIST_FIRSTP(head, type), field, type);\
+	}								\
+} while (0)
+
+/*
+ * Shared memory tail queue definitions.
+ */
+#define	SH_TAILQ_HEAD(name)						\
+struct name {								\
+	db_ssize_t stqh_first;	/* relative offset of first element */	\
+	db_ssize_t stqh_last;	/* relative offset of last's next */	\
+}
+
+#define	SH_TAILQ_HEAD_INITIALIZER(head)					\
+	{ -1, 0 }
+
+#define	SH_TAILQ_ENTRY							\
+struct {								\
+	db_ssize_t stqe_next;	/* relative offset of next element */	\
+	db_ssize_t stqe_prev;	/* relative offset of prev's next */	\
+}
+
+/*
+ * Shared memory tail queue functions.
+ */
+
+#define	SH_TAILQ_EMPTY(head)						\
+	((head)->stqh_first == -1)
+
+#define	SH_TAILQ_FIRSTP(head, type)					\
+	((struct type *)((u_int8_t *)(head) + (head)->stqh_first))
+
+#define	SH_TAILQ_FIRST(head, type)					\
+	(SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_FIRSTP(head, type))
+
+#define	SH_TAILQ_NEXTP(elm, field, type)				\
+	((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next))
+
+#define	SH_TAILQ_NEXT(elm, field, type)					\
+	((elm)->field.stqe_next == -1 ? NULL :				\
+	((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next)))
+
+  /*
+   * __SH_TAILQ_PREV_OFF is private API.  It calculates the address of
+   * the elm->field.stqe_next member of a SH_TAILQ structure.  All
+   * offsets between elements are relative to that point in SH_TAILQ
+   * structures.
+   */
+#define	__SH_TAILQ_PREV_OFF(elm, field)					\
+	((db_ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.stqe_prev))
+
+#define	SH_TAILQ_PREVP(elm, field, type)				\
+	(struct type *)((db_ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field)))
+
+#define	SH_TAILQ_PREV(head, elm, field, type)				\
+	(((elm) == SH_TAILQ_FIRST(head, type)) ? NULL :		\
+	  (struct type *)((db_ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field))))
+
+  /*
+   * __SH_TAILQ_LAST_OFF is private API.  It calculates the address of
+   * the stqe_next member of a SH_TAILQ structure in the last element
+   * of this list.  All offsets between elements are relative to that
+   * point in SH_TAILQ structures.
+   */
+#define	__SH_TAILQ_LAST_OFF(head)					\
+	((db_ssize_t *)(((u_int8_t *)(head)) + (head)->stqh_last))
+
+#define	SH_TAILQ_LASTP(head, field, type)				\
+	((struct type *)((db_ssize_t)(head) +				\
+	 ((db_ssize_t)((head)->stqh_last) -				\
+	 ((db_ssize_t)SH_PTR_TO_OFF(SH_TAILQ_FIRST(head, type),		\
+		&(SH_TAILQ_FIRSTP(head, type)->field.stqe_next))))))
+
+#define	SH_TAILQ_LAST(head, field, type)				\
+	(SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_LASTP(head, field, type))
+
+/*
+ * Given correct A.next: B.prev = SH_TAILQ_NEXT_TO_PREV(A)
+ * in a list [A, B]
+ * The prev value is always the offset from an element to its preceding
+ * element's next location, not the beginning of the structure.  To get
+ * to the beginning of an element structure in memory given an element
+ * do the following:
+ * A = B - (B.prev + (&B.next - B))
+ */
+#define	SH_TAILQ_NEXT_TO_PREV(elm, field)				\
+	(((elm)->field.stqe_next == -1 ? 0 :				\
+		(-(elm)->field.stqe_next) +				\
+		SH_PTR_TO_OFF(elm, &(elm)->field.stqe_next)))
+
+#define	SH_TAILQ_FOREACH(var, head, field, type)			\
+	for ((var) = SH_TAILQ_FIRST((head), type);			\
+	    (var) != NULL;						\
+	    (var) = SH_TAILQ_NEXT((var), field, type))
+
+#define	SH_TAILQ_FOREACH_REVERSE(var, head, field, type)		\
+	for ((var) = SH_TAILQ_LAST((head), field, type);		\
+	    (var) != NULL;						\
+	    (var) = SH_TAILQ_PREV((head), (var), field, type))
+
+#define	SH_TAILQ_INIT(head) {						\
+	(head)->stqh_first = -1;					\
+	(head)->stqh_last = SH_PTR_TO_OFF(head, &(head)->stqh_first);	\
+}
+
+#define	SH_TAILQ_INSERT_HEAD(head, elm, field, type) do {		\
+	if ((head)->stqh_first != -1) {					\
+		(elm)->field.stqe_next =				\
+		    (head)->stqh_first - SH_PTR_TO_OFF(head, elm);	\
+		SH_TAILQ_FIRSTP(head, type)->field.stqe_prev =		\
+			SH_TAILQ_NEXT_TO_PREV(elm, field);		\
+	} else {							\
+		(head)->stqh_last =					\
+		    SH_PTR_TO_OFF(head, &(elm)->field.stqe_next);	\
+		(elm)->field.stqe_next = -1;				\
+	}								\
+	(head)->stqh_first = SH_PTR_TO_OFF(head, elm);			\
+	(elm)->field.stqe_prev =					\
+	    SH_PTR_TO_OFF(elm, &(head)->stqh_first);			\
+} while (0)
+
+#define	SH_TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	(elm)->field.stqe_next = -1;					\
+	(elm)->field.stqe_prev =					\
+	    -SH_PTR_TO_OFF(head, elm) + (head)->stqh_last;		\
+	if ((head)->stqh_last ==					\
+	    SH_PTR_TO_OFF((head), &(head)->stqh_first))			\
+		(head)->stqh_first = SH_PTR_TO_OFF(head, elm);		\
+	else								\
+		*__SH_TAILQ_LAST_OFF(head) = -(head)->stqh_last +	\
+		    SH_PTR_TO_OFF((elm), &(elm)->field.stqe_next) +	\
+		    SH_PTR_TO_OFF(head, elm);				\
+	(head)->stqh_last =						\
+	    SH_PTR_TO_OFF(head, &((elm)->field.stqe_next));		\
+} while (0)
+
+#define	SH_TAILQ_INSERT_BEFORE(head, listelm, elm, field, type) do {	\
+	if (listelm == SH_TAILQ_FIRST(head, type)) {			\
+		SH_TAILQ_INSERT_HEAD(head, elm, field, type);		\
+	} else {							\
+		(elm)->field.stqe_next = SH_PTR_TO_OFF(elm, listelm);	\
+		(elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV(		\
+			SH_TAILQ_PREVP((listelm), field, type), field) + \
+			(elm)->field.stqe_next;				\
+		(SH_TAILQ_PREVP(listelm, field, type))->field.stqe_next =\
+		(SH_PTR_TO_OFF((SH_TAILQ_PREVP(listelm, field, type)),	\
+			elm));						\
+		(listelm)->field.stqe_prev =				\
+			SH_TAILQ_NEXT_TO_PREV(elm, field);		\
+	}								\
+} while (0)
+
+#define	SH_TAILQ_INSERT_AFTER(head, listelm, elm, field, type) do {	\
+	if ((listelm)->field.stqe_next != -1) {				\
+		(elm)->field.stqe_next = (listelm)->field.stqe_next -	\
+		    SH_PTR_TO_OFF(listelm, elm);			\
+		SH_TAILQ_NEXTP(listelm, field, type)->field.stqe_prev =	\
+		    SH_TAILQ_NEXT_TO_PREV(elm, field);			\
+	} else {							\
+		(elm)->field.stqe_next = -1;				\
+		(head)->stqh_last =					\
+		    SH_PTR_TO_OFF(head, &(elm)->field.stqe_next);	\
+	}								\
+	(listelm)->field.stqe_next = SH_PTR_TO_OFF(listelm, elm);	\
+	(elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV(listelm, field);	\
+} while (0)
+
+#define	SH_TAILQ_REMOVE(head, elm, field, type) do {			\
+	if ((elm)->field.stqe_next != -1) {				\
+		SH_TAILQ_NEXTP(elm, field, type)->field.stqe_prev =	\
+		    (elm)->field.stqe_prev +				\
+		    SH_PTR_TO_OFF(SH_TAILQ_NEXTP(elm,			\
+		    field, type), elm);					\
+		*__SH_TAILQ_PREV_OFF(elm, field) += (elm)->field.stqe_next;\
+	} else {							\
+		(head)->stqh_last = (elm)->field.stqe_prev +		\
+			SH_PTR_TO_OFF(head, elm);			\
+		*__SH_TAILQ_PREV_OFF(elm, field) = -1;			\
+	}								\
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif	/* !_DB_SHQUEUE_H_ */
diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h
new file mode 100644
index 00000000..4c56164f
--- /dev/null
+++ b/src/dbinc/tcl_db.h
@@ -0,0 +1,316 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_TCL_DB_H_
+#define	_DB_TCL_DB_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define	MSG_SIZE 100		/* Message size */
+
+enum INFOTYPE {
+	I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
+
+#define	MAX_ID		8	/* Maximum number of sub-id's we need */
+#define	DBTCL_PREP	64	/* Size of txn_recover preplist */
+
+#define	DBTCL_DBM	1
+#define	DBTCL_NDBM	2
+
+#define	DBTCL_GETCLOCK		0
+#define	DBTCL_GETLIMIT		1
+#define	DBTCL_GETREQ		2
+
+#define	DBTCL_MUT_ALIGN	0
+#define	DBTCL_MUT_INCR	1
+#define	DBTCL_MUT_INIT	2
+#define	DBTCL_MUT_MAX	3
+#define	DBTCL_MUT_TAS	4
+
+/*
+ * Data structure to record information about events that have occurred.  Tcl
+ * command "env event_info" can retrieve the information.  For now, we record
+ * only one occurrence per event type; "env event_info -clear" can be used to
+ * reset the info.
+ *
+ * Besides the bit flag that records the fact that an event type occurred, some
+ * event types have associated "info" and we record that here too.  When new
+ * event types are invented that have associated info, we should add a field
+ * here to record that info as well, so that it can be returned to the script
+ * with the "env event_info" results.
+ */
+typedef struct dbtcl_event_info {
+	u_int32_t	events;	/* Bit flag on for each event fired. */
+	int		panic_error;
+	int		newmaster_eid;
+	int		added_eid;
+	int		removed_eid;
+	pid_t		attached_process;
+	int		connected_eid;
+	DB_REPMGR_CONN_ERR conn_broken_info;
+	DB_REPMGR_CONN_ERR conn_failed_try_info;
+	DB_LSN		sync_point;
+} DBTCL_EVENT_INFO;
+
+/*
+ * Why use a home grown package over the Tcl_Hash functions?
+ *
+ * We could have implemented the stuff below without maintaining our
+ * own list manipulation, efficiently hashing it with the available
+ * Tcl functions (Tcl_CreateHashEntry, Tcl_GetHashValue, etc).  I chose
+ * not to do so for these reasons:
+ *
+ * We still need the information below.  Using the hashing only removes
+ * us from needing the next/prev pointers.  We still need the structure
+ * itself because we need more than one value associated with a widget.
+ * We need to keep track of parent pointers for sub-widgets (like cursors)
+ * so we can correctly close.  We need to keep track of individual widget's
+ * id counters for any sub-widgets they may have.  We need to be able to
+ * associate the name/client data outside the scope of the widget.
+ *
+ * So, is it better to use the hashing rather than
+ * the linear list we have now?  I decided against it for the simple reason
+ * that to access the structure would require two calls.  The first is
+ * Tcl_FindHashEntry(table, key) and then, once we have the entry, we'd
+ * have to do Tcl_GetHashValue(entry) to get the pointer of the structure.
+ *
+ * I believe the number of simultaneous DB widgets in existence at one time
+ * is not going to be that large (more than several dozen) such that
+ * linearly searching the list is not going to impact performance in a
+ * noticeable way.  Should performance be impacted due to the size of the
+ * info list, then perhaps it is time to revisit this decision.
+ */
+typedef struct dbtcl_info {
+	LIST_ENTRY(dbtcl_info) entries;
+	Tcl_Interp *i_interp;
+	char *i_name;
+	enum INFOTYPE i_type;
+	union infop {
+		DB *dbp;
+		DBC *dbcp;
+		DB_ENV *envp;
+		DB_LOCK *lock;
+		DB_LOGC *logc;
+		DB_MPOOLFILE *mp;
+		DB_TXN *txnp;
+		void *anyp;
+	} un;
+	union data {
+		int anydata;
+		db_pgno_t pgno;		      /* For I_MP. */
+		u_int32_t lockid;	      /* For I_LOCK. */
+		DBTCL_EVENT_INFO *event_info; /* For I_ENV. */
+		DB_TXN_TOKEN *commit_token;   /* For I_TXN. */
+	} und;
+	union data2 {
+		int anydata;
+		int pagesz;	    /* For I_MP. */
+		DB_COMPACT *c_data; /* For I_DB. */
+		db_mutex_t mutex;   /* Protects event_info (I_ENV). */
+	} und2;
+	DBT i_lockobj;
+	FILE *i_err;
+	char *i_errpfx;
+	FILE *i_msg;
+
+	/* Callbacks--Tcl_Objs containing proc names */
+	Tcl_Obj *i_compare;
+	Tcl_Obj *i_dupcompare;
+	Tcl_Obj *i_foreign_call;
+	Tcl_Obj *i_hashproc;
+	Tcl_Obj *i_isalive;
+	Tcl_Obj *i_part_callback;
+	Tcl_Obj *i_rep_send;
+	Tcl_Obj *i_second_call;
+
+	/* Environment ID for the i_rep_send callback. */
+	Tcl_Obj *i_rep_eid;
+
+	struct dbtcl_info *i_parent;
+	int	i_otherid[MAX_ID];
+
+	/* Heap dbs have an associated recno db, and secondary db. */
+	DB *hrdbp;
+	DB *hsdbp;
+} DBTCL_INFO;
+
+#define	i_anyp un.anyp
+#define	i_dbp un.dbp
+#define	i_dbcp un.dbcp
+#define	i_envp un.envp
+#define	i_lock un.lock
+#define	i_logc un.logc
+#define	i_mp un.mp
+#define	i_pagep un.anyp
+#define	i_txnp un.txnp
+
+#define	i_data und.anydata
+#define	i_pgno und.pgno
+#define	i_locker und.lockid
+#define	i_event_info und.event_info
+#define	i_commit_token und.commit_token
+#define	i_data2 und2.anydata
+#define	i_pgsz und2.pagesz
+#define	i_cdata und2.c_data
+#define	i_mutex und2.mutex
+
+#define	i_envtxnid i_otherid[0]
+#define	i_envmpid i_otherid[1]
+#define	i_envlockid i_otherid[2]
+#define	i_envlogcid i_otherid[3]
+
+#define	i_mppgid  i_otherid[0]
+
+#define	i_dbdbcid i_otherid[0]
+
+extern int __debug_on, __debug_print, __debug_stop, __debug_test;
+
+typedef struct dbtcl_global {
+	LIST_HEAD(infohead, dbtcl_info) g_infohead;
+} DBTCL_GLOBAL;
+#define	__db_infohead __dbtcl_global.g_infohead
+
+extern DBTCL_GLOBAL __dbtcl_global;
+
+/*
+ * Tcl_NewStringObj takes an "int" length argument, when the typical use is to
+ * call it with a size_t length (for example, returned by strlen).  Tcl is in
+ * the wrong, but that doesn't help us much -- cast the argument.
+ */
+#define	NewStringObj(a, b)						\
+	Tcl_NewStringObj((a), (int)(b))
+
+#define	NAME_TO_DB(name)	(DB *)_NameToPtr((name))
+#define	NAME_TO_DBC(name)	(DBC *)_NameToPtr((name))
+#define	NAME_TO_ENV(name)	(DB_ENV *)_NameToPtr((name))
+#define	NAME_TO_LOCK(name)	(DB_LOCK *)_NameToPtr((name))
+#define	NAME_TO_MP(name)	(DB_MPOOLFILE *)_NameToPtr((name))
+#define	NAME_TO_TXN(name)	(DB_TXN *)_NameToPtr((name))
+#define	NAME_TO_SEQUENCE(name)	(DB_SEQUENCE *)_NameToPtr((name))
+
+/*
+ * MAKE_STAT_LIST appends a {name value} pair to a result list that MUST be
+ * called 'res' that is a Tcl_Obj * in the local function.  This macro also
+ * assumes a label "error" to go to in the event of a Tcl error.  For stat
+ * functions this will typically go before the "free" function to free the
+ * stat structure returned by DB.
+ */
+#define	MAKE_STAT_LIST(s, v) do {					\
+	result = _SetListElemInt(interp, res, (s), (long)(v));		\
+	if (result != TCL_OK)						\
+		goto error;						\
+} while (0)
+
+#define	MAKE_WSTAT_LIST(s, v) do {					\
+	result = _SetListElemWideInt(interp, res, (s), (int64_t)(v));	\
+	if (result != TCL_OK)						\
+		goto error;						\
+} while (0)
+
+/*
+ * MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local
+ * function.  This macro also assumes a label "error" to go to
+ * in the even of a Tcl error.  For stat functions this will
+ * typically go before the "free" function to free the stat structure
+ * returned by DB.
+ */
+#define	MAKE_STAT_LSN(s, lsn) do {					\
+	myobjc = 2;							\
+	myobjv[0] = Tcl_NewLongObj((long)(lsn)->file);			\
+	myobjv[1] = Tcl_NewLongObj((long)(lsn)->offset);		\
+	lsnlist = Tcl_NewListObj(myobjc, myobjv);			\
+	myobjc = 2;							\
+	myobjv[0] = Tcl_NewStringObj((s), (int)strlen(s));		\
+	myobjv[1] = lsnlist;						\
+	thislist = Tcl_NewListObj(myobjc, myobjv);			\
+	result = Tcl_ListObjAppendElement(interp, res, thislist);	\
+	if (result != TCL_OK)						\
+		goto error;						\
+} while (0)
+
+/*
+ * MAKE_STAT_STRLIST appends a {name string} pair to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local
+ * function.  This macro also assumes a label "error" to go to
+ * in the even of a Tcl error.  For stat functions this will
+ * typically go before the "free" function to free the stat structure
+ * returned by DB.
+ */
+#define	MAKE_STAT_STRLIST(s,s1) do {					\
+	result = _SetListElem(interp, res, (s), (u_int32_t)strlen(s),	\
+	    (s1), (u_int32_t)strlen(s1));				\
+	if (result != TCL_OK)						\
+		goto error;						\
+} while (0)
+
+/*
+ * MAKE_SITE_LIST appends a {eid host port status} tuple to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local function.
+ * This macro also assumes a label "error" to go to in the event of a Tcl
+ * error.
+ */
+#define	MAKE_SITE_LIST(e, h, p, s, pr) do {				\
+	myobjc = 5;							\
+	myobjv[0] = Tcl_NewIntObj(e);					\
+	myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h));		\
+	myobjv[2] = Tcl_NewIntObj((int)p);				\
+	myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s));		\
+	myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr));		\
+	thislist = Tcl_NewListObj(myobjc, myobjv);			\
+	result = Tcl_ListObjAppendElement(interp, res, thislist);	\
+	if (result != TCL_OK)						\
+		goto error;						\
+} while (0)
+
+/*
+ * FLAG_CHECK checks that the given flag is not set yet.
+ * If it is, it sets up an error message.
+ */
+#define	FLAG_CHECK(flag) do {						\
+	if ((flag) != 0) {						\
+		Tcl_SetResult(interp,					\
+		    " Only 1 policy can be specified.\n",		\
+		    TCL_STATIC);					\
+		result = TCL_ERROR;					\
+		break;							\
+	}								\
+} while (0)
+
+/*
+ * FLAG_CHECK2 checks that the given flag is not set yet or is
+ * only set to the given allowed value.
+ * If it is, it sets up an error message.
+ */
+#define	FLAG_CHECK2(flag, val) do {					\
+	if (((flag) & ~(val)) != 0) {					\
+		Tcl_SetResult(interp,					\
+		    " Only 1 policy can be specified.\n",		\
+		    TCL_STATIC);					\
+		result = TCL_ERROR;					\
+		break;							\
+	}								\
+} while (0)
+
+/*
+ * IS_HELP checks whether the arg we bombed on is -?, which is a help option.
+ * If it is, we return TCL_OK (but leave the result set to whatever
+ * Tcl_GetIndexFromObj says, which lists all the valid options.  Otherwise
+ * return TCL_ERROR.
+ */
+#define	IS_HELP(s)						\
+    (strcmp(Tcl_GetStringFromObj(s,NULL), "-?") == 0) ? TCL_OK : TCL_ERROR
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/tcl_ext.h"
+#endif /* !_DB_TCL_DB_H_ */
diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h
new file mode 100644
index 00000000..7cbae263
--- /dev/null
+++ b/src/dbinc/txn.h
@@ -0,0 +1,288 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	_DB_TXN_H_
+#define	_DB_TXN_H_
+
+#include "dbinc/xa.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Operation parameters to the delayed commit processing code. */
+typedef enum {
+	TXN_CLOSE,		/* Close a DB handle whose close had failed. */
+	TXN_REMOVE,		/* Remove a file. */
+	TXN_TRADE,		/* Trade lockers. */
+	TXN_TRADED,		/* Already traded; downgrade lock. */
+	TXN_XTRADE		/* Trade lockers on exclusive db handle. */
+} TXN_EVENT_T;
+
+struct __db_txnregion;	typedef struct __db_txnregion DB_TXNREGION;
+struct __db_txn_stat_int;
+typedef struct __db_txn_stat_int DB_TXN_STAT_INT;
+struct __txn_logrec;	typedef struct __txn_logrec DB_TXNLOGREC;
+
+/*
+ * !!!
+ * TXN_MINIMUM = (DB_LOCK_MAXID + 1) but this makes compilers complain.
+ */
+#define	TXN_MINIMUM	0x80000000
+#define	TXN_MAXIMUM	0xffffffff	/* Maximum number of txn ids. */
+#define	TXN_INVALID	0		/* Invalid transaction ID. */
+
+#define	DEF_MAX_TXNS	100		/* Default max transactions. */
+#define	TXN_NSLOTS	4		/* Initial slots to hold DB refs */
+
+#define	TXN_PRIORITY_DEFAULT	DB_LOCK_DEFPRIORITY
+
+/*
+ * This structure must contain the same fields as the __db_txn_stat struct
+ * except for any pointer fields that are filled in only when the struct is
+ * being populated for output through the API.
+ */
+DB_ALIGN8 struct __db_txn_stat_int { /* SHARED */
+	u_int32_t st_nrestores;		/* number of restored transactions
+					   after recovery. */
+#ifndef __TEST_DB_NO_STATISTICS
+	DB_LSN	  st_last_ckp;		/* lsn of the last checkpoint */
+	time_t	  st_time_ckp;		/* time of last checkpoint */
+	u_int32_t st_last_txnid;	/* last transaction id given out */
+	u_int32_t st_inittxns;		/* initial txns allocated */
+	u_int32_t st_maxtxns;		/* maximum txns possible */
+	uintmax_t st_naborts;		/* number of aborted transactions */
+	uintmax_t st_nbegins;		/* number of begun transactions */
+	uintmax_t st_ncommits;		/* number of committed transactions */
+	u_int32_t st_nactive;		/* number of active transactions */
+	u_int32_t st_nsnapshot;		/* number of snapshot transactions */
+	u_int32_t st_maxnactive;	/* maximum active transactions */
+	u_int32_t st_maxnsnapshot;	/* maximum snapshot transactions */
+	uintmax_t st_region_wait;	/* Region lock granted after wait. */
+	uintmax_t st_region_nowait;	/* Region lock granted without wait. */
+	roff_t	  st_regsize;		/* Region size. */
+#endif
+};
+
+/*
+ * Internal data maintained in shared memory for each transaction.
+ */
+typedef struct __txn_detail {
+	u_int32_t txnid;		/* current transaction id
+					   used to link free list also */
+	pid_t pid;			/* Process owning txn */
+	db_threadid_t tid;		/* Thread owning txn */
+
+	DB_LSN	last_lsn;		/* Last LSN written for this txn. */
+	DB_LSN	begin_lsn;		/* LSN of begin record. */
+	roff_t	parent;			/* Offset of transaction's parent. */
+	roff_t	name;			/* Offset of txn name. */
+	
+	u_int32_t	nlog_dbs;	/* Number of databases used. */
+	u_int32_t	nlog_slots;	/* Number of allocated slots. */
+	roff_t		log_dbs;	/* Databases used. */
+
+	DB_LSN	read_lsn;		/* Read LSN for MVCC. */
+	DB_LSN	visible_lsn;		/* LSN at which this transaction's
+					   changes are visible. */
+	db_mutex_t	mvcc_mtx;	/* Version mutex. */
+	u_int32_t	mvcc_ref;	/* Number of buffers created by this
+					   transaction still in cache.  */
+
+	u_int32_t	priority;	/* Deadlock resolution priority. */
+
+	SH_TAILQ_HEAD(__tdkids)	kids;	/* Linked list of child txn detail. */
+	SH_TAILQ_ENTRY		klinks;
+
+	/* TXN_{ABORTED, COMMITTED PREPARED, RUNNING} */
+	u_int32_t status;		/* status of the transaction */
+
+#define	TXN_DTL_COLLECTED	0x01	/* collected during txn_recover */
+#define	TXN_DTL_RESTORED	0x02	/* prepared txn restored */
+#define	TXN_DTL_INMEMORY	0x04	/* uses in memory logs */
+#define	TXN_DTL_SNAPSHOT	0x08	/* On the list of snapshot txns. */
+#define	TXN_DTL_NOWAIT		0x10	/* Don't block on locks. */
+	u_int32_t flags;
+
+	SH_TAILQ_ENTRY	links;		/* active/free/snapshot list */
+
+	u_int32_t xa_ref;		/* XA: reference count; number
+					   of DB_TXNs reffing this struct */
+	/* TXN_XA_{ACTIVE, DEADLOCKED, IDLE, PREPARED, ROLLEDBACK} */
+	u_int32_t xa_br_status;		/* status of XA branch */
+	u_int8_t gid[DB_GID_SIZE];	/* global transaction id */
+	u_int32_t bqual;		/* bqual_length from XID */
+	u_int32_t gtrid;		/* gtrid_length from XID */
+	int32_t format;			/* XA format */
+	roff_t slots[TXN_NSLOTS];	/* Initial DB slot allocation. */
+} TXN_DETAIL;
+
+/*
+ * DB_TXNMGR --
+ *	The transaction manager encapsulates the transaction system.
+ */
+struct __db_txnmgr {
+	/*
+	 * These fields need to be protected for multi-threaded support.
+	 *
+	 * Lock list of active transactions (including the content of each
+	 * TXN_DETAIL structure on the list).
+	 */
+	db_mutex_t mutex;
+					/* List of active transactions. */
+	TAILQ_HEAD(_chain, __db_txn)	txn_chain;
+
+	u_int32_t n_discards;		/* Number of txns discarded. */
+
+	/* These fields are never updated after creation, so not protected. */
+	ENV	*env;			/* Environment. */
+	REGINFO	 reginfo;		/* Region information. */
+};
+
+/* Macros to lock/unlock the transaction region as a whole. */
+#define	TXN_SYSTEM_LOCK(env)						\
+	MUTEX_LOCK(env, ((DB_TXNREGION *)				\
+	    (env)->tx_handle->reginfo.primary)->mtx_region)
+#define	TXN_SYSTEM_UNLOCK(env)						\
+	MUTEX_UNLOCK(env, ((DB_TXNREGION *)				\
+	    (env)->tx_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_TXNREGION --
+ *	The primary transaction data structure in the shared memory region.
+ */
+struct __db_txnregion { /* SHARED */
+	db_mutex_t	mtx_region;	/* Region mutex. */
+
+	u_int32_t	inittxns;	/* initial number of active TXNs */
+	u_int32_t	curtxns;	/* current number of active TXNs */
+	u_int32_t	maxtxns;	/* maximum number of active TXNs */
+	u_int32_t	last_txnid;	/* last transaction id given out */
+	u_int32_t	cur_maxid;	/* current max unused id. */
+
+	db_mutex_t	mtx_ckp;	/* Single thread checkpoints. */
+	DB_LSN		last_ckp;	/* lsn of the last checkpoint */
+	time_t		time_ckp;	/* time of last checkpoint */
+
+	DB_TXN_STAT_INT	stat;		/* Statistics for txns. */
+
+	u_int32_t n_bulk_txn;		/* Num. bulk txns in progress. */
+	u_int32_t n_hotbackup;		/* Num. of outstanding backup notices.*/
+
+#define	TXN_IN_RECOVERY	 0x01		/* environment is being recovered */
+	u_int32_t	flags;
+					/* active TXN list */
+	SH_TAILQ_HEAD(__active) active_txn;
+	SH_TAILQ_HEAD(__mvcc) mvcc_txn;
+};
+
+/*
+ * DB_COMMIT_INFO --
+ *	Meta-data uniquely describing a transaction commit across a replication
+ *	group.
+ */
+struct __db_commit_info {
+	u_int32_t	version;	/* Stored format version. */
+	u_int32_t	gen;		/* Replication master generation. */
+	u_int32_t	envid;		/* Unique env ID of master. */
+	DB_LSN		lsn;		/* LSN of commit log record. */
+};
+
+/*
+ * DB_TXNLOGREC --
+ *	An in-memory, linked-list copy of a log record.
+ */
+struct __txn_logrec {
+	STAILQ_ENTRY(__txn_logrec) links;/* Linked list. */
+
+	u_int8_t data[1];		/* Log record. */
+};
+
+/*
+ * Log record types.  Note that these are *not* alphabetical.  This is
+ * intentional so that we don't change the meaning of values between
+ * software upgrades.
+ *
+ * EXPECTED, UNEXPECTED, IGNORE, and OK are used in the txnlist functions.
+ * Here is an explanation of how the statuses are used.
+ *
+ * TXN_OK
+ *	BEGIN records for transactions found on the txnlist during
+ *	OPENFILES (BEGIN records are those with a prev_lsn of 0,0)
+ *
+ * TXN_COMMIT
+ *	Transaction committed and should be rolled forward.
+ *
+ * TXN_ABORT
+ *	This transaction's changes must be undone.  Either there was
+ *	never a prepare or commit record for this transaction OR there
+ *	was a commit, but we are recovering to a timestamp or particular
+ *	LSN and that point is before this transaction's commit.
+ *
+ * TXN_PREPARE
+ *	Prepare record, but no commit record is in the log.
+ *
+ * TXN_IGNORE
+ *	Generic meaning is that this transaction should not be
+ *	processed during later recovery passes.  We use it in a
+ *	number of different manners:
+ *
+ *	1. We never saw its BEGIN record.  Therefore, the logs have
+ *	   been reclaimed and we *know* that this transaction doesn't
+ *	   need to be aborted, because in order for it to be
+ *	   reclaimed, there must have been a subsequent checkpoint
+ *	   (and any dirty pages for this transaction made it to
+ *	   disk).
+ *
+ *	2. This is a child transaction that created a database.
+ *	   For some reason, we don't want to recreate that database
+ *	   (i.e., it already exists or some other database created
+ *	   after it exists).
+ *
+ *	3. During recovery open of subdatabases, if the master check fails,
+ *	   we use a TXN_IGNORE on the create of the subdb in the nested
+ *	   transaction.
+ *
+ *	4. During a remove, the file with the name being removed isn't
+ *	   the file for which we are recovering a remove.
+ *
+ * TXN_EXPECTED
+ *	After a successful open during recovery, we update the
+ *	transaction's status to TXN_EXPECTED.  The open was done
+ *	in the parent, but in the open log record, we record the
+ *	child transaction's ID if we also did a create.  When there
+ *	is a valid ID in that field, we use it and mark the child's
+ *	status as TXN_EXPECTED (indicating that we don't need to redo
+ *	a create for this file).
+ *
+ *	When recovering a remove, if we don't find or can't open
+ *	the file, the child (which does the remove) gets marked
+ *	EXPECTED (indicating that we don't need to redo the remove).
+ *
+ * TXN_UNEXPECTED
+ *	During recovery, we attempted an open that should have succeeded
+ *	and we got ENOENT, so like with the EXPECTED case, we indicate
+ *	in the child that we got the UNEXPECTED return so that we do redo
+ *	the creating/deleting operation.
+ *
+ */
+#define	TXN_OK		0
+#define	TXN_COMMIT	1
+#define	TXN_PREPARE	2
+#define	TXN_ABORT	3
+#define	TXN_IGNORE	4
+#define	TXN_EXPECTED	5
+#define	TXN_UNEXPECTED	6
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/txn_auto.h"
+#include "dbinc_auto/txn_ext.h"
+#endif /* !_DB_TXN_H_ */
diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h
new file mode 100644
index 00000000..ba57cd1f
--- /dev/null
+++ b/src/dbinc/win_db.h
@@ -0,0 +1,148 @@
+/*-
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * The following provides the information necessary to build Berkeley
+ * DB on native Windows, and other Windows environments such as MinGW.
+ */
+
+/*
+ * Berkeley DB requires at least Windows 2000, tell Visual Studio of the
+ * requirement.
+ */
+#ifndef _WIN32_WINNT
+#define	_WIN32_WINNT 0x0500
+#endif
+
+#ifndef DB_WINCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/timeb.h>
+
+#include <direct.h>
+#include <fcntl.h>
+#include <io.h>
+#include <limits.h>
+#include <memory.h>
+#include <process.h>
+#include <signal.h>
+#endif /* DB_WINCE */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tchar.h>
+#include <time.h>
+
+/*
+ * To build Tcl interface libraries, the include path must be configured to
+ * use the directory containing <tcl.h>, usually the include directory in
+ * the Tcl distribution.
+ */
+#ifdef DB_TCL_SUPPORT
+#include <tcl.h>
+#endif
+
+#define	WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winsock2.h>
+#ifndef DB_WINCE
+#include <WinIoCtl.h>
+#endif
+
+#ifdef HAVE_GETADDRINFO
+/*
+ * Need explicit includes for IPv6 support on Windows.  Both are necessary to
+ * ensure that pre WinXP versions have an implementation of the getaddrinfo API.
+ */
+#include <ws2tcpip.h>
+#include <wspiapi.h>
+#endif
+
+/*
+ * Microsoft's C runtime library has fsync, getcwd, getpid, snprintf and
+ * vsnprintf, but under different names.
+ */
+#define	fsync			_commit
+
+#ifndef DB_WINCE
+#define	getcwd(buf, size)	_getcwd(buf, size)
+#endif
+#define	getpid			GetCurrentProcessId
+#define	snprintf		_snprintf
+#define	strcasecmp		_stricmp
+#define	strncasecmp		_strnicmp
+#define	vsnprintf		_vsnprintf
+
+#define	h_errno			WSAGetLastError()
+
+/*
+ * Win32 does not have getopt.
+ *
+ * The externs are here, instead of using db_config.h and clib_port.h, because
+ * that approach changes function names to BDB specific names, and the example
+ * programs use getopt and can't use BDB specific names.
+ */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+extern int getopt(int, char * const *, const char *);
+#if defined(__cplusplus)
+}
+#endif
+
+/*
+ * Microsoft's compiler _doesn't_ define __STDC__ unless you invoke it with
+ * arguments turning OFF all vendor extensions.  Even more unfortunately, if
+ * we do that, it fails to parse windows.h!!!!!  So, we define __STDC__ here,
+ * after windows.h comes in.  Note: the compiler knows we've defined it, and
+ * starts enforcing strict ANSI compliance from this point on.
+ */
+#ifndef __STDC__
+#define	__STDC__ 1
+#endif
+
+#ifdef _UNICODE
+#define	TO_TSTRING(dbenv, s, ts, ret) do {				\
+		int __len = (int)strlen(s) + 1;				\
+		ts = NULL;						\
+		if ((ret = __os_malloc((dbenv),				\
+		    __len * sizeof(_TCHAR), &(ts))) == 0 &&		\
+		    MultiByteToWideChar(CP_UTF8, 0,			\
+		    (s), -1, (ts), __len) == 0)				\
+			ret = __os_posix_err(__os_get_syserr());	\
+	} while (0)
+
+#define	FROM_TSTRING(dbenv, ts, s, ret) {				\
+		int __len = WideCharToMultiByte(CP_UTF8, 0, ts, -1,	\
+		    NULL, 0, NULL, NULL);				\
+		s = NULL;						\
+		if ((ret = __os_malloc((dbenv), __len, &(s))) == 0 &&	\
+		    WideCharToMultiByte(CP_UTF8, 0,			\
+		    (ts), -1, (s), __len, NULL, NULL) == 0)		\
+			ret = __os_posix_err(__os_get_syserr());	\
+	} while (0)
+
+#define	FREE_STRING(dbenv, s) do {					\
+		if ((s) != NULL) {					\
+			__os_free((dbenv), (s));			\
+			(s) = NULL;					\
+		}							\
+	} while (0)
+
+#else
+#define	TO_TSTRING(dbenv, s, ts, ret) (ret) = 0, (ts) = (_TCHAR *)(s)
+#define	FROM_TSTRING(dbenv, ts, s, ret) (ret) = 0, (s) = (char *)(ts)
+#define	FREE_STRING(dbenv, ts)
+#endif
+
+#ifndef INVALID_HANDLE_VALUE
+#define	INVALID_HANDLE_VALUE ((HANDLE)-1)
+#endif
+
+#ifndef INVALID_FILE_ATTRIBUTES
+#define	INVALID_FILE_ATTRIBUTES ((DWORD)-1)
+#endif
+
+#ifndef INVALID_SET_FILE_POINTER
+#define	INVALID_SET_FILE_POINTER ((DWORD)-1)
+#endif
diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h
new file mode 100644
index 00000000..7283c1ea
--- /dev/null
+++ b/src/dbinc/xa.h
@@ -0,0 +1,183 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	_DB_XA_H_
+#define	_DB_XA_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#define	XIDDATASIZE	128		/* size in bytes */
+#define	MAXGTRIDSIZE	 64		/* maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/* maximum size in bytes of bqual */
+
+struct xid_t {
+	long formatID;			/* format identifier */
+	long gtrid_length;		/* value from 1 through 64 */
+	long bqual_length;		/* value from 1 through 64 */
+	char data[XIDDATASIZE];
+};
+typedef	struct xid_t XID;
+/*
+ * A value of -1 in formatID means that the XID is null.
+ */
+
+/*
+ * Declarations of routines by which RMs call TMs:
+ */
+extern int ax_reg __P((int, XID *, long));
+extern int ax_unreg __P((int, long));
+
+/*
+ * XA Switch Data Structure
+ */
+#define	RMNAMESZ	32		/* length of resource manager name, */
+					/* including the null terminator */
+#define	MAXINFOSIZE	256		/* maximum size in bytes of xa_info */
+					/* strings, including the null
+					terminator */
+struct xa_switch_t {
+	char name[RMNAMESZ];		/* name of resource manager */
+	long flags;			/* resource manager specific options */
+	long version;			/* must be 0 */
+	int (*xa_open_entry)		/* xa_open function pointer */
+	    __P((char *, int, long));
+	int (*xa_close_entry)		/* xa_close function pointer */
+	    __P((char *, int, long));
+	int (*xa_start_entry)		/* xa_start function pointer */
+	    __P((XID *, int, long));
+	int (*xa_end_entry)		/* xa_end function pointer */
+	    __P((XID *, int, long));
+	int (*xa_rollback_entry)	/* xa_rollback function pointer */
+	    __P((XID *, int, long));
+	int (*xa_prepare_entry)		/* xa_prepare function pointer */
+	    __P((XID *, int, long));
+	int (*xa_commit_entry)		/* xa_commit function pointer */
+	    __P((XID *, int, long));
+	int (*xa_recover_entry)		/* xa_recover function pointer */
+	    __P((XID *, long, int, long));
+	int (*xa_forget_entry)		/* xa_forget function pointer */
+	    __P((XID *, int, long));
+	int (*xa_complete_entry)	/* xa_complete function pointer */
+	    __P((int *, int *, int, long));
+};
+
+/*
+ * Flag definitions for the RM switch
+ */
+#define	TMNOFLAGS	0x00000000L	/* no resource manager features
+					selected */
+#define	TMREGISTER	0x00000001L	/* resource manager dynamically
+					registers */
+#define	TMNOMIGRATE	0x00000002L	/* resource manager does not support
+					association migration */
+#define	TMUSEASYNC	0x00000004L	/* resource manager supports
+					asynchronous operations */
+/*
+ * Flag definitions for xa_ and ax_ routines
+ */
+/* use TMNOFLAGGS, defined above, when not specifying other flags */
+#define	TMASYNC		0x80000000L	/* perform routine asynchronously */
+#define	TMONEPHASE	0x40000000L	/* caller is using one-phase commit
+					optimisation */
+#define	TMFAIL		0x20000000L	/* dissociates caller and marks
+					transaction branch rollback-only */
+#define	TMNOWAIT	0x10000000L	/* return if blocking condition
+					exists */
+#define	TMRESUME	0x08000000L	/* caller is resuming association with
+					suspended transaction branch */
+#define	TMSUCCESS	0x04000000L	/* dissociate caller from transaction
+					branch */
+#define	TMSUSPEND	0x02000000L	/* caller is suspending, not ending,
+					association */
+#define	TMSTARTRSCAN	0x01000000L	/* start a recovery scan */
+#define	TMENDRSCAN	0x00800000L	/* end a recovery scan */
+#define	TMMULTIPLE	0x00400000L	/* wait for any asynchronous
+					operation */
+#define	TMJOIN		0x00200000L	/* caller is joining existing
+					transaction branch */
+#define	TMMIGRATE	0x00100000L	/* caller intends to perform
+					migration */
+
+/*
+ * ax_() return codes (transaction manager reports to resource manager)
+ */
+#define	TM_JOIN		2		/* caller is joining existing
+					transaction branch */
+#define	TM_RESUME	1		/* caller is resuming association with
+					suspended transaction branch */
+#define	TM_OK		0		/* normal execution */
+#define	TMER_TMERR	-1		/* an error occurred in the transaction
+					manager */
+#define	TMER_INVAL	-2		/* invalid arguments were given */
+#define	TMER_PROTO	-3		/* routine invoked in an improper
+					context */
+
+/*
+ * xa_() return codes (resource manager reports to transaction manager)
+ */
+#define	XA_RBBASE	100		/* The inclusive lower bound of the
+					rollback codes */
+#define	XA_RBROLLBACK	XA_RBBASE	/* The rollback was caused by an
+					unspecified reason */
+#define	XA_RBCOMMFAIL	XA_RBBASE+1	/* The rollback was caused by a
+					communication failure */
+#define	XA_RBDEADLOCK	XA_RBBASE+2	/* A deadlock was detected */
+#define	XA_RBINTEGRITY	XA_RBBASE+3	/* A condition that violates the
+					integrity of the resources was
+					detected */
+#define	XA_RBOTHER	XA_RBBASE+4	/* The resource manager rolled back the
+					transaction branch for a reason not
+					on this list */
+#define	XA_RBPROTO	XA_RBBASE+5	/* A protocol error occurred in the
+					resource manager */
+#define	XA_RBTIMEOUT	XA_RBBASE+6	/* A transaction branch took too long */
+#define	XA_RBTRANSIENT	XA_RBBASE+7	/* May retry the transaction branch */
+#define	XA_RBEND	XA_RBTRANSIENT	/* The inclusive upper bound of the
+					rollback codes */
+#define	XA_NOMIGRATE	9		/* resumption must occur where
+					suspension occurred */
+#define	XA_HEURHAZ	8		/* the transaction branch may have
+					been heuristically completed */
+#define	XA_HEURCOM	7		/* the transaction branch has been
+					heuristically committed */
+#define	XA_HEURRB	6		/* the transaction branch has been
+					heuristically rolled back */
+#define	XA_HEURMIX	5		/* the transaction branch has been
+					heuristically committed and rolled
+					back */
+#define	XA_RETRY	4		/* routine returned with no effect and
+					may be re-issued */
+#define	XA_RDONLY	3		/* the transaction branch was read-only
+					and has been committed */
+#define	XA_OK		0		/* normal execution */
+#define	XAER_ASYNC	-2		/* asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/* a resource manager error occurred in
+					 the transaction branch */
+#define	XAER_NOTA	-4		/* the XID is not valid */
+#define	XAER_INVAL	-5		/* invalid arguments were given */
+#define	XAER_PROTO	-6		/* routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/* resource manager unavailable */
+#define	XAER_DUPID	-8		/* the XID already exists */
+#define	XAER_OUTSIDE	-9		/* resource manager doing work outside
+					transaction */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_XA_H_ */
diff --git a/src/dbinc_auto/api_flags.in b/src/dbinc_auto/api_flags.in
new file mode 100644
index 00000000..9727ede2
--- /dev/null
+++ b/src/dbinc_auto/api_flags.in
@@ -0,0 +1,228 @@
+/* DO NOT EDIT: automatically built by dist/s_apiflags. */
+#define	DB_AGGRESSIVE				0x00000001
+#define	DB_ARCH_ABS				0x00000001
+#define	DB_ARCH_DATA				0x00000002
+#define	DB_ARCH_LOG				0x00000004
+#define	DB_ARCH_REMOVE				0x00000008
+#define	DB_AUTO_COMMIT				0x00000100
+#define	DB_BACKUP_CLEAN				0x00000002
+#define	DB_BACKUP_FILES				0x00000008
+#define	DB_BACKUP_NO_LOGS			0x00000010
+#define	DB_BACKUP_SINGLE_DIR			0x00000020
+#define	DB_BACKUP_UPDATE			0x00000040
+#define	DB_BOOTSTRAP_HELPER			0x00000001
+#define	DB_CDB_ALLDB				0x00000040
+#define	DB_CHKSUM				0x00000008
+#define	DB_CKP_INTERNAL				0x00000002
+#define	DB_CREATE				0x00000001
+#define	DB_CURSOR_BULK				0x00000001
+#define	DB_CURSOR_TRANSIENT			0x00000008
+#define	DB_CXX_NO_EXCEPTIONS			0x00000002
+#define	DB_DATABASE_LOCKING			0x00000080
+#define	DB_DIRECT				0x00000020
+#define	DB_DIRECT_DB				0x00000200
+#define	DB_DSYNC_DB				0x00000400
+#define	DB_DUP					0x00000010
+#define	DB_DUPSORT				0x00000002
+#define	DB_DURABLE_UNKNOWN			0x00000040
+#define	DB_ENCRYPT				0x00000001
+#define	DB_ENCRYPT_AES				0x00000001
+#define	DB_EXCL					0x00000004
+#define	DB_EXTENT				0x00000100
+#define	DB_FAILCHK				0x00000010
+#define	DB_FAILCHK_ISALIVE			0x00000040
+#define	DB_FAST_STAT				0x00000001
+#define	DB_FCNTL_LOCKING			0x00000800
+#define	DB_FLUSH				0x00000002
+#define	DB_FORCE				0x00000001
+#define	DB_FORCESYNC				0x00000001
+#define	DB_FOREIGN_ABORT			0x00000001
+#define	DB_FOREIGN_CASCADE			0x00000002
+#define	DB_FOREIGN_NULLIFY			0x00000004
+#define	DB_FREELIST_ONLY			0x00000001
+#define	DB_FREE_SPACE				0x00000002
+#define	DB_GROUP_CREATOR			0x00000002
+#define	DB_HOTBACKUP_IN_PROGRESS		0x00000800
+#define	DB_IGNORE_LEASE				0x00001000
+#define	DB_IMMUTABLE_KEY			0x00000002
+#define	DB_INIT_CDB				0x00000080
+#define	DB_INIT_LOCK				0x00000100
+#define	DB_INIT_LOG				0x00000200
+#define	DB_INIT_MPOOL				0x00000400
+#define	DB_INIT_MUTEX				0x00000800
+#define	DB_INIT_REP				0x00001000
+#define	DB_INIT_TXN				0x00002000
+#define	DB_INORDER				0x00000020
+#define	DB_INTERNAL_PERSISTENT_DB		0x00001000
+#define	DB_INTERNAL_TEMPORARY_DB		0x00002000
+#define	DB_JOIN_NOSORT				0x00000001
+#define	DB_LEGACY				0x00000004
+#define	DB_LOCAL_SITE				0x00000008
+#define	DB_LOCKDOWN				0x00004000
+#define	DB_LOCK_CHECK				0x00000001
+#define	DB_LOCK_IGNORE_REC			0x00000002
+#define	DB_LOCK_NOWAIT				0x00000004
+#define	DB_LOCK_RECORD				0x00000008
+#define	DB_LOCK_SET_TIMEOUT			0x00000010
+#define	DB_LOCK_SWITCH				0x00000020
+#define	DB_LOCK_UPGRADE				0x00000040
+#define	DB_LOG_AUTO_REMOVE			0x00000001
+#define	DB_LOG_CHKPNT				0x00000001
+#define	DB_LOG_COMMIT				0x00000004
+#define	DB_LOG_DIRECT				0x00000002
+#define	DB_LOG_DSYNC				0x00000004
+#define	DB_LOG_IN_MEMORY			0x00000008
+#define	DB_LOG_NOCOPY				0x00000008
+#define	DB_LOG_NOT_DURABLE			0x00000010
+#define	DB_LOG_NO_DATA				0x00000002
+#define	DB_LOG_VERIFY_CAF			0x00000001
+#define	DB_LOG_VERIFY_DBFILE			0x00000002
+#define	DB_LOG_VERIFY_ERR			0x00000004
+#define	DB_LOG_VERIFY_FORWARD			0x00000008
+#define	DB_LOG_VERIFY_INTERR			0x00000010
+#define	DB_LOG_VERIFY_PARTIAL			0x00000020
+#define	DB_LOG_VERIFY_VERBOSE			0x00000040
+#define	DB_LOG_VERIFY_WARNING			0x00000080
+#define	DB_LOG_WRNOSYNC				0x00000020
+#define	DB_LOG_ZERO				0x00000010
+#define	DB_MPOOL_CREATE				0x00000001
+#define	DB_MPOOL_DIRTY				0x00000002
+#define	DB_MPOOL_DISCARD			0x00000001
+#define	DB_MPOOL_EDIT				0x00000004
+#define	DB_MPOOL_FREE				0x00000008
+#define	DB_MPOOL_LAST				0x00000010
+#define	DB_MPOOL_NEW				0x00000020
+#define	DB_MPOOL_NOFILE				0x00000001
+#define	DB_MPOOL_NOLOCK				0x00000004
+#define	DB_MPOOL_TRY				0x00000040
+#define	DB_MPOOL_UNLINK				0x00000002
+#define	DB_MULTIPLE				0x00000800
+#define	DB_MULTIPLE_KEY				0x00004000
+#define	DB_MULTIVERSION				0x00000008
+#define	DB_MUTEX_ALLOCATED			0x00000001
+#define	DB_MUTEX_LOCKED				0x00000002
+#define	DB_MUTEX_LOGICAL_LOCK			0x00000004
+#define	DB_MUTEX_PROCESS_ONLY			0x00000008
+#define	DB_MUTEX_SELF_BLOCK			0x00000010
+#define	DB_MUTEX_SHARED				0x00000020
+#define	DB_NOERROR				0x00004000
+#define	DB_NOFLUSH				0x00001000
+#define	DB_NOLOCKING				0x00002000
+#define	DB_NOMMAP				0x00000010
+#define	DB_NOORDERCHK				0x00000002
+#define	DB_NOPANIC				0x00004000
+#define	DB_NOSYNC				0x00000001
+#define	DB_NO_AUTO_COMMIT			0x00008000
+#define	DB_NO_CHECKPOINT			0x00008000
+#define	DB_ODDFILESIZE				0x00000080
+#define	DB_ORDERCHKONLY				0x00000004
+#define	DB_OVERWRITE				0x00008000
+#define	DB_PANIC_ENVIRONMENT			0x00010000
+#define	DB_PRINTABLE				0x00000008
+#define	DB_PRIVATE				0x00010000
+#define	DB_PR_PAGE				0x00000010
+#define	DB_PR_RECOVERYTEST			0x00000020
+#define	DB_RDONLY				0x00000400
+#define	DB_RDWRMASTER				0x00010000
+#define	DB_READ_COMMITTED			0x00000400
+#define	DB_READ_UNCOMMITTED			0x00000200
+#define	DB_RECNUM				0x00000040
+#define	DB_RECOVER				0x00000002
+#define	DB_RECOVER_FATAL			0x00020000
+#define	DB_REGION_INIT				0x00020000
+#define	DB_REGISTER				0x00040000
+#define	DB_RENUMBER				0x00000080
+#define	DB_REPMGR_CONF_2SITE_STRICT		0x00000001
+#define	DB_REPMGR_CONF_ELECTIONS		0x00000002
+#define	DB_REPMGR_NEED_RESPONSE			0x00000001
+#define	DB_REPMGR_PEER				0x00000010
+#define	DB_REP_ANYWHERE				0x00000001
+#define	DB_REP_CLIENT				0x00000001
+#define	DB_REP_CONF_AUTOINIT			0x00000004
+#define	DB_REP_CONF_AUTOROLLBACK		0x00000008
+#define	DB_REP_CONF_BULK			0x00000010
+#define	DB_REP_CONF_DELAYCLIENT			0x00000020
+#define	DB_REP_CONF_INMEM			0x00000040
+#define	DB_REP_CONF_LEASE			0x00000080
+#define	DB_REP_CONF_NOWAIT			0x00000100
+#define	DB_REP_ELECTION				0x00000004
+#define	DB_REP_MASTER				0x00000002
+#define	DB_REP_NOBUFFER				0x00000002
+#define	DB_REP_PERMANENT			0x00000004
+#define	DB_REP_REREQUEST			0x00000008
+#define	DB_REVSPLITOFF				0x00000100
+#define	DB_RMW					0x00002000
+#define	DB_SALVAGE				0x00000040
+#define	DB_SA_SKIPFIRSTKEY			0x00000080
+#define	DB_SA_UNKNOWNKEY			0x00000100
+#define	DB_SEQ_DEC				0x00000001
+#define	DB_SEQ_INC				0x00000002
+#define	DB_SEQ_RANGE_SET			0x00000004
+#define	DB_SEQ_WRAP				0x00000008
+#define	DB_SEQ_WRAPPED				0x00000010
+#define	DB_SET_LOCK_TIMEOUT			0x00000001
+#define	DB_SET_REG_TIMEOUT			0x00000004
+#define	DB_SET_TXN_NOW				0x00000008
+#define	DB_SET_TXN_TIMEOUT			0x00000002
+#define	DB_SHALLOW_DUP				0x00000100
+#define	DB_SNAPSHOT				0x00000200
+#define	DB_STAT_ALL				0x00000004
+#define	DB_STAT_ALLOC				0x00000008
+#define	DB_STAT_CLEAR				0x00000001
+#define	DB_STAT_LOCK_CONF			0x00000010
+#define	DB_STAT_LOCK_LOCKERS			0x00000020
+#define	DB_STAT_LOCK_OBJECTS			0x00000040
+#define	DB_STAT_LOCK_PARAMS			0x00000080
+#define	DB_STAT_MEMP_HASH			0x00000010
+#define	DB_STAT_MEMP_NOERROR			0x00000020
+#define	DB_STAT_SUBSYSTEM			0x00000002
+#define	DB_STAT_SUMMARY				0x00000010
+#define	DB_ST_DUPOK				0x00000200
+#define	DB_ST_DUPSET				0x00000400
+#define	DB_ST_DUPSORT				0x00000800
+#define	DB_ST_IS_RECNO				0x00001000
+#define	DB_ST_OVFL_LEAF				0x00002000
+#define	DB_ST_RECNUM				0x00004000
+#define	DB_ST_RELEN				0x00008000
+#define	DB_ST_TOPLEVEL				0x00010000
+#define	DB_SYSTEM_MEM				0x00080000
+#define	DB_THREAD				0x00000020
+#define	DB_TIME_NOTGRANTED			0x00040000
+#define	DB_TRUNCATE				0x00020000
+#define	DB_TXN_BULK				0x00000010
+#define	DB_TXN_FAMILY				0x00000040
+#define	DB_TXN_NOSYNC				0x00000001
+#define	DB_TXN_NOT_DURABLE			0x00000004
+#define	DB_TXN_NOWAIT				0x00000002
+#define	DB_TXN_SNAPSHOT				0x00000004
+#define	DB_TXN_SYNC				0x00000008
+#define	DB_TXN_WAIT				0x00000080
+#define	DB_TXN_WRITE_NOSYNC			0x00000020
+#define	DB_UNREF				0x00020000
+#define	DB_UPGRADE				0x00000001
+#define	DB_USE_ENVIRON				0x00000004
+#define	DB_USE_ENVIRON_ROOT			0x00000008
+#define	DB_VERB_BACKUP				0x00000001
+#define	DB_VERB_DEADLOCK			0x00000002
+#define	DB_VERB_FILEOPS				0x00000004
+#define	DB_VERB_FILEOPS_ALL			0x00000008
+#define	DB_VERB_RECOVERY			0x00000010
+#define	DB_VERB_REGISTER			0x00000020
+#define	DB_VERB_REPLICATION			0x00000040
+#define	DB_VERB_REPMGR_CONNFAIL			0x00000080
+#define	DB_VERB_REPMGR_MISC			0x00000100
+#define	DB_VERB_REP_ELECT			0x00000200
+#define	DB_VERB_REP_LEASE			0x00000400
+#define	DB_VERB_REP_MISC			0x00000800
+#define	DB_VERB_REP_MSGS			0x00001000
+#define	DB_VERB_REP_SYNC			0x00002000
+#define	DB_VERB_REP_SYSTEM			0x00004000
+#define	DB_VERB_REP_TEST			0x00008000
+#define	DB_VERB_WAITSFOR			0x00010000
+#define	DB_VERIFY				0x00000002
+#define	DB_VERIFY_PARTITION			0x00040000
+#define	DB_WRITECURSOR				0x00000010
+#define	DB_WRITELOCK				0x00000020
+#define	DB_WRITEOPEN				0x00040000
+#define	DB_XA_CREATE				0x00000001
+#define	DB_YIELDCPU				0x00080000
diff --git a/src/dbinc_auto/btree_auto.h b/src/dbinc_auto/btree_auto.h
new file mode 100644
index 00000000..e57551c7
--- /dev/null
+++ b/src/dbinc_auto/btree_auto.h
@@ -0,0 +1,456 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__bam_AUTO_H
+#define	__bam_AUTO_H
+#include "dbinc/log.h"
+#define	DB___bam_split	62
+typedef struct ___bam_split_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	u_int32_t	opflags;
+	db_pgno_t	left;
+	DB_LSN	llsn;
+	db_pgno_t	right;
+	DB_LSN	rlsn;
+	u_int32_t	indx;
+	db_pgno_t	npgno;
+	DB_LSN	nlsn;
+	db_pgno_t	ppgno;
+	DB_LSN	plsn;
+	u_int32_t	pindx;
+	DBT	pg;
+	DBT	pentry;
+	DBT	rentry;
+} __bam_split_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_desc[];
+static inline int
+__bam_split_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t opflags, db_pgno_t left, DB_LSN * llsn, db_pgno_t right,
+    DB_LSN * rlsn, u_int32_t indx, db_pgno_t npgno, DB_LSN * nlsn, db_pgno_t ppgno,
+    DB_LSN * plsn, u_int32_t pindx, const DBT *pg, const DBT *pentry, const DBT *rentry)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_split, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*llsn) + sizeof(u_int32_t) + sizeof(*rlsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*nlsn) +
+	    sizeof(u_int32_t) + sizeof(*plsn) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(pg) + LOG_DBT_SIZE(pentry) + LOG_DBT_SIZE(rentry),
+	    __bam_split_desc, opflags, left, llsn, right, rlsn, indx, npgno,
+	    nlsn, ppgno, plsn, pindx, pg, pentry, rentry));
+}
+
+static inline int __bam_split_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_split_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_split_desc, sizeof(__bam_split_args), (void**)arg));
+}
+#define	DB___bam_split_48	62
+typedef struct ___bam_split_48_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	left;
+	DB_LSN	llsn;
+	db_pgno_t	right;
+	DB_LSN	rlsn;
+	u_int32_t	indx;
+	db_pgno_t	npgno;
+	DB_LSN	nlsn;
+	db_pgno_t	ppgno;
+	DB_LSN	plsn;
+	u_int32_t	pindx;
+	DBT	pg;
+	DBT	pentry;
+	DBT	rentry;
+	u_int32_t	opflags;
+} __bam_split_48_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_48_desc[];
+static inline int __bam_split_48_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_split_48_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_split_48_desc, sizeof(__bam_split_48_args), (void**)arg));
+}
+#define	DB___bam_split_42	62
+typedef struct ___bam_split_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	left;
+	DB_LSN	llsn;
+	db_pgno_t	right;
+	DB_LSN	rlsn;
+	u_int32_t	indx;
+	db_pgno_t	npgno;
+	DB_LSN	nlsn;
+	db_pgno_t	root_pgno;
+	DBT	pg;
+	u_int32_t	opflags;
+} __bam_split_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_42_desc[];
+static inline int __bam_split_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_split_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_split_42_desc, sizeof(__bam_split_42_args), (void**)arg));
+}
+#define	DB___bam_rsplit	63
+typedef struct ___bam_rsplit_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DBT	pgdbt;
+	db_pgno_t	root_pgno;
+	db_pgno_t	nrec;
+	DBT	rootent;
+	DB_LSN	rootlsn;
+} __bam_rsplit_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_rsplit_desc[];
+static inline int
+__bam_rsplit_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *pgdbt, db_pgno_t root_pgno, db_pgno_t nrec,
+    const DBT *rootent, DB_LSN * rootlsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_rsplit, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(pgdbt) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(rootent) +
+	    sizeof(*rootlsn),
+	    __bam_rsplit_desc, pgno, pgdbt, root_pgno, nrec, rootent, rootlsn));
+}
+
+static inline int __bam_rsplit_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_rsplit_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_rsplit_desc, sizeof(__bam_rsplit_args), (void**)arg));
+}
+#define	DB___bam_adj	55
+typedef struct ___bam_adj_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+	u_int32_t	indx_copy;
+	u_int32_t	is_insert;
+} __bam_adj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_adj_desc[];
+static inline int
+__bam_adj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t indx_copy,
+    u_int32_t is_insert)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_adj, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __bam_adj_desc, pgno, lsn, indx, indx_copy, is_insert));
+}
+
+static inline int __bam_adj_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_adj_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_adj_desc, sizeof(__bam_adj_args), (void**)arg));
+}
+#define	DB___bam_cadjust	56
+typedef struct ___bam_cadjust_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+	int32_t	adjust;
+	u_int32_t	opflags;
+} __bam_cadjust_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_cadjust_desc[];
+static inline int
+__bam_cadjust_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, int32_t adjust,
+    u_int32_t opflags)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_cadjust, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __bam_cadjust_desc, pgno, lsn, indx, adjust, opflags));
+}
+
+static inline int __bam_cadjust_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_cadjust_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_cadjust_desc, sizeof(__bam_cadjust_args), (void**)arg));
+}
+#define	DB___bam_cdel	57
+typedef struct ___bam_cdel_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+} __bam_cdel_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_cdel_desc[];
+static inline int
+__bam_cdel_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_cdel, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t),
+	    __bam_cdel_desc, pgno, lsn, indx));
+}
+
+static inline int __bam_cdel_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_cdel_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_cdel_desc, sizeof(__bam_cdel_args), (void**)arg));
+}
+#define	DB___bam_repl	58
+typedef struct ___bam_repl_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+	u_int32_t	isdeleted;
+	DBT	orig;
+	DBT	repl;
+	u_int32_t	prefix;
+	u_int32_t	suffix;
+} __bam_repl_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_repl_desc[];
+static inline int
+__bam_repl_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t isdeleted,
+    const DBT *orig, const DBT *repl, u_int32_t prefix, u_int32_t suffix)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_repl, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(orig) +
+	    LOG_DBT_SIZE(repl) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __bam_repl_desc, pgno, lsn, indx, isdeleted, orig, repl, prefix,
+	    suffix));
+}
+
+static inline int __bam_repl_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_repl_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_repl_desc, sizeof(__bam_repl_args), (void**)arg));
+}
+#define	DB___bam_irep	67
+typedef struct ___bam_irep_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+	u_int32_t	ptype;
+	DBT	hdr;
+	DBT	data;
+	DBT	old;
+} __bam_irep_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_irep_desc[];
+static inline int
+__bam_irep_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t ptype,
+    const DBT *hdr, const DBT *data, const DBT *old)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_irep, 1,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+	    LOG_DBT_SIZE(data) + LOG_DBT_SIZE(old),
+	    __bam_irep_desc, pgno, lsn, indx, ptype, hdr, data, old));
+}
+
+static inline int __bam_irep_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_irep_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_irep_desc, sizeof(__bam_irep_args), (void**)arg));
+}
+#define	DB___bam_root	59
+typedef struct ___bam_root_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	meta_pgno;
+	db_pgno_t	root_pgno;
+	DB_LSN	meta_lsn;
+} __bam_root_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_root_desc[];
+static inline int
+__bam_root_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta_pgno, db_pgno_t root_pgno, DB_LSN * meta_lsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_root, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*meta_lsn),
+	    __bam_root_desc, meta_pgno, root_pgno, meta_lsn));
+}
+
+static inline int __bam_root_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_root_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_root_desc, sizeof(__bam_root_args), (void**)arg));
+}
+#define	DB___bam_curadj	64
+typedef struct ___bam_curadj_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_ca_mode	mode;
+	db_pgno_t	from_pgno;
+	db_pgno_t	to_pgno;
+	db_pgno_t	left_pgno;
+	u_int32_t	first_indx;
+	u_int32_t	from_indx;
+	u_int32_t	to_indx;
+} __bam_curadj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_curadj_desc[];
+static inline int
+__bam_curadj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_ca_mode mode, db_pgno_t from_pgno, db_pgno_t to_pgno, db_pgno_t left_pgno,
+    u_int32_t first_indx, u_int32_t from_indx, u_int32_t to_indx)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_curadj, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __bam_curadj_desc, mode, from_pgno, to_pgno, left_pgno, first_indx, from_indx, to_indx));
+}
+
+static inline int __bam_curadj_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_curadj_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_curadj_desc, sizeof(__bam_curadj_args), (void**)arg));
+}
+#define	DB___bam_rcuradj	65
+typedef struct ___bam_rcuradj_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	ca_recno_arg	mode;
+	db_pgno_t	root;
+	db_recno_t	recno;
+	u_int32_t	order;
+} __bam_rcuradj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_rcuradj_desc[];
+static inline int
+__bam_rcuradj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, ca_recno_arg mode, db_pgno_t root, db_recno_t recno, u_int32_t order)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___bam_rcuradj, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __bam_rcuradj_desc, mode, root, recno, order));
+}
+
+static inline int __bam_rcuradj_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_rcuradj_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_rcuradj_desc, sizeof(__bam_rcuradj_args), (void**)arg));
+}
+#define	DB___bam_relink_43	147
+typedef struct ___bam_relink_43_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	db_pgno_t	prev;
+	DB_LSN	lsn_prev;
+	db_pgno_t	next;
+	DB_LSN	lsn_next;
+} __bam_relink_43_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_relink_43_desc[];
+static inline int __bam_relink_43_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_relink_43_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_relink_43_desc, sizeof(__bam_relink_43_args), (void**)arg));
+}
+#define	DB___bam_merge_44	148
+typedef struct ___bam_merge_44_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	db_pgno_t	npgno;
+	DB_LSN	nlsn;
+	DBT	hdr;
+	DBT	data;
+	DBT	ind;
+} __bam_merge_44_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_merge_44_desc[];
+static inline int __bam_merge_44_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __bam_merge_44_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __bam_merge_44_desc, sizeof(__bam_merge_44_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/btree_ext.h b/src/dbinc_auto/btree_ext.h
new file mode 100644
index 00000000..c90f5b80
--- /dev/null
+++ b/src/dbinc_auto/btree_ext.h
@@ -0,0 +1,147 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_btree_ext_h_
+#define	_btree_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __bam_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
+int __bam_compact_opd __P((DBC *, db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
+int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
+int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+int __bam_defcompress __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *));
+int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *));
+int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
+int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t));
+int __bamc_compress_del __P((DBC *, u_int32_t));
+int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t));
+int __bamc_compress_count __P((DBC *, db_recno_t *));
+int __bamc_compress_cmp __P((DBC *, DBC *, int *));
+int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t));
+int __bam_compress_salvage __P((DB *, VRFY_DBINFO *, void *, int (*)(void *, const void *), DBT *, DBT *));
+int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *));
+int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __bam_mswap __P((ENV *, PAGE *));
+int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int, u_int32_t *));
+int __ram_ca_delete __P((DB *, db_pgno_t, u_int32_t *));
+int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+int __bam_ca_dup __P((DBC *, u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+int __bam_ca_undodup __P((DB *, u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+int __bam_ca_split __P((DBC *, db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+int __bam_ca_undosplit __P((DB *, db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+int __bamc_init __P((DBC *, DBTYPE));
+int __bamc_refresh __P((DBC *));
+int __bamc_cmp __P((DBC *, DBC *, int *));
+int __bamc_count __P((DBC *, db_recno_t *));
+int __bamc_dup __P((DBC *, DBC *, u_int32_t));
+int __bam_bulk_overflow __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *));
+int __bam_bulk_duplicates __P((DBC *, db_pgno_t, u_int8_t *, int32_t *, int32_t **, u_int8_t **, u_int32_t *, int));
+int __bamc_rget __P((DBC *, DBT *));
+int  __bam_opd_exists __P((DBC *, db_pgno_t));
+int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+int __bam_dpages __P((DBC *, int, int));
+int __bam_pupdate __P((DBC *, PAGE *));
+int __bam_db_create __P((DB *));
+int __bam_db_close __P((DB *));
+void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __bam_set_bt_compress __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+int __bam_get_bt_minkey __P((DB *, u_int32_t *));
+void __bam_copy_config __P((DB *, DB*, u_int32_t));
+void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+int __ram_get_re_len __P((DB *, u_int32_t *));
+int __ram_get_re_pad __P((DB *, int *));
+int __bam_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __bam_metachk __P((DB *, const char *, BTMETA *));
+int __bam_read_root __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+int __bam_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t));
+int __bam_ritem_nolog __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *, u_int32_t));
+int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *));
+int __bam_split_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_48_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+int __bam_truncate __P((DBC *, u_int32_t *));
+int __ram_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __ram_append __P((DBC *, DBT *, DBT *));
+int __ramc_del __P((DBC *, u_int32_t));
+int __ramc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+int __ram_ca __P((DBC *, ca_recno_arg, int *));
+int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+int __ram_writeback __P((DB *));
+int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+int __bam_adjust __P((DBC *, int32_t));
+int __bam_nrecs __P((DBC *, db_recno_t *));
+db_recno_t __bam_total __P((DB *, PAGE *));
+int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
+int __bam_search __P((DBC *, db_pgno_t, const DBT *, u_int32_t, int, db_recno_t *, int *));
+int __bam_stkrel __P((DBC *, u_int32_t));
+int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
+int __bam_split __P((DBC *, void *, db_pgno_t *));
+int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
+int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+int __bam_pinsert __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
+int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+int __bam_stat __P((DBC *, void *, u_int32_t));
+int __bam_stat_print __P((DBC *, u_int32_t));
+int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
+void __bam_print_cursor __P((DBC *));
+int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __bam_traverse __P((DBC *, db_lockmode_t, db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+int __bam_31_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_31_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, db_pgno_t, u_int32_t));
+int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *, PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
+int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, void *, u_int32_t));
+int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, PAGE *, void *, int (*)(void *, const void *), DBT *, u_int32_t));
+int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, DBT *, void *, int (*)(void *, const void *), u_int32_t));
+int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, u_int32_t, DB *));
+int __bam_init_recover __P((ENV *, DB_DISTAB *));
+int __bam_split_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_48_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_init_print __P((ENV *, DB_DISTAB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_btree_ext_h_ */
diff --git a/src/dbinc_auto/clib_ext.h b/src/dbinc_auto/clib_ext.h
new file mode 100644
index 00000000..c53be48c
--- /dev/null
+++ b/src/dbinc_auto/clib_ext.h
@@ -0,0 +1,113 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_clib_ext_h_
+#define	_clib_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HAVE_ATOI
+int atoi __P((const char *));
+#endif
+#ifndef HAVE_ATOL
+long atol __P((const char *));
+#endif
+#ifndef HAVE_BSEARCH
+void *bsearch __P((const void *, const void *, size_t, size_t, int (*)(const void *, const void *)));
+#endif
+#ifndef HAVE_GETCWD
+char *getcwd __P((char *, size_t));
+#endif
+#ifndef HAVE_GETOPT
+int getopt __P((int, char * const *, const char *));
+#endif
+#ifndef HAVE_ISALPHA
+int isalpha __P((int));
+#endif
+#ifndef HAVE_ISDIGIT
+int isdigit __P((int));
+#endif
+#ifndef HAVE_ISPRINT
+int isprint __P((int));
+#endif
+#ifndef HAVE_ISSPACE
+int isspace __P((int));
+#endif
+#ifndef HAVE_MEMCMP
+int memcmp __P((const void *, const void *, size_t));
+#endif
+#ifndef HAVE_MEMCPY
+void *memcpy __P((void *, const void *, size_t));
+#endif
+#ifndef HAVE_MEMMOVE
+void *memmove __P((void *, const void *, size_t));
+#endif
+#ifndef HAVE_PRINTF
+int printf __P((const char *, ...));
+#endif
+#ifndef HAVE_PRINTF
+int fprintf __P((FILE *, const char *, ...));
+#endif
+#ifndef HAVE_PRINTF
+int vfprintf __P((FILE *, const char *, va_list));
+#endif
+#ifndef HAVE_QSORT
+void qsort __P((void *, size_t, size_t, int(*)(const void *, const void *)));
+#endif
+#ifndef HAVE_RAISE
+int raise __P((int));
+#endif
+#ifndef HAVE_RAND
+int rand __P((void));
+void srand __P((unsigned int));
+#endif
+#ifndef HAVE_SNPRINTF
+int snprintf __P((char *, size_t, const char *, ...));
+#endif
+#ifndef HAVE_VSNPRINTF
+int vsnprintf __P((char *, size_t, const char *, va_list));
+#endif
+#ifndef HAVE_STRCASECMP
+int strcasecmp __P((const char *, const char *));
+#endif
+#ifndef HAVE_STRCASECMP
+int strncasecmp __P((const char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRCAT
+char *strcat __P((char *, const char *));
+#endif
+#ifndef HAVE_STRCHR
+char *strchr __P((const char *,  int));
+#endif
+#ifndef HAVE_STRDUP
+char *strdup __P((const char *));
+#endif
+#ifndef HAVE_STRERROR
+char *strerror __P((int));
+#endif
+#ifndef HAVE_STRNCAT
+char *strncat __P((char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRNCMP
+int strncmp __P((const char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRRCHR
+char *strrchr __P((const char *, int));
+#endif
+#ifndef HAVE_STRSEP
+char *strsep __P((char **, const char *));
+#endif
+#ifndef HAVE_STRTOL
+long strtol __P((const char *, char **, int));
+#endif
+#ifndef HAVE_STRTOUL
+unsigned long strtoul __P((const char *, char **, int));
+#endif
+#ifndef HAVE_TIME
+time_t time __P((time_t *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_clib_ext_h_ */
diff --git a/src/dbinc_auto/common_ext.h b/src/dbinc_auto/common_ext.h
new file mode 100644
index 00000000..ac16e9db
--- /dev/null
+++ b/src/dbinc_auto/common_ext.h
@@ -0,0 +1,75 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_common_ext_h_
+#define	_common_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __clock_set_expires __P((ENV *, db_timespec *, db_timeout_t));
+int __clock_expired __P((ENV *, db_timespec *, db_timespec *));
+int __crypto_region_init __P((ENV *));
+int __db_isbigendian __P((void));
+int __db_byteorder __P((ENV *, int));
+u_int32_t __db_compress_count_int __P((u_int64_t));
+int __db_compress_int __P((u_int8_t *, u_int64_t));
+u_int32_t __db_decompress_count_int __P((const u_int8_t *));
+int __db_decompress_int __P((const u_int8_t *, u_int64_t *));
+int __db_decompress_int32 __P((const u_int8_t *, u_int32_t *));
+int __db_fchk __P((ENV *, const char *, u_int32_t, u_int32_t));
+int __db_fcchk __P((ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+int __db_ferr __P((const ENV *, const char *, int));
+int __db_fnl __P((const ENV *, const char *));
+int __db_pgerr __P((DB *, db_pgno_t, int));
+int __db_pgfmt __P((ENV *, db_pgno_t));
+#ifdef DIAGNOSTIC
+void __db_assert __P((ENV *, const char *, const char *, int));
+#endif
+int __env_panic_msg __P((ENV *));
+int __env_panic __P((ENV *, int));
+char *__db_unknown_error __P((int));
+void __db_syserr __P((const ENV *, int, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_err __P((const ENV *, int, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_errx __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+void __db_errcall __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+void __db_errfile __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_msgadd_ap __P((ENV *, DB_MSGBUF *, const char *, va_list));
+void __db_msg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+void __db_repmsg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+int __db_unknown_flag __P((ENV *, char *, u_int32_t));
+int __db_unknown_type __P((ENV *, char *, DBTYPE));
+int __db_unknown_path __P((ENV *, char *));
+int __db_check_txn __P((DB *, DB_TXN *, DB_LOCKER *, int));
+int __db_txn_deadlock_err __P((ENV *, DB_TXN *));
+int __db_not_txn_env __P((ENV *));
+int __db_rec_toobig __P((ENV *, u_int32_t, u_int32_t));
+int __db_rec_repl __P((ENV *, u_int32_t, u_int32_t));
+int __dbc_logging __P((DBC *));
+int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *));
+int __db_rdonly __P((const ENV *, const char *));
+int __db_space_err __P((const DB *));
+int __db_failed __P((const ENV *, const char *, pid_t, db_threadid_t));
+int __db_getlong __P((DB_ENV *, const char *, char *, long, long, long *));
+int __db_getulong __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *));
+void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *));
+u_int32_t __db_log2 __P((u_int32_t));
+u_int32_t __db_tablesize __P((u_int32_t));
+void __db_hashinit __P((void *, u_int32_t));
+int __dbt_usercopy __P((ENV *, DBT *));
+void __dbt_userfree __P((ENV *, DBT *, DBT *, DBT *));
+int __db_mkpath __P((ENV *, const char *));
+u_int32_t __db_openflags __P((int));
+int __db_util_arg __P((char *, char *, int *, char ***));
+int __db_util_cache __P((DB *, u_int32_t *, int *));
+int __db_util_logset __P((const char *, char *));
+void __db_util_siginit __P((void));
+int __db_util_interrupted __P((void));
+void __db_util_sigresend __P((void));
+int __db_zero_fill __P((ENV *, DB_FH *));
+int __db_zero_extend __P((ENV *, DB_FH *, db_pgno_t, db_pgno_t, u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_common_ext_h_ */
diff --git a/src/dbinc_auto/crdel_auto.h b/src/dbinc_auto/crdel_auto.h
new file mode 100644
index 00000000..86a60549
--- /dev/null
+++ b/src/dbinc_auto/crdel_auto.h
@@ -0,0 +1,127 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__crdel_AUTO_H
+#define	__crdel_AUTO_H
+#include "dbinc/log.h"
+#define	DB___crdel_metasub	142
+typedef struct ___crdel_metasub_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DBT	page;
+	DB_LSN	lsn;
+} __crdel_metasub_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_metasub_desc[];
+static inline int
+__crdel_metasub_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *page, DB_LSN * lsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___crdel_metasub, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(page) +
+	    sizeof(*lsn),
+	    __crdel_metasub_desc, pgno, page, lsn));
+}
+
+static inline int __crdel_metasub_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __crdel_metasub_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __crdel_metasub_desc, sizeof(__crdel_metasub_args), (void**)arg));
+}
+#define	DB___crdel_inmem_create	138
+typedef struct ___crdel_inmem_create_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DBT	name;
+	DBT	fid;
+	u_int32_t	pgsize;
+} __crdel_inmem_create_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_create_desc[];
+static inline int
+__crdel_inmem_create_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    int32_t fileid, const DBT *name, const DBT *fid, u_int32_t pgsize)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___crdel_inmem_create, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid) +
+	    sizeof(u_int32_t),
+	    __crdel_inmem_create_desc,
+	    fileid, name, fid, pgsize));
+}
+
+static inline int __crdel_inmem_create_read(ENV *env, 
+    void *data, __crdel_inmem_create_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __crdel_inmem_create_desc, sizeof(__crdel_inmem_create_args), (void**)arg));
+}
+#define	DB___crdel_inmem_rename	139
+typedef struct ___crdel_inmem_rename_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	oldname;
+	DBT	newname;
+	DBT	fid;
+} __crdel_inmem_rename_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_rename_desc[];
+static inline int
+__crdel_inmem_rename_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *oldname, const DBT *newname, const DBT *fid)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___crdel_inmem_rename, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(fid),
+	    __crdel_inmem_rename_desc,
+	    oldname, newname, fid));
+}
+
+static inline int __crdel_inmem_rename_read(ENV *env, 
+    void *data, __crdel_inmem_rename_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __crdel_inmem_rename_desc, sizeof(__crdel_inmem_rename_args), (void**)arg));
+}
+#define	DB___crdel_inmem_remove	140
+typedef struct ___crdel_inmem_remove_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	DBT	fid;
+} __crdel_inmem_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_remove_desc[];
+static inline int
+__crdel_inmem_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *name, const DBT *fid)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___crdel_inmem_remove, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid),
+	    __crdel_inmem_remove_desc,
+	    name, fid));
+}
+
+static inline int __crdel_inmem_remove_read(ENV *env, 
+    void *data, __crdel_inmem_remove_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __crdel_inmem_remove_desc, sizeof(__crdel_inmem_remove_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/crypto_ext.h b/src/dbinc_auto/crypto_ext.h
new file mode 100644
index 00000000..cd7113d7
--- /dev/null
+++ b/src/dbinc_auto/crypto_ext.h
@@ -0,0 +1,38 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_crypto_ext_h_
+#define	_crypto_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __aes_setup __P((ENV *, DB_CIPHER *));
+u_int __aes_adj_size __P((size_t));
+int __aes_close __P((ENV *, void *));
+int __aes_decrypt __P((ENV *, void *, void *, u_int8_t *, size_t));
+int __aes_encrypt __P((ENV *, void *, void *, u_int8_t *, size_t));
+int __aes_init __P((ENV *, DB_CIPHER *));
+int __crypto_env_close __P((ENV *));
+int __crypto_env_refresh __P((ENV *));
+int __crypto_algsetup __P((ENV *, DB_CIPHER *, u_int32_t, int));
+int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int));
+int __crypto_set_passwd __P((ENV *, ENV *));
+int __db_generate_iv __P((ENV *, u_int32_t *));
+int __db_rijndaelKeySetupEnc __P((u32 *, const u8 *, int));
+int __db_rijndaelKeySetupDec __P((u32 *, const u8 *, int));
+void __db_rijndaelEncrypt __P((u32 *, int, const u8 *, u8 *));
+void __db_rijndaelDecrypt __P((u32 *, int, const u8 *, u8 *));
+void __db_rijndaelEncryptRound __P((const u32 *, int, u8 *, int));
+void __db_rijndaelDecryptRound __P((const u32 *, int, u8 *, int));
+int __db_makeKey __P((keyInstance *, int, int, char *));
+int __db_cipherInit __P((cipherInstance *, int, char *));
+int __db_blockEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *, size_t, u_int8_t *));
+int __db_padEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *));
+int __db_blockDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *, size_t, u_int8_t *));
+int __db_padDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *));
+int __db_cipherUpdateRounds __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *, int));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_crypto_ext_h_ */
diff --git a/src/dbinc_auto/db_auto.h b/src/dbinc_auto/db_auto.h
new file mode 100644
index 00000000..04e2f465
--- /dev/null
+++ b/src/dbinc_auto/db_auto.h
@@ -0,0 +1,666 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__db_AUTO_H
+#define	__db_AUTO_H
+#include "dbinc/log.h"
+#define	DB___db_addrem	41
+typedef struct ___db_addrem_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	u_int32_t	nbytes;
+	DBT	hdr;
+	DBT	dbt;
+	DB_LSN	pagelsn;
+} __db_addrem_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_addrem_desc[];
+static inline int
+__db_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_pgno_t pgno, u_int32_t indx, u_int32_t nbytes,
+    const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_addrem, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+	    LOG_DBT_SIZE(dbt) + sizeof(*pagelsn),
+	    __db_addrem_desc,
+	    opcode, pgno, indx, nbytes, hdr, dbt, pagelsn));
+}
+
+static inline int __db_addrem_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_addrem_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_addrem_desc, sizeof(__db_addrem_args), (void**)arg));
+}
+#define	DB___db_addrem_42	41
+typedef struct ___db_addrem_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	u_int32_t	nbytes;
+	DBT	hdr;
+	DBT	dbt;
+	DB_LSN	pagelsn;
+} __db_addrem_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_addrem_42_desc[];
+static inline int __db_addrem_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_addrem_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_addrem_42_desc, sizeof(__db_addrem_42_args), (void**)arg));
+}
+#define	DB___db_big	43
+typedef struct ___db_big_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	db_pgno_t	prev_pgno;
+	db_pgno_t	next_pgno;
+	DBT	dbt;
+	DB_LSN	pagelsn;
+	DB_LSN	prevlsn;
+	DB_LSN	nextlsn;
+} __db_big_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_big_desc[];
+static inline int
+__db_big_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_pgno_t pgno, db_pgno_t prev_pgno, db_pgno_t next_pgno,
+    const DBT *dbt, DB_LSN * pagelsn, DB_LSN * prevlsn, DB_LSN * nextlsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_big, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(dbt) +
+	    sizeof(*pagelsn) + sizeof(*prevlsn) + sizeof(*nextlsn),
+	    __db_big_desc,
+	    opcode, pgno, prev_pgno, next_pgno, dbt, pagelsn, prevlsn,
+	    nextlsn));
+}
+
+static inline int __db_big_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_big_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_big_desc, sizeof(__db_big_args), (void**)arg));
+}
+#define	DB___db_big_42	43
+typedef struct ___db_big_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	db_pgno_t	prev_pgno;
+	db_pgno_t	next_pgno;
+	DBT	dbt;
+	DB_LSN	pagelsn;
+	DB_LSN	prevlsn;
+	DB_LSN	nextlsn;
+} __db_big_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_big_42_desc[];
+static inline int __db_big_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_big_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_big_42_desc, sizeof(__db_big_42_args), (void**)arg));
+}
+#define	DB___db_ovref	44
+typedef struct ___db_ovref_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	int32_t	adjust;
+	DB_LSN	lsn;
+} __db_ovref_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_ovref_desc[];
+static inline int
+__db_ovref_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, int32_t adjust, DB_LSN * lsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_ovref, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*lsn),
+	    __db_ovref_desc, pgno, adjust, lsn));
+}
+
+static inline int __db_ovref_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_ovref_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_ovref_desc, sizeof(__db_ovref_args), (void**)arg));
+}
+#define	DB___db_relink_42	45
+typedef struct ___db_relink_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	db_pgno_t	prev;
+	DB_LSN	lsn_prev;
+	db_pgno_t	next;
+	DB_LSN	lsn_next;
+} __db_relink_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_relink_42_desc[];
+static inline int __db_relink_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_relink_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_relink_42_desc, sizeof(__db_relink_42_args), (void**)arg));
+}
+#define	DB___db_debug	47
+typedef struct ___db_debug_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	op;
+	int32_t	fileid;
+	DBT	key;
+	DBT	data;
+	u_int32_t	arg_flags;
+} __db_debug_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_debug_desc[];
+static inline int
+__db_debug_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *op, int32_t fileid, const DBT *key, const DBT *data, u_int32_t arg_flags)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___db_debug, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(op) + sizeof(u_int32_t) + LOG_DBT_SIZE(key) +
+	    LOG_DBT_SIZE(data) + sizeof(u_int32_t),
+	    __db_debug_desc,
+	    op, fileid, key, data, arg_flags));
+}
+
+static inline int __db_debug_read(ENV *env, 
+    void *data, __db_debug_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __db_debug_desc, sizeof(__db_debug_args), (void**)arg));
+}
+#define	DB___db_noop	48
+typedef struct ___db_noop_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	prevlsn;
+} __db_noop_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_noop_desc[];
+static inline int
+__db_noop_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * prevlsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_noop, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*prevlsn),
+	    __db_noop_desc, pgno, prevlsn));
+}
+
+static inline int __db_noop_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_noop_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_noop_desc, sizeof(__db_noop_args), (void**)arg));
+}
+#define	DB___db_pg_alloc_42	49
+typedef struct ___db_pg_alloc_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DB_LSN	page_lsn;
+	db_pgno_t	pgno;
+	u_int32_t	ptype;
+	db_pgno_t	next;
+} __db_pg_alloc_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_alloc_42_desc[];
+static inline int __db_pg_alloc_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_alloc_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_alloc_42_desc, sizeof(__db_pg_alloc_42_args), (void**)arg));
+}
+#define	DB___db_pg_alloc	49
+typedef struct ___db_pg_alloc_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DB_LSN	page_lsn;
+	db_pgno_t	pgno;
+	u_int32_t	ptype;
+	db_pgno_t	next;
+	db_pgno_t	last_pgno;
+} __db_pg_alloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_alloc_desc[];
+static inline int
+__db_pg_alloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t meta_pgno, DB_LSN * page_lsn, db_pgno_t pgno,
+    u_int32_t ptype, db_pgno_t next, db_pgno_t last_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pg_alloc, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+	    sizeof(*page_lsn) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __db_pg_alloc_desc, meta_lsn, meta_pgno, page_lsn, pgno, ptype, next, last_pgno));
+}
+
+static inline int __db_pg_alloc_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_alloc_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_alloc_desc, sizeof(__db_pg_alloc_args), (void**)arg));
+}
+#define	DB___db_pg_free_42	50
+typedef struct ___db_pg_free_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DBT	header;
+	db_pgno_t	next;
+} __db_pg_free_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_free_42_desc[];
+static inline int __db_pg_free_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_free_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_free_42_desc, sizeof(__db_pg_free_42_args), (void**)arg));
+}
+#define	DB___db_pg_free	50
+typedef struct ___db_pg_free_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DBT	header;
+	db_pgno_t	next;
+	db_pgno_t	last_pgno;
+} __db_pg_free_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_free_desc[];
+static inline int
+__db_pg_free_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * meta_lsn, db_pgno_t meta_pgno, const DBT *header,
+    db_pgno_t next, db_pgno_t last_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pg_free, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(header) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t),
+	    __db_pg_free_desc, pgno, meta_lsn, meta_pgno, header, next, last_pgno));
+}
+
+static inline int __db_pg_free_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_free_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_free_desc, sizeof(__db_pg_free_args), (void**)arg));
+}
+#define	DB___db_cksum	51
+typedef struct ___db_cksum_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+} __db_cksum_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_cksum_desc[];
+static inline int
+__db_cksum_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___db_cksum, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN),
+	    __db_cksum_desc));
+}
+
+static inline int __db_cksum_read(ENV *env, 
+    void *data, __db_cksum_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __db_cksum_desc, sizeof(__db_cksum_args), (void**)arg));
+}
+#define	DB___db_pg_freedata_42	52
+typedef struct ___db_pg_freedata_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DBT	header;
+	db_pgno_t	next;
+	DBT	data;
+} __db_pg_freedata_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_freedata_42_desc[];
+static inline int __db_pg_freedata_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_freedata_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_freedata_42_desc, sizeof(__db_pg_freedata_42_args), (void**)arg));
+}
+#define	DB___db_pg_freedata	52
+typedef struct ___db_pg_freedata_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	DBT	header;
+	db_pgno_t	next;
+	db_pgno_t	last_pgno;
+	DBT	data;
+} __db_pg_freedata_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_freedata_desc[];
+static inline int
+__db_pg_freedata_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * meta_lsn, db_pgno_t meta_pgno, const DBT *header,
+    db_pgno_t next, db_pgno_t last_pgno, const DBT *data)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pg_freedata, 1,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(header) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+	    __db_pg_freedata_desc, pgno, meta_lsn, meta_pgno, header, next, last_pgno, data));
+}
+
+static inline int __db_pg_freedata_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_freedata_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_freedata_desc, sizeof(__db_pg_freedata_args), (void**)arg));
+}
+#define	DB___db_pg_init	60
+typedef struct ___db_pg_init_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DBT	header;
+	DBT	data;
+} __db_pg_init_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_init_desc[];
+static inline int
+__db_pg_init_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *header, const DBT *data)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pg_init, 1,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(header) +
+	    LOG_DBT_SIZE(data),
+	    __db_pg_init_desc, pgno, header, data));
+}
+
+static inline int __db_pg_init_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_init_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_init_desc, sizeof(__db_pg_init_args), (void**)arg));
+}
+#define	DB___db_pg_sort_44	61
+typedef struct ___db_pg_sort_44_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	meta;
+	DB_LSN	meta_lsn;
+	db_pgno_t	last_free;
+	DB_LSN	last_lsn;
+	db_pgno_t	last_pgno;
+	DBT	list;
+} __db_pg_sort_44_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_sort_44_desc[];
+static inline int __db_pg_sort_44_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_sort_44_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_sort_44_desc, sizeof(__db_pg_sort_44_args), (void**)arg));
+}
+#define	DB___db_pg_trunc	66
+typedef struct ___db_pg_trunc_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	meta;
+	DB_LSN	meta_lsn;
+	db_pgno_t	last_free;
+	DB_LSN	last_lsn;
+	db_pgno_t	next_free;
+	db_pgno_t	last_pgno;
+	DBT	list;
+} __db_pg_trunc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_trunc_desc[];
+static inline int
+__db_pg_trunc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta, DB_LSN * meta_lsn, db_pgno_t last_free, DB_LSN * last_lsn,
+    db_pgno_t next_free, db_pgno_t last_pgno, const DBT *list)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pg_trunc, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+	    sizeof(u_int32_t) + sizeof(*last_lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(list),
+	    __db_pg_trunc_desc, meta, meta_lsn, last_free, last_lsn, next_free, last_pgno, list));
+}
+
+static inline int __db_pg_trunc_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pg_trunc_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pg_trunc_desc, sizeof(__db_pg_trunc_args), (void**)arg));
+}
+#define	DB___db_realloc	36
+typedef struct ___db_realloc_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	prev_pgno;
+	DB_LSN	page_lsn;
+	db_pgno_t	next_free;
+	u_int32_t	ptype;
+	DBT	list;
+} __db_realloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_realloc_desc[];
+static inline int
+__db_realloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t prev_pgno, DB_LSN * page_lsn, db_pgno_t next_free, u_int32_t ptype,
+    const DBT *list)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_realloc, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*page_lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(list),
+	    __db_realloc_desc, prev_pgno, page_lsn, next_free, ptype, list));
+}
+
+static inline int __db_realloc_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_realloc_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_realloc_desc, sizeof(__db_realloc_args), (void**)arg));
+}
+#define	DB___db_relink	147
+typedef struct ___db_relink_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	db_pgno_t	new_pgno;
+	db_pgno_t	prev_pgno;
+	DB_LSN	lsn_prev;
+	db_pgno_t	next_pgno;
+	DB_LSN	lsn_next;
+} __db_relink_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_relink_desc[];
+static inline int
+__db_relink_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, db_pgno_t new_pgno, db_pgno_t prev_pgno, DB_LSN * lsn_prev,
+    db_pgno_t next_pgno, DB_LSN * lsn_next)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_relink, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(*lsn_prev) + sizeof(u_int32_t) +
+	    sizeof(*lsn_next),
+	    __db_relink_desc, pgno, new_pgno, prev_pgno, lsn_prev, next_pgno, lsn_next));
+}
+
+static inline int __db_relink_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_relink_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_relink_desc, sizeof(__db_relink_args), (void**)arg));
+}
+#define	DB___db_merge	148
+typedef struct ___db_merge_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	db_pgno_t	npgno;
+	DB_LSN	nlsn;
+	DBT	hdr;
+	DBT	data;
+	int32_t	pg_copy;
+} __db_merge_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_merge_desc[];
+static inline int
+__db_merge_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, db_pgno_t npgno, DB_LSN * nlsn,
+    const DBT *hdr, const DBT *data, int32_t pg_copy)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_merge, 1,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(*nlsn) + LOG_DBT_SIZE(hdr) +
+	    LOG_DBT_SIZE(data) + sizeof(u_int32_t),
+	    __db_merge_desc, pgno, lsn, npgno, nlsn, hdr, data, pg_copy));
+}
+
+static inline int __db_merge_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_merge_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_merge_desc, sizeof(__db_merge_args), (void**)arg));
+}
+#define	DB___db_pgno	149
+typedef struct ___db_pgno_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	lsn;
+	u_int32_t	indx;
+	db_pgno_t	opgno;
+	db_pgno_t	npgno;
+} __db_pgno_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pgno_desc[];
+static inline int
+__db_pgno_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, db_pgno_t opgno,
+    db_pgno_t npgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___db_pgno, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __db_pgno_desc, pgno, lsn, indx, opgno, npgno));
+}
+
+static inline int __db_pgno_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __db_pgno_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __db_pgno_desc, sizeof(__db_pgno_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/db_ext.h b/src/dbinc_auto/db_ext.h
new file mode 100644
index 00000000..de2a6ce4
--- /dev/null
+++ b/src/dbinc_auto/db_ext.h
@@ -0,0 +1,346 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_db_ext_h_
+#define	_db_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_init_print __P((ENV *, DB_DISTAB *));
+int __crdel_metasub_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_master_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, int, DB **));
+int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *, const char *, DBTYPE, mu_action, const char *, u_int32_t));
+int __env_dbreg_setup __P((DB *, DB_TXN *, const char *, const char *, u_int32_t));
+int __env_setup __P((DB *, DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+int __env_mpool __P((DB *, const char *, u_int32_t));
+int __db_close __P((DB *, DB_TXN *, u_int32_t));
+int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+int __db_walk_cursors __P((DB *, DBC *, int (*) __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *)), u_int32_t *, db_pgno_t, u_int32_t, void *));
+int __db_backup_name __P((ENV *, const char *, DB_TXN *, char **));
+#ifdef CONFIG_TEST
+int __db_testcopy __P((ENV *, DB *, const char *));
+#endif
+int __db_testdocopy __P((ENV *, const char *));
+int __db_cursor_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+int __db_put __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_del __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+int __db_sync __P((DB *));
+int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *, int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+int __db_secondary_close __P((DB *, u_int32_t));
+int __db_associate_foreign __P((DB *, DB *, int (*)(DB *, const DBT *, DBT *, const DBT *, int *), u_int32_t));
+int __db_init_recover __P((ENV *, DB_DISTAB *));
+int __db_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_init_print __P((ENV *, DB_DISTAB *));
+int __db_dbbackup_pp __P((DB_ENV *, const char *, const char *, u_int32_t));
+int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
+int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+int __dbc_close __P((DBC *));
+int __dbc_destroy __P((DBC *));
+int __dbc_cmp __P((DBC *, DBC *, int *));
+int __dbc_count __P((DBC *, db_recno_t *));
+int __dbc_del __P((DBC *, u_int32_t));
+int __dbc_idel __P((DBC *, u_int32_t));
+#ifdef HAVE_COMPRESSION
+int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+#endif
+int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_duperr __P((DB *, u_int32_t));
+int __dbc_cleanup __P((DBC *, DBC *, int));
+int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+int __dbc_del_primary __P((DBC *));
+int __db_s_first __P((DB *, DB **));
+int __db_s_next __P((DB **, DB_TXN *));
+int __db_s_done __P((DB *, DB_TXN *));
+int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+u_int32_t __db_partsize __P((u_int32_t, DBT *));
+#ifdef DIAGNOSTIC
+void __db_check_skeyset __P((DB *, DBT *));
+#endif
+int __cdsgroup_begin __P((ENV *, DB_TXN **));
+int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **));
+int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
+int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *));
+int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+int __db_find_free __P((DBC *, u_int32_t, u_int32_t, db_pgno_t, db_pgno_t *));
+int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
+int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+int __db_encrypt_and_checksum_pg __P((ENV *, DB *, PAGE *));
+void __db_metaswap __P((PAGE *));
+int __db_byteswap __P((DB *, db_pgno_t, PAGE *, size_t, int));
+int __db_pageswap __P((ENV *, DB *, void *, size_t, DBT *, int));
+void __db_recordswap __P((u_int32_t, u_int32_t, void *, void *, u_int32_t));
+int __db_dispatch __P((ENV *, DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
+int __db_add_recovery __P((DB_ENV *, DB_DISTAB *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+int __db_add_recovery_int __P((ENV *, DB_DISTAB *, int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *, u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+int __db_txnlist_add __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+int __db_txnlist_find __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t *));
+int __db_txnlist_update __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+int __db_txnlist_gen __P((ENV *, DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+int __db_txnlist_lsnget __P((ENV *, DB_TXNHEAD *, DB_LSN *, u_int32_t));
+int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+void __db_txnlist_print __P((DB_TXNHEAD *));
+int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __db_pitem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __db_pitem __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __db_associate_pp __P((DB *, DB_TXN *, DB *, int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+int __db_close_pp __P((DB *, u_int32_t));
+int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+int __db_cursor __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+int __db_fd_pp __P((DB *, int *));
+int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_get __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+int __db_key_range_pp __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __db_open_pp __P((DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int));
+int __db_pget_pp __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+int __db_pget __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_compact_pp __P((DB *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __db_associate_foreign_pp __P((DB *, DB *, int (*)(DB *, const DBT *, DBT *, const DBT *, int *), u_int32_t));
+int __db_sync_pp __P((DB *, u_int32_t));
+int __dbc_close_pp __P((DBC *));
+int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+int __dbc_del_pp __P((DBC *, u_int32_t));
+int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_secondary_close_pp __P((DB *, u_int32_t));
+int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+int __db_join_close __P((DBC *));
+int __db_secondary_corrupt __P((DB *));
+int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+int __db_free __P((DBC *, PAGE *, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+void __db_freelist_pos __P((db_pgno_t, db_pgno_t *, u_int32_t, u_int32_t *));
+#endif
+void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+int __db_pg_truncate __P((DBC *, DB_TXN *, db_pglist_t *, DB_COMPACT *, u_int32_t *, db_pgno_t , db_pgno_t *, DB_LSN *, int));
+#endif
+#ifdef HAVE_FTRUNCATE
+int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *, db_pgno_t *));
+#endif
+int __db_lprint __P((DBC *));
+int __db_lget __P((DBC *, int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+#ifdef DIAGNOSTIC
+int __db_haslock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
+#endif
+#ifdef DIAGNOSTIC
+int __db_has_pagelock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, PAGE *, db_lockmode_t));
+#endif
+int __db_lput __P((DBC *, DB_LOCK *));
+int __db_create_internal  __P((DB **, ENV *, u_int32_t));
+int __dbh_am_chk __P((DB *, u_int32_t));
+int __db_get_flags __P((DB *, u_int32_t *));
+int  __db_set_flags __P((DB *, u_int32_t));
+int  __db_get_lorder __P((DB *, int *));
+int  __db_set_lorder __P((DB *, int));
+int  __db_set_pagesize __P((DB *, u_int32_t));
+int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+int __db_get_open_flags __P((DB *, u_int32_t *));
+int __db_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __db_init_subdb __P((DB *, DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+int __db_meta_setup __P((ENV *, DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+int __db_reopen __P((DBC *));
+int __db_goff __P((DBC *, DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+int __db_ovref __P((DBC *, db_pgno_t));
+int __db_doff __P((DBC *, db_pgno_t));
+int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
+int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *), int *));
+int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_ovfl_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+int __db_safe_goff __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+void __db_loadme __P((void));
+int __db_dumptree __P((DB *, DB_TXN *, char *, char *, db_pgno_t, db_pgno_t));
+const FN * __db_get_flags_fn __P((void));
+int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+int __db_prpage __P((DB *, PAGE *, u_int32_t));
+const char * __db_lockmode_to_string __P((db_lockmode_t));
+int __db_dumptree __P((DB *, DB_TXN *, char *, char *, db_pgno_t, db_pgno_t));
+const FN * __db_get_flags_fn __P((void));
+int __db_prpage_int __P((ENV *, DB_MSGBUF *, DB *, char *, PAGE *, u_int32_t, u_int8_t *, u_int32_t));
+void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+void __db_prflags __P((ENV *, DB_MSGBUF *, u_int32_t, const FN *, const char *, const char *));
+int __db_name_to_val __P((FN const *, char *));
+const char *__db_pagetype_to_string __P((u_int32_t));
+int __db_dump_pp __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
+int __db_dump __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
+int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int));
+int	__db_prheader __P((DB *, const char *, int, int, void *, int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+int __db_prfooter __P((void *, int (*)(void *, const void *)));
+int  __db_pr_callback __P((void *, const void *));
+const char * __db_dbtype_to_string __P((DBTYPE));
+int __db_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+void __db_pglist_swap __P((u_int32_t, void *));
+void __db_pglist_print __P((ENV *, DB_MSGBUF *, DBT *));
+int __db_traverse_big __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+int __env_dbremove_pp __P((DB_ENV *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_remove_pp __P((DB *, const char *, const char *, u_int32_t));
+int __db_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_remove_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+int __env_dbrename_pp __P((DB_ENV *, DB_TXN *, const char *, const char *, const char *, u_int32_t));
+int __db_rename_pp __P((DB *, const char *, const char *, const char *, u_int32_t));
+int __db_rename_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *, u_int32_t));
+int __db_ret __P((DBC *, PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+int __db_retcopy __P((ENV *, DBT *, void *, u_int32_t, void **, u_int32_t *));
+int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+int __env_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+int __db_compare_both __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *));
+int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+int __db_stat_print_pp __P((DB *, u_int32_t));
+int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t *));
+int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+int __db_upgrade __P((DB *, const char *, u_int32_t));
+int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+int __db_verify_pp __P((DB *, const char *, const char *, FILE *, u_int32_t));
+int __db_verify_internal __P((DB *, const char *, const char *, void *, int (*)(void *, const void *), u_int32_t));
+int   __db_verify __P((DB *, DB_THREAD_INFO *, const char *, const char *, void *, int (*)(void *, const void *), void *, void *, u_int32_t));
+int  __db_vrfy_common __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_datapage __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_meta __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_vrfy_inpitem __P((DB *, PAGE *, db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+int __db_vrfy_duptype __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_vrfy_dbinfo_create __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+int __db_vrfy_getpageinfo __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+int __db_vrfy_putpageinfo __P((ENV *, VRFY_DBINFO *, VRFY_PAGEINFO *));
+int __db_vrfy_pgset __P((ENV *, DB_THREAD_INFO *, u_int32_t, DB **));
+int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, int *));
+int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t));
+int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+int __db_vrfy_childput __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+int __db_vrfy_ccclose __P((DBC *));
+int  __db_salvage_init __P((VRFY_DBINFO *));
+int  __db_salvage_destroy __P((VRFY_DBINFO *));
+int __db_salvage_getnext __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+int __db_salvage_markneeded __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __db_vrfy_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int, VRFY_DBINFO *));
+int __partition_init __P((DB *, u_int32_t));
+int __partition_set __P((DB *, u_int32_t, DBT *, u_int32_t (*callback)(DB *, DBT *key)));
+int __partition_set_dirs __P((DB *, const char **));
+int __partition_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+int __partition_get_callback __P((DB *, u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+int __partition_get_dirs __P((DB *, const char ***));
+int __partc_init __P((DBC *));
+int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+int __partition_sync __P((DB *));
+int __partition_stat __P((DBC *, void *, u_int32_t));
+int __part_truncate __P((DBC *, u_int32_t *));
+int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __part_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __part_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *));
+int __part_verify __P((DB *, VRFY_DBINFO *, const char *, void *, int (*)(void *, const void *), u_int32_t));
+int __part_testdocopy __P((DB *, const char *));
+int __db_no_partition __P((ENV *));
+int __partition_set __P((DB *, u_int32_t, DBT *, u_int32_t (*callback)(DB *, DBT *key)));
+int __partition_get_callback __P((DB *, u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+int __partition_get_dirs __P((DB *, const char ***));
+int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+int __partition_init __P((DB *, u_int32_t));
+int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+int __partition_set_dirs __P((DB *, const char **));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_db_ext_h_ */
diff --git a/src/dbinc_auto/dbreg_auto.h b/src/dbinc_auto/dbreg_auto.h
new file mode 100644
index 00000000..63ad0cd3
--- /dev/null
+++ b/src/dbinc_auto/dbreg_auto.h
@@ -0,0 +1,43 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__dbreg_AUTO_H
+#define	__dbreg_AUTO_H
+#include "dbinc/log.h"
+#define	DB___dbreg_register	2
+typedef struct ___dbreg_register_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	DBT	name;
+	DBT	uid;
+	int32_t	fileid;
+	DBTYPE	ftype;
+	db_pgno_t	meta_pgno;
+	u_int32_t	id;
+} __dbreg_register_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_desc[];
+static inline int
+__dbreg_register_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, const DBT *name, const DBT *uid, int32_t fileid, DBTYPE ftype,
+    db_pgno_t meta_pgno, u_int32_t id)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___dbreg_register, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(uid) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t),
+	    __dbreg_register_desc,
+	    opcode, name, uid, fileid, ftype, meta_pgno, id));
+}
+
+static inline int __dbreg_register_read(ENV *env, 
+    void *data, __dbreg_register_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __dbreg_register_desc, sizeof(__dbreg_register_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/dbreg_ext.h b/src/dbinc_auto/dbreg_ext.h
new file mode 100644
index 00000000..0f495c33
--- /dev/null
+++ b/src/dbinc_auto/dbreg_ext.h
@@ -0,0 +1,46 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_dbreg_ext_h_
+#define	_dbreg_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __dbreg_setup __P((DB *, const char *, const char *, u_int32_t));
+int __dbreg_teardown __P((DB *));
+int __dbreg_teardown_int __P((ENV *, FNAME *));
+int __dbreg_new_id __P((DB *, DB_TXN *));
+int __dbreg_get_id __P((DB *, DB_TXN *, int32_t *));
+int __dbreg_assign_id __P((DB *, int32_t, int));
+int __dbreg_revoke_id __P((DB *, int, int32_t));
+int __dbreg_revoke_id_int __P((ENV *, FNAME *, int, int, int32_t));
+int __dbreg_close_id __P((DB *, DB_TXN *, u_int32_t));
+int __dbreg_close_id_int __P((ENV *, FNAME *, u_int32_t, int));
+int __dbreg_failchk __P((ENV *));
+int __dbreg_log_close __P((ENV *, FNAME *, DB_TXN *, u_int32_t));
+int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int));
+int __dbreg_init_recover __P((ENV *, DB_DISTAB *));
+int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_init_print __P((ENV *, DB_DISTAB *));
+int __dbreg_register_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_stat_print __P((ENV *, u_int32_t));
+void __dbreg_print_fname __P((ENV *, FNAME *));
+int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t));
+int __dbreg_rem_dbentry __P((DB_LOG *, int32_t));
+int __dbreg_log_files __P((ENV *, u_int32_t));
+int __dbreg_log_nofiles __P((ENV *));
+int __dbreg_close_files __P((ENV *, int));
+int __dbreg_close_file __P((ENV *, FNAME *));
+int __dbreg_mark_restored __P((ENV *));
+int __dbreg_invalidate_files __P((ENV *, int));
+int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int));
+int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **));
+int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **));
+int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **));
+int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+int __dbreg_lazy_id __P((DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_dbreg_ext_h_ */
diff --git a/src/dbinc_auto/env_ext.h b/src/dbinc_auto/env_ext.h
new file mode 100644
index 00000000..55dbcba4
--- /dev/null
+++ b/src/dbinc_auto/env_ext.h
@@ -0,0 +1,158 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_env_ext_h_
+#define	_env_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __env_alloc_init __P((REGINFO *, size_t));
+size_t __env_alloc_overhead __P((void));
+size_t __env_alloc_size __P((size_t));
+int __env_alloc __P((REGINFO *, size_t, void *));
+void __env_alloc_free __P((REGINFO *, void *));
+int __env_alloc_extend __P((REGINFO *, void *, size_t *));
+int __env_region_extend __P((ENV *, REGINFO *));
+uintmax_t __env_elem_size __P((ENV *, void *));
+void * __env_get_chunk __P((REGINFO *, void **, uintmax_t *));
+void __env_alloc_print __P((REGINFO *, u_int32_t));
+int __env_get_backup_config __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t*));
+int __env_set_backup_config __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t));
+int __env_get_backup_callbacks __P((DB_ENV *, int (**)(DB_ENV *, const char *, const char *, void **), int (**)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), int (**)(DB_ENV *, const char *, void *)));
+int __env_set_backup_callbacks __P((DB_ENV *, int (*)(DB_ENV *, const char *, const char *, void **), int (*)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), int (*)(DB_ENV *, const char *, void *)));
+int __env_read_db_config __P((ENV *));
+int __env_failchk_pp __P((DB_ENV *, u_int32_t));
+int __env_failchk_int __P((DB_ENV *));
+size_t __env_thread_size __P((ENV *, size_t));
+size_t __env_thread_max __P((ENV *));
+int __env_thread_init __P((ENV *, int));
+void __env_thread_destroy __P((ENV *));
+int __env_set_state __P((ENV *, DB_THREAD_INFO **, DB_THREAD_STATE));
+char *__env_thread_id_string __P((DB_ENV *, pid_t, db_threadid_t, char *));
+int __db_file_extend __P((ENV *, DB_FH *, size_t));
+int __db_file_multi_write __P((ENV *, const char *));
+int __db_file_write __P((ENV *, DB_FH *, u_int32_t, u_int32_t, int));
+void __db_env_destroy __P((DB_ENV *));
+int  __env_get_alloc __P((DB_ENV *, void *(**)(size_t), void *(**)(void *, size_t), void (**)(void *)));
+int  __env_set_alloc __P((DB_ENV *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *)));
+int  __env_get_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *));
+int  __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+int  __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int  __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *));
+int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t));
+void __env_map_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *));
+void __env_fetch_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *));
+int  __env_set_flags __P((DB_ENV *, u_int32_t, int));
+int __env_set_backup __P((ENV *, int));
+int  __env_set_data_dir __P((DB_ENV *, const char *));
+int  __env_add_data_dir __P((DB_ENV *, const char *));
+int  __env_set_create_dir __P((DB_ENV *, const char *));
+int  __env_set_metadata_dir __P((DB_ENV *, const char *));
+int  __env_set_data_len __P((DB_ENV *, u_int32_t));
+int  __env_set_intermediate_dir_mode __P((DB_ENV *, const char *));
+void __env_get_errcall __P((DB_ENV *, void (**)(const DB_ENV *, const char *, const char *)));
+void __env_set_errcall __P((DB_ENV *, void (*)(const DB_ENV *, const char *, const char *)));
+void __env_get_errfile __P((DB_ENV *, FILE **));
+void __env_set_errfile __P((DB_ENV *, FILE *));
+void __env_get_errpfx __P((DB_ENV *, const char **));
+void __env_set_errpfx __P((DB_ENV *, const char *));
+int  __env_set_thread_count __P((DB_ENV *, u_int32_t));
+void __env_get_msgcall __P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+void __env_set_msgcall __P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+void __env_get_msgfile __P((DB_ENV *, FILE **));
+void __env_set_msgfile __P((DB_ENV *, FILE *));
+int  __env_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int)));
+int  __env_set_shm_key __P((DB_ENV *, long));
+int  __env_set_tmp_dir __P((DB_ENV *, const char *));
+int  __env_set_verbose __P((DB_ENV *, u_int32_t, int));
+int __db_mi_env __P((ENV *, const char *));
+int __db_mi_open __P((ENV *, const char *, int));
+int __env_not_config __P((ENV *, char *, u_int32_t));
+int __env_set_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+int __db_appname __P((ENV *, APPNAME, const char *, const char **, char **));
+int __db_tmp_open __P((ENV *, u_int32_t, DB_FH **));
+int __env_open_pp __P((DB_ENV *, const char *, u_int32_t, int));
+int __env_open __P((DB_ENV *, const char *, u_int32_t, int));
+int __env_remove __P((DB_ENV *, const char *, u_int32_t));
+int __env_config __P((DB_ENV *, const char *, u_int32_t *, int));
+int __env_close_pp __P((DB_ENV *, u_int32_t));
+int __env_close __P((DB_ENV *, u_int32_t));
+int __env_refresh __P((DB_ENV *, u_int32_t, int));
+int __env_get_open_flags __P((DB_ENV *, u_int32_t *));
+int __env_attach_regions __P((DB_ENV *,  u_int32_t, u_int32_t, int));
+int __db_apprec __P((ENV *, DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t));
+int __env_openfiles __P((ENV *, DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int));
+int __env_init_rec __P((ENV *, u_int32_t));
+int __env_attach __P((ENV *, u_int32_t *, int, int));
+int __env_turn_on __P((ENV *));
+int __env_turn_off __P((ENV *, u_int32_t));
+void __env_panic_set __P((ENV *, int));
+int __env_ref_increment __P((ENV *));
+int __env_ref_decrement __P((ENV *));
+int __env_ref_get __P((DB_ENV *, u_int32_t *));
+int __env_detach __P((ENV *, int));
+int __env_remove_env __P((ENV *));
+int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t));
+int __env_region_share __P((ENV *, REGINFO *));
+int __env_region_detach __P((ENV *, REGINFO *, int));
+int __envreg_register __P((ENV *, int *, u_int32_t));
+int __envreg_unregister __P((ENV *, int));
+int __envreg_xunlock __P((ENV *));
+int __envreg_isalive __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+u_int32_t __env_struct_sig __P((void));
+int __env_stat_print_pp __P((DB_ENV *, u_int32_t));
+void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t));
+void __db_print_fileid __P((ENV *, u_int8_t *, const char *));
+void __db_dl __P((ENV *, const char *, u_long));
+void __db_dl_pct __P((ENV *, const char *, u_long, int, const char *));
+void __db_dlbytes __P((ENV *, const char *, u_long, u_long, u_long));
+void __db_print_reginfo __P((ENV *, REGINFO *, const char *, u_int32_t));
+int __db_stat_not_built __P((ENV *));
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_close __P((ENV *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_env_ext_h_ */
diff --git a/src/dbinc_auto/ext_185_def.in b/src/dbinc_auto/ext_185_def.in
new file mode 100644
index 00000000..8da68a8d
--- /dev/null
+++ b/src/dbinc_auto/ext_185_def.in
@@ -0,0 +1,12 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_DB_EXT_185_DEF_IN_
+#define	_DB_EXT_185_DEF_IN_
+
+#ifdef _DB185_INT_H_
+#define	__db185_open __db185_open@DB_VERSION_UNIQUE_NAME@
+#else
+#define	__db185_open __db185_open@DB_VERSION_UNIQUE_NAME@
+#endif
+
+#endif /* !_DB_EXT_185_DEF_IN_ */
diff --git a/src/dbinc_auto/ext_185_prot.in b/src/dbinc_auto/ext_185_prot.in
new file mode 100644
index 00000000..dfd8d3d4
--- /dev/null
+++ b/src/dbinc_auto/ext_185_prot.in
@@ -0,0 +1,19 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_DB_EXT_185_PROT_IN_
+#define	_DB_EXT_185_PROT_IN_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef _DB185_INT_H_
+DB185 *__db185_open __P((const char *, int, int, DBTYPE, const void *));
+#else
+DB *__db185_open __P((const char *, int, int, DBTYPE, const void *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_EXT_185_PROT_IN_ */
diff --git a/src/dbinc_auto/ext_def.in b/src/dbinc_auto/ext_def.in
new file mode 100644
index 00000000..1a56f192
--- /dev/null
+++ b/src/dbinc_auto/ext_def.in
@@ -0,0 +1,66 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_DB_EXT_DEF_IN_
+#define	_DB_EXT_DEF_IN_
+
+#define	db_copy db_copy@DB_VERSION_UNIQUE_NAME@
+#define	db_create db_create@DB_VERSION_UNIQUE_NAME@
+#define	db_strerror db_strerror@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_assert db_env_set_func_assert@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_close db_env_set_func_close@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_dirfree db_env_set_func_dirfree@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_dirlist db_env_set_func_dirlist@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_exists db_env_set_func_exists@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_free db_env_set_func_free@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_fsync db_env_set_func_fsync@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_ftruncate db_env_set_func_ftruncate@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_ioinfo db_env_set_func_ioinfo@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_malloc db_env_set_func_malloc@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_file_map db_env_set_func_file_map@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_region_map db_env_set_func_region_map@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_pread db_env_set_func_pread@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_pwrite db_env_set_func_pwrite@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_open db_env_set_func_open@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_read db_env_set_func_read@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_realloc db_env_set_func_realloc@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_rename db_env_set_func_rename@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_seek db_env_set_func_seek@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_unlink db_env_set_func_unlink@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_write db_env_set_func_write@DB_VERSION_UNIQUE_NAME@
+#define	db_env_set_func_yield db_env_set_func_yield@DB_VERSION_UNIQUE_NAME@
+#define	db_env_create db_env_create@DB_VERSION_UNIQUE_NAME@
+#define	db_version db_version@DB_VERSION_UNIQUE_NAME@
+#define	db_full_version db_full_version@DB_VERSION_UNIQUE_NAME@
+#define	log_compare log_compare@DB_VERSION_UNIQUE_NAME@
+#if defined(DB_WIN32) && !defined(DB_WINCE)
+#define	db_env_set_win_security db_env_set_win_security@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	db_sequence_create db_sequence_create@DB_VERSION_UNIQUE_NAME@
+#if DB_DBM_HSEARCH != 0
+#define	__db_ndbm_clearerr __db_ndbm_clearerr@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_close __db_ndbm_close@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_delete __db_ndbm_delete@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_dirfno __db_ndbm_dirfno@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_error __db_ndbm_error@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_fetch __db_ndbm_fetch@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_firstkey __db_ndbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_nextkey __db_ndbm_nextkey@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_open __db_ndbm_open@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_pagfno __db_ndbm_pagfno@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_rdonly __db_ndbm_rdonly@DB_VERSION_UNIQUE_NAME@
+#define	__db_ndbm_store __db_ndbm_store@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_close __db_dbm_close@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_delete __db_dbm_delete@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_fetch __db_dbm_fetch@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_firstkey __db_dbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_init __db_dbm_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_nextkey __db_dbm_nextkey@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbm_store __db_dbm_store@DB_VERSION_UNIQUE_NAME@
+#endif
+#if DB_DBM_HSEARCH != 0
+#define	__db_hcreate __db_hcreate@DB_VERSION_UNIQUE_NAME@
+#define	__db_hsearch __db_hsearch@DB_VERSION_UNIQUE_NAME@
+#define	__db_hdestroy __db_hdestroy@DB_VERSION_UNIQUE_NAME@
+#endif
+
+#endif /* !_DB_EXT_DEF_IN_ */
diff --git a/src/dbinc_auto/ext_prot.in b/src/dbinc_auto/ext_prot.in
new file mode 100644
index 00000000..371e5a3e
--- /dev/null
+++ b/src/dbinc_auto/ext_prot.in
@@ -0,0 +1,73 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_DB_EXT_PROT_IN_
+#define	_DB_EXT_PROT_IN_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int db_copy __P((DB_ENV *, const char *, const char *, const char *));
+int db_create __P((DB **, DB_ENV *, u_int32_t));
+char *db_strerror __P((int));
+int db_env_set_func_assert __P((void (*)(const char *, const char *, int)));
+int db_env_set_func_close __P((int (*)(int)));
+int db_env_set_func_dirfree __P((void (*)(char **, int)));
+int db_env_set_func_dirlist __P((int (*)(const char *, char ***, int *)));
+int db_env_set_func_exists __P((int (*)(const char *, int *)));
+int db_env_set_func_free __P((void (*)(void *)));
+int db_env_set_func_fsync __P((int (*)(int)));
+int db_env_set_func_ftruncate __P((int (*)(int, off_t)));
+int db_env_set_func_ioinfo __P((int (*)(const char *, int, u_int32_t *, u_int32_t *, u_int32_t *)));
+int db_env_set_func_malloc __P((void *(*)(size_t)));
+int db_env_set_func_file_map __P((int (*)(DB_ENV *, char *, size_t, int, void **), int (*)(DB_ENV *, void *)));
+int db_env_set_func_region_map __P((int (*)(DB_ENV *, char *, size_t, int *, void **), int (*)(DB_ENV *, void *)));
+int db_env_set_func_pread __P((ssize_t (*)(int, void *, size_t, off_t)));
+int db_env_set_func_pwrite __P((ssize_t (*)(int, const void *, size_t, off_t)));
+int db_env_set_func_open __P((int (*)(const char *, int, ...)));
+int db_env_set_func_read __P((ssize_t (*)(int, void *, size_t)));
+int db_env_set_func_realloc __P((void *(*)(void *, size_t)));
+int db_env_set_func_rename __P((int (*)(const char *, const char *)));
+int db_env_set_func_seek __P((int (*)(int, off_t, int)));
+int db_env_set_func_unlink __P((int (*)(const char *)));
+int db_env_set_func_write __P((ssize_t (*)(int, const void *, size_t)));
+int db_env_set_func_yield __P((int (*)(u_long, u_long)));
+int db_env_create __P((DB_ENV **, u_int32_t));
+char *db_version __P((int *, int *, int *));
+char *db_full_version __P((int *, int *, int *, int *, int *));
+int log_compare __P((const DB_LSN *, const DB_LSN *));
+#if defined(DB_WIN32) && !defined(DB_WINCE)
+int db_env_set_win_security __P((SECURITY_ATTRIBUTES *sa));
+#endif
+int db_sequence_create __P((DB_SEQUENCE **, DB *, u_int32_t));
+#if DB_DBM_HSEARCH != 0
+int	 __db_ndbm_clearerr __P((DBM *));
+void	 __db_ndbm_close __P((DBM *));
+int	 __db_ndbm_delete __P((DBM *, datum));
+int	 __db_ndbm_dirfno __P((DBM *));
+int	 __db_ndbm_error __P((DBM *));
+datum __db_ndbm_fetch __P((DBM *, datum));
+datum __db_ndbm_firstkey __P((DBM *));
+datum __db_ndbm_nextkey __P((DBM *));
+DBM	*__db_ndbm_open __P((const char *, int, int));
+int	 __db_ndbm_pagfno __P((DBM *));
+int	 __db_ndbm_rdonly __P((DBM *));
+int	 __db_ndbm_store __P((DBM *, datum, datum, int));
+int	 __db_dbm_close __P((void));
+int	 __db_dbm_delete __P((datum));
+datum __db_dbm_fetch __P((datum));
+datum __db_dbm_firstkey __P((void));
+int	 __db_dbm_init __P((char *));
+datum __db_dbm_nextkey __P((datum));
+int	 __db_dbm_store __P((datum, datum));
+#endif
+#if DB_DBM_HSEARCH != 0
+int __db_hcreate __P((size_t));
+ENTRY *__db_hsearch __P((ENTRY, ACTION));
+void __db_hdestroy __P((void));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_EXT_PROT_IN_ */
diff --git a/src/dbinc_auto/fileops_auto.h b/src/dbinc_auto/fileops_auto.h
new file mode 100644
index 00000000..59385c88
--- /dev/null
+++ b/src/dbinc_auto/fileops_auto.h
@@ -0,0 +1,262 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__fop_AUTO_H
+#define	__fop_AUTO_H
+#include "dbinc/log.h"
+#define	DB___fop_create_42	143
+typedef struct ___fop_create_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	u_int32_t	appname;
+	u_int32_t	mode;
+} __fop_create_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_42_desc[];
+static inline int __fop_create_42_read(ENV *env, 
+    void *data, __fop_create_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_create_42_desc, sizeof(__fop_create_42_args), (void**)arg));
+}
+#define	DB___fop_create	143
+typedef struct ___fop_create_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	DBT	dirname;
+	u_int32_t	appname;
+	u_int32_t	mode;
+} __fop_create_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_desc[];
+static inline int
+__fop_create_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *name, const DBT *dirname, u_int32_t appname, u_int32_t mode)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_create, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t),
+	    __fop_create_desc,
+	    name, dirname, appname, mode));
+}
+
+static inline int __fop_create_read(ENV *env, 
+    void *data, __fop_create_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_create_desc, sizeof(__fop_create_args), (void**)arg));
+}
+#define	DB___fop_remove	144
+typedef struct ___fop_remove_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	DBT	fid;
+	u_int32_t	appname;
+} __fop_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_remove_desc[];
+static inline int
+__fop_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *name, const DBT *fid, u_int32_t appname)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_remove, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid) + sizeof(u_int32_t),
+	    __fop_remove_desc,
+	    name, fid, appname));
+}
+
+static inline int __fop_remove_read(ENV *env, 
+    void *data, __fop_remove_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_remove_desc, sizeof(__fop_remove_args), (void**)arg));
+}
+#define	DB___fop_write_42	145
+typedef struct ___fop_write_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	u_int32_t	appname;
+	u_int32_t	pgsize;
+	db_pgno_t	pageno;
+	u_int32_t	offset;
+	DBT	page;
+	u_int32_t	flag;
+} __fop_write_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_42_desc[];
+static inline int __fop_write_42_read(ENV *env, 
+    void *data, __fop_write_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_write_42_desc, sizeof(__fop_write_42_args), (void**)arg));
+}
+#define	DB___fop_write	145
+typedef struct ___fop_write_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	name;
+	DBT	dirname;
+	u_int32_t	appname;
+	u_int32_t	pgsize;
+	db_pgno_t	pageno;
+	u_int32_t	offset;
+	DBT	page;
+	u_int32_t	flag;
+} __fop_write_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_desc[];
+static inline int
+__fop_write_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *name, const DBT *dirname, u_int32_t appname, u_int32_t pgsize, db_pgno_t pageno,
+    u_int32_t offset, const DBT *page, u_int32_t flag)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_write, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(page) + sizeof(u_int32_t),
+	    __fop_write_desc,
+	    name, dirname, appname, pgsize, pageno, offset, page, flag));
+}
+
+static inline int __fop_write_read(ENV *env, 
+    void *data, __fop_write_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_write_desc, sizeof(__fop_write_args), (void**)arg));
+}
+#define	DB___fop_rename_42	146
+#define	DB___fop_rename_noundo_46	150
+typedef struct ___fop_rename_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	oldname;
+	DBT	newname;
+	DBT	fileid;
+	u_int32_t	appname;
+} __fop_rename_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_42_desc[];
+static inline int __fop_rename_42_read(ENV *env, 
+    void *data, __fop_rename_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_rename_42_desc, sizeof(__fop_rename_42_args), (void**)arg));
+}
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_46_desc[];
+static inline int __fop_rename_noundo_46_read(ENV *env, 
+    void *data, __fop_rename_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_rename_noundo_46_desc, sizeof(__fop_rename_42_args), (void**)arg));
+}
+#define	DB___fop_rename	146
+#define	DB___fop_rename_noundo	150
+typedef struct ___fop_rename_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	oldname;
+	DBT	newname;
+	DBT	dirname;
+	DBT	fileid;
+	u_int32_t	appname;
+} __fop_rename_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_desc[];
+static inline int
+__fop_rename_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *oldname, const DBT *newname, const DBT *dirname, const DBT *fileid, u_int32_t appname)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_rename, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(dirname) +
+	    LOG_DBT_SIZE(fileid) + sizeof(u_int32_t),
+	    __fop_rename_desc,
+	    oldname, newname, dirname, fileid, appname));
+}
+
+static inline int __fop_rename_read(ENV *env, 
+    void *data, __fop_rename_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_rename_desc, sizeof(__fop_rename_args), (void**)arg));
+}
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_desc[];
+static inline int
+__fop_rename_noundo_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *oldname, const DBT *newname, const DBT *dirname, const DBT *fileid, u_int32_t appname)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_rename_noundo, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(dirname) +
+	    LOG_DBT_SIZE(fileid) + sizeof(u_int32_t),
+	    __fop_rename_noundo_desc,
+	    oldname, newname, dirname, fileid, appname));
+}
+
+static inline int __fop_rename_noundo_read(ENV *env, 
+    void *data, __fop_rename_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_rename_noundo_desc, sizeof(__fop_rename_args), (void**)arg));
+}
+#define	DB___fop_file_remove	141
+typedef struct ___fop_file_remove_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DBT	real_fid;
+	DBT	tmp_fid;
+	DBT	name;
+	u_int32_t	appname;
+	u_int32_t	child;
+} __fop_file_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_file_remove_desc[];
+static inline int
+__fop_file_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    const DBT *real_fid, const DBT *tmp_fid, const DBT *name, u_int32_t appname, u_int32_t child)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___fop_file_remove, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    LOG_DBT_SIZE(real_fid) + LOG_DBT_SIZE(tmp_fid) + LOG_DBT_SIZE(name) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __fop_file_remove_desc,
+	    real_fid, tmp_fid, name, appname, child));
+}
+
+static inline int __fop_file_remove_read(ENV *env, 
+    void *data, __fop_file_remove_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __fop_file_remove_desc, sizeof(__fop_file_remove_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/fileops_ext.h b/src/dbinc_auto/fileops_ext.h
new file mode 100644
index 00000000..0aa6c1e1
--- /dev/null
+++ b/src/dbinc_auto/fileops_ext.h
@@ -0,0 +1,44 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_fileops_ext_h_
+#define	_fileops_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __fop_init_recover __P((ENV *, DB_DISTAB *));
+int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_init_print __P((ENV *, DB_DISTAB *));
+int __fop_create __P((ENV *, DB_TXN *, DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
+int __fop_remove __P((ENV *, DB_TXN *, u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
+int __fop_write __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
+int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *, const char **, u_int8_t *, APPNAME, int, u_int32_t));
+int __fop_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_noundo_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_noundo_46_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_lock_handle __P((ENV *, DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
+int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
+int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, int, u_int32_t));
+int __fop_remove_setup __P((DB *, DB_TXN *, const char *, u_int32_t));
+int __fop_read_meta __P((ENV *, const char *, u_int8_t *, size_t, DB_FH *, int, size_t *));
+int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *));
+int __fop_dbrename __P((DB *, const char *, const char *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_fileops_ext_h_ */
diff --git a/src/dbinc_auto/hash_auto.h b/src/dbinc_auto/hash_auto.h
new file mode 100644
index 00000000..c1dcae91
--- /dev/null
+++ b/src/dbinc_auto/hash_auto.h
@@ -0,0 +1,484 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__ham_AUTO_H
+#define	__ham_AUTO_H
+#ifdef HAVE_HASH
+#include "dbinc/log.h"
+#define	DB___ham_insdel	21
+typedef struct ___ham_insdel_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	ndx;
+	DB_LSN	pagelsn;
+	u_int32_t	keytype;
+	DBT	key;
+	u_int32_t	datatype;
+	DBT	data;
+} __ham_insdel_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_insdel_desc[];
+static inline int
+__ham_insdel_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_pgno_t pgno, u_int32_t ndx, DB_LSN * pagelsn,
+    u_int32_t keytype, const DBT *key, u_int32_t datatype, const DBT *data)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_insdel, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(*pagelsn) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(key) + sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+	    __ham_insdel_desc,
+	    opcode, pgno, ndx, pagelsn, keytype, key, datatype,
+	    data));
+}
+
+static inline int __ham_insdel_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_insdel_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_insdel_desc, sizeof(__ham_insdel_args), (void**)arg));
+}
+#define	DB___ham_insdel_42	21
+typedef struct ___ham_insdel_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	ndx;
+	DB_LSN	pagelsn;
+	DBT	key;
+	DBT	data;
+} __ham_insdel_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_insdel_42_desc[];
+static inline int __ham_insdel_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_insdel_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_insdel_42_desc, sizeof(__ham_insdel_42_args), (void**)arg));
+}
+#define	DB___ham_newpage	22
+typedef struct ___ham_newpage_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	prev_pgno;
+	DB_LSN	prevlsn;
+	db_pgno_t	new_pgno;
+	DB_LSN	pagelsn;
+	db_pgno_t	next_pgno;
+	DB_LSN	nextlsn;
+} __ham_newpage_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_newpage_desc[];
+static inline int
+__ham_newpage_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_pgno_t prev_pgno, DB_LSN * prevlsn, db_pgno_t new_pgno,
+    DB_LSN * pagelsn, db_pgno_t next_pgno, DB_LSN * nextlsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_newpage, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*prevlsn) + sizeof(u_int32_t) + sizeof(*pagelsn) +
+	    sizeof(u_int32_t) + sizeof(*nextlsn),
+	    __ham_newpage_desc,
+	    opcode, prev_pgno, prevlsn, new_pgno, pagelsn, next_pgno, nextlsn));
+}
+
+static inline int __ham_newpage_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_newpage_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_newpage_desc, sizeof(__ham_newpage_args), (void**)arg));
+}
+#define	DB___ham_splitdata	24
+typedef struct ___ham_splitdata_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	u_int32_t	opcode;
+	db_pgno_t	pgno;
+	DBT	pageimage;
+	DB_LSN	pagelsn;
+} __ham_splitdata_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_splitdata_desc[];
+static inline int
+__ham_splitdata_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t opcode, db_pgno_t pgno, const DBT *pageimage, DB_LSN * pagelsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_splitdata, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(pageimage) + sizeof(*pagelsn),
+	    __ham_splitdata_desc, opcode, pgno, pageimage, pagelsn));
+}
+
+static inline int __ham_splitdata_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_splitdata_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_splitdata_desc, sizeof(__ham_splitdata_args), (void**)arg));
+}
+#define	DB___ham_replace	25
+typedef struct ___ham_replace_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	ndx;
+	DB_LSN	pagelsn;
+	int32_t	off;
+	u_int32_t	oldtype;
+	DBT	olditem;
+	u_int32_t	newtype;
+	DBT	newitem;
+} __ham_replace_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_replace_desc[];
+static inline int
+__ham_replace_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t ndx, DB_LSN * pagelsn, int32_t off,
+    u_int32_t oldtype, const DBT *olditem, u_int32_t newtype, const DBT *newitem)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_replace, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*pagelsn) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(olditem) + sizeof(u_int32_t) + LOG_DBT_SIZE(newitem),
+	    __ham_replace_desc, pgno, ndx, pagelsn, off, oldtype, olditem, newtype,
+	    newitem));
+}
+
+static inline int __ham_replace_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_replace_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_replace_desc, sizeof(__ham_replace_args), (void**)arg));
+}
+#define	DB___ham_replace_42	25
+typedef struct ___ham_replace_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	ndx;
+	DB_LSN	pagelsn;
+	int32_t	off;
+	DBT	olditem;
+	DBT	newitem;
+	u_int32_t	makedup;
+} __ham_replace_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_replace_42_desc[];
+static inline int __ham_replace_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_replace_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_replace_42_desc, sizeof(__ham_replace_42_args), (void**)arg));
+}
+#define	DB___ham_copypage	28
+typedef struct ___ham_copypage_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DB_LSN	pagelsn;
+	db_pgno_t	next_pgno;
+	DB_LSN	nextlsn;
+	db_pgno_t	nnext_pgno;
+	DB_LSN	nnextlsn;
+	DBT	page;
+} __ham_copypage_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_copypage_desc[];
+static inline int
+__ham_copypage_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * pagelsn, db_pgno_t next_pgno, DB_LSN * nextlsn,
+    db_pgno_t nnext_pgno, DB_LSN * nnextlsn, const DBT *page)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_copypage, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*pagelsn) +
+	    sizeof(u_int32_t) + sizeof(*nextlsn) + sizeof(u_int32_t) +
+	    sizeof(*nnextlsn) + LOG_DBT_SIZE(page),
+	    __ham_copypage_desc, pgno, pagelsn, next_pgno, nextlsn, nnext_pgno, nnextlsn, page));
+}
+
+static inline int __ham_copypage_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_copypage_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_copypage_desc, sizeof(__ham_copypage_args), (void**)arg));
+}
+#define	DB___ham_metagroup_42	29
+typedef struct ___ham_metagroup_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	u_int32_t	bucket;
+	db_pgno_t	mmpgno;
+	DB_LSN	mmetalsn;
+	db_pgno_t	mpgno;
+	DB_LSN	metalsn;
+	db_pgno_t	pgno;
+	DB_LSN	pagelsn;
+	u_int32_t	newalloc;
+} __ham_metagroup_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_metagroup_42_desc[];
+static inline int __ham_metagroup_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_metagroup_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_metagroup_42_desc, sizeof(__ham_metagroup_42_args), (void**)arg));
+}
+#define	DB___ham_metagroup	29
+typedef struct ___ham_metagroup_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	u_int32_t	bucket;
+	db_pgno_t	mmpgno;
+	DB_LSN	mmetalsn;
+	db_pgno_t	mpgno;
+	DB_LSN	metalsn;
+	db_pgno_t	pgno;
+	DB_LSN	pagelsn;
+	u_int32_t	newalloc;
+	db_pgno_t	last_pgno;
+} __ham_metagroup_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_metagroup_desc[];
+static inline int
+__ham_metagroup_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t bucket, db_pgno_t mmpgno, DB_LSN * mmetalsn, db_pgno_t mpgno,
+    DB_LSN * metalsn, db_pgno_t pgno, DB_LSN * pagelsn, u_int32_t newalloc, db_pgno_t last_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_metagroup, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*mmetalsn) + sizeof(u_int32_t) + sizeof(*metalsn) +
+	    sizeof(u_int32_t) + sizeof(*pagelsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t),
+	    __ham_metagroup_desc, bucket, mmpgno, mmetalsn, mpgno, metalsn, pgno, pagelsn,
+	    newalloc, last_pgno));
+}
+
+static inline int __ham_metagroup_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_metagroup_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_metagroup_desc, sizeof(__ham_metagroup_args), (void**)arg));
+}
+#define	DB___ham_groupalloc_42	32
+typedef struct ___ham_groupalloc_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	db_pgno_t	start_pgno;
+	u_int32_t	num;
+	db_pgno_t	free;
+} __ham_groupalloc_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_groupalloc_42_desc[];
+static inline int __ham_groupalloc_42_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_groupalloc_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_groupalloc_42_desc, sizeof(__ham_groupalloc_42_args), (void**)arg));
+}
+#define	DB___ham_groupalloc	32
+typedef struct ___ham_groupalloc_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	db_pgno_t	start_pgno;
+	u_int32_t	num;
+	db_pgno_t	unused;
+	db_pgno_t	last_pgno;
+} __ham_groupalloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_groupalloc_desc[];
+static inline int
+__ham_groupalloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t start_pgno, u_int32_t num, db_pgno_t unused,
+    db_pgno_t last_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_groupalloc, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __ham_groupalloc_desc, meta_lsn, start_pgno, num, unused, last_pgno));
+}
+
+static inline int __ham_groupalloc_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_groupalloc_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_groupalloc_desc, sizeof(__ham_groupalloc_args), (void**)arg));
+}
+#define	DB___ham_changeslot	35
+typedef struct ___ham_changeslot_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	u_int32_t	slot;
+	db_pgno_t	old;
+	db_pgno_t	new;
+} __ham_changeslot_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_changeslot_desc[];
+static inline int
+__ham_changeslot_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, u_int32_t slot, db_pgno_t old, db_pgno_t new)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_changeslot, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __ham_changeslot_desc, meta_lsn, slot, old, new));
+}
+
+static inline int __ham_changeslot_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_changeslot_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_changeslot_desc, sizeof(__ham_changeslot_args), (void**)arg));
+}
+#define	DB___ham_contract	37
+typedef struct ___ham_contract_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	meta;
+	DB_LSN	meta_lsn;
+	u_int32_t	bucket;
+	db_pgno_t	pgno;
+} __ham_contract_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_contract_desc[];
+static inline int
+__ham_contract_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta, DB_LSN * meta_lsn, u_int32_t bucket, db_pgno_t pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_contract, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __ham_contract_desc, meta, meta_lsn, bucket, pgno));
+}
+
+static inline int __ham_contract_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_contract_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_contract_desc, sizeof(__ham_contract_args), (void**)arg));
+}
+#define	DB___ham_curadj	33
+typedef struct ___ham_curadj_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	u_int32_t	len;
+	u_int32_t	dup_off;
+	int	add;
+	int	is_dup;
+	u_int32_t	order;
+} __ham_curadj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_curadj_desc[];
+static inline int
+__ham_curadj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t indx, u_int32_t len, u_int32_t dup_off,
+    int add, int is_dup, u_int32_t order)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_curadj, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __ham_curadj_desc, pgno, indx, len, dup_off, add, is_dup, order));
+}
+
+static inline int __ham_curadj_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_curadj_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_curadj_desc, sizeof(__ham_curadj_args), (void**)arg));
+}
+#define	DB___ham_chgpg	34
+typedef struct ___ham_chgpg_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_ham_mode	mode;
+	db_pgno_t	old_pgno;
+	db_pgno_t	new_pgno;
+	u_int32_t	old_indx;
+	u_int32_t	new_indx;
+} __ham_chgpg_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_chgpg_desc[];
+static inline int
+__ham_chgpg_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_ham_mode mode, db_pgno_t old_pgno, db_pgno_t new_pgno, u_int32_t old_indx,
+    u_int32_t new_indx)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___ham_chgpg, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __ham_chgpg_desc, mode, old_pgno, new_pgno, old_indx, new_indx));
+}
+
+static inline int __ham_chgpg_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __ham_chgpg_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __ham_chgpg_desc, sizeof(__ham_chgpg_args), (void**)arg));
+}
+#endif /* HAVE_HASH */
+#endif
diff --git a/src/dbinc_auto/hash_ext.h b/src/dbinc_auto/hash_ext.h
new file mode 100644
index 00000000..e83fe817
--- /dev/null
+++ b/src/dbinc_auto/hash_ext.h
@@ -0,0 +1,129 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_hash_ext_h_
+#define	_hash_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __ham_quick_delete __P((DBC *));
+int __hamc_init __P((DBC *));
+int __hamc_count __P((DBC *, db_recno_t *));
+int __hamc_cmp __P((DBC *, DBC *, int *));
+int __hamc_dup __P((DBC *, DBC *));
+int  __ham_contract_table __P((DBC *, DB_COMPACT *));
+u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t));
+int  __ham_overwrite __P((DBC *, DBT *, u_int32_t));
+int  __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
+int __ham_init_dbt __P((ENV *, DBT *, u_int32_t, void **, u_int32_t *));
+int __hamc_update __P((DBC *, u_int32_t, db_ham_curadj, int));
+int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***));
+int __ham_init_recover __P((ENV *, DB_DISTAB *));
+int __ham_insdel_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_init_print __P((ENV *, DB_DISTAB *));
+int __ham_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
+int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
+int __ham_compact_hash __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+int __ham_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __ham_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __ham_mswap __P((ENV *, void *));
+int __ham_add_dup __P((DBC *, DBT *, u_int32_t, db_pgno_t *));
+int __ham_dup_convert __P((DBC *));
+int __ham_make_dup __P((ENV *, const DBT *, DBT *d, void **, u_int32_t *));
+void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *, u_int32_t));
+u_int32_t __ham_func2 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func5 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_test __P((DB *, const void *, u_int32_t));
+int __ham_get_meta __P((DBC *));
+int __ham_release_meta __P((DBC *));
+int __ham_dirty_meta __P((DBC *, u_int32_t));
+int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **));
+int __ham_db_create __P((DB *));
+int __ham_db_close __P((DB *));
+int __ham_get_h_ffactor __P((DB *, u_int32_t *));
+int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __ham_get_h_nelem __P((DB *, u_int32_t *));
+void __ham_copy_config __P((DB *, DB*, u_int32_t));
+int __ham_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char * name, db_pgno_t, u_int32_t));
+int __ham_metachk __P((DB *, const char *, HMETA *));
+int __ham_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __ham_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+int __ham_item __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_reset __P((DBC *));
+int __ham_item_init __P((DBC *));
+int __ham_item_last __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_first __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_prev __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_next __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_insertpair __P((DBC *, PAGE *p, db_indx_t *indxp, const DBT *, const DBT *, u_int32_t, u_int32_t));
+int __ham_getindex __P((DBC *, PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+int __ham_verify_sorted_page __P((DBC *, PAGE *));
+int __ham_sort_page_cursor __P((DBC *, PAGE *));
+int __ham_sort_page __P((DBC *,  PAGE **, PAGE *));
+int __ham_del_pair __P((DBC *, int, PAGE *));
+int __ham_replpair __P((DBC *, DBT *, u_int32_t));
+void __ham_onpage_replace __P((DB *, PAGE *, u_int32_t, int32_t, u_int32_t,  int, DBT *));
+int __ham_merge_pages __P((DBC *, u_int32_t, u_int32_t, DB_COMPACT *));
+int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
+int __ham_add_el __P((DBC *, const DBT *, const DBT *, u_int32_t));
+int __ham_copypair __P((DBC *, PAGE *, u_int32_t, PAGE *, db_indx_t *, int));
+int __ham_add_ovflpage __P((DBC *, PAGE **));
+int __ham_get_cpage __P((DBC *, db_lockmode_t));
+int __ham_next_cpage __P((DBC *, db_pgno_t));
+int __ham_lock_bucket __P((DBC *, db_lockmode_t));
+void __ham_dpair __P((DB *, PAGE *, u_int32_t));
+int __ham_insdel_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *txn, u_int32_t));
+int __ham_truncate __P((DBC *, u_int32_t *));
+int __ham_stat __P((DBC *, void *, u_int32_t));
+int __ham_stat_print __P((DBC *, u_int32_t));
+void __ham_print_cursor __P((DBC *));
+int __ham_traverse __P((DBC *, db_lockmode_t, int (*)(DBC *, PAGE *, void *, int *), void *, int));
+int __db_no_hash_am __P((ENV *));
+int __ham_30_hashmeta __P((DB *, char *, u_int8_t *));
+int __ham_30_sizefix __P((DB *, DB_FH *, char *, u_int8_t *));
+int __ham_31_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_31_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_46_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_46_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *, db_pgno_t, u_int32_t));
+int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __ham_vrfy_hashing __P((DBC *, u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t, u_int32_t (*) __P((DB *, const void *, u_int32_t))));
+int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t, DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_hash_ext_h_ */
diff --git a/src/dbinc_auto/heap_auto.h b/src/dbinc_auto/heap_auto.h
new file mode 100644
index 00000000..bf288627
--- /dev/null
+++ b/src/dbinc_auto/heap_auto.h
@@ -0,0 +1,146 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__heap_AUTO_H
+#define	__heap_AUTO_H
+#ifdef HAVE_HEAP
+#include "dbinc/log.h"
+#define	DB___heap_addrem	151
+typedef struct ___heap_addrem_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	u_int32_t	nbytes;
+	DBT	hdr;
+	DBT	dbt;
+	DB_LSN	pagelsn;
+} __heap_addrem_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_desc[];
+static inline int
+__heap_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_pgno_t pgno, u_int32_t indx, u_int32_t nbytes,
+    const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___heap_addrem, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+	    LOG_DBT_SIZE(dbt) + sizeof(*pagelsn),
+	    __heap_addrem_desc,
+	    opcode, pgno, indx, nbytes, hdr, dbt, pagelsn));
+}
+
+static inline int __heap_addrem_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __heap_addrem_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __heap_addrem_desc, sizeof(__heap_addrem_args), (void**)arg));
+}
+#define	DB___heap_pg_alloc	152
+typedef struct ___heap_pg_alloc_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	meta_lsn;
+	db_pgno_t	meta_pgno;
+	db_pgno_t	pgno;
+	u_int32_t	ptype;
+	db_pgno_t	last_pgno;
+} __heap_pg_alloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_pg_alloc_desc[];
+static inline int
+__heap_pg_alloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t meta_pgno, db_pgno_t pgno, u_int32_t ptype,
+    db_pgno_t last_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___heap_pg_alloc, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __heap_pg_alloc_desc, meta_lsn, meta_pgno, pgno, ptype, last_pgno));
+}
+
+static inline int __heap_pg_alloc_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __heap_pg_alloc_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __heap_pg_alloc_desc, sizeof(__heap_pg_alloc_args), (void**)arg));
+}
+#define	DB___heap_trunc_meta	153
+typedef struct ___heap_trunc_meta_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	u_int32_t	last_pgno;
+	u_int32_t	key_count;
+	u_int32_t	record_count;
+	u_int32_t	curregion;
+	u_int32_t	nregions;
+	DB_LSN	pagelsn;
+} __heap_trunc_meta_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_trunc_meta_desc[];
+static inline int
+__heap_trunc_meta_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t last_pgno, u_int32_t key_count, u_int32_t record_count,
+    u_int32_t curregion, u_int32_t nregions, DB_LSN * pagelsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___heap_trunc_meta, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(*pagelsn),
+	    __heap_trunc_meta_desc, pgno, last_pgno, key_count, record_count, curregion, nregions, pagelsn));
+}
+
+static inline int __heap_trunc_meta_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __heap_trunc_meta_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __heap_trunc_meta_desc, sizeof(__heap_trunc_meta_args), (void**)arg));
+}
+#define	DB___heap_trunc_page	154
+typedef struct ___heap_trunc_page_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_pgno_t	pgno;
+	DBT	old_data;
+	u_int32_t	is_region;
+	DB_LSN	pagelsn;
+} __heap_trunc_page_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_trunc_page_desc[];
+static inline int
+__heap_trunc_page_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *old_data, u_int32_t is_region, DB_LSN * pagelsn)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___heap_trunc_page, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(old_data) +
+	    sizeof(u_int32_t) + sizeof(*pagelsn),
+	    __heap_trunc_page_desc, pgno, old_data, is_region, pagelsn));
+}
+
+static inline int __heap_trunc_page_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __heap_trunc_page_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __heap_trunc_page_desc, sizeof(__heap_trunc_page_args), (void**)arg));
+}
+#endif /* HAVE_HEAP */
+#endif
diff --git a/src/dbinc_auto/heap_ext.h b/src/dbinc_auto/heap_ext.h
new file mode 100644
index 00000000..8bc24b61
--- /dev/null
+++ b/src/dbinc_auto/heap_ext.h
@@ -0,0 +1,58 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_heap_ext_h_
+#define	_heap_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __heapc_init __P((DBC *));
+int __heap_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __heap_append __P((DBC *, DBT *, DBT *));
+int __heap_pitem __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __heapc_dup __P((DBC *, DBC *));
+int __heapc_gsplit __P((DBC *, DBT *, void **, u_int32_t *));
+int __heapc_refresh __P((DBC *));
+int __heap_init_recover __P((ENV *, DB_DISTAB *));
+int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_init_print __P((ENV *, DB_DISTAB *));
+int __heap_backup __P((DB_ENV *, DB *, DB_THREAD_INFO *, DB_FH *, void *, u_int32_t));
+int __heap_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __heap_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __heap_mswap __P((ENV *, PAGE *));
+int __heap_db_create __P((DB *));
+int __heap_db_close __P((DB *));
+int __heap_get_heapsize __P((DB *, u_int32_t *, u_int32_t *));
+int __heap_get_heap_regionsize __P((DB *, u_int32_t *));
+int __heap_set_heapsize __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+int __heap_set_heap_regionsize __P((DB *, u_int32_t));
+int __heap_exist __P((void));
+int __heap_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __heap_metachk __P((DB *, const char *, HEAPMETA *));
+int __heap_read_meta __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+int __heap_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __heap_create_region __P((DBC *, db_pgno_t));
+int __heap_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_truncate __P((DBC *, u_int32_t *));
+int __heap_stat __P((DBC *, void *, u_int32_t));
+int __heap_stat_print __P((DBC *, u_int32_t));
+void __heap_print_cursor __P((DBC *));
+int __heap_stat_callback __P((DBC *, PAGE *, void *, int *));
+int __heap_traverse __P((DBC *, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __db_no_heap_am __P((ENV *));
+int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *, db_pgno_t, u_int32_t));
+int __heap_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+int __heap_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __heap_meta2pgset __P((DB *, VRFY_DBINFO *, HEAPMETA *, DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_heap_ext_h_ */
diff --git a/src/dbinc_auto/hmac_ext.h b/src/dbinc_auto/hmac_ext.h
new file mode 100644
index 00000000..c1371014
--- /dev/null
+++ b/src/dbinc_auto/hmac_ext.h
@@ -0,0 +1,20 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_hmac_ext_h_
+#define	_hmac_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __db_chksum __P((void *, u_int8_t *, size_t, u_int8_t *, u_int8_t *));
+void __db_derive_mac __P((u_int8_t *, size_t, u_int8_t *));
+int __db_check_chksum __P((ENV *, void *, DB_CIPHER *, u_int8_t *, void *, size_t, int));
+void __db_SHA1Transform __P((u_int32_t *, unsigned char *));
+void __db_SHA1Init __P((SHA1_CTX *));
+void __db_SHA1Update __P((SHA1_CTX *, unsigned char *, size_t));
+void __db_SHA1Final __P((unsigned char *, SHA1_CTX *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_hmac_ext_h_ */
diff --git a/src/dbinc_auto/int_def.in b/src/dbinc_auto/int_def.in
new file mode 100644
index 00000000..dce2831c
--- /dev/null
+++ b/src/dbinc_auto/int_def.in
@@ -0,0 +1,2265 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_DB_INT_DEF_IN_
+#define	_DB_INT_DEF_IN_
+
+#define	__crdel_metasub_desc __crdel_metasub_desc@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_create_desc __crdel_inmem_create_desc@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_rename_desc __crdel_inmem_rename_desc@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_remove_desc __crdel_inmem_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_init_recover __crdel_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_metasub_print __crdel_metasub_print@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_create_print __crdel_inmem_create_print@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_rename_print __crdel_inmem_rename_print@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_remove_print __crdel_inmem_remove_print@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_init_print __crdel_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_metasub_recover __crdel_metasub_recover@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_create_recover __crdel_inmem_create_recover@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_rename_recover __crdel_inmem_rename_recover@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_remove_recover __crdel_inmem_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_master_open __db_master_open@DB_VERSION_UNIQUE_NAME@
+#define	__db_master_update __db_master_update@DB_VERSION_UNIQUE_NAME@
+#define	__env_dbreg_setup __env_dbreg_setup@DB_VERSION_UNIQUE_NAME@
+#define	__env_setup __env_setup@DB_VERSION_UNIQUE_NAME@
+#define	__env_mpool __env_mpool@DB_VERSION_UNIQUE_NAME@
+#define	__db_close __db_close@DB_VERSION_UNIQUE_NAME@
+#define	__db_refresh __db_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__db_log_page __db_log_page@DB_VERSION_UNIQUE_NAME@
+#define	__db_walk_cursors __db_walk_cursors@DB_VERSION_UNIQUE_NAME@
+#define	__db_backup_name __db_backup_name@DB_VERSION_UNIQUE_NAME@
+#ifdef CONFIG_TEST
+#define	__db_testcopy __db_testcopy@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_testdocopy __db_testdocopy@DB_VERSION_UNIQUE_NAME@
+#define	__db_cursor_int __db_cursor_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_put __db_put@DB_VERSION_UNIQUE_NAME@
+#define	__db_del __db_del@DB_VERSION_UNIQUE_NAME@
+#define	__db_sync __db_sync@DB_VERSION_UNIQUE_NAME@
+#define	__db_associate __db_associate@DB_VERSION_UNIQUE_NAME@
+#define	__db_secondary_close __db_secondary_close@DB_VERSION_UNIQUE_NAME@
+#define	__db_associate_foreign __db_associate_foreign@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_desc __db_addrem_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_42_desc __db_addrem_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_desc __db_big_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_42_desc __db_big_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_ovref_desc __db_ovref_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_42_desc __db_relink_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_debug_desc __db_debug_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_noop_desc __db_noop_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_42_desc __db_pg_alloc_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_desc __db_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_42_desc __db_pg_free_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_desc __db_pg_free_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_cksum_desc __db_cksum_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_42_desc __db_pg_freedata_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_desc __db_pg_freedata_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_init_desc __db_pg_init_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_sort_44_desc __db_pg_sort_44_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_trunc_desc __db_pg_trunc_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_realloc_desc __db_realloc_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_desc __db_relink_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_merge_desc __db_merge_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgno_desc __db_pgno_desc@DB_VERSION_UNIQUE_NAME@
+#define	__db_init_recover __db_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_print __db_addrem_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_42_print __db_addrem_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_print __db_big_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_42_print __db_big_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_ovref_print __db_ovref_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_42_print __db_relink_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_debug_print __db_debug_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_noop_print __db_noop_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_42_print __db_pg_alloc_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_print __db_pg_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_42_print __db_pg_free_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_print __db_pg_free_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_cksum_print __db_cksum_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_42_print __db_pg_freedata_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_print __db_pg_freedata_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_init_print __db_pg_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_sort_44_print __db_pg_sort_44_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_trunc_print __db_pg_trunc_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_realloc_print __db_realloc_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_print __db_relink_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_merge_print __db_merge_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgno_print __db_pgno_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_init_print __db_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbbackup_pp __db_dbbackup_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbbackup __db_dbbackup@DB_VERSION_UNIQUE_NAME@
+#define	__db_backup __db_backup@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_close __dbc_close@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_destroy __dbc_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_cmp __dbc_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_count __dbc_count@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_del __dbc_del@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_idel __dbc_idel@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_COMPRESSION
+#define	__dbc_bulk_del __dbc_bulk_del@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__dbc_dup __dbc_dup@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_idup __dbc_idup@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_newopd __dbc_newopd@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_get __dbc_get@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_iget __dbc_iget@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_put __dbc_put@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_iput __dbc_iput@DB_VERSION_UNIQUE_NAME@
+#define	__db_duperr __db_duperr@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_cleanup __dbc_cleanup@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_secondary_get_pp __dbc_secondary_get_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_pget __dbc_pget@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_del_primary __dbc_del_primary@DB_VERSION_UNIQUE_NAME@
+#define	__db_s_first __db_s_first@DB_VERSION_UNIQUE_NAME@
+#define	__db_s_next __db_s_next@DB_VERSION_UNIQUE_NAME@
+#define	__db_s_done __db_s_done@DB_VERSION_UNIQUE_NAME@
+#define	__db_buildpartial __db_buildpartial@DB_VERSION_UNIQUE_NAME@
+#define	__db_partsize __db_partsize@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define	__db_check_skeyset __db_check_skeyset@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__cdsgroup_begin __cdsgroup_begin@DB_VERSION_UNIQUE_NAME@
+#define	__cdsgroup_begin_pp __cdsgroup_begin_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_compact_int __db_compact_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_exchange_page __db_exchange_page@DB_VERSION_UNIQUE_NAME@
+#define	__db_truncate_overflow __db_truncate_overflow@DB_VERSION_UNIQUE_NAME@
+#define	__db_truncate_root __db_truncate_root@DB_VERSION_UNIQUE_NAME@
+#define	__db_find_free __db_find_free@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink __db_relink@DB_VERSION_UNIQUE_NAME@
+#define	__db_move_metadata __db_move_metadata@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgin __db_pgin@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgout __db_pgout@DB_VERSION_UNIQUE_NAME@
+#define	__db_decrypt_pg __db_decrypt_pg@DB_VERSION_UNIQUE_NAME@
+#define	__db_encrypt_and_checksum_pg __db_encrypt_and_checksum_pg@DB_VERSION_UNIQUE_NAME@
+#define	__db_metaswap __db_metaswap@DB_VERSION_UNIQUE_NAME@
+#define	__db_byteswap __db_byteswap@DB_VERSION_UNIQUE_NAME@
+#define	__db_pageswap __db_pageswap@DB_VERSION_UNIQUE_NAME@
+#define	__db_recordswap __db_recordswap@DB_VERSION_UNIQUE_NAME@
+#define	__db_dispatch __db_dispatch@DB_VERSION_UNIQUE_NAME@
+#define	__db_add_recovery __db_add_recovery@DB_VERSION_UNIQUE_NAME@
+#define	__db_add_recovery_int __db_add_recovery_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_init __db_txnlist_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_add __db_txnlist_add@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_remove __db_txnlist_remove@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_ckp __db_txnlist_ckp@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_end __db_txnlist_end@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_find __db_txnlist_find@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_update __db_txnlist_update@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_gen __db_txnlist_gen@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_lsnadd __db_txnlist_lsnadd@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_lsnget __db_txnlist_lsnget@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_lsninit __db_txnlist_lsninit@DB_VERSION_UNIQUE_NAME@
+#define	__db_txnlist_print __db_txnlist_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_ditem_nolog __db_ditem_nolog@DB_VERSION_UNIQUE_NAME@
+#define	__db_ditem __db_ditem@DB_VERSION_UNIQUE_NAME@
+#define	__db_pitem_nolog __db_pitem_nolog@DB_VERSION_UNIQUE_NAME@
+#define	__db_pitem __db_pitem@DB_VERSION_UNIQUE_NAME@
+#define	__db_associate_pp __db_associate_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_close_pp __db_close_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_cursor_pp __db_cursor_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_cursor __db_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__db_del_pp __db_del_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_exists __db_exists@DB_VERSION_UNIQUE_NAME@
+#define	__db_fd_pp __db_fd_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_pp __db_get_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_get __db_get@DB_VERSION_UNIQUE_NAME@
+#define	__db_join_pp __db_join_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_key_range_pp __db_key_range_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_open_pp __db_open_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_pget_pp __db_pget_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_pget __db_pget@DB_VERSION_UNIQUE_NAME@
+#define	__db_put_pp __db_put_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_compact_pp __db_compact_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_associate_foreign_pp __db_associate_foreign_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_sync_pp __db_sync_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_close_pp __dbc_close_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_cmp_pp __dbc_cmp_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_count_pp __dbc_count_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_del_pp __dbc_del_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_dup_pp __dbc_dup_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_get_pp __dbc_get_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_get_arg __dbc_get_arg@DB_VERSION_UNIQUE_NAME@
+#define	__db_secondary_close_pp __db_secondary_close_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_pget_pp __dbc_pget_pp@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_put_pp __dbc_put_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_txn_auto_init __db_txn_auto_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_txn_auto_resolve __db_txn_auto_resolve@DB_VERSION_UNIQUE_NAME@
+#define	__db_join __db_join@DB_VERSION_UNIQUE_NAME@
+#define	__db_join_close __db_join_close@DB_VERSION_UNIQUE_NAME@
+#define	__db_secondary_corrupt __db_secondary_corrupt@DB_VERSION_UNIQUE_NAME@
+#define	__db_new __db_new@DB_VERSION_UNIQUE_NAME@
+#define	__db_free __db_free@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_FTRUNCATE
+#define	__db_freelist_pos __db_freelist_pos@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_freelist_sort __db_freelist_sort@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_FTRUNCATE
+#define	__db_pg_truncate __db_pg_truncate@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_FTRUNCATE
+#define	__db_free_truncate __db_free_truncate@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_lprint __db_lprint@DB_VERSION_UNIQUE_NAME@
+#define	__db_lget __db_lget@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define	__db_haslock __db_haslock@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DIAGNOSTIC
+#define	__db_has_pagelock __db_has_pagelock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_lput __db_lput@DB_VERSION_UNIQUE_NAME@
+#define	__db_create_internal __db_create_internal@DB_VERSION_UNIQUE_NAME@
+#define	__dbh_am_chk __dbh_am_chk@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_flags __db_get_flags@DB_VERSION_UNIQUE_NAME@
+#define	__db_set_flags __db_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_lorder __db_get_lorder@DB_VERSION_UNIQUE_NAME@
+#define	__db_set_lorder __db_set_lorder@DB_VERSION_UNIQUE_NAME@
+#define	__db_set_pagesize __db_set_pagesize@DB_VERSION_UNIQUE_NAME@
+#define	__db_open __db_open@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_open_flags __db_get_open_flags@DB_VERSION_UNIQUE_NAME@
+#define	__db_new_file __db_new_file@DB_VERSION_UNIQUE_NAME@
+#define	__db_init_subdb __db_init_subdb@DB_VERSION_UNIQUE_NAME@
+#define	__db_chk_meta __db_chk_meta@DB_VERSION_UNIQUE_NAME@
+#define	__db_meta_setup __db_meta_setup@DB_VERSION_UNIQUE_NAME@
+#define	__db_reopen __db_reopen@DB_VERSION_UNIQUE_NAME@
+#define	__db_goff __db_goff@DB_VERSION_UNIQUE_NAME@
+#define	__db_poff __db_poff@DB_VERSION_UNIQUE_NAME@
+#define	__db_ovref __db_ovref@DB_VERSION_UNIQUE_NAME@
+#define	__db_doff __db_doff@DB_VERSION_UNIQUE_NAME@
+#define	__db_moff __db_moff@DB_VERSION_UNIQUE_NAME@
+#define	__db_coff __db_coff@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_overflow __db_vrfy_overflow@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_ovfl_structure __db_vrfy_ovfl_structure@DB_VERSION_UNIQUE_NAME@
+#define	__db_safe_goff __db_safe_goff@DB_VERSION_UNIQUE_NAME@
+#define	__db_loadme __db_loadme@DB_VERSION_UNIQUE_NAME@
+#define	__db_dumptree __db_dumptree@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_flags_fn __db_get_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define	__db_prnpage __db_prnpage@DB_VERSION_UNIQUE_NAME@
+#define	__db_prpage __db_prpage@DB_VERSION_UNIQUE_NAME@
+#define	__db_lockmode_to_string __db_lockmode_to_string@DB_VERSION_UNIQUE_NAME@
+#define	__db_dumptree __db_dumptree@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_flags_fn __db_get_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define	__db_prpage_int __db_prpage_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_prbytes __db_prbytes@DB_VERSION_UNIQUE_NAME@
+#define	__db_prflags __db_prflags@DB_VERSION_UNIQUE_NAME@
+#define	__db_name_to_val __db_name_to_val@DB_VERSION_UNIQUE_NAME@
+#define	__db_pagetype_to_string __db_pagetype_to_string@DB_VERSION_UNIQUE_NAME@
+#define	__db_dump_pp __db_dump_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_dump __db_dump@DB_VERSION_UNIQUE_NAME@
+#define	__db_prdbt __db_prdbt@DB_VERSION_UNIQUE_NAME@
+#define	__db_prheader __db_prheader@DB_VERSION_UNIQUE_NAME@
+#define	__db_prfooter __db_prfooter@DB_VERSION_UNIQUE_NAME@
+#define	__db_pr_callback __db_pr_callback@DB_VERSION_UNIQUE_NAME@
+#define	__db_dbtype_to_string __db_dbtype_to_string@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_recover __db_addrem_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_42_recover __db_addrem_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_recover __db_big_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_42_recover __db_big_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_ovref_recover __db_ovref_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_debug_recover __db_debug_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_noop_recover __db_noop_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_recover __db_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_recover __db_pg_free_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_recover __db_pg_freedata_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_cksum_recover __db_cksum_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_init_recover __db_pg_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_trunc_recover __db_pg_trunc_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_realloc_recover __db_realloc_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_sort_44_recover __db_pg_sort_44_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_42_recover __db_pg_alloc_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_42_recover __db_pg_free_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_42_recover __db_pg_freedata_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_42_recover __db_relink_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_recover __db_relink_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_merge_recover __db_merge_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgno_recover __db_pgno_recover@DB_VERSION_UNIQUE_NAME@
+#define	__db_pglist_swap __db_pglist_swap@DB_VERSION_UNIQUE_NAME@
+#define	__db_pglist_print __db_pglist_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_traverse_big __db_traverse_big@DB_VERSION_UNIQUE_NAME@
+#define	__db_reclaim_callback __db_reclaim_callback@DB_VERSION_UNIQUE_NAME@
+#define	__db_truncate_callback __db_truncate_callback@DB_VERSION_UNIQUE_NAME@
+#define	__env_dbremove_pp __env_dbremove_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_remove_pp __db_remove_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_remove __db_remove@DB_VERSION_UNIQUE_NAME@
+#define	__db_remove_int __db_remove_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_inmem_remove __db_inmem_remove@DB_VERSION_UNIQUE_NAME@
+#define	__env_dbrename_pp __env_dbrename_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_rename_pp __db_rename_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_rename_int __db_rename_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_ret __db_ret@DB_VERSION_UNIQUE_NAME@
+#define	__db_retcopy __db_retcopy@DB_VERSION_UNIQUE_NAME@
+#define	__env_fileid_reset_pp __env_fileid_reset_pp@DB_VERSION_UNIQUE_NAME@
+#define	__env_fileid_reset __env_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define	__env_lsn_reset_pp __env_lsn_reset_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_lsn_reset __db_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define	__db_compare_both __db_compare_both@DB_VERSION_UNIQUE_NAME@
+#define	__db_sort_multiple __db_sort_multiple@DB_VERSION_UNIQUE_NAME@
+#define	__db_stat_pp __db_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_stat_print_pp __db_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_stat_print __db_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_truncate_pp __db_truncate_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_truncate __db_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__db_upgrade_pp __db_upgrade_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_upgrade __db_upgrade@DB_VERSION_UNIQUE_NAME@
+#define	__db_lastpgno __db_lastpgno@DB_VERSION_UNIQUE_NAME@
+#define	__db_31_offdup __db_31_offdup@DB_VERSION_UNIQUE_NAME@
+#define	__db_verify_pp __db_verify_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_verify_internal __db_verify_internal@DB_VERSION_UNIQUE_NAME@
+#define	__db_verify __db_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_common __db_vrfy_common@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_datapage __db_vrfy_datapage@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_meta __db_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_struct_feedback __db_vrfy_struct_feedback@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_pg __db_salvage_pg@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_leaf __db_salvage_leaf@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_inpitem __db_vrfy_inpitem@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_duptype __db_vrfy_duptype@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_duptree __db_salvage_duptree@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_dbinfo_create __db_vrfy_dbinfo_create@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_dbinfo_destroy __db_vrfy_dbinfo_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_getpageinfo __db_vrfy_getpageinfo@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_putpageinfo __db_vrfy_putpageinfo@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_pgset __db_vrfy_pgset@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_pgset_get __db_vrfy_pgset_get@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_pgset_inc __db_vrfy_pgset_inc@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_pgset_next __db_vrfy_pgset_next@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_childcursor __db_vrfy_childcursor@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_childput __db_vrfy_childput@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_ccset __db_vrfy_ccset@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_ccnext __db_vrfy_ccnext@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_ccclose __db_vrfy_ccclose@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_init __db_salvage_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_destroy __db_salvage_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_getnext __db_salvage_getnext@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_isdone __db_salvage_isdone@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_markdone __db_salvage_markdone@DB_VERSION_UNIQUE_NAME@
+#define	__db_salvage_markneeded __db_salvage_markneeded@DB_VERSION_UNIQUE_NAME@
+#define	__db_vrfy_prdbt __db_vrfy_prdbt@DB_VERSION_UNIQUE_NAME@
+#define	__partition_init __partition_init@DB_VERSION_UNIQUE_NAME@
+#define	__partition_set __partition_set@DB_VERSION_UNIQUE_NAME@
+#define	__partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@
+#define	__partition_open __partition_open@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_callback __partition_get_callback@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_keys __partition_get_keys@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_dirs __partition_get_dirs@DB_VERSION_UNIQUE_NAME@
+#define	__partc_init __partc_init@DB_VERSION_UNIQUE_NAME@
+#define	__partc_get __partc_get@DB_VERSION_UNIQUE_NAME@
+#define	__partition_close __partition_close@DB_VERSION_UNIQUE_NAME@
+#define	__partition_sync __partition_sync@DB_VERSION_UNIQUE_NAME@
+#define	__partition_stat __partition_stat@DB_VERSION_UNIQUE_NAME@
+#define	__part_truncate __part_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__part_compact __part_compact@DB_VERSION_UNIQUE_NAME@
+#define	__part_lsn_reset __part_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define	__part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define	__part_key_range __part_key_range@DB_VERSION_UNIQUE_NAME@
+#define	__part_remove __part_remove@DB_VERSION_UNIQUE_NAME@
+#define	__part_rename __part_rename@DB_VERSION_UNIQUE_NAME@
+#define	__part_verify __part_verify@DB_VERSION_UNIQUE_NAME@
+#define	__part_testdocopy __part_testdocopy@DB_VERSION_UNIQUE_NAME@
+#define	__db_no_partition __db_no_partition@DB_VERSION_UNIQUE_NAME@
+#define	__partition_set __partition_set@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_callback __partition_get_callback@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_dirs __partition_get_dirs@DB_VERSION_UNIQUE_NAME@
+#define	__partition_get_keys __partition_get_keys@DB_VERSION_UNIQUE_NAME@
+#define	__partition_init __partition_init@DB_VERSION_UNIQUE_NAME@
+#define	__part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define	__partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@
+#define	__bam_compact_int __bam_compact_int@DB_VERSION_UNIQUE_NAME@
+#define	__bam_compact_opd __bam_compact_opd@DB_VERSION_UNIQUE_NAME@
+#define	__bam_truncate_ipages __bam_truncate_ipages@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cmp __bam_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__bam_defcmp __bam_defcmp@DB_VERSION_UNIQUE_NAME@
+#define	__bam_defpfx __bam_defpfx@DB_VERSION_UNIQUE_NAME@
+#define	__bam_compress_dupcmp __bam_compress_dupcmp@DB_VERSION_UNIQUE_NAME@
+#define	__bam_defcompress __bam_defcompress@DB_VERSION_UNIQUE_NAME@
+#define	__bam_defdecompress __bam_defdecompress@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_get __bamc_compress_get@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_put __bamc_compress_put@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_del __bamc_compress_del@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_bulk_del __bamc_compress_bulk_del@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_count __bamc_compress_count@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_cmp __bamc_compress_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_compress_dup __bamc_compress_dup@DB_VERSION_UNIQUE_NAME@
+#define	__bam_compress_salvage __bam_compress_salvage@DB_VERSION_UNIQUE_NAME@
+#define	__bam_compress_count __bam_compress_count@DB_VERSION_UNIQUE_NAME@
+#define	__bam_pgin __bam_pgin@DB_VERSION_UNIQUE_NAME@
+#define	__bam_pgout __bam_pgout@DB_VERSION_UNIQUE_NAME@
+#define	__bam_mswap __bam_mswap@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_delete __bam_ca_delete@DB_VERSION_UNIQUE_NAME@
+#define	__ram_ca_delete __ram_ca_delete@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_di __bam_ca_di@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_dup __bam_ca_dup@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_undodup __bam_ca_undodup@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_rsplit __bam_ca_rsplit@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_split __bam_ca_split@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ca_undosplit __bam_ca_undosplit@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_init __bamc_init@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_refresh __bamc_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_cmp __bamc_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_count __bamc_count@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_dup __bamc_dup@DB_VERSION_UNIQUE_NAME@
+#define	__bam_bulk_overflow __bam_bulk_overflow@DB_VERSION_UNIQUE_NAME@
+#define	__bam_bulk_duplicates __bam_bulk_duplicates@DB_VERSION_UNIQUE_NAME@
+#define	__bamc_rget __bamc_rget@DB_VERSION_UNIQUE_NAME@
+#define	__bam_opd_exists __bam_opd_exists@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ditem __bam_ditem@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adjindx __bam_adjindx@DB_VERSION_UNIQUE_NAME@
+#define	__bam_dpages __bam_dpages@DB_VERSION_UNIQUE_NAME@
+#define	__bam_pupdate __bam_pupdate@DB_VERSION_UNIQUE_NAME@
+#define	__bam_db_create __bam_db_create@DB_VERSION_UNIQUE_NAME@
+#define	__bam_db_close __bam_db_close@DB_VERSION_UNIQUE_NAME@
+#define	__bam_map_flags __bam_map_flags@DB_VERSION_UNIQUE_NAME@
+#define	__bam_set_flags __bam_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__bam_set_bt_compare __bam_set_bt_compare@DB_VERSION_UNIQUE_NAME@
+#define	__bam_set_bt_compress __bam_set_bt_compress@DB_VERSION_UNIQUE_NAME@
+#define	__bam_get_bt_minkey __bam_get_bt_minkey@DB_VERSION_UNIQUE_NAME@
+#define	__bam_copy_config __bam_copy_config@DB_VERSION_UNIQUE_NAME@
+#define	__ram_map_flags __ram_map_flags@DB_VERSION_UNIQUE_NAME@
+#define	__ram_set_flags __ram_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__ram_get_re_len __ram_get_re_len@DB_VERSION_UNIQUE_NAME@
+#define	__ram_get_re_pad __ram_get_re_pad@DB_VERSION_UNIQUE_NAME@
+#define	__bam_open __bam_open@DB_VERSION_UNIQUE_NAME@
+#define	__bam_metachk __bam_metachk@DB_VERSION_UNIQUE_NAME@
+#define	__bam_read_root __bam_read_root@DB_VERSION_UNIQUE_NAME@
+#define	__bam_new_file __bam_new_file@DB_VERSION_UNIQUE_NAME@
+#define	__bam_new_subdb __bam_new_subdb@DB_VERSION_UNIQUE_NAME@
+#define	__bam_iitem __bam_iitem@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ritem __bam_ritem@DB_VERSION_UNIQUE_NAME@
+#define	__bam_ritem_nolog __bam_ritem_nolog@DB_VERSION_UNIQUE_NAME@
+#define	__bam_irep __bam_irep@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_recover __bam_split_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_48_recover __bam_split_48_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_42_recover __bam_split_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rsplit_recover __bam_rsplit_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adj_recover __bam_adj_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cadjust_recover __bam_cadjust_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cdel_recover __bam_cdel_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_repl_recover __bam_repl_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_irep_recover __bam_irep_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_root_recover __bam_root_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_curadj_recover __bam_curadj_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rcuradj_recover __bam_rcuradj_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_merge_44_recover __bam_merge_44_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_relink_43_recover __bam_relink_43_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_reclaim __bam_reclaim@DB_VERSION_UNIQUE_NAME@
+#define	__bam_truncate __bam_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__ram_open __ram_open@DB_VERSION_UNIQUE_NAME@
+#define	__ram_append __ram_append@DB_VERSION_UNIQUE_NAME@
+#define	__ramc_del __ramc_del@DB_VERSION_UNIQUE_NAME@
+#define	__ramc_get __ramc_get@DB_VERSION_UNIQUE_NAME@
+#define	__ramc_put __ramc_put@DB_VERSION_UNIQUE_NAME@
+#define	__ram_ca __ram_ca@DB_VERSION_UNIQUE_NAME@
+#define	__ram_getno __ram_getno@DB_VERSION_UNIQUE_NAME@
+#define	__ram_writeback __ram_writeback@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rsearch __bam_rsearch@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adjust __bam_adjust@DB_VERSION_UNIQUE_NAME@
+#define	__bam_nrecs __bam_nrecs@DB_VERSION_UNIQUE_NAME@
+#define	__bam_total __bam_total@DB_VERSION_UNIQUE_NAME@
+#define	__bam_get_root __bam_get_root@DB_VERSION_UNIQUE_NAME@
+#define	__bam_search __bam_search@DB_VERSION_UNIQUE_NAME@
+#define	__bam_stkrel __bam_stkrel@DB_VERSION_UNIQUE_NAME@
+#define	__bam_stkgrow __bam_stkgrow@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split __bam_split@DB_VERSION_UNIQUE_NAME@
+#define	__bam_broot __bam_broot@DB_VERSION_UNIQUE_NAME@
+#define	__ram_root __ram_root@DB_VERSION_UNIQUE_NAME@
+#define	__bam_pinsert __bam_pinsert@DB_VERSION_UNIQUE_NAME@
+#define	__bam_copy __bam_copy@DB_VERSION_UNIQUE_NAME@
+#define	__bam_stat __bam_stat@DB_VERSION_UNIQUE_NAME@
+#define	__bam_stat_print __bam_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_stat_callback __bam_stat_callback@DB_VERSION_UNIQUE_NAME@
+#define	__bam_print_cursor __bam_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__bam_key_range __bam_key_range@DB_VERSION_UNIQUE_NAME@
+#define	__bam_traverse __bam_traverse@DB_VERSION_UNIQUE_NAME@
+#define	__bam_30_btreemeta __bam_30_btreemeta@DB_VERSION_UNIQUE_NAME@
+#define	__bam_31_btreemeta __bam_31_btreemeta@DB_VERSION_UNIQUE_NAME@
+#define	__bam_31_lbtree __bam_31_lbtree@DB_VERSION_UNIQUE_NAME@
+#define	__bam_vrfy_meta __bam_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ram_vrfy_leaf __ram_vrfy_leaf@DB_VERSION_UNIQUE_NAME@
+#define	__bam_vrfy __bam_vrfy@DB_VERSION_UNIQUE_NAME@
+#define	__bam_vrfy_itemorder __bam_vrfy_itemorder@DB_VERSION_UNIQUE_NAME@
+#define	__bam_vrfy_structure __bam_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define	__bam_vrfy_subtree __bam_vrfy_subtree@DB_VERSION_UNIQUE_NAME@
+#define	__bam_salvage __bam_salvage@DB_VERSION_UNIQUE_NAME@
+#define	__bam_salvage_walkdupint __bam_salvage_walkdupint@DB_VERSION_UNIQUE_NAME@
+#define	__bam_meta2pgset __bam_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_desc __bam_split_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_48_desc __bam_split_48_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_42_desc __bam_split_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rsplit_desc __bam_rsplit_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adj_desc __bam_adj_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cadjust_desc __bam_cadjust_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cdel_desc __bam_cdel_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_repl_desc __bam_repl_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_irep_desc __bam_irep_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_root_desc __bam_root_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_curadj_desc __bam_curadj_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rcuradj_desc __bam_rcuradj_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_relink_43_desc __bam_relink_43_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_merge_44_desc __bam_merge_44_desc@DB_VERSION_UNIQUE_NAME@
+#define	__bam_init_recover __bam_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_print __bam_split_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_48_print __bam_split_48_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_42_print __bam_split_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rsplit_print __bam_rsplit_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adj_print __bam_adj_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cadjust_print __bam_cadjust_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cdel_print __bam_cdel_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_repl_print __bam_repl_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_irep_print __bam_irep_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_root_print __bam_root_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_curadj_print __bam_curadj_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rcuradj_print __bam_rcuradj_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_relink_43_print __bam_relink_43_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_merge_44_print __bam_merge_44_print@DB_VERSION_UNIQUE_NAME@
+#define	__bam_init_print __bam_init_print@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_ATOI
+#define	atoi atoi@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ATOL
+#define	atol atol@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_BSEARCH
+#define	bsearch bsearch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_GETCWD
+#define	getcwd getcwd@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_GETOPT
+#define	getopt getopt@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISALPHA
+#define	isalpha isalpha@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISDIGIT
+#define	isdigit isdigit@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISPRINT
+#define	isprint isprint@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISSPACE
+#define	isspace isspace@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMCMP
+#define	memcmp memcmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMCPY
+#define	memcpy memcpy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMMOVE
+#define	memmove memmove@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define	printf printf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define	fprintf fprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define	vfprintf vfprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_QSORT
+#define	qsort qsort@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_RAISE
+#define	raise raise@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_RAND
+#define	rand rand@DB_VERSION_UNIQUE_NAME@
+#define	srand srand@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_SNPRINTF
+#define	snprintf snprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_VSNPRINTF
+#define	vsnprintf vsnprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCASECMP
+#define	strcasecmp strcasecmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCASECMP
+#define	strncasecmp strncasecmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCAT
+#define	strcat strcat@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCHR
+#define	strchr strchr@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRDUP
+#define	strdup strdup@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRERROR
+#define	strerror strerror@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRNCAT
+#define	strncat strncat@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRNCMP
+#define	strncmp strncmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRRCHR
+#define	strrchr strrchr@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRSEP
+#define	strsep strsep@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRTOL
+#define	strtol strtol@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRTOUL
+#define	strtoul strtoul@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_TIME
+#define	time time@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__clock_set_expires __clock_set_expires@DB_VERSION_UNIQUE_NAME@
+#define	__clock_expired __clock_expired@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_region_init __crypto_region_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_isbigendian __db_isbigendian@DB_VERSION_UNIQUE_NAME@
+#define	__db_byteorder __db_byteorder@DB_VERSION_UNIQUE_NAME@
+#define	__db_compress_count_int __db_compress_count_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_compress_int __db_compress_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_decompress_count_int __db_decompress_count_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_decompress_int __db_decompress_int@DB_VERSION_UNIQUE_NAME@
+#define	__db_decompress_int32 __db_decompress_int32@DB_VERSION_UNIQUE_NAME@
+#define	__db_fchk __db_fchk@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcchk __db_fcchk@DB_VERSION_UNIQUE_NAME@
+#define	__db_ferr __db_ferr@DB_VERSION_UNIQUE_NAME@
+#define	__db_fnl __db_fnl@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgerr __db_pgerr@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgfmt __db_pgfmt@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define	__db_assert __db_assert@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__env_panic_msg __env_panic_msg@DB_VERSION_UNIQUE_NAME@
+#define	__env_panic __env_panic@DB_VERSION_UNIQUE_NAME@
+#define	__db_unknown_error __db_unknown_error@DB_VERSION_UNIQUE_NAME@
+#define	__db_syserr __db_syserr@DB_VERSION_UNIQUE_NAME@
+#define	__db_err __db_err@DB_VERSION_UNIQUE_NAME@
+#define	__db_errx __db_errx@DB_VERSION_UNIQUE_NAME@
+#define	__db_errcall __db_errcall@DB_VERSION_UNIQUE_NAME@
+#define	__db_errfile __db_errfile@DB_VERSION_UNIQUE_NAME@
+#define	__db_msgadd __db_msgadd@DB_VERSION_UNIQUE_NAME@
+#define	__db_msgadd_ap __db_msgadd_ap@DB_VERSION_UNIQUE_NAME@
+#define	__db_msg __db_msg@DB_VERSION_UNIQUE_NAME@
+#define	__db_repmsg __db_repmsg@DB_VERSION_UNIQUE_NAME@
+#define	__db_unknown_flag __db_unknown_flag@DB_VERSION_UNIQUE_NAME@
+#define	__db_unknown_type __db_unknown_type@DB_VERSION_UNIQUE_NAME@
+#define	__db_unknown_path __db_unknown_path@DB_VERSION_UNIQUE_NAME@
+#define	__db_check_txn __db_check_txn@DB_VERSION_UNIQUE_NAME@
+#define	__db_txn_deadlock_err __db_txn_deadlock_err@DB_VERSION_UNIQUE_NAME@
+#define	__db_not_txn_env __db_not_txn_env@DB_VERSION_UNIQUE_NAME@
+#define	__db_rec_toobig __db_rec_toobig@DB_VERSION_UNIQUE_NAME@
+#define	__db_rec_repl __db_rec_repl@DB_VERSION_UNIQUE_NAME@
+#define	__dbc_logging __dbc_logging@DB_VERSION_UNIQUE_NAME@
+#define	__db_check_lsn __db_check_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__db_rdonly __db_rdonly@DB_VERSION_UNIQUE_NAME@
+#define	__db_space_err __db_space_err@DB_VERSION_UNIQUE_NAME@
+#define	__db_failed __db_failed@DB_VERSION_UNIQUE_NAME@
+#define	__db_getlong __db_getlong@DB_VERSION_UNIQUE_NAME@
+#define	__db_getulong __db_getulong@DB_VERSION_UNIQUE_NAME@
+#define	__db_idspace __db_idspace@DB_VERSION_UNIQUE_NAME@
+#define	__db_log2 __db_log2@DB_VERSION_UNIQUE_NAME@
+#define	__db_tablesize __db_tablesize@DB_VERSION_UNIQUE_NAME@
+#define	__db_hashinit __db_hashinit@DB_VERSION_UNIQUE_NAME@
+#define	__dbt_usercopy __dbt_usercopy@DB_VERSION_UNIQUE_NAME@
+#define	__dbt_userfree __dbt_userfree@DB_VERSION_UNIQUE_NAME@
+#define	__db_mkpath __db_mkpath@DB_VERSION_UNIQUE_NAME@
+#define	__db_openflags __db_openflags@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_arg __db_util_arg@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_cache __db_util_cache@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_logset __db_util_logset@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_siginit __db_util_siginit@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_interrupted __db_util_interrupted@DB_VERSION_UNIQUE_NAME@
+#define	__db_util_sigresend __db_util_sigresend@DB_VERSION_UNIQUE_NAME@
+#define	__db_zero_fill __db_zero_fill@DB_VERSION_UNIQUE_NAME@
+#define	__db_zero_extend __db_zero_extend@DB_VERSION_UNIQUE_NAME@
+#define	__aes_setup __aes_setup@DB_VERSION_UNIQUE_NAME@
+#define	__aes_adj_size __aes_adj_size@DB_VERSION_UNIQUE_NAME@
+#define	__aes_close __aes_close@DB_VERSION_UNIQUE_NAME@
+#define	__aes_decrypt __aes_decrypt@DB_VERSION_UNIQUE_NAME@
+#define	__aes_encrypt __aes_encrypt@DB_VERSION_UNIQUE_NAME@
+#define	__aes_init __aes_init@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_env_close __crypto_env_close@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_env_refresh __crypto_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_algsetup __crypto_algsetup@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_decrypt_meta __crypto_decrypt_meta@DB_VERSION_UNIQUE_NAME@
+#define	__crypto_set_passwd __crypto_set_passwd@DB_VERSION_UNIQUE_NAME@
+#define	__db_generate_iv __db_generate_iv@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelKeySetupEnc __db_rijndaelKeySetupEnc@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelKeySetupDec __db_rijndaelKeySetupDec@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelEncrypt __db_rijndaelEncrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelDecrypt __db_rijndaelDecrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelEncryptRound __db_rijndaelEncryptRound@DB_VERSION_UNIQUE_NAME@
+#define	__db_rijndaelDecryptRound __db_rijndaelDecryptRound@DB_VERSION_UNIQUE_NAME@
+#define	__db_makeKey __db_makeKey@DB_VERSION_UNIQUE_NAME@
+#define	__db_cipherInit __db_cipherInit@DB_VERSION_UNIQUE_NAME@
+#define	__db_blockEncrypt __db_blockEncrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_padEncrypt __db_padEncrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_blockDecrypt __db_blockDecrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_padDecrypt __db_padDecrypt@DB_VERSION_UNIQUE_NAME@
+#define	__db_cipherUpdateRounds __db_cipherUpdateRounds@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_setup __dbreg_setup@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_teardown __dbreg_teardown@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_teardown_int __dbreg_teardown_int@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_new_id __dbreg_new_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_get_id __dbreg_get_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_assign_id __dbreg_assign_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_revoke_id __dbreg_revoke_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_revoke_id_int __dbreg_revoke_id_int@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_close_id __dbreg_close_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_close_id_int __dbreg_close_id_int@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_failchk __dbreg_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_log_close __dbreg_log_close@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_log_id __dbreg_log_id@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_register_desc __dbreg_register_desc@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_init_recover __dbreg_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_register_print __dbreg_register_print@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_init_print __dbreg_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_register_recover __dbreg_register_recover@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_stat_print __dbreg_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_print_fname __dbreg_print_fname@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_add_dbentry __dbreg_add_dbentry@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_rem_dbentry __dbreg_rem_dbentry@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_log_files __dbreg_log_files@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_log_nofiles __dbreg_log_nofiles@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_close_files __dbreg_close_files@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_close_file __dbreg_close_file@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_mark_restored __dbreg_mark_restored@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_invalidate_files __dbreg_invalidate_files@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_id_to_db __dbreg_id_to_db@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_id_to_fname __dbreg_id_to_fname@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_fid_to_fname __dbreg_fid_to_fname@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_get_name __dbreg_get_name@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_do_open __dbreg_do_open@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_lazy_id __dbreg_lazy_id@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_init __env_alloc_init@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_overhead __env_alloc_overhead@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_size __env_alloc_size@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc __env_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_free __env_alloc_free@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_extend __env_alloc_extend@DB_VERSION_UNIQUE_NAME@
+#define	__env_region_extend __env_region_extend@DB_VERSION_UNIQUE_NAME@
+#define	__env_elem_size __env_elem_size@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_chunk __env_get_chunk@DB_VERSION_UNIQUE_NAME@
+#define	__env_alloc_print __env_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_backup_config __env_get_backup_config@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_backup_config __env_set_backup_config@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_backup_callbacks __env_get_backup_callbacks@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_backup_callbacks __env_set_backup_callbacks@DB_VERSION_UNIQUE_NAME@
+#define	__env_read_db_config __env_read_db_config@DB_VERSION_UNIQUE_NAME@
+#define	__env_failchk_pp __env_failchk_pp@DB_VERSION_UNIQUE_NAME@
+#define	__env_failchk_int __env_failchk_int@DB_VERSION_UNIQUE_NAME@
+#define	__env_thread_size __env_thread_size@DB_VERSION_UNIQUE_NAME@
+#define	__env_thread_max __env_thread_max@DB_VERSION_UNIQUE_NAME@
+#define	__env_thread_init __env_thread_init@DB_VERSION_UNIQUE_NAME@
+#define	__env_thread_destroy __env_thread_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_state __env_set_state@DB_VERSION_UNIQUE_NAME@
+#define	__env_thread_id_string __env_thread_id_string@DB_VERSION_UNIQUE_NAME@
+#define	__db_file_extend __db_file_extend@DB_VERSION_UNIQUE_NAME@
+#define	__db_file_multi_write __db_file_multi_write@DB_VERSION_UNIQUE_NAME@
+#define	__db_file_write __db_file_write@DB_VERSION_UNIQUE_NAME@
+#define	__db_env_destroy __db_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_alloc __env_get_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_alloc __env_set_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_memory_init __env_get_memory_init@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_memory_init __env_set_memory_init@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_memory_max __env_get_memory_max@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_memory_max __env_set_memory_max@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_encrypt_flags __env_get_encrypt_flags@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_encrypt __env_set_encrypt@DB_VERSION_UNIQUE_NAME@
+#define	__env_map_flags __env_map_flags@DB_VERSION_UNIQUE_NAME@
+#define	__env_fetch_flags __env_fetch_flags@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_flags __env_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_backup __env_set_backup@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_data_dir __env_set_data_dir@DB_VERSION_UNIQUE_NAME@
+#define	__env_add_data_dir __env_add_data_dir@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_create_dir __env_set_create_dir@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_metadata_dir __env_set_metadata_dir@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_data_len __env_set_data_len@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_intermediate_dir_mode __env_set_intermediate_dir_mode@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_errcall __env_get_errcall@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_errcall __env_set_errcall@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_errfile __env_get_errfile@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_errfile __env_set_errfile@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_errpfx __env_get_errpfx@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_errpfx __env_set_errpfx@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_thread_count __env_set_thread_count@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_msgcall __env_get_msgcall@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_msgcall __env_set_msgcall@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_msgfile __env_get_msgfile@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_msgfile __env_set_msgfile@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_paniccall __env_set_paniccall@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_shm_key __env_set_shm_key@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_tmp_dir __env_set_tmp_dir@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_verbose __env_set_verbose@DB_VERSION_UNIQUE_NAME@
+#define	__db_mi_env __db_mi_env@DB_VERSION_UNIQUE_NAME@
+#define	__db_mi_open __db_mi_open@DB_VERSION_UNIQUE_NAME@
+#define	__env_not_config __env_not_config@DB_VERSION_UNIQUE_NAME@
+#define	__env_set_timeout __env_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__db_appname __db_appname@DB_VERSION_UNIQUE_NAME@
+#define	__db_tmp_open __db_tmp_open@DB_VERSION_UNIQUE_NAME@
+#define	__env_open_pp __env_open_pp@DB_VERSION_UNIQUE_NAME@
+#define	__env_open __env_open@DB_VERSION_UNIQUE_NAME@
+#define	__env_remove __env_remove@DB_VERSION_UNIQUE_NAME@
+#define	__env_config __env_config@DB_VERSION_UNIQUE_NAME@
+#define	__env_close_pp __env_close_pp@DB_VERSION_UNIQUE_NAME@
+#define	__env_close __env_close@DB_VERSION_UNIQUE_NAME@
+#define	__env_refresh __env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__env_get_open_flags __env_get_open_flags@DB_VERSION_UNIQUE_NAME@
+#define	__env_attach_regions __env_attach_regions@DB_VERSION_UNIQUE_NAME@
+#define	__db_apprec __db_apprec@DB_VERSION_UNIQUE_NAME@
+#define	__env_openfiles __env_openfiles@DB_VERSION_UNIQUE_NAME@
+#define	__env_init_rec __env_init_rec@DB_VERSION_UNIQUE_NAME@
+#define	__env_attach __env_attach@DB_VERSION_UNIQUE_NAME@
+#define	__env_turn_on __env_turn_on@DB_VERSION_UNIQUE_NAME@
+#define	__env_turn_off __env_turn_off@DB_VERSION_UNIQUE_NAME@
+#define	__env_panic_set __env_panic_set@DB_VERSION_UNIQUE_NAME@
+#define	__env_ref_increment __env_ref_increment@DB_VERSION_UNIQUE_NAME@
+#define	__env_ref_decrement __env_ref_decrement@DB_VERSION_UNIQUE_NAME@
+#define	__env_ref_get __env_ref_get@DB_VERSION_UNIQUE_NAME@
+#define	__env_detach __env_detach@DB_VERSION_UNIQUE_NAME@
+#define	__env_remove_env __env_remove_env@DB_VERSION_UNIQUE_NAME@
+#define	__env_region_attach __env_region_attach@DB_VERSION_UNIQUE_NAME@
+#define	__env_region_share __env_region_share@DB_VERSION_UNIQUE_NAME@
+#define	__env_region_detach __env_region_detach@DB_VERSION_UNIQUE_NAME@
+#define	__envreg_register __envreg_register@DB_VERSION_UNIQUE_NAME@
+#define	__envreg_unregister __envreg_unregister@DB_VERSION_UNIQUE_NAME@
+#define	__envreg_xunlock __envreg_xunlock@DB_VERSION_UNIQUE_NAME@
+#define	__envreg_isalive __envreg_isalive@DB_VERSION_UNIQUE_NAME@
+#define	__env_struct_sig __env_struct_sig@DB_VERSION_UNIQUE_NAME@
+#define	__env_stat_print_pp __env_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__db_print_fh __db_print_fh@DB_VERSION_UNIQUE_NAME@
+#define	__db_print_fileid __db_print_fileid@DB_VERSION_UNIQUE_NAME@
+#define	__db_dl __db_dl@DB_VERSION_UNIQUE_NAME@
+#define	__db_dl_pct __db_dl_pct@DB_VERSION_UNIQUE_NAME@
+#define	__db_dlbytes __db_dlbytes@DB_VERSION_UNIQUE_NAME@
+#define	__db_print_reginfo __db_print_reginfo@DB_VERSION_UNIQUE_NAME@
+#define	__db_stat_not_built __db_stat_not_built@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__fop_create_42_desc __fop_create_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_desc __fop_create_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove_desc __fop_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_42_desc __fop_write_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_desc __fop_write_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_42_desc __fop_rename_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_noundo_46_desc __fop_rename_noundo_46_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_desc __fop_rename_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_noundo_desc __fop_rename_noundo_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_file_remove_desc __fop_file_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define	__fop_init_recover __fop_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_42_print __fop_create_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_print __fop_create_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove_print __fop_remove_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_42_print __fop_write_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_print __fop_write_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_42_print __fop_rename_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_print __fop_rename_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_file_remove_print __fop_file_remove_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_init_print __fop_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create __fop_create@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove __fop_remove@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write __fop_write@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename __fop_rename@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_recover __fop_create_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_42_recover __fop_create_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove_recover __fop_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_recover __fop_write_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_42_recover __fop_write_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_recover __fop_rename_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_noundo_recover __fop_rename_noundo_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_42_recover __fop_rename_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_noundo_46_recover __fop_rename_noundo_46_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_file_remove_recover __fop_file_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define	__fop_lock_handle __fop_lock_handle@DB_VERSION_UNIQUE_NAME@
+#define	__fop_file_setup __fop_file_setup@DB_VERSION_UNIQUE_NAME@
+#define	__fop_subdb_setup __fop_subdb_setup@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove_setup __fop_remove_setup@DB_VERSION_UNIQUE_NAME@
+#define	__fop_read_meta __fop_read_meta@DB_VERSION_UNIQUE_NAME@
+#define	__fop_dummy __fop_dummy@DB_VERSION_UNIQUE_NAME@
+#define	__fop_dbrename __fop_dbrename@DB_VERSION_UNIQUE_NAME@
+#define	__ham_quick_delete __ham_quick_delete@DB_VERSION_UNIQUE_NAME@
+#define	__hamc_init __hamc_init@DB_VERSION_UNIQUE_NAME@
+#define	__hamc_count __hamc_count@DB_VERSION_UNIQUE_NAME@
+#define	__hamc_cmp __hamc_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__hamc_dup __hamc_dup@DB_VERSION_UNIQUE_NAME@
+#define	__ham_contract_table __ham_contract_table@DB_VERSION_UNIQUE_NAME@
+#define	__ham_call_hash __ham_call_hash@DB_VERSION_UNIQUE_NAME@
+#define	__ham_overwrite __ham_overwrite@DB_VERSION_UNIQUE_NAME@
+#define	__ham_lookup __ham_lookup@DB_VERSION_UNIQUE_NAME@
+#define	__ham_init_dbt __ham_init_dbt@DB_VERSION_UNIQUE_NAME@
+#define	__hamc_update __hamc_update@DB_VERSION_UNIQUE_NAME@
+#define	__ham_get_clist __ham_get_clist@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_desc __ham_insdel_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_42_desc __ham_insdel_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_newpage_desc __ham_newpage_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_splitdata_desc __ham_splitdata_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_desc __ham_replace_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_42_desc __ham_replace_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copypage_desc __ham_copypage_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_42_desc __ham_metagroup_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_desc __ham_metagroup_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_42_desc __ham_groupalloc_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_desc __ham_groupalloc_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_changeslot_desc __ham_changeslot_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_contract_desc __ham_contract_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_curadj_desc __ham_curadj_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_chgpg_desc __ham_chgpg_desc@DB_VERSION_UNIQUE_NAME@
+#define	__ham_init_recover __ham_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_print __ham_insdel_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_42_print __ham_insdel_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_newpage_print __ham_newpage_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_splitdata_print __ham_splitdata_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_print __ham_replace_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_42_print __ham_replace_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copypage_print __ham_copypage_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_42_print __ham_metagroup_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_print __ham_metagroup_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_42_print __ham_groupalloc_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_print __ham_groupalloc_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_changeslot_print __ham_changeslot_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_contract_print __ham_contract_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_curadj_print __ham_curadj_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_chgpg_print __ham_chgpg_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_init_print __ham_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_compact_int __ham_compact_int@DB_VERSION_UNIQUE_NAME@
+#define	__ham_compact_bucket __ham_compact_bucket@DB_VERSION_UNIQUE_NAME@
+#define	__ham_compact_hash __ham_compact_hash@DB_VERSION_UNIQUE_NAME@
+#define	__ham_pgin __ham_pgin@DB_VERSION_UNIQUE_NAME@
+#define	__ham_pgout __ham_pgout@DB_VERSION_UNIQUE_NAME@
+#define	__ham_mswap __ham_mswap@DB_VERSION_UNIQUE_NAME@
+#define	__ham_add_dup __ham_add_dup@DB_VERSION_UNIQUE_NAME@
+#define	__ham_dup_convert __ham_dup_convert@DB_VERSION_UNIQUE_NAME@
+#define	__ham_make_dup __ham_make_dup@DB_VERSION_UNIQUE_NAME@
+#define	__ham_dsearch __ham_dsearch@DB_VERSION_UNIQUE_NAME@
+#define	__ham_func2 __ham_func2@DB_VERSION_UNIQUE_NAME@
+#define	__ham_func3 __ham_func3@DB_VERSION_UNIQUE_NAME@
+#define	__ham_func4 __ham_func4@DB_VERSION_UNIQUE_NAME@
+#define	__ham_func5 __ham_func5@DB_VERSION_UNIQUE_NAME@
+#define	__ham_test __ham_test@DB_VERSION_UNIQUE_NAME@
+#define	__ham_get_meta __ham_get_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_release_meta __ham_release_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_dirty_meta __ham_dirty_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_return_meta __ham_return_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_db_create __ham_db_create@DB_VERSION_UNIQUE_NAME@
+#define	__ham_db_close __ham_db_close@DB_VERSION_UNIQUE_NAME@
+#define	__ham_get_h_ffactor __ham_get_h_ffactor@DB_VERSION_UNIQUE_NAME@
+#define	__ham_set_h_compare __ham_set_h_compare@DB_VERSION_UNIQUE_NAME@
+#define	__ham_get_h_nelem __ham_get_h_nelem@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copy_config __ham_copy_config@DB_VERSION_UNIQUE_NAME@
+#define	__ham_open __ham_open@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metachk __ham_metachk@DB_VERSION_UNIQUE_NAME@
+#define	__ham_new_file __ham_new_file@DB_VERSION_UNIQUE_NAME@
+#define	__ham_new_subdb __ham_new_subdb@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item __ham_item@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_reset __ham_item_reset@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_init __ham_item_init@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_last __ham_item_last@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_first __ham_item_first@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_prev __ham_item_prev@DB_VERSION_UNIQUE_NAME@
+#define	__ham_item_next __ham_item_next@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insertpair __ham_insertpair@DB_VERSION_UNIQUE_NAME@
+#define	__ham_getindex __ham_getindex@DB_VERSION_UNIQUE_NAME@
+#define	__ham_verify_sorted_page __ham_verify_sorted_page@DB_VERSION_UNIQUE_NAME@
+#define	__ham_sort_page_cursor __ham_sort_page_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__ham_sort_page __ham_sort_page@DB_VERSION_UNIQUE_NAME@
+#define	__ham_del_pair __ham_del_pair@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replpair __ham_replpair@DB_VERSION_UNIQUE_NAME@
+#define	__ham_onpage_replace __ham_onpage_replace@DB_VERSION_UNIQUE_NAME@
+#define	__ham_merge_pages __ham_merge_pages@DB_VERSION_UNIQUE_NAME@
+#define	__ham_split_page __ham_split_page@DB_VERSION_UNIQUE_NAME@
+#define	__ham_add_el __ham_add_el@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copypair __ham_copypair@DB_VERSION_UNIQUE_NAME@
+#define	__ham_add_ovflpage __ham_add_ovflpage@DB_VERSION_UNIQUE_NAME@
+#define	__ham_get_cpage __ham_get_cpage@DB_VERSION_UNIQUE_NAME@
+#define	__ham_next_cpage __ham_next_cpage@DB_VERSION_UNIQUE_NAME@
+#define	__ham_lock_bucket __ham_lock_bucket@DB_VERSION_UNIQUE_NAME@
+#define	__ham_dpair __ham_dpair@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_recover __ham_insdel_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_42_recover __ham_insdel_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_newpage_recover __ham_newpage_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_recover __ham_replace_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_42_recover __ham_replace_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_splitdata_recover __ham_splitdata_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copypage_recover __ham_copypage_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_recover __ham_metagroup_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_contract_recover __ham_contract_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_recover __ham_groupalloc_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_changeslot_recover __ham_changeslot_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_curadj_recover __ham_curadj_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_chgpg_recover __ham_chgpg_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_42_recover __ham_metagroup_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_42_recover __ham_groupalloc_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__ham_reclaim __ham_reclaim@DB_VERSION_UNIQUE_NAME@
+#define	__ham_truncate __ham_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__ham_stat __ham_stat@DB_VERSION_UNIQUE_NAME@
+#define	__ham_stat_print __ham_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__ham_print_cursor __ham_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__ham_traverse __ham_traverse@DB_VERSION_UNIQUE_NAME@
+#define	__db_no_hash_am __db_no_hash_am@DB_VERSION_UNIQUE_NAME@
+#define	__ham_30_hashmeta __ham_30_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_30_sizefix __ham_30_sizefix@DB_VERSION_UNIQUE_NAME@
+#define	__ham_31_hashmeta __ham_31_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_31_hash __ham_31_hash@DB_VERSION_UNIQUE_NAME@
+#define	__ham_46_hashmeta __ham_46_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_46_hash __ham_46_hash@DB_VERSION_UNIQUE_NAME@
+#define	__ham_vrfy_meta __ham_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define	__ham_vrfy __ham_vrfy@DB_VERSION_UNIQUE_NAME@
+#define	__ham_vrfy_structure __ham_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define	__ham_vrfy_hashing __ham_vrfy_hashing@DB_VERSION_UNIQUE_NAME@
+#define	__ham_salvage __ham_salvage@DB_VERSION_UNIQUE_NAME@
+#define	__ham_meta2pgset __ham_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define	__heapc_init __heapc_init@DB_VERSION_UNIQUE_NAME@
+#define	__heap_ditem __heap_ditem@DB_VERSION_UNIQUE_NAME@
+#define	__heap_append __heap_append@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pitem __heap_pitem@DB_VERSION_UNIQUE_NAME@
+#define	__heapc_dup __heapc_dup@DB_VERSION_UNIQUE_NAME@
+#define	__heapc_gsplit __heapc_gsplit@DB_VERSION_UNIQUE_NAME@
+#define	__heapc_refresh __heapc_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__heap_addrem_desc __heap_addrem_desc@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pg_alloc_desc __heap_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_meta_desc __heap_trunc_meta_desc@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_page_desc __heap_trunc_page_desc@DB_VERSION_UNIQUE_NAME@
+#define	__heap_init_recover __heap_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__heap_addrem_print __heap_addrem_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pg_alloc_print __heap_pg_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_meta_print __heap_trunc_meta_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_page_print __heap_trunc_page_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_init_print __heap_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_backup __heap_backup@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pgin __heap_pgin@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pgout __heap_pgout@DB_VERSION_UNIQUE_NAME@
+#define	__heap_mswap __heap_mswap@DB_VERSION_UNIQUE_NAME@
+#define	__heap_db_create __heap_db_create@DB_VERSION_UNIQUE_NAME@
+#define	__heap_db_close __heap_db_close@DB_VERSION_UNIQUE_NAME@
+#define	__heap_get_heapsize __heap_get_heapsize@DB_VERSION_UNIQUE_NAME@
+#define	__heap_get_heap_regionsize __heap_get_heap_regionsize@DB_VERSION_UNIQUE_NAME@
+#define	__heap_set_heapsize __heap_set_heapsize@DB_VERSION_UNIQUE_NAME@
+#define	__heap_set_heap_regionsize __heap_set_heap_regionsize@DB_VERSION_UNIQUE_NAME@
+#define	__heap_exist __heap_exist@DB_VERSION_UNIQUE_NAME@
+#define	__heap_open __heap_open@DB_VERSION_UNIQUE_NAME@
+#define	__heap_metachk __heap_metachk@DB_VERSION_UNIQUE_NAME@
+#define	__heap_read_meta __heap_read_meta@DB_VERSION_UNIQUE_NAME@
+#define	__heap_new_file __heap_new_file@DB_VERSION_UNIQUE_NAME@
+#define	__heap_create_region __heap_create_region@DB_VERSION_UNIQUE_NAME@
+#define	__heap_addrem_recover __heap_addrem_recover@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pg_alloc_recover __heap_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_meta_recover __heap_trunc_meta_recover@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_page_recover __heap_trunc_page_recover@DB_VERSION_UNIQUE_NAME@
+#define	__heap_truncate __heap_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__heap_stat __heap_stat@DB_VERSION_UNIQUE_NAME@
+#define	__heap_stat_print __heap_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__heap_print_cursor __heap_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__heap_stat_callback __heap_stat_callback@DB_VERSION_UNIQUE_NAME@
+#define	__heap_traverse __heap_traverse@DB_VERSION_UNIQUE_NAME@
+#define	__db_no_heap_am __db_no_heap_am@DB_VERSION_UNIQUE_NAME@
+#define	__heap_vrfy_meta __heap_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define	__heap_vrfy __heap_vrfy@DB_VERSION_UNIQUE_NAME@
+#define	__heap_vrfy_structure __heap_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define	__heap_salvage __heap_salvage@DB_VERSION_UNIQUE_NAME@
+#define	__heap_meta2pgset __heap_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define	__db_chksum __db_chksum@DB_VERSION_UNIQUE_NAME@
+#define	__db_derive_mac __db_derive_mac@DB_VERSION_UNIQUE_NAME@
+#define	__db_check_chksum __db_check_chksum@DB_VERSION_UNIQUE_NAME@
+#define	__db_SHA1Transform __db_SHA1Transform@DB_VERSION_UNIQUE_NAME@
+#define	__db_SHA1Init __db_SHA1Init@DB_VERSION_UNIQUE_NAME@
+#define	__db_SHA1Update __db_SHA1Update@DB_VERSION_UNIQUE_NAME@
+#define	__db_SHA1Final __db_SHA1Final@DB_VERSION_UNIQUE_NAME@
+#define	__lock_vec_pp __lock_vec_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_vec __lock_vec@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_pp __lock_get_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get __lock_get@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_internal __lock_get_internal@DB_VERSION_UNIQUE_NAME@
+#define	__lock_put_pp __lock_put_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_put __lock_put@DB_VERSION_UNIQUE_NAME@
+#define	__lock_downgrade __lock_downgrade@DB_VERSION_UNIQUE_NAME@
+#define	__lock_locker_same_family __lock_locker_same_family@DB_VERSION_UNIQUE_NAME@
+#define	__lock_wakeup __lock_wakeup@DB_VERSION_UNIQUE_NAME@
+#define	__lock_promote __lock_promote@DB_VERSION_UNIQUE_NAME@
+#define	__lock_change __lock_change@DB_VERSION_UNIQUE_NAME@
+#define	__lock_detect_pp __lock_detect_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_detect __lock_detect@DB_VERSION_UNIQUE_NAME@
+#define	__lock_failchk __lock_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__lock_id_pp __lock_id_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_id __lock_id@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_thread_id __lock_set_thread_id@DB_VERSION_UNIQUE_NAME@
+#define	__lock_id_free_pp __lock_id_free_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_id_free __lock_id_free@DB_VERSION_UNIQUE_NAME@
+#define	__lock_id_set __lock_id_set@DB_VERSION_UNIQUE_NAME@
+#define	__lock_getlocker __lock_getlocker@DB_VERSION_UNIQUE_NAME@
+#define	__lock_getlocker_int __lock_getlocker_int@DB_VERSION_UNIQUE_NAME@
+#define	__lock_addfamilylocker __lock_addfamilylocker@DB_VERSION_UNIQUE_NAME@
+#define	__lock_freelocker __lock_freelocker@DB_VERSION_UNIQUE_NAME@
+#define	__lock_familyremove __lock_familyremove@DB_VERSION_UNIQUE_NAME@
+#define	__lock_fix_list __lock_fix_list@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_list __lock_get_list@DB_VERSION_UNIQUE_NAME@
+#define	__lock_list_print __lock_list_print@DB_VERSION_UNIQUE_NAME@
+#define	__lock_env_create __lock_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__lock_env_destroy __lock_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_conflicts __lock_get_lk_conflicts@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_conflicts __lock_set_lk_conflicts@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_detect __lock_get_lk_detect@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_detect __lock_set_lk_detect@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_max_locks __lock_get_lk_max_locks@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_max_locks __lock_set_lk_max_locks@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_max_lockers __lock_get_lk_max_lockers@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_max_lockers __lock_set_lk_max_lockers@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_max_objects __lock_get_lk_max_objects@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_max_objects __lock_set_lk_max_objects@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_partitions __lock_get_lk_partitions@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_partitions __lock_set_lk_partitions@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_tablesize __lock_get_lk_tablesize@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_tablesize __lock_set_lk_tablesize@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_lk_priority __lock_set_lk_priority@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_lk_priority __lock_get_lk_priority@DB_VERSION_UNIQUE_NAME@
+#define	__lock_get_env_timeout __lock_get_env_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_env_timeout __lock_set_env_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__lock_open __lock_open@DB_VERSION_UNIQUE_NAME@
+#define	__lock_env_refresh __lock_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__lock_region_mutex_count __lock_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define	__lock_region_mutex_max __lock_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define	__lock_region_max __lock_region_max@DB_VERSION_UNIQUE_NAME@
+#define	__lock_region_size __lock_region_size@DB_VERSION_UNIQUE_NAME@
+#define	__lock_stat_pp __lock_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_stat_print_pp __lock_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__lock_stat_print __lock_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__lock_printlock __lock_printlock@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_timeout __lock_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__lock_set_timeout_internal __lock_set_timeout_internal@DB_VERSION_UNIQUE_NAME@
+#define	__lock_inherit_timeout __lock_inherit_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__lock_ohash __lock_ohash@DB_VERSION_UNIQUE_NAME@
+#define	__lock_lhash __lock_lhash@DB_VERSION_UNIQUE_NAME@
+#define	__lock_nomem __lock_nomem@DB_VERSION_UNIQUE_NAME@
+#define	__log_open __log_open@DB_VERSION_UNIQUE_NAME@
+#define	__log_find __log_find@DB_VERSION_UNIQUE_NAME@
+#define	__log_valid __log_valid@DB_VERSION_UNIQUE_NAME@
+#define	__log_env_refresh __log_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_cached_ckp_lsn __log_get_cached_ckp_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__log_region_mutex_count __log_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define	__log_region_mutex_max __log_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define	__log_region_size __log_region_size@DB_VERSION_UNIQUE_NAME@
+#define	__log_region_max __log_region_max@DB_VERSION_UNIQUE_NAME@
+#define	__log_vtruncate __log_vtruncate@DB_VERSION_UNIQUE_NAME@
+#define	__log_is_outdated __log_is_outdated@DB_VERSION_UNIQUE_NAME@
+#define	__log_zero __log_zero@DB_VERSION_UNIQUE_NAME@
+#define	__log_inmem_lsnoff __log_inmem_lsnoff@DB_VERSION_UNIQUE_NAME@
+#define	__log_inmem_newfile __log_inmem_newfile@DB_VERSION_UNIQUE_NAME@
+#define	__log_inmem_chkspace __log_inmem_chkspace@DB_VERSION_UNIQUE_NAME@
+#define	__log_inmem_copyout __log_inmem_copyout@DB_VERSION_UNIQUE_NAME@
+#define	__log_inmem_copyin __log_inmem_copyin@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_version __log_set_version@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_oldversion __log_get_oldversion@DB_VERSION_UNIQUE_NAME@
+#define	__log_archive_pp __log_archive_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_archive __log_archive@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_stable_lsn __log_get_stable_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__log_autoremove __log_autoremove@DB_VERSION_UNIQUE_NAME@
+#define	__log_check_page_lsn __log_check_page_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__log_printf_capi __log_printf_capi@DB_VERSION_UNIQUE_NAME@
+#define	__log_printf_pp __log_printf_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_printf __log_printf@DB_VERSION_UNIQUE_NAME@
+#define	__log_cursor_pp __log_cursor_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_cursor __log_cursor@DB_VERSION_UNIQUE_NAME@
+#define	__logc_close __logc_close@DB_VERSION_UNIQUE_NAME@
+#define	__logc_version __logc_version@DB_VERSION_UNIQUE_NAME@
+#define	__logc_get __logc_get@DB_VERSION_UNIQUE_NAME@
+#define	__log_hdrswap __log_hdrswap@DB_VERSION_UNIQUE_NAME@
+#define	__log_persistswap __log_persistswap@DB_VERSION_UNIQUE_NAME@
+#define	__log_read_record_pp __log_read_record_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_read_record __log_read_record@DB_VERSION_UNIQUE_NAME@
+#define	__log_env_create __log_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__log_env_destroy __log_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_lg_bsize __log_get_lg_bsize@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_lg_bsize __log_set_lg_bsize@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_lg_filemode __log_get_lg_filemode@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_lg_filemode __log_set_lg_filemode@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_lg_max __log_get_lg_max@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_lg_max __log_set_lg_max@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_lg_regionmax __log_get_lg_regionmax@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_lg_regionmax __log_set_lg_regionmax@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_lg_dir __log_get_lg_dir@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_lg_dir __log_set_lg_dir@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_flags __log_get_flags@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_flags __log_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__log_get_config __log_get_config@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_config __log_set_config@DB_VERSION_UNIQUE_NAME@
+#define	__log_set_config_int __log_set_config_int@DB_VERSION_UNIQUE_NAME@
+#define	__log_check_sizes __log_check_sizes@DB_VERSION_UNIQUE_NAME@
+#define	__log_print_record __log_print_record@DB_VERSION_UNIQUE_NAME@
+#define	__log_put_pp __log_put_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_put __log_put@DB_VERSION_UNIQUE_NAME@
+#define	__log_current_lsn_int __log_current_lsn_int@DB_VERSION_UNIQUE_NAME@
+#define	__log_current_lsn __log_current_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__log_newfile __log_newfile@DB_VERSION_UNIQUE_NAME@
+#define	__log_flush_pp __log_flush_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_flush __log_flush@DB_VERSION_UNIQUE_NAME@
+#define	__log_flush_int __log_flush_int@DB_VERSION_UNIQUE_NAME@
+#define	__log_file_pp __log_file_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_name __log_name@DB_VERSION_UNIQUE_NAME@
+#define	__log_rep_put __log_rep_put@DB_VERSION_UNIQUE_NAME@
+#define	__log_put_record_pp __log_put_record_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_put_record __log_put_record@DB_VERSION_UNIQUE_NAME@
+#define	__log_stat_pp __log_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_stat_print_pp __log_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_stat_print __log_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__log_verify_pp __log_verify_pp@DB_VERSION_UNIQUE_NAME@
+#define	__log_verify __log_verify@DB_VERSION_UNIQUE_NAME@
+#define	__log_verify_wrap __log_verify_wrap@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_init_verify __crdel_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_init_verify __db_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_init_verify __dbreg_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_init_verify __bam_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_init_verify __fop_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_init_verify __ham_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__heap_init_verify __heap_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_init_verify __qam_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_init_verify __txn_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_log_verify_global_report __db_log_verify_global_report@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_metasub_verify __crdel_metasub_verify@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_create_verify __crdel_inmem_create_verify@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_rename_verify __crdel_inmem_rename_verify@DB_VERSION_UNIQUE_NAME@
+#define	__crdel_inmem_remove_verify __crdel_inmem_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_addrem_verify __db_addrem_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_big_verify __db_big_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_ovref_verify __db_ovref_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_42_verify __db_relink_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_debug_verify __db_debug_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_noop_verify __db_noop_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_42_verify __db_pg_alloc_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_alloc_verify __db_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_42_verify __db_pg_free_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_free_verify __db_pg_free_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_cksum_verify __db_cksum_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_42_verify __db_pg_freedata_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_freedata_verify __db_pg_freedata_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_init_verify __db_pg_init_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_sort_44_verify __db_pg_sort_44_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pg_trunc_verify __db_pg_trunc_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_realloc_verify __db_realloc_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_relink_verify __db_relink_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_merge_verify __db_merge_verify@DB_VERSION_UNIQUE_NAME@
+#define	__db_pgno_verify __db_pgno_verify@DB_VERSION_UNIQUE_NAME@
+#define	__dbreg_register_verify __dbreg_register_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_verify __bam_split_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_split_42_verify __bam_split_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rsplit_verify __bam_rsplit_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_adj_verify __bam_adj_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_irep_verify __bam_irep_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cadjust_verify __bam_cadjust_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_cdel_verify __bam_cdel_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_repl_verify __bam_repl_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_root_verify __bam_root_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_curadj_verify __bam_curadj_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_rcuradj_verify __bam_rcuradj_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_relink_43_verify __bam_relink_43_verify@DB_VERSION_UNIQUE_NAME@
+#define	__bam_merge_44_verify __bam_merge_44_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_42_verify __fop_create_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_create_verify __fop_create_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_remove_verify __fop_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_42_verify __fop_write_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_write_verify __fop_write_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_42_verify __fop_rename_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_rename_verify __fop_rename_verify@DB_VERSION_UNIQUE_NAME@
+#define	__fop_file_remove_verify __fop_file_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_insdel_verify __ham_insdel_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_newpage_verify __ham_newpage_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_splitdata_verify __ham_splitdata_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_replace_verify __ham_replace_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_copypage_verify __ham_copypage_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_42_verify __ham_metagroup_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_metagroup_verify __ham_metagroup_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_42_verify __ham_groupalloc_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_groupalloc_verify __ham_groupalloc_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_changeslot_verify __ham_changeslot_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_contract_verify __ham_contract_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_curadj_verify __ham_curadj_verify@DB_VERSION_UNIQUE_NAME@
+#define	__ham_chgpg_verify __ham_chgpg_verify@DB_VERSION_UNIQUE_NAME@
+#define	__heap_addrem_verify __heap_addrem_verify@DB_VERSION_UNIQUE_NAME@
+#define	__heap_pg_alloc_verify __heap_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_meta_verify __heap_trunc_meta_verify@DB_VERSION_UNIQUE_NAME@
+#define	__heap_trunc_page_verify __heap_trunc_page_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_incfirst_verify __qam_incfirst_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_mvptr_verify __qam_mvptr_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_del_verify __qam_del_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_add_verify __qam_add_verify@DB_VERSION_UNIQUE_NAME@
+#define	__qam_delext_verify __qam_delext_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_42_verify __txn_regop_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_verify __txn_regop_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_42_verify __txn_ckp_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_verify __txn_ckp_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_child_verify __txn_child_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_xa_regop_42_verify __txn_xa_regop_42_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_prepare_verify __txn_prepare_verify@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recycle_verify __txn_recycle_verify@DB_VERSION_UNIQUE_NAME@
+#define	__create_log_vrfy_info __create_log_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define	__destroy_log_vrfy_info __destroy_log_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define	__put_txn_vrfy_info __put_txn_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define	__get_txn_vrfy_info __get_txn_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define	__add_recycle_lsn_range __add_recycle_lsn_range@DB_VERSION_UNIQUE_NAME@
+#define	__iterate_txninfo __iterate_txninfo@DB_VERSION_UNIQUE_NAME@
+#define	__rem_last_recycle_lsn __rem_last_recycle_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__add_file_updated __add_file_updated@DB_VERSION_UNIQUE_NAME@
+#define	__del_file_updated __del_file_updated@DB_VERSION_UNIQUE_NAME@
+#define	__clear_fileups __clear_fileups@DB_VERSION_UNIQUE_NAME@
+#define	__free_txninfo_stack __free_txninfo_stack@DB_VERSION_UNIQUE_NAME@
+#define	__free_txninfo __free_txninfo@DB_VERSION_UNIQUE_NAME@
+#define	__put_filereg_info __put_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define	__del_filelife __del_filelife@DB_VERSION_UNIQUE_NAME@
+#define	__put_filelife __put_filelife@DB_VERSION_UNIQUE_NAME@
+#define	__get_filelife __get_filelife@DB_VERSION_UNIQUE_NAME@
+#define	__get_filereg_by_dbregid __get_filereg_by_dbregid@DB_VERSION_UNIQUE_NAME@
+#define	__add_dbregid __add_dbregid@DB_VERSION_UNIQUE_NAME@
+#define	__get_filereg_info __get_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define	__free_filereg_info __free_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define	__get_ckp_info __get_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define	__get_last_ckp_info __get_last_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define	__put_ckp_info __put_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define	__get_timestamp_info __get_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define	__get_latest_timestamp_info __get_latest_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define	__put_timestamp_info __put_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define	__find_lsnrg_by_timerg __find_lsnrg_by_timerg@DB_VERSION_UNIQUE_NAME@
+#define	__add_txnrange __add_txnrange@DB_VERSION_UNIQUE_NAME@
+#define	__get_aborttxn __get_aborttxn@DB_VERSION_UNIQUE_NAME@
+#define	__txn_started __txn_started@DB_VERSION_UNIQUE_NAME@
+#define	__set_logvrfy_dbfuid __set_logvrfy_dbfuid@DB_VERSION_UNIQUE_NAME@
+#define	__add_page_to_txn __add_page_to_txn@DB_VERSION_UNIQUE_NAME@
+#define	__del_txn_pages __del_txn_pages@DB_VERSION_UNIQUE_NAME@
+#define	__is_ancestor_txn __is_ancestor_txn@DB_VERSION_UNIQUE_NAME@
+#define	__return_txn_pages __return_txn_pages@DB_VERSION_UNIQUE_NAME@
+#define	__memp_alloc __memp_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__memp_free __memp_free@DB_VERSION_UNIQUE_NAME@
+#define	__memp_backup_open __memp_backup_open@DB_VERSION_UNIQUE_NAME@
+#define	__memp_backup_mpf __memp_backup_mpf@DB_VERSION_UNIQUE_NAME@
+#define	__memp_backup_close __memp_backup_close@DB_VERSION_UNIQUE_NAME@
+#define	__memp_failchk __memp_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__memp_bhwrite __memp_bhwrite@DB_VERSION_UNIQUE_NAME@
+#define	__memp_pgread __memp_pgread@DB_VERSION_UNIQUE_NAME@
+#define	__memp_pg __memp_pg@DB_VERSION_UNIQUE_NAME@
+#define	__memp_bhfree __memp_bhfree@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fget_pp __memp_fget_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fget __memp_fget@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fcreate_pp __memp_fcreate_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fcreate __memp_fcreate@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_clear_len __memp_set_clear_len@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_fileid __memp_get_fileid@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_fileid __memp_set_fileid@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_flags __memp_get_flags@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_flags __memp_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_ftype __memp_get_ftype@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_ftype __memp_set_ftype@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_lsn_offset __memp_set_lsn_offset@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_pgcookie __memp_get_pgcookie@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_pgcookie __memp_set_pgcookie@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_priority __memp_get_priority@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_last_pgno __memp_get_last_pgno@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fn __memp_fn@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fns __memp_fns@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fopen_pp __memp_fopen_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fopen __memp_fopen@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fclose_pp __memp_fclose_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fclose __memp_fclose@DB_VERSION_UNIQUE_NAME@
+#define	__memp_mf_discard __memp_mf_discard@DB_VERSION_UNIQUE_NAME@
+#define	__memp_inmemlist __memp_inmemlist@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fput_pp __memp_fput_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fput __memp_fput@DB_VERSION_UNIQUE_NAME@
+#define	__memp_unpin_buffers __memp_unpin_buffers@DB_VERSION_UNIQUE_NAME@
+#define	__memp_dirty __memp_dirty@DB_VERSION_UNIQUE_NAME@
+#define	__memp_shared __memp_shared@DB_VERSION_UNIQUE_NAME@
+#define	__memp_env_create __memp_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__memp_env_destroy __memp_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_cachesize __memp_get_cachesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_cachesize __memp_set_cachesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_config __memp_set_config@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_config __memp_get_config@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_max_openfd __memp_get_mp_max_openfd@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_max_openfd __memp_set_mp_max_openfd@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_max_write __memp_get_mp_max_write@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_max_write __memp_set_mp_max_write@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_mmapsize __memp_get_mp_mmapsize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_mmapsize __memp_set_mp_mmapsize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_pagesize __memp_get_mp_pagesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_pagesize __memp_set_mp_pagesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_tablesize __memp_get_mp_tablesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_tablesize __memp_set_mp_tablesize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_mp_mtxcount __memp_get_mp_mtxcount@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_mp_mtxcount __memp_set_mp_mtxcount@DB_VERSION_UNIQUE_NAME@
+#define	__memp_nameop __memp_nameop@DB_VERSION_UNIQUE_NAME@
+#define	__memp_ftruncate __memp_ftruncate@DB_VERSION_UNIQUE_NAME@
+#define	__memp_alloc_freelist __memp_alloc_freelist@DB_VERSION_UNIQUE_NAME@
+#define	__memp_free_freelist __memp_free_freelist@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_freelist __memp_get_freelist@DB_VERSION_UNIQUE_NAME@
+#define	__memp_extend_freelist __memp_extend_freelist@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_last_pgno __memp_set_last_pgno@DB_VERSION_UNIQUE_NAME@
+#define	__memp_bh_settxn __memp_bh_settxn@DB_VERSION_UNIQUE_NAME@
+#define	__memp_skip_curadj __memp_skip_curadj@DB_VERSION_UNIQUE_NAME@
+#define	__memp_bh_freeze __memp_bh_freeze@DB_VERSION_UNIQUE_NAME@
+#define	__memp_bh_thaw __memp_bh_thaw@DB_VERSION_UNIQUE_NAME@
+#define	__memp_open __memp_open@DB_VERSION_UNIQUE_NAME@
+#define	__memp_init __memp_init@DB_VERSION_UNIQUE_NAME@
+#define	__memp_max_regions __memp_max_regions@DB_VERSION_UNIQUE_NAME@
+#define	__memp_region_mutex_count __memp_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define	__memp_env_refresh __memp_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__memp_register_pp __memp_register_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_register __memp_register@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_bucket __memp_get_bucket@DB_VERSION_UNIQUE_NAME@
+#define	__memp_resize __memp_resize@DB_VERSION_UNIQUE_NAME@
+#define	__memp_get_cache_max __memp_get_cache_max@DB_VERSION_UNIQUE_NAME@
+#define	__memp_set_cache_max __memp_set_cache_max@DB_VERSION_UNIQUE_NAME@
+#define	__memp_stat_pp __memp_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_stat_print_pp __memp_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_stat_print __memp_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__memp_stat_hash __memp_stat_hash@DB_VERSION_UNIQUE_NAME@
+#define	__memp_walk_files __memp_walk_files@DB_VERSION_UNIQUE_NAME@
+#define	__memp_discard_all_mpfs __memp_discard_all_mpfs@DB_VERSION_UNIQUE_NAME@
+#define	__memp_sync_pp __memp_sync_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_sync __memp_sync@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fsync_pp __memp_fsync_pp@DB_VERSION_UNIQUE_NAME@
+#define	__memp_fsync __memp_fsync@DB_VERSION_UNIQUE_NAME@
+#define	__mp_xxx_fh __mp_xxx_fh@DB_VERSION_UNIQUE_NAME@
+#define	__memp_sync_int __memp_sync_int@DB_VERSION_UNIQUE_NAME@
+#define	__memp_mf_sync __memp_mf_sync@DB_VERSION_UNIQUE_NAME@
+#define	__memp_trickle_pp __memp_trickle_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_alloc __mutex_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_alloc_int __mutex_alloc_int@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_free __mutex_free@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_free_int __mutex_free_int@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_refresh __mutex_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__mut_failchk __mut_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcntl_mutex_init __db_fcntl_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcntl_mutex_lock __db_fcntl_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcntl_mutex_trylock __db_fcntl_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcntl_mutex_unlock __db_fcntl_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define	__db_fcntl_mutex_destroy __db_fcntl_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_alloc_pp __mutex_alloc_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_free_pp __mutex_free_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_lock_pp __mutex_lock_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_unlock_pp __mutex_unlock_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_get_align __mutex_get_align@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_align __mutex_set_align@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_get_increment __mutex_get_increment@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_increment __mutex_set_increment@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_get_init __mutex_get_init@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_init __mutex_set_init@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_get_max __mutex_get_max@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_max __mutex_set_max@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_get_tas_spins __mutex_get_tas_spins@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_tas_spins __mutex_set_tas_spins@DB_VERSION_UNIQUE_NAME@
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define	__atomic_inc __atomic_inc@DB_VERSION_UNIQUE_NAME@
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define	__atomic_dec __atomic_dec@DB_VERSION_UNIQUE_NAME@
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define	atomic_compare_exchange atomic_compare_exchange@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_pthread_mutex_init __db_pthread_mutex_init@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_MUTEX_HYBRID
+#define	__db_pthread_mutex_lock __db_pthread_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define	__db_pthread_mutex_readlock __db_pthread_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+#define	__db_hybrid_mutex_suspend __db_hybrid_mutex_suspend@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_pthread_mutex_unlock __db_pthread_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define	__db_pthread_mutex_destroy __db_pthread_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_open __mutex_open@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_env_refresh __mutex_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_resource_return __mutex_resource_return@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_stat_pp __mutex_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_stat_print_pp __mutex_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_stat_print __mutex_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_print_debug_single __mutex_print_debug_single@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_print_debug_stats __mutex_print_debug_stats@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_set_wait_info __mutex_set_wait_info@DB_VERSION_UNIQUE_NAME@
+#define	__mutex_clear __mutex_clear@DB_VERSION_UNIQUE_NAME@
+#define	__db_tas_mutex_init __db_tas_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_tas_mutex_lock __db_tas_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define	__db_tas_mutex_trylock __db_tas_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_SHARED_LATCHES)
+#define	__db_tas_mutex_readlock __db_tas_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define	__db_tas_mutex_tryreadlock __db_tas_mutex_tryreadlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_tas_mutex_unlock __db_tas_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define	__db_tas_mutex_destroy __db_tas_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__db_win32_mutex_init __db_win32_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define	__db_win32_mutex_lock __db_win32_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define	__db_win32_mutex_trylock __db_win32_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_SHARED_LATCHES)
+#define	__db_win32_mutex_readlock __db_win32_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define	__db_win32_mutex_tryreadlock __db_win32_mutex_tryreadlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__db_win32_mutex_unlock __db_win32_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define	__db_win32_mutex_destroy __db_win32_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__os_abort __os_abort@DB_VERSION_UNIQUE_NAME@
+#define	__os_abspath __os_abspath@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_REPLICATION_THREADS)
+#define	__os_getaddrinfo __os_getaddrinfo@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_REPLICATION_THREADS)
+#define	__os_freeaddrinfo __os_freeaddrinfo@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__os_umalloc __os_umalloc@DB_VERSION_UNIQUE_NAME@
+#define	__os_urealloc __os_urealloc@DB_VERSION_UNIQUE_NAME@
+#define	__os_ufree __os_ufree@DB_VERSION_UNIQUE_NAME@
+#define	__os_strdup __os_strdup@DB_VERSION_UNIQUE_NAME@
+#define	__os_calloc __os_calloc@DB_VERSION_UNIQUE_NAME@
+#define	__os_malloc __os_malloc@DB_VERSION_UNIQUE_NAME@
+#define	__os_realloc __os_realloc@DB_VERSION_UNIQUE_NAME@
+#define	__os_free __os_free@DB_VERSION_UNIQUE_NAME@
+#define	__ua_memcpy __ua_memcpy@DB_VERSION_UNIQUE_NAME@
+#define	__os_gettime __os_gettime@DB_VERSION_UNIQUE_NAME@
+#define	__os_fs_notzero __os_fs_notzero@DB_VERSION_UNIQUE_NAME@
+#define	__os_support_direct_io __os_support_direct_io@DB_VERSION_UNIQUE_NAME@
+#define	__os_support_db_register __os_support_db_register@DB_VERSION_UNIQUE_NAME@
+#define	__os_support_replication __os_support_replication@DB_VERSION_UNIQUE_NAME@
+#define	__os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@
+#define	__os_ctime __os_ctime@DB_VERSION_UNIQUE_NAME@
+#define	__os_dirlist __os_dirlist@DB_VERSION_UNIQUE_NAME@
+#define	__os_dirfree __os_dirfree@DB_VERSION_UNIQUE_NAME@
+#define	__os_get_errno_ret_zero __os_get_errno_ret_zero@DB_VERSION_UNIQUE_NAME@
+#define	__os_get_errno __os_get_errno@DB_VERSION_UNIQUE_NAME@
+#define	__os_get_neterr __os_get_neterr@DB_VERSION_UNIQUE_NAME@
+#define	__os_get_syserr __os_get_syserr@DB_VERSION_UNIQUE_NAME@
+#define	__os_set_errno __os_set_errno@DB_VERSION_UNIQUE_NAME@
+#define	__os_strerror __os_strerror@DB_VERSION_UNIQUE_NAME@
+#define	__os_posix_err __os_posix_err@DB_VERSION_UNIQUE_NAME@
+#define	__os_fileid __os_fileid@DB_VERSION_UNIQUE_NAME@
+#define	__os_fdlock __os_fdlock@DB_VERSION_UNIQUE_NAME@
+#define	__os_fsync __os_fsync@DB_VERSION_UNIQUE_NAME@
+#define	__os_getenv __os_getenv@DB_VERSION_UNIQUE_NAME@
+#define	__os_openhandle __os_openhandle@DB_VERSION_UNIQUE_NAME@
+#define	__os_closehandle __os_closehandle@DB_VERSION_UNIQUE_NAME@
+#define	__os_attach __os_attach@DB_VERSION_UNIQUE_NAME@
+#define	__os_detach __os_detach@DB_VERSION_UNIQUE_NAME@
+#define	__os_mapfile __os_mapfile@DB_VERSION_UNIQUE_NAME@
+#define	__os_unmapfile __os_unmapfile@DB_VERSION_UNIQUE_NAME@
+#define	__os_mkdir __os_mkdir@DB_VERSION_UNIQUE_NAME@
+#define	__os_open __os_open@DB_VERSION_UNIQUE_NAME@
+#define	__os_concat_path __os_concat_path@DB_VERSION_UNIQUE_NAME@
+#define	__os_id __os_id@DB_VERSION_UNIQUE_NAME@
+#define	__os_rename __os_rename@DB_VERSION_UNIQUE_NAME@
+#define	__os_isroot __os_isroot@DB_VERSION_UNIQUE_NAME@
+#define	__db_rpath __db_rpath@DB_VERSION_UNIQUE_NAME@
+#define	__os_io __os_io@DB_VERSION_UNIQUE_NAME@
+#define	__os_read __os_read@DB_VERSION_UNIQUE_NAME@
+#define	__os_write __os_write@DB_VERSION_UNIQUE_NAME@
+#define	__os_physwrite __os_physwrite@DB_VERSION_UNIQUE_NAME@
+#define	__os_seek __os_seek@DB_VERSION_UNIQUE_NAME@
+#define	__os_stack __os_stack@DB_VERSION_UNIQUE_NAME@
+#define	__os_exists __os_exists@DB_VERSION_UNIQUE_NAME@
+#define	__os_ioinfo __os_ioinfo@DB_VERSION_UNIQUE_NAME@
+#define	__os_tmpdir __os_tmpdir@DB_VERSION_UNIQUE_NAME@
+#define	__os_truncate __os_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__os_unique_id __os_unique_id@DB_VERSION_UNIQUE_NAME@
+#define	__os_unlink __os_unlink@DB_VERSION_UNIQUE_NAME@
+#define	__os_yield __os_yield@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_QNX
+#define	__os_qnx_region_open __os_qnx_region_open@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__os_is_winnt __os_is_winnt@DB_VERSION_UNIQUE_NAME@
+#define	__os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_REPLICATION_THREADS
+#define	__os_get_neterr __os_get_neterr@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__qam_position __qam_position@DB_VERSION_UNIQUE_NAME@
+#define	__qam_pitem __qam_pitem@DB_VERSION_UNIQUE_NAME@
+#define	__qam_append __qam_append@DB_VERSION_UNIQUE_NAME@
+#define	__qamc_dup __qamc_dup@DB_VERSION_UNIQUE_NAME@
+#define	__qamc_init __qamc_init@DB_VERSION_UNIQUE_NAME@
+#define	__qam_truncate __qam_truncate@DB_VERSION_UNIQUE_NAME@
+#define	__qam_delete __qam_delete@DB_VERSION_UNIQUE_NAME@
+#define	__qam_incfirst_desc __qam_incfirst_desc@DB_VERSION_UNIQUE_NAME@
+#define	__qam_mvptr_desc __qam_mvptr_desc@DB_VERSION_UNIQUE_NAME@
+#define	__qam_del_desc __qam_del_desc@DB_VERSION_UNIQUE_NAME@
+#define	__qam_add_desc __qam_add_desc@DB_VERSION_UNIQUE_NAME@
+#define	__qam_delext_desc __qam_delext_desc@DB_VERSION_UNIQUE_NAME@
+#define	__qam_init_recover __qam_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_incfirst_print __qam_incfirst_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_mvptr_print __qam_mvptr_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_del_print __qam_del_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_add_print __qam_add_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_delext_print __qam_delext_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_init_print __qam_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__qam_mswap __qam_mswap@DB_VERSION_UNIQUE_NAME@
+#define	__qam_pgin_out __qam_pgin_out@DB_VERSION_UNIQUE_NAME@
+#define	__qam_fprobe __qam_fprobe@DB_VERSION_UNIQUE_NAME@
+#define	__qam_fclose __qam_fclose@DB_VERSION_UNIQUE_NAME@
+#define	__qam_fremove __qam_fremove@DB_VERSION_UNIQUE_NAME@
+#define	__qam_sync __qam_sync@DB_VERSION_UNIQUE_NAME@
+#define	__qam_gen_filelist __qam_gen_filelist@DB_VERSION_UNIQUE_NAME@
+#define	__qam_extent_names __qam_extent_names@DB_VERSION_UNIQUE_NAME@
+#define	__qam_exid __qam_exid@DB_VERSION_UNIQUE_NAME@
+#define	__qam_nameop __qam_nameop@DB_VERSION_UNIQUE_NAME@
+#define	__qam_lsn_reset __qam_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define	__qam_backup_extents __qam_backup_extents@DB_VERSION_UNIQUE_NAME@
+#define	__qam_db_create __qam_db_create@DB_VERSION_UNIQUE_NAME@
+#define	__qam_db_close __qam_db_close@DB_VERSION_UNIQUE_NAME@
+#define	__qam_get_extentsize __qam_get_extentsize@DB_VERSION_UNIQUE_NAME@
+#define	__queue_pageinfo __queue_pageinfo@DB_VERSION_UNIQUE_NAME@
+#define	__db_prqueue __db_prqueue@DB_VERSION_UNIQUE_NAME@
+#define	__qam_remove __qam_remove@DB_VERSION_UNIQUE_NAME@
+#define	__qam_rename __qam_rename@DB_VERSION_UNIQUE_NAME@
+#define	__qam_map_flags __qam_map_flags@DB_VERSION_UNIQUE_NAME@
+#define	__qam_set_flags __qam_set_flags@DB_VERSION_UNIQUE_NAME@
+#define	__qam_open __qam_open@DB_VERSION_UNIQUE_NAME@
+#define	__qam_set_ext_data __qam_set_ext_data@DB_VERSION_UNIQUE_NAME@
+#define	__qam_metachk __qam_metachk@DB_VERSION_UNIQUE_NAME@
+#define	__qam_new_file __qam_new_file@DB_VERSION_UNIQUE_NAME@
+#define	__qam_incfirst_recover __qam_incfirst_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_mvptr_recover __qam_mvptr_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_del_recover __qam_del_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_delext_recover __qam_delext_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_add_recover __qam_add_recover@DB_VERSION_UNIQUE_NAME@
+#define	__qam_stat __qam_stat@DB_VERSION_UNIQUE_NAME@
+#define	__qam_stat_print __qam_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_no_queue_am __db_no_queue_am@DB_VERSION_UNIQUE_NAME@
+#define	__qam_31_qammeta __qam_31_qammeta@DB_VERSION_UNIQUE_NAME@
+#define	__qam_32_qammeta __qam_32_qammeta@DB_VERSION_UNIQUE_NAME@
+#define	__qam_vrfy_meta __qam_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define	__qam_meta2pgset __qam_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define	__qam_vrfy_data __qam_vrfy_data@DB_VERSION_UNIQUE_NAME@
+#define	__qam_vrfy_structure __qam_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define	__qam_vrfy_walkqueue __qam_vrfy_walkqueue@DB_VERSION_UNIQUE_NAME@
+#define	__qam_salvage __qam_salvage@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_marshal __rep_bulk_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_unmarshal __rep_bulk_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_control_marshal __rep_control_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_control_unmarshal __rep_control_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_egen_marshal __rep_egen_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_egen_unmarshal __rep_egen_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_fileinfo_marshal __rep_fileinfo_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_fileinfo_unmarshal __rep_fileinfo_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_fileinfo_v6_marshal __rep_fileinfo_v6_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_fileinfo_v6_unmarshal __rep_fileinfo_v6_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_grant_info_marshal __rep_grant_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_grant_info_unmarshal __rep_grant_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_logreq_marshal __rep_logreq_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_logreq_unmarshal __rep_logreq_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_newfile_marshal __rep_newfile_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_newfile_unmarshal __rep_newfile_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_update_marshal __rep_update_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_update_unmarshal __rep_update_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote_info_marshal __rep_vote_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote_info_unmarshal __rep_vote_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote_info_v5_marshal __rep_vote_info_v5_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote_info_v5_unmarshal __rep_vote_info_v5_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lsn_hist_key_marshal __rep_lsn_hist_key_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lsn_hist_key_unmarshal __rep_lsn_hist_key_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lsn_hist_data_marshal __rep_lsn_hist_data_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lsn_hist_data_unmarshal __rep_lsn_hist_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_update_req __rep_update_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_page_req __rep_page_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_update_setup __rep_update_setup@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_page __rep_bulk_page@DB_VERSION_UNIQUE_NAME@
+#define	__rep_page __rep_page@DB_VERSION_UNIQUE_NAME@
+#define	__rep_init_cleanup __rep_init_cleanup@DB_VERSION_UNIQUE_NAME@
+#define	__rep_pggap_req __rep_pggap_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_finfo_alloc __rep_finfo_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__rep_remove_init_file __rep_remove_init_file@DB_VERSION_UNIQUE_NAME@
+#define	__rep_reset_init __rep_reset_init@DB_VERSION_UNIQUE_NAME@
+#define	__rep_elect_pp __rep_elect_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_elect_int __rep_elect_int@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote1 __rep_vote1@DB_VERSION_UNIQUE_NAME@
+#define	__rep_vote2 __rep_vote2@DB_VERSION_UNIQUE_NAME@
+#define	__rep_update_grant __rep_update_grant@DB_VERSION_UNIQUE_NAME@
+#define	__rep_islease_granted __rep_islease_granted@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_table_alloc __rep_lease_table_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_grant __rep_lease_grant@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_check __rep_lease_check@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_refresh __rep_lease_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_expire __rep_lease_expire@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lease_waittime __rep_lease_waittime@DB_VERSION_UNIQUE_NAME@
+#define	__rep_allreq __rep_allreq@DB_VERSION_UNIQUE_NAME@
+#define	__rep_log __rep_log@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_log __rep_bulk_log@DB_VERSION_UNIQUE_NAME@
+#define	__rep_logreq __rep_logreq@DB_VERSION_UNIQUE_NAME@
+#define	__rep_loggap_req __rep_loggap_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_logready __rep_logready@DB_VERSION_UNIQUE_NAME@
+#define	__rep_env_create __rep_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__rep_env_destroy __rep_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_config __rep_get_config@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_config __rep_set_config@DB_VERSION_UNIQUE_NAME@
+#define	__rep_start_pp __rep_start_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_start_int __rep_start_int@DB_VERSION_UNIQUE_NAME@
+#define	__rep_open_sysdb __rep_open_sysdb@DB_VERSION_UNIQUE_NAME@
+#define	__rep_client_dbinit __rep_client_dbinit@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_limit __rep_get_limit@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_limit __rep_set_limit@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_nsites_pp __rep_set_nsites_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_nsites_int __rep_set_nsites_int@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_nsites __rep_get_nsites@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_priority __rep_set_priority@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_priority __rep_get_priority@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_timeout __rep_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_timeout __rep_get_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_request __rep_get_request@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_request __rep_set_request@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_transport_pp __rep_set_transport_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_transport_int __rep_set_transport_int@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_clockskew __rep_get_clockskew@DB_VERSION_UNIQUE_NAME@
+#define	__rep_set_clockskew __rep_set_clockskew@DB_VERSION_UNIQUE_NAME@
+#define	__rep_flush __rep_flush@DB_VERSION_UNIQUE_NAME@
+#define	__rep_sync __rep_sync@DB_VERSION_UNIQUE_NAME@
+#define	__rep_txn_applied __rep_txn_applied@DB_VERSION_UNIQUE_NAME@
+#define	__rep_process_message_pp __rep_process_message_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_process_message_int __rep_process_message_int@DB_VERSION_UNIQUE_NAME@
+#define	__rep_apply __rep_apply@DB_VERSION_UNIQUE_NAME@
+#define	__rep_process_txn __rep_process_txn@DB_VERSION_UNIQUE_NAME@
+#define	__rep_resend_req __rep_resend_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_check_doreq __rep_check_doreq@DB_VERSION_UNIQUE_NAME@
+#define	__rep_check_missing __rep_check_missing@DB_VERSION_UNIQUE_NAME@
+#define	__rep_open __rep_open@DB_VERSION_UNIQUE_NAME@
+#define	__rep_close_diagfiles __rep_close_diagfiles@DB_VERSION_UNIQUE_NAME@
+#define	__rep_env_refresh __rep_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__rep_env_close __rep_env_close@DB_VERSION_UNIQUE_NAME@
+#define	__rep_preclose __rep_preclose@DB_VERSION_UNIQUE_NAME@
+#define	__rep_closefiles __rep_closefiles@DB_VERSION_UNIQUE_NAME@
+#define	__rep_write_egen __rep_write_egen@DB_VERSION_UNIQUE_NAME@
+#define	__rep_write_gen __rep_write_gen@DB_VERSION_UNIQUE_NAME@
+#define	__rep_stat_pp __rep_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_stat_print_pp __rep_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__rep_stat_print __rep_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_message __rep_bulk_message@DB_VERSION_UNIQUE_NAME@
+#define	__rep_send_bulk __rep_send_bulk@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_alloc __rep_bulk_alloc@DB_VERSION_UNIQUE_NAME@
+#define	__rep_bulk_free __rep_bulk_free@DB_VERSION_UNIQUE_NAME@
+#define	__rep_send_message __rep_send_message@DB_VERSION_UNIQUE_NAME@
+#define	__rep_new_master __rep_new_master@DB_VERSION_UNIQUE_NAME@
+#define	__rep_elect_done __rep_elect_done@DB_VERSION_UNIQUE_NAME@
+#define	__env_rep_enter __env_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define	__env_db_rep_exit __env_db_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define	__db_rep_enter __db_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define	__op_handle_enter __op_handle_enter@DB_VERSION_UNIQUE_NAME@
+#define	__op_rep_enter __op_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define	__op_rep_exit __op_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define	__archive_rep_enter __archive_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define	__archive_rep_exit __archive_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lockout_archive __rep_lockout_archive@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lockout_api __rep_lockout_api@DB_VERSION_UNIQUE_NAME@
+#define	__rep_take_apilockout __rep_take_apilockout@DB_VERSION_UNIQUE_NAME@
+#define	__rep_clear_apilockout __rep_clear_apilockout@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lockout_apply __rep_lockout_apply@DB_VERSION_UNIQUE_NAME@
+#define	__rep_lockout_msg __rep_lockout_msg@DB_VERSION_UNIQUE_NAME@
+#define	__rep_send_throttle __rep_send_throttle@DB_VERSION_UNIQUE_NAME@
+#define	__rep_msg_to_old __rep_msg_to_old@DB_VERSION_UNIQUE_NAME@
+#define	__rep_msg_from_old __rep_msg_from_old@DB_VERSION_UNIQUE_NAME@
+#define	__rep_print_system __rep_print_system@DB_VERSION_UNIQUE_NAME@
+#define	__rep_print __rep_print@DB_VERSION_UNIQUE_NAME@
+#define	__rep_print_message __rep_print_message@DB_VERSION_UNIQUE_NAME@
+#define	__rep_fire_event __rep_fire_event@DB_VERSION_UNIQUE_NAME@
+#define	__rep_msg __rep_msg@DB_VERSION_UNIQUE_NAME@
+#define	__rep_notify_threads __rep_notify_threads@DB_VERSION_UNIQUE_NAME@
+#define	__rep_check_goal __rep_check_goal@DB_VERSION_UNIQUE_NAME@
+#define	__rep_log_backup __rep_log_backup@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_maxpermlsn __rep_get_maxpermlsn@DB_VERSION_UNIQUE_NAME@
+#define	__rep_is_internal_rep_file __rep_is_internal_rep_file@DB_VERSION_UNIQUE_NAME@
+#define	__rep_get_datagen __rep_get_datagen@DB_VERSION_UNIQUE_NAME@
+#define	__rep_verify __rep_verify@DB_VERSION_UNIQUE_NAME@
+#define	__rep_verify_fail __rep_verify_fail@DB_VERSION_UNIQUE_NAME@
+#define	__rep_verify_req __rep_verify_req@DB_VERSION_UNIQUE_NAME@
+#define	__rep_dorecovery __rep_dorecovery@DB_VERSION_UNIQUE_NAME@
+#define	__rep_verify_match __rep_verify_match@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_member_desc __repmgr_member_desc@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_handshake_marshal __repmgr_handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_handshake_unmarshal __repmgr_handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_v3handshake_marshal __repmgr_v3handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_v3handshake_unmarshal __repmgr_v3handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_v2handshake_marshal __repmgr_v2handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_v2handshake_unmarshal __repmgr_v2handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_parm_refresh_marshal __repmgr_parm_refresh_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_parm_refresh_unmarshal __repmgr_parm_refresh_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_permlsn_marshal __repmgr_permlsn_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_permlsn_unmarshal __repmgr_permlsn_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_version_proposal_marshal __repmgr_version_proposal_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_version_proposal_unmarshal __repmgr_version_proposal_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_version_confirmation_marshal __repmgr_version_confirmation_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_version_confirmation_unmarshal __repmgr_version_confirmation_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_msg_hdr_marshal __repmgr_msg_hdr_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_msg_hdr_unmarshal __repmgr_msg_hdr_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_msg_metadata_marshal __repmgr_msg_metadata_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_msg_metadata_unmarshal __repmgr_msg_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membership_key_marshal __repmgr_membership_key_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membership_key_unmarshal __repmgr_membership_key_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membership_data_marshal __repmgr_membership_data_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membership_data_unmarshal __repmgr_membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_member_metadata_marshal __repmgr_member_metadata_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_member_metadata_unmarshal __repmgr_member_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_gm_fwd_marshal __repmgr_gm_fwd_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_gm_fwd_unmarshal __repmgr_gm_fwd_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membr_vers_marshal __repmgr_membr_vers_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_membr_vers_unmarshal __repmgr_membr_vers_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_info_marshal __repmgr_site_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_info_unmarshal __repmgr_site_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_connect_reject_marshal __repmgr_connect_reject_marshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_connect_reject_unmarshal __repmgr_connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_member_print __repmgr_member_print@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_print __repmgr_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_election __repmgr_init_election@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_claim_victory __repmgr_claim_victory@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_turn_on_elections __repmgr_turn_on_elections@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_valid_config __repmgr_valid_config@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_autostart __repmgr_autostart@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_start_selector __repmgr_start_selector@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stop __repmgr_stop@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_env_create __repmgr_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_env_destroy __repmgr_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stop_threads __repmgr_stop_threads@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_msg __repmgr_send_msg@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_request __repmgr_send_request@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_response __repmgr_send_response@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_channel_close __repmgr_channel_close@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_channel_timeout __repmgr_channel_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_request_inval __repmgr_send_request_inval@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_channel_close_inval __repmgr_channel_close_inval@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_channel_timeout_inval __repmgr_channel_timeout_inval@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_join_group __repmgr_join_group@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_get_site_address __repmgr_get_site_address@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_get_eid __repmgr_get_eid@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_get_config __repmgr_get_config@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_config __repmgr_site_config@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_close __repmgr_site_close@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_msg_thread __repmgr_msg_thread@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_err_resp __repmgr_send_err_resp@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_update_membership __repmgr_update_membership@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_gm_version __repmgr_set_gm_version@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_setup_gmdb_op __repmgr_setup_gmdb_op@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_cleanup_gmdb_op __repmgr_cleanup_gmdb_op@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_hold_master_role __repmgr_hold_master_role@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_rlse_master_role __repmgr_rlse_master_role@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_sites __repmgr_set_sites@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_connect __repmgr_connect@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send __repmgr_send@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_sync_siteaddr __repmgr_sync_siteaddr@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_broadcast __repmgr_send_broadcast@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_one __repmgr_send_one@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_many __repmgr_send_many@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_own_msg __repmgr_send_own_msg@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_write_iovecs __repmgr_write_iovecs@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_bust_connection __repmgr_bust_connection@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_disable_connection __repmgr_disable_connection@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_cleanup_defunct __repmgr_cleanup_defunct@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_close_connection __repmgr_close_connection@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_decr_conn_ref __repmgr_decr_conn_ref@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_destroy_conn __repmgr_destroy_conn@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_pack_netaddr __repmgr_pack_netaddr@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_getaddr __repmgr_getaddr@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_listen __repmgr_listen@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_net_close __repmgr_net_close@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_net_destroy __repmgr_net_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_thread_start __repmgr_thread_start@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_thread_join __repmgr_thread_join@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_nonblock_conn __repmgr_set_nonblock_conn@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_nonblocking __repmgr_set_nonblocking@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_wake_waiters __repmgr_wake_waiters@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_await_cond __repmgr_await_cond@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_await_gmdbop __repmgr_await_gmdbop@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_compute_wait_deadline __repmgr_compute_wait_deadline@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_await_drain __repmgr_await_drain@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_alloc_cond __repmgr_alloc_cond@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_free_cond __repmgr_free_cond@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_env_create_pf __repmgr_env_create_pf@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_create_mutex_pf __repmgr_create_mutex_pf@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_destroy_mutex_pf __repmgr_destroy_mutex_pf@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init __repmgr_init@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_deinit __repmgr_deinit@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_waiters __repmgr_init_waiters@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_destroy_waiters __repmgr_destroy_waiters@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_lock_mutex __repmgr_lock_mutex@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_unlock_mutex __repmgr_unlock_mutex@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_signal __repmgr_signal@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_wake_msngers __repmgr_wake_msngers@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_wake_main_thread __repmgr_wake_main_thread@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_writev __repmgr_writev@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_readv __repmgr_readv@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_select_loop __repmgr_select_loop@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_queue_destroy __repmgr_queue_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_queue_get __repmgr_queue_get@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_queue_put __repmgr_queue_put@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_queue_size __repmgr_queue_size@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_member_recover __repmgr_member_recover@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_select_thread __repmgr_select_thread@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_bow_out __repmgr_bow_out@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_accept __repmgr_accept@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_compute_timeout __repmgr_compute_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_connected_master __repmgr_connected_master@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_check_timeouts __repmgr_check_timeouts@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_first_try_connections __repmgr_first_try_connections@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_v1_handshake __repmgr_send_v1_handshake@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_read_from_site __repmgr_read_from_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_read_conn __repmgr_read_conn@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_prepare_simple_input __repmgr_prepare_simple_input@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_handshake __repmgr_send_handshake@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_find_version_info __repmgr_find_version_info@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_write_some __repmgr_write_some@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stat_print __repmgr_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define	__repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	__repmgr_schedule_connection_attempt __repmgr_schedule_connection_attempt@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_is_server __repmgr_is_server@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_reset_for_reading __repmgr_reset_for_reading@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_new_connection __repmgr_new_connection@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_keepalive __repmgr_set_keepalive@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_new_site __repmgr_new_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_create_mutex __repmgr_create_mutex@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_destroy_mutex __repmgr_destroy_mutex@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_cleanup_netaddr __repmgr_cleanup_netaddr@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_iovec_init __repmgr_iovec_init@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_add_buffer __repmgr_add_buffer@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_add_dbt __repmgr_add_dbt@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_update_consumed __repmgr_update_consumed@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_prepare_my_addr __repmgr_prepare_my_addr@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_get_nsites __repmgr_get_nsites@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_thread_failure __repmgr_thread_failure@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_format_eid_loc __repmgr_format_eid_loc@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_format_site_loc __repmgr_format_site_loc@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_format_addr_loc __repmgr_format_addr_loc@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_repstart __repmgr_repstart@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_become_master __repmgr_become_master@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_each_connection __repmgr_each_connection@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_open __repmgr_open@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_join __repmgr_join@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_env_refresh __repmgr_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_share_netaddrs __repmgr_share_netaddrs@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_copy_in_added_sites __repmgr_copy_in_added_sites@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_new_sites __repmgr_init_new_sites@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_failchk __repmgr_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_master_is_known __repmgr_master_is_known@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_stable_lsn __repmgr_stable_lsn@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_send_sync_msg __repmgr_send_sync_msg@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_marshal_member_list __repmgr_marshal_member_list@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_refresh_membership __repmgr_refresh_membership@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_reload_gmdb __repmgr_reload_gmdb@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_gmdb_version_cmp __repmgr_gmdb_version_cmp@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_save __repmgr_init_save@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_init_restore __repmgr_init_restore@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_defer_op __repmgr_defer_op@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_fire_conn_err_event __repmgr_fire_conn_err_event@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_print_conn_err __repmgr_print_conn_err@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_become_client __repmgr_become_client@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_lookup_site __repmgr_lookup_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_find_site __repmgr_find_site@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_set_membership __repmgr_set_membership@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_bcast_parm_refresh __repmgr_bcast_parm_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_chg_prio __repmgr_chg_prio@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_bcast_own_msg __repmgr_bcast_own_msg@DB_VERSION_UNIQUE_NAME@
+#define	__seq_stat __seq_stat@DB_VERSION_UNIQUE_NAME@
+#define	__seq_stat_print __seq_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define	__db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define	bdb_HCommand bdb_HCommand@DB_VERSION_UNIQUE_NAME@
+#if DB_DBM_HSEARCH != 0
+#define	bdb_NdbmOpen bdb_NdbmOpen@DB_VERSION_UNIQUE_NAME@
+#endif
+#if DB_DBM_HSEARCH != 0
+#define	bdb_DbmCommand bdb_DbmCommand@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	ndbm_Cmd ndbm_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	_DbInfoDelete _DbInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define	db_Cmd db_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	tcl_CompactStat tcl_CompactStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_rep_send tcl_rep_send@DB_VERSION_UNIQUE_NAME@
+#define	dbc_Cmd dbc_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	env_Cmd env_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvRemove tcl_EnvRemove@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvClose tcl_EnvClose@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvIdReset tcl_EnvIdReset@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvLsnReset tcl_EnvLsnReset@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvVerbose tcl_EnvVerbose@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvAttr tcl_EnvAttr@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvSetFlags tcl_EnvSetFlags@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvTest tcl_EnvTest@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvGetEncryptFlags tcl_EnvGetEncryptFlags@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvSetErrfile tcl_EnvSetErrfile@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvSetMsgfile tcl_EnvSetMsgfile@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvSetErrpfx tcl_EnvSetErrpfx@DB_VERSION_UNIQUE_NAME@
+#define	tcl_EnvStatPrint tcl_EnvStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	_NewInfo _NewInfo@DB_VERSION_UNIQUE_NAME@
+#define	_NameToPtr _NameToPtr@DB_VERSION_UNIQUE_NAME@
+#define	_PtrToInfo _PtrToInfo@DB_VERSION_UNIQUE_NAME@
+#define	_NameToInfo _NameToInfo@DB_VERSION_UNIQUE_NAME@
+#define	_SetInfoData _SetInfoData@DB_VERSION_UNIQUE_NAME@
+#define	_DeleteInfo _DeleteInfo@DB_VERSION_UNIQUE_NAME@
+#define	_SetListElem _SetListElem@DB_VERSION_UNIQUE_NAME@
+#define	_SetListElemInt _SetListElemInt@DB_VERSION_UNIQUE_NAME@
+#define	_SetListElemWideInt _SetListElemWideInt@DB_VERSION_UNIQUE_NAME@
+#define	_SetListRecnoElem _SetListRecnoElem@DB_VERSION_UNIQUE_NAME@
+#define	_SetListHeapElem _SetListHeapElem@DB_VERSION_UNIQUE_NAME@
+#define	_Set3DBTList _Set3DBTList@DB_VERSION_UNIQUE_NAME@
+#define	_SetMultiList _SetMultiList@DB_VERSION_UNIQUE_NAME@
+#define	_GetGlobPrefix _GetGlobPrefix@DB_VERSION_UNIQUE_NAME@
+#define	_ReturnSetup _ReturnSetup@DB_VERSION_UNIQUE_NAME@
+#define	_ErrorSetup _ErrorSetup@DB_VERSION_UNIQUE_NAME@
+#define	_ErrorFunc _ErrorFunc@DB_VERSION_UNIQUE_NAME@
+#ifdef CONFIG_TEST 
+#define	_EventFunc _EventFunc@DB_VERSION_UNIQUE_NAME@
+#endif
+#define	_GetLsn _GetLsn@DB_VERSION_UNIQUE_NAME@
+#define	_GetRid _GetRid@DB_VERSION_UNIQUE_NAME@
+#define	_GetUInt32 _GetUInt32@DB_VERSION_UNIQUE_NAME@
+#define	_GetFlagsList _GetFlagsList@DB_VERSION_UNIQUE_NAME@
+#define	_debug_check _debug_check@DB_VERSION_UNIQUE_NAME@
+#define	_CopyObjBytes _CopyObjBytes@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockDetect tcl_LockDetect@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockGet tcl_LockGet@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockStat tcl_LockStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockStatPrint tcl_LockStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockTimeout tcl_LockTimeout@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockVec tcl_LockVec@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogArchive tcl_LogArchive@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogCompare tcl_LogCompare@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogFile tcl_LogFile@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogFlush tcl_LogFlush@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogGet tcl_LogGet@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogPut tcl_LogPut@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogStat tcl_LogStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogStatPrint tcl_LogStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	logc_Cmd logc_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogConfig tcl_LogConfig@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LogGetConfig tcl_LogGetConfig@DB_VERSION_UNIQUE_NAME@
+#define	_MpInfoDelete _MpInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MpSync tcl_MpSync@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MpTrickle tcl_MpTrickle@DB_VERSION_UNIQUE_NAME@
+#define	tcl_Mp tcl_Mp@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MpStat tcl_MpStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MpStatPrint tcl_MpStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_Mutex tcl_Mutex@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutFree tcl_MutFree@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutGet tcl_MutGet@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutLock tcl_MutLock@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutSet tcl_MutSet@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutStat tcl_MutStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutStatPrint tcl_MutStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_MutUnlock tcl_MutUnlock@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepConfig tcl_RepConfig@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepGetTwo tcl_RepGetTwo@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepGetConfig tcl_RepGetConfig@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepGetTimeout tcl_RepGetTimeout@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepGetAckPolicy tcl_RepGetAckPolicy@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepGetLocalSite tcl_RepGetLocalSite@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepElect tcl_RepElect@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepFlush tcl_RepFlush@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepSync tcl_RepSync@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepLease tcl_RepLease@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepInmemFiles tcl_RepInmemFiles@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepLimit tcl_RepLimit@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepNSites tcl_RepNSites@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepRequest tcl_RepRequest@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepNoarchiveTimeout tcl_RepNoarchiveTimeout@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepTransport tcl_RepTransport@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepStart tcl_RepStart@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepProcessMessage tcl_RepProcessMessage@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepStat tcl_RepStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepStatPrint tcl_RepStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepMgr tcl_RepMgr@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepMgrSiteList tcl_RepMgrSiteList@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepMgrStat tcl_RepMgrStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepMgrStatPrint tcl_RepMgrStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_RepApplied tcl_RepApplied@DB_VERSION_UNIQUE_NAME@
+#define	seq_Cmd seq_Cmd@DB_VERSION_UNIQUE_NAME@
+#define	_TxnInfoDelete _TxnInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define	tcl_TxnCheckpoint tcl_TxnCheckpoint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_Txn tcl_Txn@DB_VERSION_UNIQUE_NAME@
+#define	tcl_CDSGroup tcl_CDSGroup@DB_VERSION_UNIQUE_NAME@
+#define	tcl_TxnStat tcl_TxnStat@DB_VERSION_UNIQUE_NAME@
+#define	tcl_TxnStatPrint tcl_TxnStatPrint@DB_VERSION_UNIQUE_NAME@
+#define	tcl_TxnTimeout tcl_TxnTimeout@DB_VERSION_UNIQUE_NAME@
+#define	tcl_TxnRecover tcl_TxnRecover@DB_VERSION_UNIQUE_NAME@
+#define	bdb_RandCommand bdb_RandCommand@DB_VERSION_UNIQUE_NAME@
+#define	tcl_LockMutex tcl_LockMutex@DB_VERSION_UNIQUE_NAME@
+#define	tcl_UnlockMutex tcl_UnlockMutex@DB_VERSION_UNIQUE_NAME@
+#define	__txn_begin_pp __txn_begin_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_begin __txn_begin@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recycle_id __txn_recycle_id@DB_VERSION_UNIQUE_NAME@
+#define	__txn_continue __txn_continue@DB_VERSION_UNIQUE_NAME@
+#define	__txn_commit __txn_commit@DB_VERSION_UNIQUE_NAME@
+#define	__txn_abort __txn_abort@DB_VERSION_UNIQUE_NAME@
+#define	__txn_discard_int __txn_discard_int@DB_VERSION_UNIQUE_NAME@
+#define	__txn_prepare __txn_prepare@DB_VERSION_UNIQUE_NAME@
+#define	__txn_id __txn_id@DB_VERSION_UNIQUE_NAME@
+#define	__txn_get_name __txn_get_name@DB_VERSION_UNIQUE_NAME@
+#define	__txn_set_name __txn_set_name@DB_VERSION_UNIQUE_NAME@
+#define	__txn_get_priority __txn_get_priority@DB_VERSION_UNIQUE_NAME@
+#define	__txn_set_priority __txn_set_priority@DB_VERSION_UNIQUE_NAME@
+#define	__txn_set_timeout __txn_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define	__txn_activekids __txn_activekids@DB_VERSION_UNIQUE_NAME@
+#define	__txn_force_abort __txn_force_abort@DB_VERSION_UNIQUE_NAME@
+#define	__txn_preclose __txn_preclose@DB_VERSION_UNIQUE_NAME@
+#define	__txn_reset __txn_reset@DB_VERSION_UNIQUE_NAME@
+#define	__txn_applied_pp __txn_applied_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_42_desc __txn_regop_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_desc __txn_regop_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_42_desc __txn_ckp_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_desc __txn_ckp_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_child_desc __txn_child_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_xa_regop_42_desc __txn_xa_regop_42_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_prepare_desc __txn_prepare_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recycle_desc __txn_recycle_desc@DB_VERSION_UNIQUE_NAME@
+#define	__txn_init_recover __txn_init_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_42_print __txn_regop_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_print __txn_regop_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_42_print __txn_ckp_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_print __txn_ckp_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_child_print __txn_child_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_xa_regop_42_print __txn_xa_regop_42_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_prepare_print __txn_prepare_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recycle_print __txn_recycle_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_init_print __txn_init_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_checkpoint_pp __txn_checkpoint_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_checkpoint __txn_checkpoint@DB_VERSION_UNIQUE_NAME@
+#define	__txn_getactive __txn_getactive@DB_VERSION_UNIQUE_NAME@
+#define	__txn_getckp __txn_getckp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_updateckp __txn_updateckp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_failchk __txn_failchk@DB_VERSION_UNIQUE_NAME@
+#define	__txn_env_create __txn_env_create@DB_VERSION_UNIQUE_NAME@
+#define	__txn_env_destroy __txn_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define	__txn_get_tx_max __txn_get_tx_max@DB_VERSION_UNIQUE_NAME@
+#define	__txn_set_tx_max __txn_set_tx_max@DB_VERSION_UNIQUE_NAME@
+#define	__txn_get_tx_timestamp __txn_get_tx_timestamp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_set_tx_timestamp __txn_set_tx_timestamp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_recover __txn_regop_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_prepare_recover __txn_prepare_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_recover __txn_ckp_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_child_recover __txn_child_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_restore_txn __txn_restore_txn@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recycle_recover __txn_recycle_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_regop_42_recover __txn_regop_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_ckp_42_recover __txn_ckp_42_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recover_pp __txn_recover_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_recover __txn_recover@DB_VERSION_UNIQUE_NAME@
+#define	__txn_get_prepared __txn_get_prepared@DB_VERSION_UNIQUE_NAME@
+#define	__txn_openfiles __txn_openfiles@DB_VERSION_UNIQUE_NAME@
+#define	__txn_open __txn_open@DB_VERSION_UNIQUE_NAME@
+#define	__txn_findlastckp __txn_findlastckp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_env_refresh __txn_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define	__txn_region_mutex_count __txn_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define	__txn_region_mutex_max __txn_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define	__txn_region_size __txn_region_size@DB_VERSION_UNIQUE_NAME@
+#define	__txn_region_max __txn_region_max@DB_VERSION_UNIQUE_NAME@
+#define	__txn_id_set __txn_id_set@DB_VERSION_UNIQUE_NAME@
+#define	__txn_oldest_reader __txn_oldest_reader@DB_VERSION_UNIQUE_NAME@
+#define	__txn_add_buffer __txn_add_buffer@DB_VERSION_UNIQUE_NAME@
+#define	__txn_remove_buffer __txn_remove_buffer@DB_VERSION_UNIQUE_NAME@
+#define	__txn_stat_pp __txn_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_stat_print_pp __txn_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define	__txn_stat_print __txn_stat_print@DB_VERSION_UNIQUE_NAME@
+#define	__txn_closeevent __txn_closeevent@DB_VERSION_UNIQUE_NAME@
+#define	__txn_remevent __txn_remevent@DB_VERSION_UNIQUE_NAME@
+#define	__txn_remrem __txn_remrem@DB_VERSION_UNIQUE_NAME@
+#define	__txn_lockevent __txn_lockevent@DB_VERSION_UNIQUE_NAME@
+#define	__txn_remlock __txn_remlock@DB_VERSION_UNIQUE_NAME@
+#define	__txn_doevents __txn_doevents@DB_VERSION_UNIQUE_NAME@
+#define	__txn_record_fname __txn_record_fname@DB_VERSION_UNIQUE_NAME@
+#define	__txn_dref_fname __txn_dref_fname@DB_VERSION_UNIQUE_NAME@
+#define	__txn_reset_fe_watermarks __txn_reset_fe_watermarks@DB_VERSION_UNIQUE_NAME@
+#define	__txn_remove_fe_watermark __txn_remove_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define	__txn_add_fe_watermark __txn_add_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define	__txn_flush_fe_files __txn_flush_fe_files@DB_VERSION_UNIQUE_NAME@
+#define	__txn_pg_above_fe_watermark __txn_pg_above_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define	__db_rmid_to_env __db_rmid_to_env@DB_VERSION_UNIQUE_NAME@
+#define	__db_xid_to_txn __db_xid_to_txn@DB_VERSION_UNIQUE_NAME@
+#define	__db_map_rmid __db_map_rmid@DB_VERSION_UNIQUE_NAME@
+#define	__db_unmap_rmid __db_unmap_rmid@DB_VERSION_UNIQUE_NAME@
+#define	__db_unmap_xid __db_unmap_xid@DB_VERSION_UNIQUE_NAME@
+#define	__db_global_values __db_global_values@DB_VERSION_UNIQUE_NAME@
+#define	__repmgr_guesstimated_max __repmgr_guesstimated_max@DB_VERSION_UNIQUE_NAME@
+#define	db_xa_switch db_xa_switch@DB_VERSION_UNIQUE_NAME@
+
+#endif /* !_DB_INT_DEF_IN_ */
diff --git a/src/dbinc_auto/lock_ext.h b/src/dbinc_auto/lock_ext.h
new file mode 100644
index 00000000..d5981e18
--- /dev/null
+++ b/src/dbinc_auto/lock_ext.h
@@ -0,0 +1,78 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_lock_ext_h_
+#define	_lock_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __lock_vec_pp __P((DB_ENV *, u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+int __lock_vec __P((ENV *, DB_LOCKER *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+int __lock_get_pp __P((DB_ENV *, u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+int __lock_get __P((ENV *, DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+int  __lock_get_internal __P((DB_LOCKTAB *, DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, db_timeout_t, DB_LOCK *));
+int  __lock_put_pp __P((DB_ENV *, DB_LOCK *));
+int  __lock_put __P((ENV *, DB_LOCK *));
+int __lock_downgrade __P((ENV *, DB_LOCK *, db_lockmode_t, u_int32_t));
+int __lock_locker_same_family __P((ENV *, DB_LOCKER *, DB_LOCKER *, int *));
+int __lock_wakeup __P((ENV *, const DBT *));
+int __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *, int *, u_int32_t));
+int __lock_change __P((ENV *, DB_LOCK *, DB_LOCK *));
+int __lock_detect_pp __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+int __lock_detect __P((ENV *, u_int32_t, int *));
+int __lock_failchk __P((ENV *));
+int __lock_id_pp __P((DB_ENV *, u_int32_t *));
+int  __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **));
+void __lock_set_thread_id __P((void *, pid_t, db_threadid_t));
+int __lock_id_free_pp __P((DB_ENV *, u_int32_t));
+int  __lock_id_free __P((ENV *, DB_LOCKER *));
+int __lock_id_set __P((ENV *, u_int32_t, u_int32_t));
+int __lock_getlocker __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
+int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
+int __lock_addfamilylocker __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __lock_freelocker  __P((DB_LOCKTAB *, DB_LOCKER *));
+int __lock_familyremove  __P((DB_LOCKTAB *, DB_LOCKER *));
+int __lock_fix_list __P((ENV *, DBT *, u_int32_t));
+int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t, db_lockmode_t, DBT *));
+void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *));
+int __lock_env_create __P((DB_ENV *));
+void __lock_env_destroy __P((DB_ENV *));
+int __lock_get_lk_conflicts __P((DB_ENV *, const u_int8_t **, int *));
+int __lock_set_lk_conflicts __P((DB_ENV *, u_int8_t *, int));
+int __lock_get_lk_detect __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_detect __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_locks __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_locks __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_lockers __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_lockers __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_objects __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_objects __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_partitions __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_partitions __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_tablesize __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_tablesize __P((DB_ENV *, u_int32_t));
+int __lock_set_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t));
+int __lock_get_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t *));
+int __lock_get_env_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t));
+int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+int __lock_open __P((ENV *));
+int __lock_env_refresh __P((ENV *));
+u_int32_t __lock_region_mutex_count __P((ENV *));
+u_int32_t __lock_region_mutex_max __P((ENV *));
+size_t __lock_region_max __P((ENV *));
+size_t __lock_region_size __P((ENV *, size_t));
+int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+int __lock_stat_print_pp __P((DB_ENV *, u_int32_t));
+int  __lock_stat_print __P((ENV *, u_int32_t));
+void __lock_printlock __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int));
+int __lock_set_timeout __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+int __lock_set_timeout_internal __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *));
+u_int32_t __lock_ohash __P((const DBT *));
+u_int32_t __lock_lhash __P((DB_LOCKOBJ *));
+int __lock_nomem __P((ENV *, const char *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_lock_ext_h_ */
diff --git a/src/dbinc_auto/log_ext.h b/src/dbinc_auto/log_ext.h
new file mode 100644
index 00000000..dde6742d
--- /dev/null
+++ b/src/dbinc_auto/log_ext.h
@@ -0,0 +1,208 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_log_ext_h_
+#define	_log_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __log_open __P((ENV *));
+int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
+int __log_valid __P((DB_LOG *, u_int32_t, int, DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
+int __log_env_refresh __P((ENV *));
+int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
+u_int32_t __log_region_mutex_count __P((ENV *));
+u_int32_t __log_region_mutex_max __P((ENV *));
+size_t	__log_region_size __P((ENV *));
+size_t	__log_region_max __P((ENV *));
+int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
+int __log_is_outdated __P((ENV *, u_int32_t, int *));
+int __log_zero __P((ENV *, DB_LSN *));
+int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
+int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
+int __log_inmem_chkspace __P((DB_LOG *, size_t));
+void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
+void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
+void __log_set_version __P((ENV *, u_int32_t));
+int __log_get_oldversion __P((ENV *, u_int32_t *));
+int __log_archive_pp __P((DB_ENV *, char **[], u_int32_t));
+int __log_archive __P((ENV *, char **[], u_int32_t));
+int __log_get_stable_lsn __P((ENV *, DB_LSN *, int));
+void __log_autoremove __P((ENV *));
+int __log_check_page_lsn __P((ENV *, DB *, DB_LSN *));
+int __log_printf_capi __P((DB_ENV *, DB_TXN *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __log_printf_pp __P((DB_ENV *, DB_TXN *, const char *, va_list));
+int __log_printf __P((ENV *, DB_TXN *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __log_cursor_pp __P((DB_ENV *, DB_LOGC **, u_int32_t));
+int __log_cursor __P((ENV *, DB_LOGC **));
+int __logc_close __P((DB_LOGC *));
+int __logc_version __P((DB_LOGC *, u_int32_t *));
+int __logc_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+void __log_hdrswap __P((HDR *, int));
+void __log_persistswap __P((LOGP *));
+int __log_read_record_pp  __P((DB_ENV *, DB **, void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+int __log_read_record  __P((ENV *, DB **, void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+int __log_env_create __P((DB_ENV *));
+void __log_env_destroy __P((DB_ENV *));
+int __log_get_lg_bsize __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
+int __log_get_lg_filemode __P((DB_ENV *, int *));
+int __log_set_lg_filemode __P((DB_ENV *, int));
+int __log_get_lg_max __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+int __log_get_lg_regionmax __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
+int __log_get_lg_dir __P((DB_ENV *, const char **));
+int __log_set_lg_dir __P((DB_ENV *, const char *));
+void __log_get_flags __P((DB_ENV *, u_int32_t *));
+void __log_set_flags __P((ENV *, u_int32_t, int));
+int __log_get_config __P((DB_ENV *, u_int32_t, int *));
+int __log_set_config __P((DB_ENV *, u_int32_t, int));
+int __log_set_config_int __P((DB_ENV *, u_int32_t, int, int));
+int __log_check_sizes __P((ENV *, u_int32_t, u_int32_t));
+int __log_print_record  __P((ENV *, DBT *, DB_LSN *, char *, DB_LOG_RECSPEC *, void *));
+int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_current_lsn_int __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+int __log_current_lsn __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
+int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
+int __log_flush __P((ENV *, const DB_LSN *));
+int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
+int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
+int __log_name __P((DB_LOG *, u_int32_t, char **, DB_FH **, u_int32_t));
+int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
+int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
+int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+int __log_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __log_stat_print __P((ENV *, u_int32_t));
+int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *, DB_THREAD_INFO *));
+int __log_verify_wrap __P((ENV *, const char *, u_int32_t, const char *, const char *, time_t, time_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, int, int));
+int __crdel_init_verify __P((ENV *, DB_DISTAB *));
+int __db_init_verify __P((ENV *, DB_DISTAB *));
+int __dbreg_init_verify __P((ENV *, DB_DISTAB *));
+int __bam_init_verify __P((ENV *, DB_DISTAB *));
+int __fop_init_verify __P((ENV *, DB_DISTAB *));
+int __ham_init_verify __P((ENV *, DB_DISTAB *));
+int __heap_init_verify __P((ENV *, DB_DISTAB *));
+int __qam_init_verify __P((ENV *, DB_DISTAB *));
+int __txn_init_verify __P((ENV *, DB_DISTAB *));
+void __db_log_verify_global_report __P((const DB_LOG_VRFY_INFO *));
+int __crdel_metasub_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_incfirst_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_xa_regop_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recycle_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __create_log_vrfy_info __P((const DB_LOG_VERIFY_CONFIG *, DB_LOG_VRFY_INFO **, DB_THREAD_INFO *));
+int __destroy_log_vrfy_info __P((DB_LOG_VRFY_INFO *));
+int __put_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, const VRFY_TXN_INFO *));
+int __get_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, u_int32_t, VRFY_TXN_INFO **));
+int __add_recycle_lsn_range __P((DB_LOG_VRFY_INFO *, const DB_LSN *, u_int32_t, u_int32_t));
+int __iterate_txninfo __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t, TXNINFO_HANDLER, void *));
+int __rem_last_recycle_lsn __P((VRFY_TXN_INFO *));
+int __add_file_updated __P((VRFY_TXN_INFO *, const DBT *, int32_t));
+int __del_file_updated __P((VRFY_TXN_INFO *, const DBT *));
+int __clear_fileups __P((VRFY_TXN_INFO *));
+int __free_txninfo_stack __P((VRFY_TXN_INFO *));
+int __free_txninfo __P((VRFY_TXN_INFO *));
+int __put_filereg_info __P((const DB_LOG_VRFY_INFO *, const VRFY_FILEREG_INFO *));
+int __del_filelife __P((const DB_LOG_VRFY_INFO *, int32_t));
+int __put_filelife __P((const DB_LOG_VRFY_INFO *, VRFY_FILELIFE *));
+int __get_filelife __P((const DB_LOG_VRFY_INFO *, int32_t, VRFY_FILELIFE **));
+int __get_filereg_by_dbregid __P((const DB_LOG_VRFY_INFO *, int32_t, VRFY_FILEREG_INFO **));
+int __add_dbregid __P((DB_LOG_VRFY_INFO *, VRFY_FILEREG_INFO *, int32_t, u_int32_t, DB_LSN, DBTYPE, db_pgno_t, int *));
+int __get_filereg_info __P((const DB_LOG_VRFY_INFO *, const DBT *, VRFY_FILEREG_INFO **));
+int __free_filereg_info __P((VRFY_FILEREG_INFO *));
+int __get_ckp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_CKP_INFO **));
+int __get_last_ckp_info __P((const DB_LOG_VRFY_INFO *, VRFY_CKP_INFO **));
+int __put_ckp_info __P((const DB_LOG_VRFY_INFO *, const VRFY_CKP_INFO *));
+int __get_timestamp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_TIMESTAMP_INFO **));
+int __get_latest_timestamp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_TIMESTAMP_INFO **));
+int __put_timestamp_info __P((const DB_LOG_VRFY_INFO *, const VRFY_TIMESTAMP_INFO *));
+int __find_lsnrg_by_timerg __P((DB_LOG_VRFY_INFO *, time_t, time_t, DB_LSN *, DB_LSN *));
+int __add_txnrange __P((DB_LOG_VRFY_INFO *, u_int32_t, DB_LSN, int32_t, int));
+int __get_aborttxn __P((DB_LOG_VRFY_INFO *, DB_LSN));
+int __txn_started __P((DB_LOG_VRFY_INFO *, DB_LSN, u_int32_t, int *));
+int __set_logvrfy_dbfuid __P((DB_LOG_VRFY_INFO *));
+int __add_page_to_txn __P((DB_LOG_VRFY_INFO *, int32_t, db_pgno_t, u_int32_t, u_int32_t *, int *));
+int __del_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t));
+int __is_ancestor_txn __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t, DB_LSN, int *));
+int __return_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_log_ext_h_ */
diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h
new file mode 100644
index 00000000..d142b584
--- /dev/null
+++ b/src/dbinc_auto/mp_ext.h
@@ -0,0 +1,106 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_mp_ext_h_
+#define	_mp_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __memp_alloc __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+void __memp_free __P((REGINFO *, void *));
+int __memp_backup_open __P((ENV *, DB_MPOOLFILE *, const char *, const char *, u_int32_t, DB_FH **, void**));
+int __memp_backup_mpf __P((ENV *, DB_MPOOLFILE *, DB_THREAD_INFO *, db_pgno_t, db_pgno_t, DB_FH *, void *,  u_int32_t));
+int __memp_backup_close __P((ENV *, DB_MPOOLFILE *, const char *, DB_FH *, void *HANDLE));
+int __memp_failchk __P((ENV *));
+int __memp_bhwrite __P((DB_MPOOL *, DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
+int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
+int __memp_bhfree __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
+int __memp_fget_pp __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+int __memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
+int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
+int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *));
+int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int));
+int __memp_get_ftype __P((DB_MPOOLFILE *, int *));
+int __memp_set_ftype __P((DB_MPOOLFILE *, int));
+int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *));
+int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+char * __memp_fn __P((DB_MPOOLFILE *));
+char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+int __memp_fopen_pp __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *, const char *, const char **, u_int32_t, int, size_t));
+int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
+int __memp_inmemlist __P((ENV *, char ***, int *));
+int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
+int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
+int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
+int __memp_shared __P((DB_MPOOLFILE *, void *));
+int __memp_env_create __P((DB_ENV *));
+void __memp_env_destroy __P((DB_ENV *));
+int __memp_get_cachesize __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
+int __memp_set_mp_max_openfd __P((DB_ENV *, int));
+int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
+int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
+int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *));
+int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t));
+int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t));
+int __memp_get_mp_mtxcount __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_mtxcount __P((DB_ENV *, u_int32_t));
+int __memp_nameop __P((ENV *, u_int8_t *, const char *, const char *, const char *, int));
+int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *, DB_THREAD_INFO *, db_pgno_t, u_int32_t));
+int __memp_alloc_freelist __P((DB_MPOOLFILE *, u_int32_t, db_pgno_t **));
+int __memp_free_freelist __P((DB_MPOOLFILE *));
+int __memp_get_freelist __P(( DB_MPOOLFILE *, u_int32_t *, db_pgno_t **));
+int __memp_extend_freelist __P(( DB_MPOOLFILE *, u_int32_t , db_pgno_t **));
+int __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t));
+int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
+int __memp_skip_curadj __P((DBC *, db_pgno_t));
+int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, int *));
+int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, BH *));
+int __memp_open __P((ENV *, int));
+int	__memp_init __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
+u_int32_t __memp_max_regions __P((ENV *));
+u_int32_t __memp_region_mutex_count __P((ENV *));
+int __memp_env_refresh __P((ENV *));
+int __memp_register_pp __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+int __memp_register __P((ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+int __memp_get_bucket __P((ENV *, MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *));
+int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __memp_stat_pp __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+int __memp_stat_print_pp __P((DB_ENV *, u_int32_t));
+int  __memp_stat_print __P((ENV *, u_int32_t));
+void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+int __memp_walk_files __P((ENV *, MPOOL *, int (*) __P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t));
+int __memp_discard_all_mpfs __P((ENV *, MPOOL *));
+int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
+int __memp_sync __P((ENV *, u_int32_t, DB_LSN *));
+int __memp_fsync_pp __P((DB_MPOOLFILE *));
+int __memp_fsync __P((DB_MPOOLFILE *));
+int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+int __memp_sync_int __P((ENV *, DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
+int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
+int __memp_trickle_pp __P((DB_ENV *, int, int *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_mp_ext_h_ */
diff --git a/src/dbinc_auto/mutex_ext.h b/src/dbinc_auto/mutex_ext.h
new file mode 100644
index 00000000..1a2a1b2b
--- /dev/null
+++ b/src/dbinc_auto/mutex_ext.h
@@ -0,0 +1,91 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_mutex_ext_h_
+#define	_mutex_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *));
+int __mutex_alloc_int __P((ENV *, int, int, u_int32_t, db_mutex_t *));
+int __mutex_free __P((ENV *, db_mutex_t *));
+int __mutex_free_int __P((ENV *, int, db_mutex_t *));
+int __mutex_refresh __P((ENV *, db_mutex_t));
+int __mut_failchk __P((ENV *));
+int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
+int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
+int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *));
+int __mutex_free_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_lock_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_unlock_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_get_align __P((DB_ENV *, u_int32_t *));
+int __mutex_set_align __P((DB_ENV *, u_int32_t));
+int __mutex_get_increment __P((DB_ENV *, u_int32_t *));
+int __mutex_set_increment __P((DB_ENV *, u_int32_t));
+int __mutex_get_init __P((DB_ENV *, u_int32_t *));
+int __mutex_set_init __P((DB_ENV *, u_int32_t));
+int __mutex_get_max __P((DB_ENV *, u_int32_t *));
+int __mutex_set_max __P((DB_ENV *, u_int32_t));
+int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *));
+int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t));
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *));
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+atomic_value_t __atomic_dec __P((ENV *, db_atomic_t *));
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+int atomic_compare_exchange __P((ENV *, db_atomic_t *, atomic_value_t, atomic_value_t));
+#endif
+int __db_pthread_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+#ifndef HAVE_MUTEX_HYBRID
+int __db_pthread_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_pthread_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+int __db_hybrid_mutex_suspend __P((ENV *, db_mutex_t, db_timespec *, int));
+#endif
+int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t));
+int __mutex_open __P((ENV *, int));
+int __mutex_env_refresh __P((ENV *));
+void __mutex_resource_return __P((ENV *, REGINFO *));
+int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+int __mutex_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __mutex_stat_print __P((ENV *, u_int32_t));
+void __mutex_print_debug_single __P((ENV *, const char *, db_mutex_t, u_int32_t));
+void __mutex_print_debug_stats __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
+void __mutex_set_wait_info __P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *));
+void __mutex_clear __P((ENV *, db_mutex_t));
+int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_tas_mutex_trylock __P((ENV *, db_mutex_t));
+#if defined(HAVE_SHARED_LATCHES)
+int __db_tas_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_tas_mutex_tryreadlock __P((ENV *, db_mutex_t));
+#endif
+int __db_tas_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_tas_mutex_destroy __P((ENV *, db_mutex_t));
+int __db_win32_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_win32_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_win32_mutex_trylock __P((ENV *, db_mutex_t));
+#if defined(HAVE_SHARED_LATCHES)
+int __db_win32_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_win32_mutex_tryreadlock __P((ENV *, db_mutex_t));
+#endif
+int __db_win32_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_win32_mutex_destroy __P((ENV *, db_mutex_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_mutex_ext_h_ */
diff --git a/src/dbinc_auto/os_ext.h b/src/dbinc_auto/os_ext.h
new file mode 100644
index 00000000..a0a7b791
--- /dev/null
+++ b/src/dbinc_auto/os_ext.h
@@ -0,0 +1,84 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_os_ext_h_
+#define	_os_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __os_abort __P((ENV *));
+int __os_abspath __P((const char *));
+#if defined(HAVE_REPLICATION_THREADS)
+int __os_getaddrinfo __P((ENV *, const char *, u_int, const char *, const ADDRINFO *, ADDRINFO **));
+#endif
+#if defined(HAVE_REPLICATION_THREADS)
+void __os_freeaddrinfo __P((ENV *, ADDRINFO *));
+#endif
+int __os_umalloc __P((ENV *, size_t, void *));
+int __os_urealloc __P((ENV *, size_t, void *));
+void __os_ufree __P((ENV *, void *));
+int __os_strdup __P((ENV *, const char *, void *));
+int __os_calloc __P((ENV *, size_t, size_t, void *));
+int __os_malloc __P((ENV *, size_t, void *));
+int __os_realloc __P((ENV *, size_t, void *));
+void __os_free __P((ENV *, void *));
+void *__ua_memcpy __P((void *, const void *, size_t));
+void __os_gettime __P((ENV *, db_timespec *, int));
+int __os_fs_notzero __P((void));
+int __os_support_direct_io __P((void));
+int __os_support_db_register __P((void));
+int __os_support_replication __P((void));
+u_int32_t __os_cpu_count __P((void));
+char *__os_ctime __P((const time_t *, char *));
+int __os_dirlist __P((ENV *, const char *, int, char ***, int *));
+void __os_dirfree __P((ENV *, char **, int));
+int __os_get_errno_ret_zero __P((void));
+int __os_get_errno __P((void));
+int __os_get_neterr __P((void));
+int __os_get_syserr __P((void));
+void __os_set_errno __P((int));
+char *__os_strerror __P((int, char *, size_t));
+int __os_posix_err __P((int));
+int __os_fileid __P((ENV *, const char *, int, u_int8_t *));
+int __os_fdlock __P((ENV *, DB_FH *, off_t, int, int));
+int __os_fsync __P((ENV *, DB_FH *));
+int __os_getenv __P((ENV *, const char *, char **, size_t));
+int __os_openhandle __P((ENV *, const char *, int, int, DB_FH **));
+int __os_closehandle __P((ENV *, DB_FH *));
+int __os_attach __P((ENV *, REGINFO *, REGION *));
+int __os_detach __P((ENV *, REGINFO *, int));
+int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **));
+int __os_unmapfile __P((ENV *, void *, size_t));
+int __os_mkdir __P((ENV *, const char *, int));
+int __os_open __P((ENV *, const char *, u_int32_t, u_int32_t, int, DB_FH **));
+int __os_concat_path __P((char *, size_t, const char *, const char *));
+void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*));
+int __os_rename __P((ENV *, const char *, const char *, u_int32_t));
+int __os_isroot __P((void));
+char *__db_rpath __P((const char *));
+int __os_io __P((ENV *, int, DB_FH *, db_pgno_t, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
+int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_physwrite __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_seek __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t));
+void __os_stack __P((ENV *));
+int __os_exists __P((ENV *, const char *, int *));
+int __os_ioinfo __P((ENV *, const char *, DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *));
+int __os_tmpdir __P((ENV *, u_int32_t));
+int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+void __os_unique_id __P((ENV *, u_int32_t *));
+int __os_unlink __P((ENV *, const char *, int));
+void __os_yield __P((ENV *, u_long, u_long));
+#ifdef HAVE_QNX
+int __os_qnx_region_open __P((ENV *, const char *, int, int, DB_FH **));
+#endif
+int __os_is_winnt __P((void));
+u_int32_t __os_cpu_count __P((void));
+#ifdef HAVE_REPLICATION_THREADS
+int __os_get_neterr __P((void));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_os_ext_h_ */
diff --git a/src/dbinc_auto/qam_auto.h b/src/dbinc_auto/qam_auto.h
new file mode 100644
index 00000000..fe7c2437
--- /dev/null
+++ b/src/dbinc_auto/qam_auto.h
@@ -0,0 +1,174 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__qam_AUTO_H
+#define	__qam_AUTO_H
+#ifdef HAVE_QUEUE
+#include "dbinc/log.h"
+#define	DB___qam_incfirst	84
+typedef struct ___qam_incfirst_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	db_recno_t	recno;
+	db_pgno_t	meta_pgno;
+} __qam_incfirst_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_incfirst_desc[];
+static inline int
+__qam_incfirst_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_recno_t recno, db_pgno_t meta_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___qam_incfirst, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __qam_incfirst_desc, recno, meta_pgno));
+}
+
+static inline int __qam_incfirst_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __qam_incfirst_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __qam_incfirst_desc, sizeof(__qam_incfirst_args), (void**)arg));
+}
+#define	DB___qam_mvptr	85
+typedef struct ___qam_mvptr_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	fileid;
+	db_recno_t	old_first;
+	db_recno_t	new_first;
+	db_recno_t	old_cur;
+	db_recno_t	new_cur;
+	DB_LSN	metalsn;
+	db_pgno_t	meta_pgno;
+} __qam_mvptr_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_mvptr_desc[];
+static inline int
+__qam_mvptr_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, db_recno_t old_first, db_recno_t new_first, db_recno_t old_cur,
+    db_recno_t new_cur, DB_LSN * metalsn, db_pgno_t meta_pgno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___qam_mvptr, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    sizeof(*metalsn) + sizeof(u_int32_t),
+	    __qam_mvptr_desc,
+	    opcode, old_first, new_first, old_cur, new_cur, metalsn, meta_pgno));
+}
+
+static inline int __qam_mvptr_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __qam_mvptr_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __qam_mvptr_desc, sizeof(__qam_mvptr_args), (void**)arg));
+}
+#define	DB___qam_del	79
+typedef struct ___qam_del_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	lsn;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	db_recno_t	recno;
+} __qam_del_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_del_desc[];
+static inline int
+__qam_del_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___qam_del, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __qam_del_desc, lsn, pgno, indx, recno));
+}
+
+static inline int __qam_del_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __qam_del_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __qam_del_desc, sizeof(__qam_del_args), (void**)arg));
+}
+#define	DB___qam_add	80
+typedef struct ___qam_add_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	lsn;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	db_recno_t	recno;
+	DBT	data;
+	u_int32_t	vflag;
+	DBT	olddata;
+} __qam_add_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_add_desc[];
+static inline int
+__qam_add_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno,
+    const DBT *data, u_int32_t vflag, const DBT *olddata)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___qam_add, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(data) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(olddata),
+	    __qam_add_desc, lsn, pgno, indx, recno, data, vflag, olddata));
+}
+
+static inline int __qam_add_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __qam_add_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __qam_add_desc, sizeof(__qam_add_args), (void**)arg));
+}
+#define	DB___qam_delext	83
+typedef struct ___qam_delext_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	int32_t	fileid;
+	DB_LSN	lsn;
+	db_pgno_t	pgno;
+	u_int32_t	indx;
+	db_recno_t	recno;
+	DBT	data;
+} __qam_delext_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_delext_desc[];
+static inline int
+__qam_delext_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno,
+    const DBT *data)
+{
+	return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+	    flags, DB___qam_delext, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+	    __qam_delext_desc, lsn, pgno, indx, recno, data));
+}
+
+static inline int __qam_delext_read(ENV *env, 
+    DB **dbpp, void *td, void *data, __qam_delext_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    dbpp, td, data, __qam_delext_desc, sizeof(__qam_delext_args), (void**)arg));
+}
+#endif /* HAVE_QUEUE */
+#endif
diff --git a/src/dbinc_auto/qam_ext.h b/src/dbinc_auto/qam_ext.h
new file mode 100644
index 00000000..3f143664
--- /dev/null
+++ b/src/dbinc_auto/qam_ext.h
@@ -0,0 +1,68 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_qam_ext_h_
+#define	_qam_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __qam_position __P((DBC *, db_recno_t *, u_int32_t, int *));
+int __qam_pitem __P((DBC *,  QPAGE *, u_int32_t, db_recno_t, DBT *));
+int __qam_append __P((DBC *, DBT *, DBT *));
+int __qamc_dup __P((DBC *, DBC *));
+int __qamc_init __P((DBC *));
+int __qam_truncate __P((DBC *, u_int32_t *));
+int __qam_delete __P((DBC *,  DBT *, u_int32_t));
+int __qam_init_recover __P((ENV *, DB_DISTAB *));
+int __qam_incfirst_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_init_print __P((ENV *, DB_DISTAB *));
+int __qam_mswap __P((ENV *, PAGE *));
+int __qam_pgin_out __P((ENV *, db_pgno_t, void *, DBT *));
+int __qam_fprobe __P((DBC *, db_pgno_t, void *, qam_probe_mode, DB_CACHE_PRIORITY, u_int32_t));
+int __qam_fclose __P((DB *, db_pgno_t));
+int __qam_fremove __P((DB *, db_pgno_t));
+int __qam_sync __P((DB *));
+int __qam_gen_filelist __P((DB *, DB_THREAD_INFO *, QUEUE_FILELIST **));
+int __qam_extent_names __P((ENV *, char *, char ***));
+void __qam_exid __P((DB *, u_int8_t *, u_int32_t));
+int __qam_nameop __P((DB *, DB_TXN *, const char *, qam_name_op));
+int __qam_lsn_reset __P((DB *, DB_THREAD_INFO *));
+int __qam_backup_extents __P((DB *, DB_THREAD_INFO *, const char *, u_int32_t));
+int __qam_db_create __P((DB *));
+int __qam_db_close __P((DB *, u_int32_t));
+int __qam_get_extentsize __P((DB *, u_int32_t *));
+int __queue_pageinfo __P((DB *, db_pgno_t *, db_pgno_t *, int *, int, u_int32_t));
+int __db_prqueue __P((DB *, u_int32_t));
+int __qam_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __qam_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *));
+void __qam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __qam_set_flags __P((DB *, u_int32_t *flagsp));
+int __qam_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, int, u_int32_t));
+int __qam_set_ext_data __P((DB*, const char *));
+int __qam_metachk __P((DB *, const char *, QMETA *));
+int __qam_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __qam_incfirst_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_stat __P((DBC *, void *, u_int32_t));
+int __qam_stat_print __P((DBC *, u_int32_t));
+int __db_no_queue_am __P((ENV *));
+int __qam_31_qammeta __P((DB *, char *, u_int8_t *));
+int __qam_32_qammeta __P((DB *, char *, u_int8_t *));
+int __qam_vrfy_meta __P((DB *, VRFY_DBINFO *, QMETA *, db_pgno_t, u_int32_t));
+int __qam_meta2pgset __P((DB *, VRFY_DBINFO *, DB *));
+int __qam_vrfy_data __P((DB *, VRFY_DBINFO *, QPAGE *, db_pgno_t, u_int32_t));
+int __qam_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+int __qam_vrfy_walkqueue __P((DB *, VRFY_DBINFO *, void *, int (*)(void *, const void *), u_int32_t));
+int __qam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_qam_ext_h_ */
diff --git a/src/dbinc_auto/rep_automsg.h b/src/dbinc_auto/rep_automsg.h
new file mode 100644
index 00000000..584040cf
--- /dev/null
+++ b/src/dbinc_auto/rep_automsg.h
@@ -0,0 +1,120 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#ifndef	__rep_AUTOMSG_H
+#define	__rep_AUTOMSG_H
+
+/*
+ * Message sizes are simply the sum of field sizes (not
+ * counting variable size parts, when DBTs are present),
+ * and may be different from struct sizes due to padding.
+ */
+#define	__REP_BULK_SIZE	16
+typedef struct ___rep_bulk_args {
+	u_int32_t	len;
+	DB_LSN		lsn;
+	DBT		bulkdata;
+} __rep_bulk_args;
+
+#define	__REP_CONTROL_SIZE	36
+typedef struct ___rep_control_args {
+	u_int32_t	rep_version;
+	u_int32_t	log_version;
+	DB_LSN		lsn;
+	u_int32_t	rectype;
+	u_int32_t	gen;
+	u_int32_t	msg_sec;
+	u_int32_t	msg_nsec;
+	u_int32_t	flags;
+} __rep_control_args;
+
+#define	__REP_EGEN_SIZE	4
+typedef struct ___rep_egen_args {
+	u_int32_t	egen;
+} __rep_egen_args;
+
+#define	__REP_FILEINFO_SIZE	40
+typedef struct ___rep_fileinfo_args {
+	u_int32_t	pgsize;
+	db_pgno_t	pgno;
+	db_pgno_t	max_pgno;
+	u_int32_t	filenum;
+	u_int32_t	finfo_flags;
+	u_int32_t	type;
+	u_int32_t	db_flags;
+	DBT		uid;
+	DBT		info;
+	DBT		dir;
+} __rep_fileinfo_args;
+
+#define	__REP_FILEINFO_V6_SIZE	36
+typedef struct ___rep_fileinfo_v6_args {
+	u_int32_t	pgsize;
+	db_pgno_t	pgno;
+	db_pgno_t	max_pgno;
+	u_int32_t	filenum;
+	u_int32_t	finfo_flags;
+	u_int32_t	type;
+	u_int32_t	db_flags;
+	DBT		uid;
+	DBT		info;
+} __rep_fileinfo_v6_args;
+
+#define	__REP_GRANT_INFO_SIZE	8
+typedef struct ___rep_grant_info_args {
+	u_int32_t	msg_sec;
+	u_int32_t	msg_nsec;
+} __rep_grant_info_args;
+
+#define	__REP_LOGREQ_SIZE	8
+typedef struct ___rep_logreq_args {
+	DB_LSN		endlsn;
+} __rep_logreq_args;
+
+#define	__REP_NEWFILE_SIZE	4
+typedef struct ___rep_newfile_args {
+	u_int32_t	version;
+} __rep_newfile_args;
+
+#define	__REP_UPDATE_SIZE	16
+typedef struct ___rep_update_args {
+	DB_LSN		first_lsn;
+	u_int32_t	first_vers;
+	u_int32_t	num_files;
+} __rep_update_args;
+
+#define	__REP_VOTE_INFO_SIZE	28
+typedef struct ___rep_vote_info_args {
+	u_int32_t	egen;
+	u_int32_t	nsites;
+	u_int32_t	nvotes;
+	u_int32_t	priority;
+	u_int32_t	spare_pri;
+	u_int32_t	tiebreaker;
+	u_int32_t	data_gen;
+} __rep_vote_info_args;
+
+#define	__REP_VOTE_INFO_V5_SIZE	20
+typedef struct ___rep_vote_info_v5_args {
+	u_int32_t	egen;
+	u_int32_t	nsites;
+	u_int32_t	nvotes;
+	u_int32_t	priority;
+	u_int32_t	tiebreaker;
+} __rep_vote_info_v5_args;
+
+#define	__REP_LSN_HIST_KEY_SIZE	8
+typedef struct ___rep_lsn_hist_key_args {
+	u_int32_t	version;
+	u_int32_t	gen;
+} __rep_lsn_hist_key_args;
+
+#define	__REP_LSN_HIST_DATA_SIZE	20
+typedef struct ___rep_lsn_hist_data_args {
+	u_int32_t	envid;
+	DB_LSN		lsn;
+	u_int32_t	hist_sec;
+	u_int32_t	hist_nsec;
+} __rep_lsn_hist_data_args;
+
+#define	__REP_MAXMSG_SIZE	40
+#endif
diff --git a/src/dbinc_auto/rep_ext.h b/src/dbinc_auto/rep_ext.h
new file mode 100644
index 00000000..89bdc797
--- /dev/null
+++ b/src/dbinc_auto/rep_ext.h
@@ -0,0 +1,151 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_rep_ext_h_
+#define	_rep_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *, u_int8_t *, size_t, size_t *));
+int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_control_marshal __P((ENV *, __rep_control_args *, u_int8_t *, size_t, size_t *));
+int __rep_control_unmarshal __P((ENV *, __rep_control_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_egen_marshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, size_t *));
+int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_fileinfo_marshal __P((ENV *, u_int32_t, __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
+int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
+int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_grant_info_marshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
+int __rep_grant_info_unmarshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *, u_int8_t *, size_t, size_t *));
+int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *, u_int8_t *, size_t, size_t *));
+int __rep_newfile_unmarshal __P((ENV *, __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_update_marshal __P((ENV *, u_int32_t, __rep_update_args *, u_int8_t *, size_t, size_t *));
+int __rep_update_unmarshal __P((ENV *, u_int32_t, __rep_update_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_vote_info_marshal __P((ENV *, __rep_vote_info_args *, u_int8_t *, size_t, size_t *));
+int __rep_vote_info_unmarshal __P((ENV *, __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_vote_info_v5_marshal __P((ENV *, __rep_vote_info_v5_args *, u_int8_t *, size_t, size_t *));
+int __rep_vote_info_v5_unmarshal __P((ENV *, __rep_vote_info_v5_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_lsn_hist_key_marshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *));
+int __rep_lsn_hist_key_unmarshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_lsn_hist_data_marshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *));
+int __rep_lsn_hist_data_unmarshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_update_req __P((ENV *, __rep_control_args *));
+int __rep_page_req __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_update_setup __P((ENV *, int, __rep_control_args *, DBT *, time_t, DB_LSN *));
+int __rep_bulk_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_init_cleanup __P((ENV *, REP *, int));
+int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *, u_int32_t));
+int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *, __rep_fileinfo_args **));
+int __rep_remove_init_file __P((ENV *));
+int __rep_reset_init __P((ENV *));
+int __rep_elect_pp __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __rep_elect_int __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_update_grant __P((ENV *, db_timespec *));
+int __rep_islease_granted __P((ENV *));
+int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_lease_check __P((ENV *, int));
+int __rep_lease_refresh __P((ENV *));
+int __rep_lease_expire __P((ENV *));
+db_timeout_t __rep_lease_waittime __P((ENV *));
+int __rep_allreq __P((ENV *, __rep_control_args *, int));
+int __rep_log __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, int, time_t, DB_LSN *));
+int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, time_t, DB_LSN *));
+int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t));
+int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *));
+int __rep_env_create __P((DB_ENV *));
+void __rep_env_destroy __P((DB_ENV *));
+int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
+int __rep_set_config __P((DB_ENV *, u_int32_t, int));
+int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
+int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+int __rep_open_sysdb __P((ENV *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **));
+int __rep_client_dbinit __P((ENV *, int, repdb_t));
+int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
+int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t));
+int __rep_set_nsites_int __P((ENV *, u_int32_t));
+int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
+int __rep_set_priority __P((DB_ENV *, u_int32_t));
+int __rep_get_priority __P((DB_ENV *, u_int32_t *));
+int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
+int __rep_get_request __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
+int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+int __rep_set_transport_pp __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+int __rep_set_transport_int __P((ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
+int __rep_flush __P((DB_ENV *));
+int __rep_sync __P((DB_ENV *, u_int32_t));
+int __rep_txn_applied __P((ENV *, DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+int __rep_process_message_pp __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+int __rep_process_message_int __P((ENV *, DBT *, DBT *, int, DB_LSN *));
+int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, DB_LSN *, int *, DB_LSN *));
+int __rep_process_txn __P((ENV *, DBT *));
+int __rep_resend_req __P((ENV *, int));
+int __rep_check_doreq __P((ENV *, REP *));
+int __rep_check_missing __P((ENV *, u_int32_t, DB_LSN *));
+int __rep_open __P((ENV *));
+int __rep_close_diagfiles __P((ENV *));
+int __rep_env_refresh __P((ENV *));
+int __rep_env_close __P((ENV *));
+int __rep_preclose __P((ENV *));
+int __rep_closefiles __P((ENV *));
+int __rep_write_egen __P((ENV *, REP *, u_int32_t));
+int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __rep_stat_print __P((ENV *, u_int32_t));
+int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *, DB_LSN *, const DBT *, u_int32_t));
+int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t));
+int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *, u_int32_t *, u_int32_t));
+int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t));
+int __rep_send_message __P((ENV *, int, u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
+int __rep_new_master __P((ENV *, __rep_control_args *, int));
+void __rep_elect_done __P((ENV *, REP *));
+int __env_rep_enter __P((ENV *, int));
+int __env_db_rep_exit __P((ENV *));
+int __db_rep_enter __P((DB *, int, int, int));
+int __op_handle_enter __P((ENV *));
+int __op_rep_enter __P((ENV *, int, int));
+int __op_rep_exit __P((ENV *));
+int __archive_rep_enter __P((ENV *));
+int __archive_rep_exit __P((ENV *));
+int __rep_lockout_archive __P((ENV *, REP *));
+int __rep_lockout_api __P((ENV *, REP *));
+int __rep_take_apilockout __P((ENV *));
+int __rep_clear_apilockout __P((ENV *));
+int __rep_lockout_apply __P((ENV *, REP *, u_int32_t));
+int __rep_lockout_msg __P((ENV *, REP *, u_int32_t));
+int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *, u_int32_t, u_int32_t));
+u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t));
+u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t));
+int __rep_print_system __P((ENV *, u_int32_t, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __rep_print __P((ENV *, u_int32_t, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __rep_print_message __P((ENV *, int, __rep_control_args *, char *, u_int32_t));
+void __rep_fire_event __P((ENV *, u_int32_t, void *));
+void __rep_msg __P((const ENV *, const char *));
+int __rep_notify_threads __P((ENV *, rep_waitreason_t));
+int __rep_check_goal __P((ENV *, struct rep_waitgoal *));
+int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t));
+int __rep_get_maxpermlsn __P((ENV *, DB_LSN *));
+int __rep_is_internal_rep_file __P((char *));
+int __rep_get_datagen __P((ENV *, u_int32_t *));
+int __rep_verify __P((ENV *, __rep_control_args *, DBT *, int, time_t));
+int __rep_verify_fail __P((ENV *, __rep_control_args *));
+int __rep_verify_req __P((ENV *, __rep_control_args *, int));
+int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
+int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_rep_ext_h_ */
diff --git a/src/dbinc_auto/repmgr_auto.h b/src/dbinc_auto/repmgr_auto.h
new file mode 100644
index 00000000..5e9f386d
--- /dev/null
+++ b/src/dbinc_auto/repmgr_auto.h
@@ -0,0 +1,41 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__repmgr_AUTO_H
+#define	__repmgr_AUTO_H
+#ifdef HAVE_REPLICATION_THREADS
+#include "dbinc/log.h"
+#define	DB___repmgr_member	200
+typedef struct ___repmgr_member_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	version;
+	u_int32_t	prev_status;
+	u_int32_t	status;
+	DBT	host;
+	u_int32_t	port;
+} __repmgr_member_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __repmgr_member_desc[];
+static inline int
+__repmgr_member_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t version, u_int32_t prev_status, u_int32_t status, const DBT *host, u_int32_t port)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___repmgr_member, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(host) + sizeof(u_int32_t),
+	    __repmgr_member_desc,
+	    version, prev_status, status, host, port));
+}
+
+static inline int __repmgr_member_read(ENV *env, 
+    void *data, __repmgr_member_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __repmgr_member_desc, sizeof(__repmgr_member_args), (void**)arg));
+}
+#endif /* HAVE_REPLICATION_THREADS */
+#endif
diff --git a/src/dbinc_auto/repmgr_automsg.h b/src/dbinc_auto/repmgr_automsg.h
new file mode 100644
index 00000000..1b2b928c
--- /dev/null
+++ b/src/dbinc_auto/repmgr_automsg.h
@@ -0,0 +1,113 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#ifndef	__repmgr_AUTOMSG_H
+#define	__repmgr_AUTOMSG_H
+
+/*
+ * Message sizes are simply the sum of field sizes (not
+ * counting variable size parts, when DBTs are present),
+ * and may be different from struct sizes due to padding.
+ */
+#define	__REPMGR_HANDSHAKE_SIZE	12
+typedef struct ___repmgr_handshake_args {
+	u_int16_t	port;
+	u_int16_t	alignment;
+	u_int32_t	ack_policy;
+	u_int32_t	flags;
+} __repmgr_handshake_args;
+
+#define	__REPMGR_V3HANDSHAKE_SIZE	10
+typedef struct ___repmgr_v3handshake_args {
+	u_int16_t	port;
+	u_int32_t	priority;
+	u_int32_t	flags;
+} __repmgr_v3handshake_args;
+
+#define	__REPMGR_V2HANDSHAKE_SIZE	6
+typedef struct ___repmgr_v2handshake_args {
+	u_int16_t	port;
+	u_int32_t	priority;
+} __repmgr_v2handshake_args;
+
+#define	__REPMGR_PARM_REFRESH_SIZE	8
+typedef struct ___repmgr_parm_refresh_args {
+	u_int32_t	ack_policy;
+	u_int32_t	flags;
+} __repmgr_parm_refresh_args;
+
+#define	__REPMGR_PERMLSN_SIZE	12
+typedef struct ___repmgr_permlsn_args {
+	u_int32_t	generation;
+	DB_LSN		lsn;
+} __repmgr_permlsn_args;
+
+#define	__REPMGR_VERSION_PROPOSAL_SIZE	8
+typedef struct ___repmgr_version_proposal_args {
+	u_int32_t	min;
+	u_int32_t	max;
+} __repmgr_version_proposal_args;
+
+#define	__REPMGR_VERSION_CONFIRMATION_SIZE	4
+typedef struct ___repmgr_version_confirmation_args {
+	u_int32_t	version;
+} __repmgr_version_confirmation_args;
+
+#define	__REPMGR_MSG_HDR_SIZE	9
+typedef struct ___repmgr_msg_hdr_args {
+	u_int8_t	type;
+	u_int32_t	word1;
+	u_int32_t	word2;
+} __repmgr_msg_hdr_args;
+
+#define	__REPMGR_MSG_METADATA_SIZE	12
+typedef struct ___repmgr_msg_metadata_args {
+	u_int32_t	tag;
+	u_int32_t	limit;
+	u_int32_t	flags;
+} __repmgr_msg_metadata_args;
+
+#define	__REPMGR_MEMBERSHIP_KEY_SIZE	6
+typedef struct ___repmgr_membership_key_args {
+	DBT		host;
+	u_int16_t	port;
+} __repmgr_membership_key_args;
+
+#define	__REPMGR_MEMBERSHIP_DATA_SIZE	4
+typedef struct ___repmgr_membership_data_args {
+	u_int32_t	flags;
+} __repmgr_membership_data_args;
+
+#define	__REPMGR_MEMBER_METADATA_SIZE	8
+typedef struct ___repmgr_member_metadata_args {
+	u_int32_t	format;
+	u_int32_t	version;
+} __repmgr_member_metadata_args;
+
+#define	__REPMGR_GM_FWD_SIZE	10
+typedef struct ___repmgr_gm_fwd_args {
+	DBT		host;
+	u_int16_t	port;
+	u_int32_t	gen;
+} __repmgr_gm_fwd_args;
+
+#define	__REPMGR_MEMBR_VERS_SIZE	8
+typedef struct ___repmgr_membr_vers_args {
+	u_int32_t	version;
+	u_int32_t	gen;
+} __repmgr_membr_vers_args;
+
+#define	__REPMGR_SITE_INFO_SIZE	10
+typedef struct ___repmgr_site_info_args {
+	DBT		host;
+	u_int16_t	port;
+	u_int32_t	flags;
+} __repmgr_site_info_args;
+
+#define	__REPMGR_CONNECT_REJECT_SIZE	8
+typedef struct ___repmgr_connect_reject_args {
+	u_int32_t	version;
+	u_int32_t	gen;
+} __repmgr_connect_reject_args;
+
+#define	__REPMGR_MAXMSG_SIZE	12
+#endif
diff --git a/src/dbinc_auto/repmgr_ext.h b/src/dbinc_auto/repmgr_ext.h
new file mode 100644
index 00000000..b1237950
--- /dev/null
+++ b/src/dbinc_auto/repmgr_ext.h
@@ -0,0 +1,249 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_repmgr_ext_h_
+#define	_repmgr_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+void __repmgr_handshake_marshal __P((ENV *, __repmgr_handshake_args *, u_int8_t *));
+int __repmgr_handshake_unmarshal __P((ENV *, __repmgr_handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v3handshake_marshal __P((ENV *, __repmgr_v3handshake_args *, u_int8_t *));
+int __repmgr_v3handshake_unmarshal __P((ENV *, __repmgr_v3handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v2handshake_marshal __P((ENV *, __repmgr_v2handshake_args *, u_int8_t *));
+int __repmgr_v2handshake_unmarshal __P((ENV *, __repmgr_v2handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_parm_refresh_marshal __P((ENV *, __repmgr_parm_refresh_args *, u_int8_t *));
+int __repmgr_parm_refresh_unmarshal __P((ENV *, __repmgr_parm_refresh_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_permlsn_marshal __P((ENV *, __repmgr_permlsn_args *, u_int8_t *));
+int __repmgr_permlsn_unmarshal __P((ENV *, __repmgr_permlsn_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_version_proposal_marshal __P((ENV *, __repmgr_version_proposal_args *, u_int8_t *));
+int __repmgr_version_proposal_unmarshal __P((ENV *, __repmgr_version_proposal_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_version_confirmation_marshal __P((ENV *, __repmgr_version_confirmation_args *, u_int8_t *));
+int __repmgr_version_confirmation_unmarshal __P((ENV *, __repmgr_version_confirmation_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_msg_hdr_marshal __P((ENV *, __repmgr_msg_hdr_args *, u_int8_t *));
+int __repmgr_msg_hdr_unmarshal __P((ENV *, __repmgr_msg_hdr_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_msg_metadata_marshal __P((ENV *, __repmgr_msg_metadata_args *, u_int8_t *));
+int __repmgr_msg_metadata_unmarshal __P((ENV *, __repmgr_msg_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_membership_key_marshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_membership_key_unmarshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_membership_data_marshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *));
+int __repmgr_membership_data_unmarshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_member_metadata_marshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *));
+int __repmgr_member_metadata_unmarshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_gm_fwd_marshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_gm_fwd_unmarshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_membr_vers_marshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *));
+int __repmgr_membr_vers_unmarshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_site_info_marshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_site_info_unmarshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_connect_reject_marshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *));
+int __repmgr_connect_reject_unmarshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __repmgr_init_print __P((ENV *, DB_DISTAB *));
+int __repmgr_init_election __P((ENV *, u_int32_t));
+int __repmgr_claim_victory __P((ENV *));
+int __repmgr_turn_on_elections __P((ENV *));
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+int __repmgr_valid_config __P((ENV *, u_int32_t));
+int __repmgr_autostart __P((ENV *));
+int __repmgr_start_selector __P((ENV *));
+int __repmgr_close __P((ENV *));
+int __repmgr_stop __P((ENV *));
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+int __repmgr_env_create __P((ENV *, DB_REP *));
+void __repmgr_env_destroy __P((ENV *, DB_REP *));
+int __repmgr_stop_threads __P((ENV *));
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+int __repmgr_send_msg __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+int __repmgr_send_request __P((DB_CHANNEL *, DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+int __repmgr_send_response __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+int __repmgr_channel_close __P((DB_CHANNEL *, u_int32_t));
+int __repmgr_channel_timeout __P((DB_CHANNEL *, db_timeout_t));
+int __repmgr_send_request_inval __P((DB_CHANNEL *, DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+int __repmgr_channel_close_inval __P((DB_CHANNEL *, u_int32_t));
+int __repmgr_channel_timeout_inval __P((DB_CHANNEL *, db_timeout_t));
+int __repmgr_join_group __P((ENV *));
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+int __repmgr_get_site_address __P((DB_SITE *, const char **, u_int *));
+int __repmgr_get_eid __P((DB_SITE *, int *));
+int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *));
+int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+int __repmgr_site_close __P((DB_SITE *));
+void *__repmgr_msg_thread __P((void *));
+int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int));
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t));
+int __repmgr_set_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+int __repmgr_setup_gmdb_op __P((ENV *, DB_THREAD_INFO *, DB_TXN **, u_int32_t));
+int __repmgr_cleanup_gmdb_op __P((ENV *, int));
+int __repmgr_hold_master_role __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_rlse_master_role __P((ENV *));
+void __repmgr_set_sites __P((ENV *));
+int __repmgr_connect __P((ENV *, repmgr_netaddr_t *, REPMGR_CONNECTION **, int *));
+int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+int __repmgr_sync_siteaddr __P((ENV *));
+int __repmgr_send_broadcast __P((ENV *, u_int, const DBT *, const DBT *, u_int *, u_int *, int *));
+int __repmgr_send_one __P((ENV *, REPMGR_CONNECTION *, u_int, const DBT *, const DBT *, db_timeout_t));
+int __repmgr_send_many __P((ENV *, REPMGR_CONNECTION *, REPMGR_IOVECS *, db_timeout_t));
+int __repmgr_send_own_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+int __repmgr_write_iovecs __P((ENV *, REPMGR_CONNECTION *, REPMGR_IOVECS *, size_t *));
+int __repmgr_bust_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_disable_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_cleanup_defunct __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_close_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_decr_conn_ref __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_destroy_conn __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_pack_netaddr __P((ENV *, const char *, u_int, repmgr_netaddr_t *));
+int __repmgr_getaddr __P((ENV *, const char *, u_int, int, ADDRINFO **));
+int __repmgr_listen __P((ENV *));
+int __repmgr_net_close __P((ENV *));
+void __repmgr_net_destroy __P((ENV *, DB_REP *));
+int __repmgr_thread_start __P((ENV *, REPMGR_RUNNABLE *));
+int __repmgr_thread_join __P((REPMGR_RUNNABLE *));
+int __repmgr_set_nonblock_conn __P((REPMGR_CONNECTION *));
+int __repmgr_set_nonblocking __P((socket_t));
+int __repmgr_wake_waiters __P((ENV *, waiter_t *));
+int __repmgr_await_cond __P((ENV *, PREDICATE, void *, db_timeout_t, waiter_t *));
+int __repmgr_await_gmdbop __P((ENV *));
+void __repmgr_compute_wait_deadline __P((ENV*, struct timespec *, db_timeout_t));
+int __repmgr_await_drain __P((ENV *, REPMGR_CONNECTION *, db_timeout_t));
+int __repmgr_alloc_cond __P((cond_var_t *));
+int __repmgr_free_cond __P((cond_var_t *));
+void __repmgr_env_create_pf __P((DB_REP *));
+int __repmgr_create_mutex_pf __P((mgr_mutex_t *));
+int __repmgr_destroy_mutex_pf __P((mgr_mutex_t *));
+int __repmgr_init __P((ENV *));
+int __repmgr_deinit __P((ENV *));
+int __repmgr_init_waiters __P((ENV *, waiter_t *));
+int __repmgr_destroy_waiters __P((ENV *, waiter_t *));
+int __repmgr_lock_mutex __P((mgr_mutex_t *));
+int __repmgr_unlock_mutex __P((mgr_mutex_t *));
+int __repmgr_signal __P((cond_var_t *));
+int __repmgr_wake_msngers __P((ENV*, u_int));
+int __repmgr_wake_main_thread __P((ENV*));
+int __repmgr_writev __P((socket_t, db_iovec_t *, int, size_t *));
+int __repmgr_readv __P((socket_t, db_iovec_t *, int, size_t *));
+int __repmgr_select_loop __P((ENV *));
+int __repmgr_queue_destroy __P((ENV *));
+int __repmgr_queue_get __P((ENV *, REPMGR_MESSAGE **, REPMGR_RUNNABLE *));
+int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *));
+int __repmgr_queue_size __P((ENV *));
+int __repmgr_member_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+void *__repmgr_select_thread __P((void *));
+int __repmgr_bow_out __P((ENV *));
+int __repmgr_accept __P((ENV *));
+int __repmgr_compute_timeout __P((ENV *, db_timespec *));
+REPMGR_SITE *__repmgr_connected_master __P((ENV *));
+int __repmgr_check_timeouts __P((ENV *));
+int __repmgr_first_try_connections __P((ENV *));
+int __repmgr_send_v1_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t));
+int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_read_conn __P((REPMGR_CONNECTION *));
+int __repmgr_prepare_simple_input __P((ENV *, REPMGR_CONNECTION *, __repmgr_msg_hdr_args *));
+int __repmgr_send_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t, u_int32_t));
+int __repmgr_find_version_info __P((ENV *, REPMGR_CONNECTION *, DBT *));
+int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __repmgr_stat_print __P((ENV *, u_int32_t));
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_close __P((ENV *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+#endif
+int __repmgr_schedule_connection_attempt __P((ENV *, int, int));
+int __repmgr_is_server __P((ENV *, REPMGR_SITE *));
+void __repmgr_reset_for_reading __P((REPMGR_CONNECTION *));
+int __repmgr_new_connection __P((ENV *, REPMGR_CONNECTION **, socket_t, int));
+int __repmgr_set_keepalive __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_new_site __P((ENV *, REPMGR_SITE**, const char *, u_int));
+int __repmgr_create_mutex __P((ENV *, mgr_mutex_t **));
+int __repmgr_destroy_mutex __P((ENV *, mgr_mutex_t *));
+void __repmgr_cleanup_netaddr __P((ENV *, repmgr_netaddr_t *));
+void __repmgr_iovec_init __P((REPMGR_IOVECS *));
+void __repmgr_add_buffer __P((REPMGR_IOVECS *, void *, size_t));
+void __repmgr_add_dbt __P((REPMGR_IOVECS *, const DBT *));
+int __repmgr_update_consumed __P((REPMGR_IOVECS *, size_t));
+int __repmgr_prepare_my_addr __P((ENV *, DBT *));
+int __repmgr_get_nsites __P((ENV *, u_int32_t *));
+int __repmgr_thread_failure __P((ENV *, int));
+char *__repmgr_format_eid_loc __P((DB_REP *, REPMGR_CONNECTION *, char *));
+char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *));
+char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *));
+int __repmgr_repstart __P((ENV *, u_int32_t));
+int __repmgr_become_master __P((ENV *));
+int __repmgr_each_connection __P((ENV *, CONNECTION_ACTION, void *, int));
+int __repmgr_open __P((ENV *, void *));
+int __repmgr_join __P((ENV *, void *));
+int __repmgr_env_refresh __P((ENV *env));
+int __repmgr_share_netaddrs __P((ENV *, void *, u_int, u_int));
+int __repmgr_copy_in_added_sites __P((ENV *));
+int __repmgr_init_new_sites __P((ENV *, int, int));
+int __repmgr_failchk __P((ENV *));
+int __repmgr_master_is_known __P((ENV *));
+int __repmgr_stable_lsn __P((ENV *, DB_LSN *));
+int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
+int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t));
+int __repmgr_reload_gmdb __P((ENV *));
+int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t));
+int __repmgr_init_save __P((ENV *, DBT *));
+int __repmgr_init_restore __P((ENV *, DBT *));
+int __repmgr_defer_op __P((ENV *, u_int32_t));
+void __repmgr_fire_conn_err_event __P((ENV *, REPMGR_CONNECTION *, int));
+void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int));
+int __repmgr_become_client __P((ENV *));
+REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int));
+int __repmgr_find_site __P((ENV *, const char *, u_int, int *));
+int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t));
+int __repmgr_bcast_parm_refresh __P((ENV *));
+int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t));
+int __repmgr_bcast_own_msg __P((ENV *, u_int32_t, u_int8_t *, size_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_repmgr_ext_h_ */
diff --git a/src/dbinc_auto/sequence_ext.h b/src/dbinc_auto/sequence_ext.h
new file mode 100644
index 00000000..a2c114cf
--- /dev/null
+++ b/src/dbinc_auto/sequence_ext.h
@@ -0,0 +1,17 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_sequence_ext_h_
+#define	_sequence_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t));
+int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t));
+const FN * __db_get_seq_flags_fn __P((void));
+const FN * __db_get_seq_flags_fn __P((void));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_sequence_ext_h_ */
diff --git a/src/dbinc_auto/tcl_ext.h b/src/dbinc_auto/tcl_ext.h
new file mode 100644
index 00000000..8b076c8b
--- /dev/null
+++ b/src/dbinc_auto/tcl_ext.h
@@ -0,0 +1,134 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_tcl_ext_h_
+#define	_tcl_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int bdb_HCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+#if DB_DBM_HSEARCH != 0
+int bdb_NdbmOpen __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DBM **));
+#endif
+#if DB_DBM_HSEARCH != 0
+int bdb_DbmCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*, int, DBM *));
+#endif
+int ndbm_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+void _DbInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int db_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_CompactStat __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_rep_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+int dbc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int env_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_EnvRemove __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_EnvClose __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_EnvIdReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvLsnReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvVerbose __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_EnvAttr __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvSetFlags __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_EnvTest __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvGetEncryptFlags __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+void tcl_EnvSetErrfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+void tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvSetErrpfx __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+DBTCL_INFO *_NewInfo __P((Tcl_Interp *, void *, char *, enum INFOTYPE));
+void *_NameToPtr __P((CONST char *));
+DBTCL_INFO *_PtrToInfo __P((CONST void *));
+DBTCL_INFO *_NameToInfo __P((CONST char *));
+void  _SetInfoData __P((DBTCL_INFO *, void *));
+void  _DeleteInfo __P((DBTCL_INFO *));
+int _SetListElem __P((Tcl_Interp *, Tcl_Obj *, void *, u_int32_t, void *, u_int32_t));
+int _SetListElemInt __P((Tcl_Interp *, Tcl_Obj *, void *, long));
+int _SetListElemWideInt __P((Tcl_Interp *, Tcl_Obj *, void *, int64_t));
+int _SetListRecnoElem __P((Tcl_Interp *, Tcl_Obj *, db_recno_t, u_char *, u_int32_t));
+int _SetListHeapElem __P((Tcl_Interp *, Tcl_Obj *, DB_HEAP_RID, u_char *, u_int32_t));
+int _Set3DBTList __P((Tcl_Interp *, Tcl_Obj *, DBT *, int, DBT *, int, DBT *));
+int _SetMultiList __P((Tcl_Interp *, Tcl_Obj *, DBT *, DBT*, DBTYPE, u_int32_t, DBC*));
+int _GetGlobPrefix __P((char *, char **));
+int _ReturnSetup __P((Tcl_Interp *, int, int, char *));
+int _ErrorSetup __P((Tcl_Interp *, int, char *));
+void _ErrorFunc __P((const DB_ENV *, CONST char *, const char *));
+#ifdef CONFIG_TEST 
+void _EventFunc __P((DB_ENV *, u_int32_t, void *));
+#endif
+int _GetLsn __P((Tcl_Interp *, Tcl_Obj *, DB_LSN *));
+int _GetRid __P((Tcl_Interp *, Tcl_Obj *, DB_HEAP_RID *));
+int _GetUInt32 __P((Tcl_Interp *, Tcl_Obj *, u_int32_t *));
+Tcl_Obj *_GetFlagsList __P((Tcl_Interp *, u_int32_t, const FN *));
+void _debug_check  __P((void));
+int _CopyObjBytes  __P((Tcl_Interp *, Tcl_Obj *obj, void *, u_int32_t *, int *));
+int tcl_LockDetect __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockVec __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogArchive __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogCompare __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LogFile __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogFlush __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogPut __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int logc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LogConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_LogGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+void _MpInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_MpSync __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MpTrickle __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Mp __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_MpStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MpStatPrint __P((Tcl_Interp *, int,  Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Mutex __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutFree __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutGet __P((Tcl_Interp *, DB_ENV *, int));
+int tcl_MutLock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutSet __P((Tcl_Interp *, Tcl_Obj *, DB_ENV *, int));
+int tcl_MutStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutUnlock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetTwo __P((Tcl_Interp *, DB_ENV *, int));
+int tcl_RepGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetTimeout __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetAckPolicy __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepGetLocalSite __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepElect __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepFlush __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepSync __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepLease  __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepInmemFiles  __P((Tcl_Interp *, DB_ENV *));
+int tcl_RepLimit __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepNSites __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepRequest __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepNoarchiveTimeout __P((Tcl_Interp *, DB_ENV *));
+int tcl_RepTransport  __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *, DBTCL_INFO *));
+int tcl_RepStart __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepProcessMessage __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepStat __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepMgr __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrSiteList __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrStat __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepApplied __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int seq_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+void _TxnInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_TxnCheckpoint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Txn __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_CDSGroup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_TxnStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnRecover __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int bdb_RandCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LockMutex __P((DB_ENV *, db_mutex_t));
+int tcl_UnlockMutex __P((DB_ENV *, db_mutex_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_tcl_ext_h_ */
diff --git a/src/dbinc_auto/txn_auto.h b/src/dbinc_auto/txn_auto.h
new file mode 100644
index 00000000..48cb066d
--- /dev/null
+++ b/src/dbinc_auto/txn_auto.h
@@ -0,0 +1,220 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef	__txn_AUTO_H
+#define	__txn_AUTO_H
+#include "dbinc/log.h"
+#define	DB___txn_regop_42	10
+typedef struct ___txn_regop_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	timestamp;
+	DBT	locks;
+} __txn_regop_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_regop_42_desc[];
+static inline int __txn_regop_42_read(ENV *env, 
+    void *data, __txn_regop_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_regop_42_desc, sizeof(__txn_regop_42_args), (void**)arg));
+}
+#define	DB___txn_regop	10
+typedef struct ___txn_regop_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	int32_t	timestamp;
+	u_int32_t	envid;
+	DBT	locks;
+} __txn_regop_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_regop_desc[];
+static inline int
+__txn_regop_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, int32_t timestamp, u_int32_t envid, const DBT *locks)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___txn_regop, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+	    LOG_DBT_SIZE(locks),
+	    __txn_regop_desc,
+	    opcode, timestamp, envid, locks));
+}
+
+static inline int __txn_regop_read(ENV *env, 
+    void *data, __txn_regop_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_regop_desc, sizeof(__txn_regop_args), (void**)arg));
+}
+#define	DB___txn_ckp_42	11
+typedef struct ___txn_ckp_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DB_LSN	ckp_lsn;
+	DB_LSN	last_ckp;
+	int32_t	timestamp;
+	u_int32_t	rep_gen;
+} __txn_ckp_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_ckp_42_desc[];
+static inline int __txn_ckp_42_read(ENV *env, 
+    void *data, __txn_ckp_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_ckp_42_desc, sizeof(__txn_ckp_42_args), (void**)arg));
+}
+#define	DB___txn_ckp	11
+typedef struct ___txn_ckp_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	DB_LSN	ckp_lsn;
+	DB_LSN	last_ckp;
+	int32_t	timestamp;
+	u_int32_t	envid;
+	u_int32_t	spare;
+} __txn_ckp_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_ckp_desc[];
+static inline int
+__txn_ckp_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    DB_LSN * ckp_lsn, DB_LSN * last_ckp, int32_t timestamp, u_int32_t envid, u_int32_t spare)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___txn_ckp, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(*ckp_lsn) + sizeof(*last_ckp) + sizeof(u_int32_t) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __txn_ckp_desc,
+	    ckp_lsn, last_ckp, timestamp, envid, spare));
+}
+
+static inline int __txn_ckp_read(ENV *env, 
+    void *data, __txn_ckp_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_ckp_desc, sizeof(__txn_ckp_args), (void**)arg));
+}
+#define	DB___txn_child	12
+typedef struct ___txn_child_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	child;
+	DB_LSN	c_lsn;
+} __txn_child_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_child_desc[];
+static inline int
+__txn_child_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t child, DB_LSN * c_lsn)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___txn_child, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(*c_lsn),
+	    __txn_child_desc,
+	    child, c_lsn));
+}
+
+static inline int __txn_child_read(ENV *env, 
+    void *data, __txn_child_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_child_desc, sizeof(__txn_child_args), (void**)arg));
+}
+#define	DB___txn_xa_regop_42	13
+typedef struct ___txn_xa_regop_42_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	DBT	xid;
+	int32_t	formatID;
+	u_int32_t	gtrid;
+	u_int32_t	bqual;
+	DB_LSN	begin_lsn;
+	DBT	locks;
+} __txn_xa_regop_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_xa_regop_42_desc[];
+static inline int __txn_xa_regop_42_read(ENV *env, 
+    void *data, __txn_xa_regop_42_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_xa_regop_42_desc, sizeof(__txn_xa_regop_42_args), (void**)arg));
+}
+#define	DB___txn_prepare	13
+typedef struct ___txn_prepare_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	opcode;
+	DBT	gid;
+	DB_LSN	begin_lsn;
+	DBT	locks;
+} __txn_prepare_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_prepare_desc[];
+static inline int
+__txn_prepare_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t opcode, const DBT *gid, DB_LSN * begin_lsn, const DBT *locks)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___txn_prepare, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + LOG_DBT_SIZE(gid) + sizeof(*begin_lsn) +
+	    LOG_DBT_SIZE(locks),
+	    __txn_prepare_desc,
+	    opcode, gid, begin_lsn, locks));
+}
+
+static inline int __txn_prepare_read(ENV *env, 
+    void *data, __txn_prepare_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_prepare_desc, sizeof(__txn_prepare_args), (void**)arg));
+}
+#define	DB___txn_recycle	14
+typedef struct ___txn_recycle_args {
+	u_int32_t type;
+	DB_TXN *txnp;
+	DB_LSN prev_lsn;
+	u_int32_t	min;
+	u_int32_t	max;
+} __txn_recycle_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_recycle_desc[];
+static inline int
+__txn_recycle_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+    u_int32_t min, u_int32_t max)
+{
+	return (__log_put_record(env, NULL, txnp, ret_lsnp,
+	    flags, DB___txn_recycle, 0,
+	    sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+	    sizeof(u_int32_t) + sizeof(u_int32_t),
+	    __txn_recycle_desc,
+	    min, max));
+}
+
+static inline int __txn_recycle_read(ENV *env, 
+    void *data, __txn_recycle_args **arg)
+{
+	*arg = NULL;
+	return (__log_read_record(env, 
+	    NULL, NULL, data, __txn_recycle_desc, sizeof(__txn_recycle_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/txn_ext.h b/src/dbinc_auto/txn_ext.h
new file mode 100644
index 00000000..7c21455f
--- /dev/null
+++ b/src/dbinc_auto/txn_ext.h
@@ -0,0 +1,93 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_txn_ext_h_
+#define	_txn_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+int __txn_begin __P((ENV *, DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t));
+int __txn_recycle_id __P((ENV *, int));
+int __txn_continue __P((ENV *, DB_TXN *, TXN_DETAIL *, DB_THREAD_INFO *, int));
+int __txn_commit __P((DB_TXN *, u_int32_t));
+int __txn_abort __P((DB_TXN *));
+int __txn_discard_int __P((DB_TXN *, u_int32_t flags));
+int __txn_prepare __P((DB_TXN *, u_int8_t *));
+u_int32_t __txn_id __P((DB_TXN *));
+int __txn_get_name __P((DB_TXN *, const char **));
+int __txn_set_name __P((DB_TXN *, const char *));
+int __txn_get_priority __P((DB_TXN *, u_int32_t *));
+int __txn_set_priority __P((DB_TXN *, u_int32_t));
+int  __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t));
+int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *));
+int __txn_force_abort __P((ENV *, u_int8_t *));
+int __txn_preclose __P((ENV *));
+int __txn_reset __P((ENV *));
+int __txn_applied_pp __P((DB_ENV *, DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+int __txn_init_recover __P((ENV *, DB_DISTAB *));
+int __txn_regop_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_xa_regop_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recycle_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_init_print __P((ENV *, DB_DISTAB *));
+int __txn_checkpoint_pp __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __txn_checkpoint __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __txn_getactive __P((ENV *, DB_LSN *));
+int __txn_getckp __P((ENV *, DB_LSN *));
+int __txn_updateckp __P((ENV *, DB_LSN *));
+int __txn_failchk __P((ENV *));
+int __txn_env_create __P((DB_ENV *));
+void __txn_env_destroy __P((DB_ENV *));
+int __txn_get_tx_max __P((DB_ENV *, u_int32_t *));
+int __txn_set_tx_max __P((DB_ENV *, u_int32_t));
+int __txn_get_tx_timestamp __P((DB_ENV *, time_t *));
+int __txn_set_tx_timestamp __P((DB_ENV *, time_t *));
+int __txn_regop_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_prepare_args *));
+int __txn_recycle_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recover_pp __P((DB_ENV *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_recover __P((ENV *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_get_prepared __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
+int __txn_open __P((ENV *));
+int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *));
+int __txn_env_refresh __P((ENV *));
+u_int32_t __txn_region_mutex_count __P((ENV *));
+u_int32_t __txn_region_mutex_max __P((ENV *));
+size_t __txn_region_size __P((ENV *));
+size_t __txn_region_max __P((ENV *));
+int __txn_id_set __P((ENV *, u_int32_t, u_int32_t));
+int __txn_oldest_reader __P((ENV *, DB_LSN *));
+int __txn_add_buffer __P((ENV *, TXN_DETAIL *));
+int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t));
+int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+int __txn_stat_print_pp __P((DB_ENV *, u_int32_t));
+int  __txn_stat_print __P((ENV *, u_int32_t));
+int __txn_closeevent __P((ENV *, DB_TXN *, DB *));
+int __txn_remevent __P((ENV *, DB_TXN *, const char *, u_int8_t *, int));
+void __txn_remrem __P((ENV *, DB_TXN *, const char *));
+int __txn_lockevent __P((ENV *, DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *));
+void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *));
+int __txn_doevents __P((ENV *, DB_TXN *, int, int));
+int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *));
+int __txn_dref_fname __P((ENV *, DB_TXN *));
+void __txn_reset_fe_watermarks __P((DB_TXN *));
+void __txn_remove_fe_watermark __P((DB_TXN *,DB *));
+void __txn_add_fe_watermark __P((DB_TXN *, DB *, db_pgno_t));
+int __txn_flush_fe_files __P((DB_TXN *));
+int __txn_pg_above_fe_watermark __P((DB_TXN*, MPOOLFILE*, db_pgno_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_txn_ext_h_ */
diff --git a/src/dbinc_auto/xa_ext.h b/src/dbinc_auto/xa_ext.h
new file mode 100644
index 00000000..47a167f9
--- /dev/null
+++ b/src/dbinc_auto/xa_ext.h
@@ -0,0 +1,18 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef	_xa_ext_h_
+#define	_xa_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __db_rmid_to_env __P((int, ENV **));
+int __db_xid_to_txn __P((ENV *, XID *, TXN_DETAIL **));
+void __db_map_rmid __P((int, ENV *));
+int __db_unmap_rmid __P((int));
+void __db_unmap_xid __P((ENV *, XID *, size_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_xa_ext_h_ */
diff --git a/src/dbreg/dbreg.c b/src/dbreg/dbreg.c
new file mode 100644
index 00000000..5067edac
--- /dev/null
+++ b/src/dbreg/dbreg.c
@@ -0,0 +1,1012 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+static int __dbreg_push_id __P((ENV *, int32_t));
+static int __dbreg_pop_id __P((ENV *, int32_t *));
+static int __dbreg_pluck_id __P((ENV *, int32_t));
+
+/*
+ * The dbreg subsystem, as its name implies, registers database handles so
+ * that we can associate log messages with them without logging a filename
+ * or a full, unique DB ID.  Instead, we assign each dbp an int32_t which is
+ * easy and cheap to log, and use this subsystem to map back and forth.
+ *
+ * Overview of how dbreg ids are managed:
+ *
+ * OPEN
+ *	dbreg_setup (Creates FNAME struct.)
+ *	dbreg_new_id (Assigns new ID to dbp and logs it.  May be postponed
+ *	until we attempt to log something else using that dbp, if the dbp
+ *	was opened on a replication client.)
+ *
+ * CLOSE
+ *	dbreg_close_id  (Logs closure of dbp/revocation of ID.)
+ *	dbreg_revoke_id (As name implies, revokes ID.)
+ *	dbreg_teardown (Destroys FNAME.)
+ *
+ * RECOVERY
+ *	dbreg_setup
+ *	dbreg_assign_id (Assigns a particular ID we have in the log to a dbp.)
+ *
+ *	sometimes: dbreg_revoke_id; dbreg_teardown
+ *	other times: normal close path
+ *
+ * A note about locking:
+ *
+ *	FNAME structures are referenced only by their corresponding dbp's
+ *	until they have a valid id.
+ *
+ *	Once they have a valid id, they must get linked into the log
+ *	region list so they can get logged on checkpoints.
+ *
+ *	An FNAME that may/does have a valid id must be accessed under
+ *	protection of the mtx_filelist, with the following exception:
+ *
+ *	We don't want to have to grab the mtx_filelist on every log
+ *	record, and it should be safe not to do so when we're just
+ *	looking at the id, because once allocated, the id should
+ *	not change under a handle until the handle is closed.
+ *
+ *	If a handle is closed during an attempt by another thread to
+ *	log with it, well, the application doing the close deserves to
+ *	go down in flames and a lot else is about to fail anyway.
+ *
+ *	When in the course of logging we encounter an invalid id
+ *	and go to allocate it lazily, we *do* need to check again
+ *	after grabbing the mutex, because it's possible to race with
+ *	another thread that has also decided that it needs to allocate
+ *	a id lazily.
+ *
+ * See SR #5623 for further discussion of the new dbreg design.
+ */
+
+/*
+ * __dbreg_setup --
+ *	Allocate and initialize an FNAME structure.  The FNAME structures
+ * live in the log shared region and map one-to-one with open database handles.
+ * When the handle needs to be logged, the FNAME should have a valid fid
+ * allocated.  If the handle currently isn't logged, it still has an FNAME
+ * entry.  If we later discover that the handle needs to be logged, we can
+ * allocate a id for it later.  (This happens when the handle is on a
+ * replication client that later becomes a master.)
+ *
+ * PUBLIC: int __dbreg_setup __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__dbreg_setup(dbp, fname, dname, create_txnid)
+	DB *dbp;
+	const char *fname, *dname;
+	u_int32_t create_txnid;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	FNAME *fnp;
+#ifdef HAVE_STATISTICS
+	LOG *lp;
+#endif
+	REGINFO *infop;
+	int ret;
+	size_t len;
+	void *p;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	infop = &dblp->reginfo;
+
+	fnp = NULL;
+	p = NULL;
+
+	/* Allocate an FNAME and, if necessary, a buffer for the name itself. */
+	LOG_SYSTEM_LOCK(env);
+	if ((ret = __env_alloc(infop, sizeof(FNAME), &fnp)) != 0)
+		goto err;
+
+#ifdef HAVE_STATISTICS
+	lp = dblp->reginfo.primary;
+	if (++lp->stat.st_nfileid > lp->stat.st_maxnfileid)
+		lp->stat.st_maxnfileid = lp->stat.st_nfileid;
+#endif
+
+	memset(fnp, 0, sizeof(FNAME));
+	if (fname == NULL)
+		fnp->fname_off = INVALID_ROFF;
+	else {
+		len = strlen(fname) + 1;
+		if ((ret = __env_alloc(infop, len, &p)) != 0)
+			goto err;
+		fnp->fname_off = R_OFFSET(infop, p);
+		memcpy(p, fname, len);
+	}
+	if (dname == NULL)
+		fnp->dname_off = INVALID_ROFF;
+	else {
+		len = strlen(dname) + 1;
+		if ((ret = __env_alloc(infop, len, &p)) != 0)
+			goto err;
+		fnp->dname_off = R_OFFSET(infop, p);
+		memcpy(p, dname, len);
+	}
+	LOG_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Fill in all the remaining info that we'll need later to register
+	 * the file, if we use it for logging.
+	 */
+	fnp->id = fnp->old_id = DB_LOGFILEID_INVALID;
+	fnp->s_type = dbp->type;
+	memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
+	fnp->meta_pgno = dbp->meta_pgno;
+	fnp->create_txnid = create_txnid;
+	dbp->dbenv->thread_id(dbp->dbenv, &fnp->pid, NULL);
+
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		F_SET(fnp, DB_FNAME_INMEM);
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		F_SET(fnp, DB_FNAME_RECOVER);
+	/*
+	 * The DB is BIGENDed if its bytes are swapped XOR
+	 *	the machine is bigended
+	 */
+	if ((F_ISSET(dbp, DB_AM_SWAP) != 0) ^
+	    (F_ISSET(env, ENV_LITTLEENDIAN) == 0))
+		F_SET(fnp, DBREG_BIGEND);
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		F_SET(fnp, DBREG_CHKSUM);
+	if (F_ISSET(dbp, DB_AM_ENCRYPT))
+		F_SET(fnp, DBREG_ENCRYPT);
+	if (F2_ISSET(dbp, DB2_AM_EXCL))
+		F_SET(fnp, DBREG_EXCL);
+	fnp->txn_ref = 1;
+	fnp->mutex = dbp->mutex;
+
+	dbp->log_filename = fnp;
+
+	return (0);
+
+err:	LOG_SYSTEM_UNLOCK(env);
+	if (ret == ENOMEM)
+		__db_errx(env, DB_STR("1501",
+    "Logging region out of memory; you may need to increase its size"));
+
+	return (ret);
+}
+
+/*
+ * __dbreg_teardown --
+ *	Destroy a DB handle's FNAME struct.  This is only called when closing
+ * the DB.
+ *
+ * PUBLIC: int __dbreg_teardown __P((DB *));
+ */
+int
+__dbreg_teardown(dbp)
+	DB *dbp;
+{
+	int ret;
+
+	/*
+	 * We may not have an FNAME if we were never opened.  This is not an
+	 * error.
+	 */
+	if (dbp->log_filename == NULL)
+		return (0);
+
+	ret = __dbreg_teardown_int(dbp->env, dbp->log_filename);
+
+	/* We freed the copy of the mutex from the FNAME. */
+	dbp->log_filename = NULL;
+	dbp->mutex = MUTEX_INVALID;
+
+	return (ret);
+}
+
+/*
+ * __dbreg_teardown_int --
+ *	Destroy an FNAME struct.
+ *
+ * PUBLIC: int __dbreg_teardown_int __P((ENV *, FNAME *));
+ */
+int
+__dbreg_teardown_int(env, fnp)
+	ENV *env;
+	FNAME *fnp;
+{
+	DB_LOG *dblp;
+#ifdef HAVE_STATISTICS
+	LOG *lp;
+#endif
+	REGINFO *infop;
+	int ret;
+
+	if (F_ISSET(fnp, DB_FNAME_NOTLOGGED))
+		return (0);
+	dblp = env->lg_handle;
+	infop = &dblp->reginfo;
+#ifdef HAVE_STATISTICS
+	lp = dblp->reginfo.primary;
+#endif
+
+	DB_ASSERT(env, fnp->id == DB_LOGFILEID_INVALID);
+	ret = __mutex_free(env, &fnp->mutex);
+
+	LOG_SYSTEM_LOCK(env);
+	if (fnp->fname_off != INVALID_ROFF)
+		__env_alloc_free(infop, R_ADDR(infop, fnp->fname_off));
+	if (fnp->dname_off != INVALID_ROFF)
+		__env_alloc_free(infop, R_ADDR(infop, fnp->dname_off));
+	__env_alloc_free(infop, fnp);
+	STAT(lp->stat.st_nfileid--);
+	LOG_SYSTEM_UNLOCK(env);
+
+	return (ret);
+}
+
+/*
+ * __dbreg_new_id --
+ *	Get an unused dbreg id to this database handle.
+ *	Used as a wrapper to acquire the mutex and
+ *	only set the id on success.
+ *
+ * PUBLIC: int __dbreg_new_id __P((DB *, DB_TXN *));
+ */
+int
+__dbreg_new_id(dbp, txn)
+	DB *dbp;
+	DB_TXN *txn;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int32_t id;
+	int ret;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	/* The mtx_filelist protects the FNAME list and id management. */
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	if (fnp->id != DB_LOGFILEID_INVALID) {
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+		return (0);
+	}
+	if ((ret = __dbreg_get_id(dbp, txn, &id)) == 0)
+		fnp->id = id;
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (ret);
+}
+
+/*
+ * __dbreg_get_id --
+ *	Assign an unused dbreg id to this database handle.
+ *	Assume the caller holds the mtx_filelist locked.  Assume the
+ *	caller will set the fnp->id field with the id we return.
+ *
+ * PUBLIC: int __dbreg_get_id __P((DB *, DB_TXN *, int32_t *));
+ */
+int
+__dbreg_get_id(dbp, txn, idp)
+	DB *dbp;
+	DB_TXN *txn;
+	int32_t *idp;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int32_t id;
+	int ret;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	/*
+	 * It's possible that after deciding we needed to call this function,
+	 * someone else allocated an ID before we grabbed the lock.  Check
+	 * to make sure there was no race and we have something useful to do.
+	 */
+	/* Get an unused ID from the free list. */
+	if ((ret = __dbreg_pop_id(env, &id)) != 0)
+		goto err;
+
+	/* If no ID was found, allocate a new one. */
+	if (id == DB_LOGFILEID_INVALID)
+		id = lp->fid_max++;
+
+	/* If the file is durable (i.e., not, not-durable), mark it as such. */
+	if (!F_ISSET(dbp, DB_AM_NOT_DURABLE))
+		F_SET(fnp, DB_FNAME_DURABLE);
+
+	/* Hook the FNAME into the list of open files. */
+	SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
+
+	/*
+	 * Log the registry.  We should only request a new ID in situations
+	 * where logging is reasonable.
+	 */
+	DB_ASSERT(env, !F_ISSET(dbp, DB_AM_RECOVER));
+
+	if ((ret = __dbreg_log_id(dbp, txn, id, 0)) != 0)
+		goto err;
+
+	/*
+	 * Once we log the create_txnid, we need to make sure we never
+	 * log it again (as might happen if this is a replication client
+	 * that later upgrades to a master).
+	 */
+	fnp->create_txnid = TXN_INVALID;
+
+	DB_ASSERT(env, dbp->type == fnp->s_type);
+	DB_ASSERT(env, dbp->meta_pgno == fnp->meta_pgno);
+
+	if ((ret = __dbreg_add_dbentry(env, dblp, dbp, id)) != 0)
+		goto err;
+	/*
+	 * If we have a successful call, set the ID.  Otherwise
+	 * we have to revoke it and remove it from all the lists
+	 * it has been added to, and return an invalid id.
+	 */
+err:
+	if (ret != 0 && id != DB_LOGFILEID_INVALID) {
+		(void)__dbreg_revoke_id(dbp, 1, id);
+		id = DB_LOGFILEID_INVALID;
+	}
+	*idp = id;
+	return (ret);
+}
+
+/*
+ * __dbreg_assign_id --
+ *	Assign a particular dbreg id to this database handle.
+ *
+ * PUBLIC: int __dbreg_assign_id __P((DB *, int32_t, int));
+ */
+int
+__dbreg_assign_id(dbp, id, deleted)
+	DB *dbp;
+	int32_t id;
+	int deleted;
+{
+	DB *close_dbp;
+	DB_LOG *dblp;
+	ENV *env;
+	FNAME *close_fnp, *fnp;
+	LOG *lp;
+	int ret;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	close_dbp = NULL;
+	close_fnp = NULL;
+
+	/* The mtx_filelist protects the FNAME list and id management. */
+	MUTEX_LOCK(env, lp->mtx_filelist);
+
+	/* We should only call this on DB handles that have no ID. */
+	DB_ASSERT(env, fnp->id == DB_LOGFILEID_INVALID);
+
+	/*
+	 * Make sure there isn't already a file open with this ID. There can
+	 * be in recovery, if we're recovering across a point where an ID got
+	 * reused.
+	 */
+	if (__dbreg_id_to_fname(dblp, id, 1, &close_fnp) == 0) {
+		/*
+		 * We want to save off any dbp we have open with this id.  We
+		 * can't safely close it now, because we hold the mtx_filelist,
+		 * but we should be able to rely on it being open in this
+		 * process, and we're running recovery, so no other thread
+		 * should muck with it if we just put off closing it until
+		 * we're ready to return.
+		 *
+		 * Once we have the dbp, revoke its id;  we're about to
+		 * reuse it.
+		 */
+		ret = __dbreg_id_to_db(env, NULL, &close_dbp, id, 0);
+		if (ret == ENOENT) {
+			ret = 0;
+			goto cont;
+		} else if (ret != 0)
+			goto err;
+
+		if ((ret = __dbreg_revoke_id(close_dbp, 1,
+		    DB_LOGFILEID_INVALID)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Remove this ID from the free list, if it's there, and make sure
+	 * we don't allocate it anew.
+	 */
+cont:	if ((ret = __dbreg_pluck_id(env, id)) != 0)
+		goto err;
+	if (id >= lp->fid_max)
+		lp->fid_max = id + 1;
+
+	/* Now go ahead and assign the id to our dbp. */
+	fnp->id = id;
+	/* If the file is durable (i.e., not, not-durable), mark it as such. */
+	if (!F_ISSET(dbp, DB_AM_NOT_DURABLE))
+		F_SET(fnp, DB_FNAME_DURABLE);
+	SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
+
+	/*
+	 * If we get an error adding the dbentry, revoke the id.
+	 * We void the return value since we want to retain and
+	 * return the original error in ret anyway.
+	 */
+	if ((ret = __dbreg_add_dbentry(env, dblp, dbp, id)) != 0)
+		(void)__dbreg_revoke_id(dbp, 1, id);
+	else
+		dblp->dbentry[id].deleted = deleted;
+
+err:	MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	/* There's nothing useful that our caller can do if this close fails. */
+	if (close_dbp != NULL)
+		(void)__db_close(close_dbp, NULL, DB_NOSYNC);
+
+	return (ret);
+}
+
+/*
+ * __dbreg_revoke_id --
+ *	Take a log id away from a dbp, in preparation for closing it,
+ *	but without logging the close.
+ *
+ * PUBLIC: int __dbreg_revoke_id __P((DB *, int, int32_t));
+ */
+int
+__dbreg_revoke_id(dbp, have_lock, force_id)
+	DB *dbp;
+	int have_lock;
+	int32_t force_id;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	int push;
+
+	env = dbp->env;
+
+	/*
+	 * If we are not in recovery but the file was opened for a recovery
+	 * operation, then this process aborted a transaction for another
+	 * process and the id may still be in use, so don't reuse this id.
+	 * If our fid generation in replication has changed, this fid
+	 * should not be reused
+	 */
+	db_rep = env->rep_handle;
+	push = (!F_ISSET(dbp, DB_AM_RECOVER) || IS_RECOVERING(env)) &&
+	    (!REP_ON(env) || ((REP *)db_rep->region)->gen == dbp->fid_gen);
+
+	return (__dbreg_revoke_id_int(dbp->env,
+	      dbp->log_filename, have_lock, push, force_id));
+}
+/*
+ * __dbreg_revoke_id_int --
+ *	Revoke a log, in preparation for closing it, but without logging
+ *	the close.
+ *
+ * PUBLIC: int __dbreg_revoke_id_int
+ * PUBLIC:     __P((ENV *, FNAME *, int, int, int32_t));
+ */
+int
+__dbreg_revoke_id_int(env, fnp, have_lock, push, force_id)
+	ENV *env;
+	FNAME *fnp;
+	int have_lock, push;
+	int32_t force_id;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	int32_t id;
+	int ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+
+	/* If we lack an ID, this is a null-op. */
+	if (fnp == NULL)
+		return (0);
+
+	/*
+	 * If we have a force_id, we had an error after allocating
+	 * the id, and putting it on the fq list, but before we
+	 * finished setting up fnp.  So, if we have a force_id use it.
+	 */
+	if (force_id != DB_LOGFILEID_INVALID)
+		id = force_id;
+	else if (fnp->id == DB_LOGFILEID_INVALID) {
+		if (fnp->old_id == DB_LOGFILEID_INVALID)
+			return (0);
+		id = fnp->old_id;
+	} else
+		id = fnp->id;
+	if (!have_lock)
+		MUTEX_LOCK(env, lp->mtx_filelist);
+
+	fnp->id = DB_LOGFILEID_INVALID;
+	fnp->old_id = DB_LOGFILEID_INVALID;
+
+	/* Remove the FNAME from the list of open files. */
+	SH_TAILQ_REMOVE(&lp->fq, fnp, q, __fname);
+
+	/*
+	 * This FNAME may be for a DBP which is already closed.  Its ID may
+	 * still be in use by an aborting transaction.  If not,
+	 * remove this id from the dbentry table and push it onto the
+	 * free list.
+	 */
+	if ((ret = __dbreg_rem_dbentry(dblp, id)) == 0 && push)
+		ret = __dbreg_push_id(env, id);
+
+	if (!have_lock)
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (ret);
+}
+
+/*
+ * __dbreg_close_id --
+ *	Take a dbreg id away from a dbp that we're closing, and log
+ * the unregistry if the refcount goes to 0.
+ *
+ * PUBLIC: int __dbreg_close_id __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__dbreg_close_id(dbp, txn, op)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t op;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int ret, t_ret;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	/* If we lack an ID, this is a null-op. */
+	if (fnp == NULL)
+		return (0);
+
+	if (fnp->id == DB_LOGFILEID_INVALID) {
+		ret = __dbreg_revoke_id(dbp, 0, DB_LOGFILEID_INVALID);
+		goto done;
+	}
+
+	/*
+	 * If we are the last reference to this db then we need to log it
+	 * as closed.  Otherwise the last transaction will do the logging.
+	 * Remove the DBP from the db entry table since it can nolonger
+	 * be used.  If we abort it will have to be reopened.
+	 */
+	ret = 0;
+	DB_ASSERT(env, fnp->txn_ref > 0);
+	if (fnp->txn_ref > 1) {
+		MUTEX_LOCK(env, dbp->mutex);
+		if (fnp->txn_ref > 1) {
+			if ((t_ret = __dbreg_rem_dbentry(
+			    env->lg_handle, fnp->id)) != 0 && ret == 0)
+				ret = t_ret;
+
+			/*
+			 * The DB handle has been closed in the logging system.
+			 * Transactions may still have a ref to this name.
+			 * Mark it so that if recovery reopens the file id
+			 * the transaction will not close the wrong handle.
+			 */
+			F_SET(fnp, DB_FNAME_CLOSED);
+			fnp->txn_ref--;
+			MUTEX_UNLOCK(env, dbp->mutex);
+			/* The mutex now lives only in the FNAME. */
+			dbp->mutex = MUTEX_INVALID;
+			dbp->log_filename = NULL;
+			goto no_log;
+		}
+	}
+	MUTEX_LOCK(env, lp->mtx_filelist);
+
+	if ((ret = __dbreg_log_close(env, fnp, txn, op)) != 0)
+		goto err;
+	ret = __dbreg_revoke_id(dbp, 1, DB_LOGFILEID_INVALID);
+
+err:	MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+done:	if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+no_log:
+	return (ret);
+}
+/*
+ * __dbreg_close_id_int --
+ *	Close down a dbreg id and log the unregistry.  This is called only
+ * when a transaction has the last ref to the fname.
+ *
+ * PUBLIC: int __dbreg_close_id_int __P((ENV *, FNAME *, u_int32_t, int));
+ */
+int
+__dbreg_close_id_int(env, fnp, op, locked)
+	ENV *env;
+	FNAME *fnp;
+	u_int32_t op;
+	int locked;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	int ret, t_ret;
+
+	DB_ASSERT(env, fnp->txn_ref == 1);
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if (fnp->id == DB_LOGFILEID_INVALID)
+		return (__dbreg_revoke_id_int(env,
+		     fnp, locked, 1, DB_LOGFILEID_INVALID));
+
+	if (F_ISSET(fnp, DB_FNAME_RECOVER))
+		return (__dbreg_close_file(env, fnp));
+	/*
+	 * If log_close fails then it will mark the name DB_FNAME_NOTLOGGED
+	 * and the id must persist.
+	 */
+	if (!locked)
+		MUTEX_LOCK(env, lp->mtx_filelist);
+	if ((ret = __dbreg_log_close(env, fnp, NULL, op)) != 0)
+		goto err;
+
+	ret = __dbreg_revoke_id_int(env, fnp, 1, 1, DB_LOGFILEID_INVALID);
+
+err:	if (!locked)
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	if ((t_ret = __dbreg_teardown_int(env, fnp)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __dbreg_failchk --
+ *
+ * Look for entries that belong to dead processes and either close them
+ * out or, if there are pending transactions, just remove the mutex which
+ * will get discarded later.
+ *
+ * PUBLIC: int __dbreg_failchk __P((ENV *));
+ */
+int
+__dbreg_failchk(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	FNAME *fnp, *nnp;
+	LOG *lp;
+	int ret, t_ret;
+	char buf[DB_THREADID_STRLEN];
+	db_threadid_t unused;
+
+	if ((dblp = env->lg_handle) == NULL)
+		return (0);
+
+	DB_THREADID_INIT(unused);
+
+	lp = dblp->reginfo.primary;
+	dbenv = env->dbenv;
+	ret = 0;
+
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = nnp) {
+		nnp = SH_TAILQ_NEXT(fnp, q, __fname);
+		if (dbenv->is_alive(dbenv, 
+		    fnp->pid, unused, DB_MUTEX_PROCESS_ONLY))
+			continue;
+		MUTEX_LOCK(env, fnp->mutex);
+		__db_msg(env, DB_STR_A("1502",
+		    "Freeing log information for process: %s, (ref %lu)",
+		    "%s %lu"),
+		    dbenv->thread_id_string(dbenv, fnp->pid, unused, buf),
+		    (u_long)fnp->txn_ref);
+		if (fnp->txn_ref > 1 || F_ISSET(fnp, DB_FNAME_CLOSED)) {
+			if (!F_ISSET(fnp, DB_FNAME_CLOSED)) {
+				fnp->txn_ref--;
+				F_SET(fnp, DB_FNAME_CLOSED);
+			}
+			MUTEX_UNLOCK(env, fnp->mutex);
+			fnp->mutex = MUTEX_INVALID;
+			fnp->pid = 0;
+		} else {
+			F_SET(fnp, DB_FNAME_CLOSED);
+			if ((t_ret = __dbreg_close_id_int(env,
+			    fnp, DBREG_CLOSE, 1)) && ret == 0)
+				ret = t_ret;
+		}
+	}
+
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (ret);
+}
+/*
+ * __dbreg_log_close --
+ *
+ * Log a close of a database.  Called when closing a file or when a
+ * replication client is becoming a master.  That closes all the
+ * files it previously had open.
+ *
+ * Assumes caller holds the lp->mutex_filelist lock already.
+ *
+ * PUBLIC: int __dbreg_log_close __P((ENV *, FNAME *,
+ * PUBLIC:    DB_TXN *, u_int32_t));
+ */
+int
+__dbreg_log_close(env, fnp, txn, op)
+	ENV *env;
+	FNAME *fnp;
+	DB_TXN *txn;
+	u_int32_t op;
+{
+	DBT fid_dbt, r_name, *dbtp;
+	DB_LOG *dblp;
+	DB_LSN r_unused;
+	int ret;
+
+	dblp = env->lg_handle;
+	ret = 0;
+
+	if (fnp->fname_off == INVALID_ROFF)
+		dbtp = NULL;
+	else {
+		memset(&r_name, 0, sizeof(r_name));
+		r_name.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+		r_name.size = (u_int32_t)strlen((char *)r_name.data) + 1;
+		dbtp = &r_name;
+	}
+	memset(&fid_dbt, 0, sizeof(fid_dbt));
+	fid_dbt.data = fnp->ufid;
+	fid_dbt.size = DB_FILE_ID_LEN;
+	if ((ret = __dbreg_register_log(env, txn, &r_unused,
+	    F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
+	    op, dbtp, &fid_dbt, fnp->id,
+	    fnp->s_type, fnp->meta_pgno, TXN_INVALID)) != 0) {
+		/*
+		 * We are trying to close, but the log write failed.
+		 * Unfortunately, close needs to plow forward, because
+		 * the application can't do anything with the handle.
+		 * Make the entry in the shared memory region so that
+		 * when we close the environment, we know that this
+		 * happened.  Also, make sure we remove this from the
+		 * per-process table, so that we don't try to close it
+		 * later.
+		 */
+		F_SET(fnp, DB_FNAME_NOTLOGGED);
+		(void)__dbreg_rem_dbentry(dblp, fnp->id);
+	}
+	return (ret);
+}
+
+/*
+ * __dbreg_push_id and __dbreg_pop_id --
+ *	Dbreg ids from closed files are kept on a stack in shared memory
+ * for recycling.  (We want to reuse them as much as possible because each
+ * process keeps open files in an array by ID.)  Push them to the stack and
+ * pop them from it, managing memory as appropriate.
+ *
+ * The stack is protected by the mtx_filelist, and both functions assume it
+ * is already locked.
+ */
+static int
+__dbreg_push_id(env, id)
+	ENV *env;
+	int32_t id;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	REGINFO *infop;
+	int32_t *stack, *newstack;
+	int ret;
+
+	dblp = env->lg_handle;
+	infop = &dblp->reginfo;
+	lp = infop->primary;
+
+	if (id == lp->fid_max - 1) {
+		lp->fid_max--;
+		return (0);
+	}
+
+	/* Check if we have room on the stack. */
+	if (lp->free_fid_stack == INVALID_ROFF ||
+	    lp->free_fids_alloced <= lp->free_fids + 1) {
+		LOG_SYSTEM_LOCK(env);
+		if ((ret = __env_alloc(infop,
+		    (lp->free_fids_alloced + 20) * sizeof(u_int32_t),
+		    &newstack)) != 0) {
+			LOG_SYSTEM_UNLOCK(env);
+			return (ret);
+		}
+
+		if (lp->free_fid_stack != INVALID_ROFF) {
+			stack = R_ADDR(infop, lp->free_fid_stack);
+			memcpy(newstack, stack,
+			    lp->free_fids_alloced * sizeof(u_int32_t));
+			__env_alloc_free(infop, stack);
+		}
+		lp->free_fid_stack = R_OFFSET(infop, newstack);
+		lp->free_fids_alloced += 20;
+		LOG_SYSTEM_UNLOCK(env);
+	}
+
+	stack = R_ADDR(infop, lp->free_fid_stack);
+	stack[lp->free_fids++] = id;
+	return (0);
+}
+
+static int
+__dbreg_pop_id(env, id)
+	ENV *env;
+	int32_t *id;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	int32_t *stack;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/* Do we have anything to pop? */
+	if (lp->free_fid_stack != INVALID_ROFF && lp->free_fids > 0) {
+		stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+		*id = stack[--lp->free_fids];
+	} else
+		*id = DB_LOGFILEID_INVALID;
+
+	return (0);
+}
+
+/*
+ * __dbreg_pluck_id --
+ *	Remove a particular dbreg id from the stack of free ids.  This is
+ * used when we open a file, as in recovery, with a specific ID that might
+ * be on the stack.
+ *
+ * Returns success whether or not the particular id was found, and like
+ * push and pop, assumes that the mtx_filelist is locked.
+ */
+static int
+__dbreg_pluck_id(env, id)
+	ENV *env;
+	int32_t id;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	int32_t *stack;
+	u_int i;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if (id >= lp->fid_max)
+		return (0);
+
+	/* Do we have anything to look at? */
+	if (lp->free_fid_stack != INVALID_ROFF) {
+		stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+		for (i = 0; i < lp->free_fids; i++)
+			if (id == stack[i]) {
+				/*
+				 * Found it.  Overwrite it with the top
+				 * id (which may harmlessly be itself),
+				 * and shorten the stack by one.
+				 */
+				stack[i] = stack[lp->free_fids - 1];
+				lp->free_fids--;
+				return (0);
+			}
+	}
+
+	return (0);
+}
+
+/*
+ * __dbreg_log_id --
+ *	Used for in-memory named files.  They are created in mpool and
+ * are given id's early in the open process so that we can read and
+ * create pages in the mpool for the files.  However, at the time that
+ * the mpf is created, the file may not be fully created and/or its
+ * meta-data may not be fully known, so we can't do a full dbregister.
+ * This is a routine exported that will log a complete dbregister
+ * record that will allow for both recovery and replication.
+ *
+ * PUBLIC: int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int));
+ */
+int
+__dbreg_log_id(dbp, txn, id, needlock)
+	DB *dbp;
+	DB_TXN *txn;
+	int32_t id;
+	int needlock;
+{
+	DBT fid_dbt, r_name;
+	DB_LOG *dblp;
+	DB_LSN unused;
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	u_int32_t op;
+	int i, ret;
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	/*
+	 * Verify that the fnp has been initialized, by seeing if it
+	 * has any non-zero bytes in it.
+	 */
+	for (i = 0; i < DB_FILE_ID_LEN; i++)
+		if (fnp->ufid[i] != 0)
+			break;
+	if (i == DB_FILE_ID_LEN)
+		memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
+
+	if (fnp->s_type == DB_UNKNOWN)
+		fnp->s_type = dbp->type;
+
+	/*
+	 * Log the registry.  We should only request a new ID in situations
+	 * where logging is reasonable.
+	 */
+	memset(&fid_dbt, 0, sizeof(fid_dbt));
+	memset(&r_name, 0, sizeof(r_name));
+
+	if (needlock)
+		MUTEX_LOCK(env, lp->mtx_filelist);
+
+	if (fnp->fname_off != INVALID_ROFF) {
+		r_name.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+		r_name.size = (u_int32_t)strlen((char *)r_name.data) + 1;
+	}
+
+	fid_dbt.data = dbp->fileid;
+	fid_dbt.size = DB_FILE_ID_LEN;
+
+	op = !F_ISSET(dbp, DB_AM_OPEN_CALLED) ? DBREG_PREOPEN :
+	    (F_ISSET(dbp, DB_AM_INMEM) ? 
+	    (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XREOPEN : DBREG_REOPEN):
+	    (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XOPEN : DBREG_OPEN));
+	ret = __dbreg_register_log(env, txn, &unused,
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0,
+	    op | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
+	    r_name.size == 0 ? NULL : &r_name, &fid_dbt, id,
+	    fnp->s_type, fnp->meta_pgno, fnp->create_txnid);
+
+	if (needlock)
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	return (ret);
+}
diff --git a/src/dbreg/dbreg.src b/src/dbreg/dbreg.src
new file mode 100644
index 00000000..c7740d63
--- /dev/null
+++ b/src/dbreg/dbreg.src
@@ -0,0 +1,37 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__dbreg
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Used for registering name/id translations at open or close.
+ * opcode: register or unregister
+ * name: file name
+ * fileid: unique file id
+ * ftype: file type
+ * ftype: database type
+ * id: transaction id of the subtransaction that created the fs object
+ */
+BEGIN register		42	2
+DBOP	opcode		u_int32_t	lu
+DBT	name		DBT		s
+DBT	uid		DBT		s
+ARG	fileid		int32_t		ld
+ARG	ftype		DBTYPE		lx
+ARG	meta_pgno	db_pgno_t	lu
+ARG	id		u_int32_t	lx
+END
diff --git a/src/dbreg/dbreg_auto.c b/src/dbreg/dbreg_auto.c
new file mode 100644
index 00000000..a26e5527
--- /dev/null
+++ b/src/dbreg/dbreg_auto.c
@@ -0,0 +1,35 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __dbreg_register_desc[] = {
+	{LOGREC_DBOP, SSZ(__dbreg_register_args, opcode), "opcode", ""},
+	{LOGREC_DBT, SSZ(__dbreg_register_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__dbreg_register_args, uid), "uid", ""},
+	{LOGREC_ARG, SSZ(__dbreg_register_args, fileid), "fileid", "%ld"},
+	{LOGREC_ARG, SSZ(__dbreg_register_args, ftype), "ftype", "%lx"},
+	{LOGREC_ARG, SSZ(__dbreg_register_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__dbreg_register_args, id), "id", "%lx"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __dbreg_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __dbreg_register_recover, DB___dbreg_register)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/dbreg/dbreg_autop.c b/src/dbreg/dbreg_autop.c
new file mode 100644
index 00000000..ea43addd
--- /dev/null
+++ b/src/dbreg/dbreg_autop.c
@@ -0,0 +1,43 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__dbreg_register_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__dbreg_register", __dbreg_register_desc, info));
+}
+
+/*
+ * PUBLIC: int __dbreg_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __dbreg_register_print, DB___dbreg_register)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/dbreg/dbreg_rec.c b/src/dbreg/dbreg_rec.c
new file mode 100644
index 00000000..1b387bb7
--- /dev/null
+++ b/src/dbreg/dbreg_rec.c
@@ -0,0 +1,399 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+static int __dbreg_open_file __P((ENV *,
+    DB_TXN *, __dbreg_register_args *, void *));
+/*
+ * PUBLIC: int __dbreg_register_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__dbreg_register_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__dbreg_register_args *argp;
+	DB_ENTRY *dbe;
+	DB_LOG *dblp;
+	DB *dbp;
+	u_int32_t opcode, status;
+	int do_close, do_open, do_rem, ret, t_ret;
+
+	dblp = env->lg_handle;
+	dbp = NULL;
+
+#ifdef DEBUG_RECOVER
+	REC_PRINT(__dbreg_register_print);
+#endif
+	do_open = do_close = 0;
+	if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
+		goto out;
+
+	opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+	switch (opcode) {
+	case DBREG_OPEN:
+	case DBREG_PREOPEN:
+	case DBREG_REOPEN:
+	case DBREG_XOPEN:
+	case DBREG_XREOPEN:
+		/*
+		 * In general, we redo the open on REDO and abort on UNDO.
+		 * However, a reopen is a second instance of an open of
+		 * in-memory files and we don't want to close them yet
+		 * on abort, so just skip that here.
+		 */
+		if ((DB_REDO(op) ||
+		    op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES))
+			do_open = 1;
+		else if (opcode != DBREG_REOPEN && opcode != DBREG_XREOPEN)
+			do_close = 1;
+		break;
+	case DBREG_CLOSE:
+		if (DB_UNDO(op))
+			do_open = 1;
+		else
+			do_close = 1;
+		break;
+	case DBREG_RCLOSE:
+		/*
+		 * DBREG_RCLOSE was generated by recover because a file was
+		 * left open.  The POPENFILES pass, which is run to open
+		 * files to abort prepared transactions, may not include the
+		 * open for this file so we open it here.  Note that a normal
+		 * CLOSE is not legal before the prepared transaction is
+		 * committed or aborted.
+		 */
+		if (DB_UNDO(op) || op == DB_TXN_POPENFILES)
+			do_open = 1;
+		else
+			do_close = 1;
+		break;
+	case DBREG_CHKPNT:
+	case DBREG_XCHKPNT:
+		if (DB_UNDO(op) ||
+		    op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES)
+			do_open = 1;
+		break;
+	default:
+		ret = __db_unknown_path(env, "__dbreg_register_recover");
+		goto out;
+	}
+
+	if (do_open) {
+		/*
+		 * We must open the db even if the meta page is not
+		 * yet written as we may be creating subdatabase.
+		 */
+		if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT
+		    && opcode != DBREG_XCHKPNT)
+			F_SET(dblp, DBLOG_FORCE_OPEN);
+
+		/*
+		 * During an abort or an open pass to recover prepared txns,
+		 * we need to make sure that we use the same locker id on the
+		 * open.  We pass the txnid along to ensure this.
+		 */
+		ret = __dbreg_open_file(env,
+		    op == DB_TXN_ABORT || op == DB_TXN_POPENFILES ?
+		    argp->txnp : NULL, argp, info);
+		if (ret == DB_PAGE_NOTFOUND && argp->meta_pgno != PGNO_BASE_MD)
+			ret = ENOENT;
+		if (ret == ENOENT || ret == EINVAL) {
+			/*
+			 * If this is an OPEN while rolling forward, it's
+			 * possible that the file was recreated since last
+			 * time we got here.  In that case, we've got deleted
+			 * set and probably shouldn't, so we need to check
+			 * for that case and possibly retry.
+			 */
+			if (DB_REDO(op) && argp->txnp != 0 &&
+			    dblp->dbentry[argp->fileid].deleted) {
+				dblp->dbentry[argp->fileid].deleted = 0;
+				ret =
+				    __dbreg_open_file(env, NULL, argp, info);
+				if (ret == DB_PAGE_NOTFOUND &&
+				     argp->meta_pgno != PGNO_BASE_MD)
+					ret = ENOENT;
+			}
+			/*
+			 * We treat ENOENT as OK since it's possible that
+			 * the file was renamed or deleted.
+			 * All other errors, we return.
+			 */
+			if (ret == ENOENT)
+				ret = 0;
+		}
+		F_CLR(dblp, DBLOG_FORCE_OPEN);
+	}
+
+	if (do_close) {
+		/*
+		 * If we are undoing an open, or redoing a close,
+		 * then we need to close the file.  If we are simply
+		 * revoking then we just need to grab the DBP and revoke
+		 * the log id.
+		 *
+		 * If the file is deleted, then we can just ignore this close.
+		 * Otherwise, we should usually have a valid dbp we should
+		 * close or whose reference count should be decremented.
+		 * However, if we shut down without closing a file, we may, in
+		 * fact, not have the file open, and that's OK.
+		 */
+		do_rem = 0;
+		MUTEX_LOCK(env, dblp->mtx_dbreg);
+		if (argp->fileid < dblp->dbentry_cnt) {
+			/*
+			 * Typically, closes should match an open which means
+			 * that if this is a close, there should be a valid
+			 * entry in the dbentry table when we get here,
+			 * however there are exceptions.  1. If this is an
+			 * OPENFILES pass, then we may have started from
+			 * a log file other than the first, and the
+			 * corresponding open appears in an earlier file.
+			 * 2. If we are undoing an open on an abort or
+			 * recovery, it's possible that we failed after
+			 * the log record, but before we actually entered
+			 * a handle here.
+			 * 3. If we aborted an open, then we wrote a non-txnal
+			 * RCLOSE into the log.  During the forward pass, the
+			 * file won't be open, and that's OK.
+			 */
+			dbe = &dblp->dbentry[argp->fileid];
+			if (dbe->dbp == NULL && !dbe->deleted) {
+				/* No valid entry here. Nothing to do. */
+				MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+				goto done;
+			}
+
+			/* We have either an open entry or a deleted entry. */
+			if ((dbp = dbe->dbp) != NULL) {
+				/*
+				 * If we're a replication client, it's
+				 * possible to get here with a dbp that
+				 * the user opened, but which we later
+				 * assigned a fileid to.  Be sure that
+				 * we only close dbps that we opened in
+				 * the recovery code or that were opened
+				 * inside a currently aborting transaction
+				 * but not by the recovery code.
+				 */
+				do_rem = (F_ISSET(dbp, DB_AM_RECOVER) ||
+				    F2_ISSET(dbp, DB2_AM_EXCL)) ?
+				    op != DB_TXN_ABORT :
+				    op == DB_TXN_ABORT;
+				MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+			} else if (dbe->deleted) {
+				MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+				if ((ret = __dbreg_rem_dbentry(
+				    dblp, argp->fileid)) != 0)
+					goto out;
+			}
+		} else
+			MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+		/*
+		 * During recovery, all files are closed.  On an abort, we only
+		 * close the file if we opened it during the abort
+		 * (DB_AM_RECOVER set), otherwise we simply do a __db_refresh.
+		 * For the close case, if remove or rename has closed the file,
+		 * don't request a sync, because a NULL mpf would be a problem.
+		 *
+		 * If we are undoing a create we'd better discard any buffers
+		 * from the memory pool.  We identify creates because the
+		 * argp->id field contains the transaction containing the file
+		 * create; if that id is invalid, we are not creating.
+		 *
+		 * On the backward pass, we need to "undo" opens even if the
+		 * transaction in which they appeared committed, because we have
+		 * already undone the corresponding close.  In that case, the
+		 * id will be valid, but we do not want to discard buffers.
+		 */
+		if (do_rem && dbp != NULL) {
+			if (argp->id != TXN_INVALID) {
+				if ((ret = __db_txnlist_find(env,
+				    info, argp->txnp->txnid, &status))
+				    != DB_NOTFOUND && ret != 0)
+					goto out;
+				if (ret == DB_NOTFOUND || status != TXN_COMMIT)
+					F_SET(dbp, DB_AM_DISCARD);
+				ret = 0;
+			}
+
+			if (op == DB_TXN_ABORT) {
+				if ((t_ret = __db_refresh(dbp,
+				    NULL, DB_NOSYNC, NULL, 0)) != 0 && ret == 0)
+					ret = t_ret;
+			} else {
+				if ((t_ret = __db_close(
+				    dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+					ret = t_ret;
+			}
+		}
+	}
+done:	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+out:	if (argp != NULL)
+		__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * __dbreg_open_file --
+ *	Called during log_register recovery.  Make sure that we have an
+ *	entry in the dbentry table for this ndx.  Returns 0 on success,
+ *	non-zero on error.
+ */
+static int
+__dbreg_open_file(env, txn, argp, info)
+	ENV *env;
+	DB_TXN *txn;
+	__dbreg_register_args *argp;
+	void *info;
+{
+	DB *dbp;
+	DB_ENTRY *dbe;
+	DB_LOG *dblp;
+	u_int32_t id, opcode, status;
+	int ret;
+
+	dblp = env->lg_handle;
+	opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+
+	/*
+	 * When we're opening, we have to check that the name we are opening
+	 * is what we expect.  If it's not, then we close the old file and
+	 * open the new one.
+	 */
+	MUTEX_LOCK(env, dblp->mtx_dbreg);
+	if (argp->fileid != DB_LOGFILEID_INVALID &&
+	    argp->fileid < dblp->dbentry_cnt)
+		dbe = &dblp->dbentry[argp->fileid];
+	else
+		dbe = NULL;
+
+	if (dbe != NULL) {
+		if (dbe->deleted) {
+			MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+			return (ENOENT);
+		}
+
+		/*
+		 * At the end of OPENFILES, we may have a file open.  If this
+		 * is a reopen, then we will always close and reopen.  If the
+		 * open was part of a committed transaction, so it doesn't
+		 * get undone.  However, if the fileid was previously used,
+		 * we'll see a close that may need to get undone.  There are
+		 * three ways we can detect this. 1) the meta-pgno in the
+		 * current file does not match that of the open file, 2) the
+		 * file uid of the current file does not match that of the
+		 * previously opened file, 3) the current file is unnamed, in
+		 * which case it should never be opened during recovery.
+		 * It is also possible that the db open previously failed
+		 * because the file was missing.  Check the DB_AM_OPEN_CALLED
+		 * bit and try to open it again.
+		 */
+		if ((dbp = dbe->dbp) != NULL) {
+			if (opcode == DBREG_REOPEN || 
+			    opcode == DBREG_XREOPEN ||
+			    !F_ISSET(dbp, DB_AM_OPEN_CALLED) ||
+			    dbp->meta_pgno != argp->meta_pgno ||
+			    argp->name.size == 0 ||
+			    memcmp(dbp->fileid, argp->uid.data,
+			    DB_FILE_ID_LEN) != 0) {
+				MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+				(void)__dbreg_revoke_id(dbp, 0,
+				    DB_LOGFILEID_INVALID);
+				if (F_ISSET(dbp, DB_AM_RECOVER))
+					(void)__db_close(dbp, NULL, DB_NOSYNC);
+				goto reopen;
+			}
+
+			DB_ASSERT(env, dbe->dbp == dbp);
+			MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+			/*
+			 * This is a successful open.  We need to record that
+			 * in the txnlist so that we know how to handle the
+			 * subtransaction that created the file system object.
+			 */
+			if (argp != NULL && argp->id != TXN_INVALID &&
+			    (ret = __db_txnlist_update(env, info,
+			    argp->id, TXN_EXPECTED, NULL, &status, 1)) != 0)
+				return (ret);
+			return (0);
+		}
+	}
+
+	MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+reopen:
+	/*
+	 * We never re-open temporary files.  Temp files are only useful during
+	 * aborts in which case the dbp was entered when the file was
+	 * registered. During recovery, we treat temp files as properly deleted
+	 * files, allowing the open to fail and not reporting any errors when
+	 * recovery fails to get a valid dbp from __dbreg_id_to_db.
+	 */
+	if (argp->name.size == 0) {
+		(void)__dbreg_add_dbentry(env, dblp, NULL, argp->fileid);
+		return (ENOENT);
+	}
+
+	/*
+	 * We are about to pass a recovery txn pointer into the main library.
+	 * We need to make sure that any accessed fields are set appropriately.
+	 */
+	if (txn != NULL) {
+		id = txn->txnid;
+		memset(txn, 0, sizeof(DB_TXN));
+		txn->txnid = id;
+		txn->mgrp = env->tx_handle;
+	}
+
+	return (__dbreg_do_open(env,
+	    txn, dblp, argp->uid.data, argp->name.data, argp->ftype,
+	    argp->fileid, argp->meta_pgno, info, argp->id, opcode));
+}
diff --git a/src/dbreg/dbreg_stat.c b/src/dbreg/dbreg_stat.c
new file mode 100644
index 00000000..6dfb3869
--- /dev/null
+++ b/src/dbreg/dbreg_stat.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int __dbreg_print_all __P((ENV *, u_int32_t));
+
+/*
+ * __dbreg_stat_print --
+ *	Print the dbreg statistics.
+ *
+ * PUBLIC: int __dbreg_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__dbreg_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	int ret;
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __dbreg_print_all(env, flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __dbreg_print_fname --
+ *	Display the contents of an FNAME structure.
+ *
+ * PUBLIC: void __dbreg_print_fname __P((ENV *, FNAME *));
+ */
+void
+__dbreg_print_fname(env, fnp)
+	ENV *env;
+	FNAME *fnp;
+{
+	static const FN fn[] = {
+		{ DB_FNAME_DURABLE,	"DB_FNAME_DURABLE" },
+		{ DB_FNAME_NOTLOGGED,	"DB_FNAME_NOTLOGGED" },
+		{ DB_FNAME_CLOSED,	"DB_FNAME_CLOSED" },
+		{ DB_FNAME_RECOVER,	"DB_FNAME_RECOVER" },
+		{ 0,			NULL }
+	};
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB handle FNAME contents:");
+	STAT_LONG("log ID", fnp->id);
+	STAT_ULONG("Meta pgno", fnp->meta_pgno);
+	__db_print_fileid(env, fnp->ufid, "\tFile ID");
+	STAT_ULONG("create txn", fnp->create_txnid);
+	STAT_ULONG("refcount", fnp->txn_ref);
+	__db_prflags(env, NULL, fnp->flags, fn, NULL, "\tFlags");
+}
+
+/*
+ * __dbreg_print_all --
+ *	Display the ENV's list of files.
+ */
+static int
+__dbreg_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_LOG *dblp;
+	FNAME *fnp;
+	LOG *lp;
+	int32_t *stack;
+	int del, first;
+	u_int32_t i;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	__db_msg(env, "LOG FNAME list:");
+	__mutex_print_debug_single(
+	    env, "File name mutex", lp->mtx_filelist, flags);
+
+	STAT_LONG("Fid max", lp->fid_max);
+	STAT_LONG("Log buffer size", lp->buffer_size);
+
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	first = 1;
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+		if (first) {
+			first = 0;
+			__db_msg(env,
+		    "ID\tName\t\tType\tPgno\tPid\tTxnid\tFlags\tRef\tDBP-info");
+		}
+		dbp = fnp->id >= dblp->dbentry_cnt ? NULL :
+		    dblp->dbentry[fnp->id].dbp;
+		del = fnp->id >= dblp->dbentry_cnt ? 0 :
+		    dblp->dbentry[fnp->id].deleted;
+		__db_msg(env,
+		    "%ld\t%-8s%s%-8s%s\t%lu\t%lu\t%lx\t%lx\t%lx\t%s",
+		    (long)fnp->id,
+		    fnp->fname_off == INVALID_ROFF ?
+			"" : (char *)R_ADDR(&dblp->reginfo, fnp->fname_off),
+		    fnp->dname_off == INVALID_ROFF ? "" : ":",
+		    fnp->dname_off == INVALID_ROFF ?
+			"" : (char *)R_ADDR(&dblp->reginfo, fnp->dname_off),
+		    __db_dbtype_to_string(fnp->s_type),
+		    (u_long)fnp->meta_pgno, (u_long)fnp->pid,
+		    (u_long)fnp->create_txnid, (u_long)fnp->flags,
+		    (u_long)fnp->txn_ref,
+		    dbp == NULL ? "No DBP" : "DBP");
+		if (dbp != NULL)
+			__db_msg(env, " (%d %lx %lx)", del, P_TO_ULONG(dbp),
+			    (u_long)(dbp == NULL ? 0 : dbp->flags));
+	}
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "LOG region list of free IDs.");
+	if (lp->free_fid_stack == INVALID_ROFF)
+		__db_msg(env, "Free id stack is empty.");
+	else {
+		STAT_ULONG("Free id array size", lp->free_fids_alloced);
+		STAT_ULONG("Number of ids on the free stack", lp->free_fids);
+		stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+		for (i = 0; i < lp->free_fids; i++)
+			STAT_LONG("fid", stack[i]);
+	}
+
+	return (0);
+}
+#endif
diff --git a/src/dbreg/dbreg_util.c b/src/dbreg/dbreg_util.c
new file mode 100644
index 00000000..80de4d91
--- /dev/null
+++ b/src/dbreg/dbreg_util.c
@@ -0,0 +1,847 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __dbreg_check_master __P((ENV *, u_int8_t *, char *));
+
+/*
+ * __dbreg_add_dbentry --
+ *	Adds a DB entry to the dbreg DB entry table.
+ *
+ * PUBLIC: int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t));
+ */
+int
+__dbreg_add_dbentry(env, dblp, dbp, ndx)
+	ENV *env;
+	DB_LOG *dblp;
+	DB *dbp;
+	int32_t ndx;
+{
+	int32_t i;
+	int ret;
+
+	ret = 0;
+
+	MUTEX_LOCK(env, dblp->mtx_dbreg);
+
+	/*
+	 * Check if we need to grow the table.  Note, ndx is 0-based (the
+	 * index into the DB entry table) an dbentry_cnt is 1-based, the
+	 * number of available slots.
+	 */
+	if (dblp->dbentry_cnt <= ndx) {
+		if ((ret = __os_realloc(env,
+		    (size_t)(ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY),
+		    &dblp->dbentry)) != 0)
+			goto err;
+
+		/* Initialize the new entries. */
+		for (i = dblp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) {
+			dblp->dbentry[i].dbp = NULL;
+			dblp->dbentry[i].deleted = 0;
+		}
+		dblp->dbentry_cnt = i;
+	}
+
+	DB_ASSERT(env, dblp->dbentry[ndx].dbp == NULL);
+	dblp->dbentry[ndx].deleted = dbp == NULL;
+	dblp->dbentry[ndx].dbp = dbp;
+
+err:	MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+	return (ret);
+}
+
+/*
+ * __dbreg_rem_dbentry
+ *	Remove an entry from the DB entry table.
+ *
+ * PUBLIC: int __dbreg_rem_dbentry __P((DB_LOG *, int32_t));
+ */
+int
+__dbreg_rem_dbentry(dblp, ndx)
+	DB_LOG *dblp;
+	int32_t ndx;
+{
+	MUTEX_LOCK(dblp->env, dblp->mtx_dbreg);
+	if (dblp->dbentry_cnt > ndx) {
+		dblp->dbentry[ndx].dbp = NULL;
+		dblp->dbentry[ndx].deleted = 0;
+	}
+	MUTEX_UNLOCK(dblp->env, dblp->mtx_dbreg);
+
+	return (0);
+}
+
+/*
+ * __dbreg_log_files --
+ *	Put a DBREG_CHKPNT/CLOSE log record for each open database.
+ *
+ * PUBLIC: int __dbreg_log_files __P((ENV *, u_int32_t));
+ */
+int
+__dbreg_log_files(env, opcode)
+	ENV *env;
+	u_int32_t opcode;
+{
+	DBT *dbtp, fid_dbt, t;
+	DB_LOG *dblp;
+	DB_LSN r_unused;
+	FNAME *fnp;
+	LOG *lp;
+	u_int32_t lopcode;
+	int ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	ret = 0;
+
+	MUTEX_LOCK(env, lp->mtx_filelist);
+
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+		/* This id was revoked by a switch in replication master. */
+		if (fnp->id == DB_LOGFILEID_INVALID)
+			continue;
+		if (fnp->fname_off == INVALID_ROFF)
+			dbtp = NULL;
+		else {
+			memset(&t, 0, sizeof(t));
+			t.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+			t.size = (u_int32_t)strlen(t.data) + 1;
+			dbtp = &t;
+		}
+		memset(&fid_dbt, 0, sizeof(fid_dbt));
+		fid_dbt.data = fnp->ufid;
+		fid_dbt.size = DB_FILE_ID_LEN;
+		/*
+		 * Output DBREG_CHKPNT records which will be processed during
+		 * the OPENFILES pass of recovery.  At the end of recovery we
+		 * want to output the files that were open so a future recovery
+		 * run will have the correct files open during a backward pass.
+		 * For this we output DBREG_RCLOSE records so the files will be
+		 * closed on the forward pass.
+		 */
+		lopcode = opcode;
+		if ( opcode == DBREG_CHKPNT && F_ISSET(fnp, DBREG_EXCL))
+			lopcode = DBREG_XCHKPNT;
+		if ((ret = __dbreg_register_log(env, NULL, &r_unused,
+		    F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
+		    lopcode | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
+		    dbtp, &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno,
+		    TXN_INVALID)) != 0)
+			break;
+	}
+
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	return (ret);
+}
+
+/*
+ * __dbreg_log_nofiles --
+ *
+ * PUBLIC: int __dbreg_log_nofiles __P((ENV *));
+ */
+int
+__dbreg_log_nofiles(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	return (SH_TAILQ_EMPTY(&lp->fq));
+}
+/*
+ * __dbreg_close_files --
+ *	Remove the id's of open files and actually close those
+ *	files that were opened by the recovery daemon.  We sync the
+ *	file, unless its mpf pointer has been NULLed by a db_remove or
+ *	db_rename.  We may not have flushed the log_register record that
+ *	closes the file.
+ *
+ * PUBLIC: int __dbreg_close_files __P((ENV *, int));
+ */
+int
+__dbreg_close_files(env, do_restored)
+	ENV *env;
+	int do_restored;
+{
+	DB *dbp;
+	DB_LOG *dblp;
+	int ret, t_ret;
+	int32_t i;
+
+	/* If we haven't initialized logging, we have nothing to do. */
+	if (!LOGGING_ON(env))
+		return (0);
+
+	dblp = env->lg_handle;
+	ret = 0;
+
+	MUTEX_LOCK(env, dblp->mtx_dbreg);
+	for (i = 0; i < dblp->dbentry_cnt; i++) {
+		/*
+		 * We only want to close dbps that recovery opened.  Any
+		 * dbps that weren't opened by recovery but show up here
+		 * are about to be unconditionally removed from the table.
+		 * Before doing so, we need to revoke their log fileids
+		 * so that we don't end up leaving around FNAME entries
+		 * for dbps that shouldn't have them.
+		 */
+		if ((dbp = dblp->dbentry[i].dbp) != NULL) {
+			/*
+			 * It's unsafe to call DB->close or revoke_id
+			 * while holding the thread lock, because
+			 * we'll call __dbreg_rem_dbentry and grab it again.
+			 *
+			 * Just drop it.  Since dbreg ids go monotonically
+			 * upward, concurrent opens should be safe, and the
+			 * user should have no business closing files while
+			 * we're in this loop anyway--we're in the process of
+			 * making all outstanding dbps invalid.
+			 */
+			/*
+			 * If we only want to close those FNAMES marked
+			 * as restored, check now.
+			 */
+			if (do_restored &&
+			    !F_ISSET(dbp->log_filename, DB_FNAME_RESTORED))
+				continue;
+			MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+			if (F_ISSET(dbp, DB_AM_RECOVER))
+				t_ret = __db_close(dbp,
+				    NULL, dbp->mpf == NULL ? DB_NOSYNC : 0);
+			else
+				t_ret = __dbreg_revoke_id(
+				     dbp, 0, DB_LOGFILEID_INVALID);
+			if (ret == 0)
+				ret = t_ret;
+			MUTEX_LOCK(env, dblp->mtx_dbreg);
+		}
+
+		dblp->dbentry[i].deleted = 0;
+		dblp->dbentry[i].dbp = NULL;
+	}
+	MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+	return (ret);
+}
+
+/*
+ * __dbreg_close_file --
+ *	Close a database file opened by recovery.
+ * PUBLIC: int __dbreg_close_file __P((ENV *, FNAME *));
+ */
+int
+__dbreg_close_file(env, fnp)
+	ENV *env;
+	FNAME *fnp;
+{
+	DB *dbp;
+	DB_LOG *dblp;
+
+	dblp = env->lg_handle;
+
+	dbp = dblp->dbentry[fnp->id].dbp;
+	if (dbp == NULL)
+		return (0);
+	DB_ASSERT(env, dbp->log_filename == fnp);
+	DB_ASSERT(env, F_ISSET(dbp, DB_AM_RECOVER));
+	return (__db_close(dbp, NULL, DB_NOSYNC));
+}
+
+/*
+ * __dbreg_mark_restored --
+ *	Mark files when we change replication roles and there are outstanding
+ * prepared txns that may use these files.  These will be invalidated later
+ * when all outstanding prepared txns are resolved.
+ *
+ * PUBLIC: int __dbreg_mark_restored __P((ENV *));
+ */
+int
+__dbreg_mark_restored(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	FNAME *fnp;
+	LOG *lp;
+
+	/* If we haven't initialized logging, we have nothing to do. */
+	if (!LOGGING_ON(env))
+		return (0);
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+		if (fnp->id != DB_LOGFILEID_INVALID)
+			F_SET(fnp, DB_FNAME_RESTORED);
+
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (0);
+}
+
+/*
+ * __dbreg_invalidate_files --
+ *	Invalidate files when we change replication roles.  Save the
+ * id so that another process will be able to clean up the information
+ * when it notices.
+ *
+ * PUBLIC: int __dbreg_invalidate_files __P((ENV *, int));
+ */
+int
+__dbreg_invalidate_files(env, do_restored)
+	ENV *env;
+	int do_restored;
+{
+	DB_LOG *dblp;
+	FNAME *fnp;
+	LOG *lp;
+	int ret;
+
+	/* If we haven't initialized logging, we have nothing to do. */
+	if (!LOGGING_ON(env))
+		return (0);
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	ret = 0;
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+		/*
+		 * Normally, skip any file with DB_FNAME_RESTORED
+		 * set.  If do_restored is set, only invalidate
+		 * those files with the flag set and skip all others.
+		 */
+		if (F_ISSET(fnp, DB_FNAME_RESTORED) && !do_restored)
+			continue;
+		if (!F_ISSET(fnp, DB_FNAME_RESTORED) && do_restored)
+			continue;
+		if (fnp->id != DB_LOGFILEID_INVALID) {
+			if ((ret = __dbreg_log_close(env,
+			    fnp, NULL, DBREG_RCLOSE)) != 0)
+				goto err;
+			fnp->old_id = fnp->id;
+			fnp->id = DB_LOGFILEID_INVALID;
+		}
+	}
+err:	MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (ret);
+}
+
+/*
+ * __dbreg_id_to_db --
+ *	Return the DB corresponding to the specified dbreg id.
+ *
+ * PUBLIC: int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int));
+ */
+int
+__dbreg_id_to_db(env, txn, dbpp, ndx, tryopen)
+	ENV *env;
+	DB_TXN *txn;
+	DB **dbpp;
+	int32_t ndx;
+	int tryopen;
+{
+	DB_LOG *dblp;
+	FNAME *fname;
+	int ret;
+	char *name;
+
+	dblp = env->lg_handle;
+	ret = 0;
+
+	MUTEX_LOCK(env, dblp->mtx_dbreg);
+
+	/*
+	 * We take a final parameter that indicates whether we should attempt
+	 * to open the file if no mapping is found.  During recovery, the
+	 * recovery routines all want to try to open the file (and this is
+	 * called from __dbreg_id_to_db), however, if we have a multi-process
+	 * environment where some processes may not have the files open,
+	 * then we also get called from __dbreg_assign_id and it's OK if
+	 * there is no mapping.
+	 *
+	 * Under failchk, a process different than the one issuing DB
+	 * operations may abort a transaction.  In this case, the "recovery"
+	 * routines are run by a process that does not necessarily have the
+	 * file open, so we we must open the file explicitly.
+	 */
+	if (ndx >= dblp->dbentry_cnt ||
+	    (!dblp->dbentry[ndx].deleted && dblp->dbentry[ndx].dbp == NULL)) {
+		if (!tryopen || F_ISSET(dblp, DBLOG_RECOVER)) {
+			ret = ENOENT;
+			goto err;
+		}
+
+		/*
+		 * __dbreg_id_to_fname acquires the mtx_filelist mutex, which
+		 * we can't safely acquire while we hold the thread lock.  We
+		 * no longer need it anyway--the dbentry table didn't have what
+		 * we needed.
+		 */
+		MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+		if (__dbreg_id_to_fname(dblp, ndx, 0, &fname) != 0)
+			/*
+			 * With transactional opens, we may actually have
+			 * closed this file in the transaction in which
+			 * case this will fail too.  Then it's up to the
+			 * caller to reopen the file.
+			 */
+			return (ENOENT);
+
+		/*
+		 * Note that we're relying on fname not to change, even though
+		 * we released the mutex that protects it (mtx_filelist) inside
+		 * __dbreg_id_to_fname.  This should be a safe assumption, the
+		 * other process that has the file open shouldn't be closing it
+		 * while we're trying to abort.
+		 */
+		name = fname->fname_off == INVALID_ROFF ?
+		    NULL : R_ADDR(&dblp->reginfo, fname->fname_off);
+
+		/*
+		 * At this point, we are not holding the thread lock, so exit
+		 * directly instead of going through the exit code at the
+		 * bottom.  If the __dbreg_do_open succeeded, then we don't need
+		 * to do any of the remaining error checking at the end of this
+		 * routine.
+		 * If TXN_INVALID is passed then no txnlist is needed.
+		 */
+		if ((ret = __dbreg_do_open(env, txn, dblp,
+		    fname->ufid, name, fname->s_type, ndx, fname->meta_pgno,
+		    NULL, TXN_INVALID, F_ISSET(fname, DB_FNAME_INMEM) ?
+		    DBREG_REOPEN : DBREG_OPEN)) != 0)
+			return (ret);
+
+		*dbpp = dblp->dbentry[ndx].dbp;
+		return (*dbpp == NULL ? DB_DELETED : 0);
+	}
+
+	/*
+	 * Return DB_DELETED if the file has been deleted (it's not an error).
+	 */
+	if (dblp->dbentry[ndx].deleted) {
+		ret = DB_DELETED;
+		goto err;
+	}
+
+	/* It's an error if we don't have a corresponding writable DB. */
+	if ((*dbpp = dblp->dbentry[ndx].dbp) == NULL)
+		ret = ENOENT;
+	else
+		/*
+		 * If we are in recovery, then set that the file has
+		 * been written.  It is possible to run recovery,
+		 * find all the pages in their post update state
+		 * in the OS buffer pool, put a checkpoint in the log
+		 * and then crash the system without forcing the pages
+		 * to disk. If this is an in-memory file, we may not have
+		 * an mpf yet.
+		 */
+		if ((*dbpp)->mpf != NULL && (*dbpp)->mpf->mfp != NULL)
+			(*dbpp)->mpf->mfp->file_written = 1;
+
+err:	MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+	return (ret);
+}
+
+/*
+ * __dbreg_id_to_fname --
+ *	Traverse the shared-memory region looking for the entry that
+ *	matches the passed dbreg id.  Returns 0 on success; -1 on error.
+ *
+ * PUBLIC: int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **));
+ */
+int
+__dbreg_id_to_fname(dblp, id, have_lock, fnamep)
+	DB_LOG *dblp;
+	int32_t id;
+	int have_lock;
+	FNAME **fnamep;
+{
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int ret;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	ret = -1;
+
+	if (!have_lock)
+		MUTEX_LOCK(env, lp->mtx_filelist);
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+		if (fnp->id == id) {
+			*fnamep = fnp;
+			ret = 0;
+			break;
+		}
+	if (!have_lock)
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	return (ret);
+}
+/*
+ * __dbreg_fid_to_fname --
+ *	Traverse the shared-memory region looking for the entry that
+ *	matches the passed file unique id.  Returns 0 on success; -1 on error.
+ *
+ * PUBLIC: int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **));
+ */
+int
+__dbreg_fid_to_fname(dblp, fid, have_lock, fnamep)
+	DB_LOG *dblp;
+	u_int8_t *fid;
+	int have_lock;
+	FNAME **fnamep;
+{
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int ret;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	ret = -1;
+
+	if (!have_lock)
+		MUTEX_LOCK(env, lp->mtx_filelist);
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+		if (memcmp(fnp->ufid, fid, DB_FILE_ID_LEN) == 0) {
+			*fnamep = fnp;
+			ret = 0;
+			break;
+		}
+	if (!have_lock)
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	return (ret);
+}
+
+/*
+ * __dbreg_get_name
+ *
+ * Interface to get name of registered files.  This is mainly diagnostic
+ * and the name passed could be transient unless there is something
+ * ensuring that the file cannot be closed.
+ *
+ * PUBLIC: int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **));
+ */
+int
+__dbreg_get_name(env, fid, fnamep, dnamep)
+	ENV *env;
+	u_int8_t *fid;
+	char **fnamep, **dnamep;
+{
+	DB_LOG *dblp;
+	FNAME *fnp;
+
+	dblp = env->lg_handle;
+
+	if (dblp != NULL && __dbreg_fid_to_fname(dblp, fid, 0, &fnp) == 0) {
+		*fnamep = fnp->fname_off == INVALID_ROFF ?
+		    NULL : R_ADDR(&dblp->reginfo, fnp->fname_off);
+		*dnamep = fnp->dname_off == INVALID_ROFF ?
+		    NULL : R_ADDR(&dblp->reginfo, fnp->dname_off);
+		return (0);
+	}
+
+	*fnamep = *dnamep = NULL;
+	return (-1);
+}
+
+/*
+ * __dbreg_do_open --
+ *	Open files referenced in the log.  This is the part of the open that
+ * is not protected by the thread mutex.
+ * PUBLIC: int __dbreg_do_open __P((ENV *,
+ * PUBLIC:     DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE,
+ * PUBLIC:     int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+ */
+int
+__dbreg_do_open(env,
+    txn, lp, uid, name, ftype, ndx, meta_pgno, info, id, opcode)
+	ENV *env;
+	DB_TXN *txn;
+	DB_LOG *lp;
+	u_int8_t *uid;
+	char *name;
+	DBTYPE ftype;
+	int32_t ndx;
+	db_pgno_t meta_pgno;
+	void *info;
+	u_int32_t id, opcode;
+{
+	DB *dbp;
+	u_int32_t cstat, ret_stat;
+	int ret, t_ret, try_inmem;
+	char *dname, *fname;
+
+	cstat = TXN_EXPECTED;
+	fname = name;
+	dname = NULL;
+	try_inmem = 0;
+
+retry_inmem:
+	if ((ret = __db_create_internal(&dbp, lp->env, 0)) != 0)
+		return (ret);
+
+	/*
+	 * We can open files under a number of different scenarios.
+	 * First, we can open a file during a normal txn_abort, if that file
+	 * was opened and closed during the transaction (as is the master
+	 * database of a sub-database).
+	 * Second, we might be aborting a transaction in a process other than
+	 * the one that did it (failchk).
+	 * Third, we might be in recovery.
+	 * In case 3, there is no locking, so there is no issue.
+	 * In cases 1 and 2, we are guaranteed to already hold any locks
+	 * that we need, since we're still in the same transaction, so by
+	 * setting DB_AM_RECOVER, we guarantee that we don't log and that
+	 * we don't try to acquire locks on behalf of a different locker id.
+	 */
+	F_SET(dbp, DB_AM_RECOVER);
+	if (meta_pgno != PGNO_BASE_MD) {
+		memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
+		dbp->meta_pgno = meta_pgno;
+	}
+
+	if (opcode == DBREG_PREOPEN) {
+		dbp->type = ftype;
+		if ((ret = __dbreg_setup(dbp, name, NULL, id)) != 0)
+			goto err;
+		MAKE_INMEM(dbp);
+		goto skip_open;
+	}
+
+	if (opcode == DBREG_REOPEN || opcode == DBREG_XREOPEN || try_inmem) {
+		MAKE_INMEM(dbp);
+		fname = NULL;
+		dname = name;
+	}
+
+	if (opcode == DBREG_XOPEN || opcode == DBREG_XCHKPNT ||
+	    opcode == DBREG_XREOPEN)
+		F2_SET(dbp, DB2_AM_EXCL|DB2_AM_INTEXCL);
+
+	if ((ret = __db_open(dbp, NULL, txn, fname, dname, ftype,
+	    DB_DURABLE_UNKNOWN | DB_ODDFILESIZE,
+	    DB_MODE_600, meta_pgno)) == 0) {
+skip_open:
+		/*
+		 * Verify that we are opening the same file that we were
+		 * referring to when we wrote this log record.
+		 */
+		if ((meta_pgno != PGNO_BASE_MD &&
+		    __dbreg_check_master(env, uid, name) != 0) ||
+		    memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+			cstat = TXN_UNEXPECTED;
+		else
+			cstat = TXN_EXPECTED;
+
+		/* Assign the specific dbreg id to this dbp. */
+		if ((ret = __dbreg_assign_id(dbp, ndx, 0)) != 0)
+			goto err;
+
+		/*
+		 * Record the newly-opened file in the transaction so it closed
+		 * when the transaction ends.  Decrement the reference count
+		 * because there will be no explicit close for this handle and
+		 * we want it to be closed when the transaction ends.
+		 */
+		if (txn != NULL && (ret =
+		    __txn_record_fname(env, txn, dbp->log_filename)) != 0)
+			goto err;
+		--dbp->log_filename->txn_ref;
+
+		/*
+		 * If we successfully opened this file, then we need to
+		 * convey that information to the txnlist so that we
+		 * know how to handle the subtransaction that created
+		 * the file system object.
+		 */
+		if (id != TXN_INVALID)
+			ret = __db_txnlist_update(env,
+			    info, id, cstat, NULL, &ret_stat, 1);
+
+err:		if (cstat == TXN_UNEXPECTED)
+			goto not_right;
+		return (ret);
+	} else if (ret == ENOENT) {
+		/*
+		 * If the open failed with ENOENT, retry it as a named in-mem
+		 * database.  Some record types do not distinguish between a
+		 * named in-memory database and one on-disk.  Therefore, an
+		 * internal init via replication that is trying to open and
+		 * access this as a named in-mem database will not find it
+		 * on-disk, and we need to try to open it in-memory too.
+		 *     But don't do this for [P]REOPEN, since we're already
+		 * handling those cases specially, above.
+		 */
+		if (try_inmem == 0 &&
+		    opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN && 
+		    opcode != DBREG_XREOPEN) {
+			if ((ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0)
+				return (ret);
+			try_inmem = 1;
+			goto retry_inmem;
+		} else if (try_inmem != 0)
+			CLR_INMEM(dbp);
+
+		/*
+		 * If it exists neither on disk nor in memory
+		 * record that the open failed in the txnlist.
+		 */
+		if (id != TXN_INVALID && (ret = __db_txnlist_update(env,
+		    info, id, TXN_UNEXPECTED, NULL, &ret_stat, 1)) != 0)
+			goto not_right;
+
+		/*
+		 * If this is file is missing then we may have crashed
+		 * without writing the corresponding close, record
+		 * the open so recovery will write a close record
+		 * with its checkpoint. If this is a backward pass then
+		 * we are closing a non-existent file and need to mark
+		 * it as deleted.
+		 */
+		if (dbp->log_filename == NULL &&
+		    (ret = __dbreg_setup(dbp, name, NULL, id)) != 0)
+			return (ret);
+		ret = __dbreg_assign_id(dbp, ndx, 1);
+		return (ret);
+	}
+not_right:
+	if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0)
+		return (ret == 0 ? t_ret : ret);
+
+	/* Add this file as deleted. */
+	if ((t_ret = __dbreg_add_dbentry(env, lp, NULL, ndx)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int
+__dbreg_check_master(env, uid, name)
+	ENV *env;
+	u_int8_t *uid;
+	char *name;
+{
+	DB *dbp;
+	int ret;
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+	F_SET(dbp, DB_AM_RECOVER);
+	ret = __db_open(dbp, NULL, NULL,
+	    name, NULL, DB_BTREE, 0, DB_MODE_600, PGNO_BASE_MD);
+
+	if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+		ret = EINVAL;
+
+	(void)__db_close(dbp, NULL, 0);
+	return (ret);
+}
+
+/*
+ * __dbreg_lazy_id --
+ *	When a replication client gets upgraded to being a replication master,
+ * it may have database handles open that have not been assigned an ID, but
+ * which have become legal to use for logging.
+ *
+ *	This function lazily allocates a new ID for such a function, in a
+ * new transaction created for the purpose.  We need to do this in a new
+ * transaction because we definitely wish to commit the dbreg_register, but
+ * at this point we have no way of knowing whether the log record that incited
+ * us to call this will be part of a committed transaction.
+ *
+ *	We first revoke any old id this handle may have had.  That can happen
+ * if a master becomes a client and then becomes a master again and
+ * there are other processes with valid open handles to this env.
+ *
+ * PUBLIC: int __dbreg_lazy_id __P((DB *));
+ */
+int
+__dbreg_lazy_id(dbp)
+	DB *dbp;
+{
+	DB_LOG *dblp;
+	DB_TXN *txn;
+	ENV *env;
+	FNAME *fnp;
+	LOG *lp;
+	int32_t id;
+	int ret;
+
+	env = dbp->env;
+
+	DB_ASSERT(env, IS_REP_MASTER(env) || F_ISSET(dbp, DB_AM_NOT_DURABLE));
+
+	env = dbp->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	fnp = dbp->log_filename;
+
+	/* The mtx_filelist protects the FNAME list and id management. */
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	if (fnp->id != DB_LOGFILEID_INVALID) {
+		MUTEX_UNLOCK(env, lp->mtx_filelist);
+		return (0);
+	}
+	id = DB_LOGFILEID_INVALID;
+	/*
+	 * When we became master we moved the fnp->id to old_id in
+	 * every FNAME structure that was open.  If our id was changed,
+	 * we need to revoke and give back that id.
+	 */
+	if (fnp->old_id != DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_revoke_id(dbp, 1, DB_LOGFILEID_INVALID)) != 0)
+		goto err;
+	if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		goto err;
+
+	if ((ret = __dbreg_get_id(dbp, txn, &id)) != 0) {
+		(void)__txn_abort(txn);
+		goto err;
+	}
+
+	if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
+		goto err;
+
+	/*
+	 * All DB related logging routines check the id value *without*
+	 * holding the mtx_filelist to know whether we need to call
+	 * dbreg_lazy_id to begin with.  We must set the ID after a
+	 * *successful* commit so that there is no possibility of a second
+	 * modification call finding a valid ID in the dbp before the
+	 * dbreg_register and commit records are in the log.
+	 * If there was an error, then we call __dbreg_revoke_id to
+	 * remove the entry from the lists.
+	 */
+	fnp->id = id;
+err:
+	if (ret != 0 && id != DB_LOGFILEID_INVALID)
+		(void)__dbreg_revoke_id(dbp, 1, id);
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+	return (ret);
+}
diff --git a/src/env/env_alloc.c b/src/env/env_alloc.c
new file mode 100644
index 00000000..700bfb27
--- /dev/null
+++ b/src/env/env_alloc.c
@@ -0,0 +1,759 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Implement shared memory region allocation.  The initial list is a single
+ * memory "chunk" which is carved up as memory is requested.  Chunks are
+ * coalesced when free'd.  We maintain two types of linked-lists: a list of
+ * all chunks sorted by address, and a set of lists with free chunks sorted
+ * by size.
+ *
+ * The ALLOC_LAYOUT structure is the governing structure for the allocator.
+ *
+ * The ALLOC_ELEMENT structure is the structure that describes any single
+ * chunk of memory, and is immediately followed by the user's memory.
+ *
+ * The internal memory chunks are always aligned to a uintmax_t boundary so
+ * we don't drop core accessing the fields of the ALLOC_ELEMENT structure.
+ *
+ * The memory chunks returned to the user are aligned to a uintmax_t boundary.
+ * This is enforced by terminating the ALLOC_ELEMENT structure with a uintmax_t
+ * field as that immediately precedes the user's memory.  Any caller needing
+ * more than uintmax_t alignment is responsible for doing alignment themselves.
+ */
+
+typedef SH_TAILQ_HEAD(__sizeq) SIZEQ_HEAD;
+
+typedef struct __alloc_layout {
+	SH_TAILQ_HEAD(__addrq) addrq;		/* Sorted by address */
+
+	/*
+	 * A perfect Berkeley DB application does little allocation because
+	 * most things are allocated on startup and never free'd.  This is
+	 * true even for the cache, because we don't free and re-allocate
+	 * the memory associated with a cache buffer when swapping a page
+	 * in memory for a page on disk -- unless the page is changing size.
+	 * The latter problem is why we have multiple size queues.  If the
+	 * application's working set fits in cache, it's not a problem.  If
+	 * the application's working set doesn't fit in cache, but all of
+	 * the databases have the same size pages, it's still not a problem.
+	 * If the application's working set doesn't fit in cache, and its
+	 * databases have different page sizes, we can end up walking a lot
+	 * of 512B chunk allocations looking for an available 64KB chunk.
+	 *
+	 * So, we keep a set of queues, where we expect to find a chunk of
+	 * roughly the right size at the front of the list.  The first queue
+	 * is chunks <= 1024, the second is <= 2048, and so on.  With 11
+	 * queues, we have separate queues for chunks up to 1MB.
+	 */
+#define	DB_SIZE_Q_COUNT	11
+	SIZEQ_HEAD	sizeq[DB_SIZE_Q_COUNT];	/* Sorted by size */
+#ifdef HAVE_STATISTICS
+	u_int32_t	pow2_size[DB_SIZE_Q_COUNT];
+#endif
+
+#ifdef HAVE_STATISTICS
+	u_int32_t success;			/* Successful allocations */
+	u_int32_t failure;			/* Failed allocations */
+	u_int32_t freed;			/* Free calls */
+	u_int32_t longest;			/* Longest chain walked */
+#endif
+	uintmax_t  unused;			/* Guarantee alignment */
+} ALLOC_LAYOUT;
+
+typedef struct __alloc_element {
+	SH_TAILQ_ENTRY addrq;			/* List by address */
+	SH_TAILQ_ENTRY sizeq;			/* List by size */
+
+	/*
+	 * The "len" field is the total length of the chunk, not the size
+	 * available to the caller.  Use a uintmax_t to guarantee that the
+	 * size of this struct will be aligned correctly.
+	 */
+	uintmax_t len;				/* Chunk length */
+
+	/*
+	 * The "ulen" field is the length returned to the caller.
+	 *
+	 * Set to 0 if the chunk is not currently in use.
+	 */
+	uintmax_t ulen;				/* User's length */
+} ALLOC_ELEMENT;
+
+/*
+ * If the chunk can be split into two pieces, with the fragment holding at
+ * least 64 bytes of memory, we divide the chunk into two parts.
+ */
+#define	SHALLOC_FRAGMENT	(sizeof(ALLOC_ELEMENT) + 64)
+
+/* Macro to find the appropriate queue for a specific size chunk. */
+#undef	SET_QUEUE_FOR_SIZE
+#define	SET_QUEUE_FOR_SIZE(head, q, i, len) do {			\
+	for (i = 0; i < DB_SIZE_Q_COUNT; ++i) {				\
+		q = &(head)->sizeq[i];					\
+		if ((len) <= (u_int64_t)1024 << i)			\
+			break;						\
+	}								\
+} while (0)
+
+static void __env_size_insert __P((ALLOC_LAYOUT *, ALLOC_ELEMENT *));
+
+/*
+ * __env_alloc_init --
+ *	Initialize the area as one large chunk.
+ *
+ * PUBLIC: void __env_alloc_init __P((REGINFO *, size_t));
+ */
+void
+__env_alloc_init(infop, size)
+	REGINFO *infop;
+	size_t size;
+{
+	ALLOC_ELEMENT *elp;
+	ALLOC_LAYOUT *head;
+	ENV *env;
+	u_int i;
+
+	env = infop->env;
+
+	/* No initialization needed for heap memory regions. */
+	if (F_ISSET(env, ENV_PRIVATE))
+		return;
+
+	/*
+	 * The first chunk of memory is the ALLOC_LAYOUT structure.
+	 */
+	head = infop->head;
+	memset(head, 0, sizeof(*head));
+	SH_TAILQ_INIT(&head->addrq);
+	for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
+		SH_TAILQ_INIT(&head->sizeq[i]);
+	COMPQUIET(head->unused, 0);
+
+	/*
+	 * The rest of the memory is the first available chunk.
+	 */
+	elp = (ALLOC_ELEMENT *)((u_int8_t *)head + sizeof(ALLOC_LAYOUT));
+	elp->len = size - sizeof(ALLOC_LAYOUT);
+	elp->ulen = 0;
+
+	SH_TAILQ_INSERT_HEAD(&head->addrq, elp, addrq, __alloc_element);
+	SH_TAILQ_INSERT_HEAD(
+	    &head->sizeq[DB_SIZE_Q_COUNT - 1], elp, sizeq, __alloc_element);
+}
+
+/*
+ * The length, the ALLOC_ELEMENT structure and an optional guard byte,
+ * rounded up to standard alignment.
+ */
+#ifdef DIAGNOSTIC
+#define	DB_ALLOC_SIZE(len)						\
+	(size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT) + 1, sizeof(uintmax_t))
+#else
+#define	DB_ALLOC_SIZE(len)						\
+	(size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT), sizeof(uintmax_t))
+#endif
+
+/*
+ * __env_alloc_overhead --
+ *	Return the overhead needed for an allocation.
+ *
+ * PUBLIC: size_t __env_alloc_overhead __P((void));
+ */
+size_t
+__env_alloc_overhead()
+{
+	return (sizeof(ALLOC_ELEMENT));
+}
+
+/*
+ * __env_alloc_size --
+ *	Return the space needed for an allocation, including alignment.
+ *
+ * PUBLIC: size_t __env_alloc_size __P((size_t));
+ */
+size_t
+__env_alloc_size(len)
+	size_t len;
+{
+	return (DB_ALLOC_SIZE(len));
+}
+
+/*
+ * __env_alloc --
+ *	Allocate space from the shared region.
+ *
+ * PUBLIC: int __env_alloc __P((REGINFO *, size_t, void *));
+ */
+int
+__env_alloc(infop, len, retp)
+	REGINFO *infop;
+	size_t len;
+	void *retp;
+{
+	SIZEQ_HEAD *q;
+	ALLOC_ELEMENT *elp, *frag, *elp_tmp;
+	ALLOC_LAYOUT *head;
+	ENV *env;
+	REGION_MEM *mem;
+	REGINFO *envinfop;
+	size_t total_len;
+	u_int8_t *p;
+	u_int i;
+	int ret;
+#ifdef HAVE_STATISTICS
+	u_int32_t st_search;
+#endif
+	env = infop->env;
+	*(void **)retp = NULL;
+#ifdef HAVE_MUTEX_SUPPORT
+	MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+	PERFMON3(env, mpool, env_alloc, len, infop->id, infop->type);
+	/*
+	 * In a heap-backed environment, we call malloc for additional space.
+	 * (Malloc must return memory correctly aligned for our use.)
+	 *
+	 * In a heap-backed environment, memory is laid out as follows:
+	 *
+	 * { uintmax_t total-length } { user-memory } { guard-byte }
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		/*
+		 * If we are shared then we must track the allocation
+		 * in the main environment region.
+		 */
+		if (F_ISSET(infop, REGION_SHARED))
+			envinfop = env->reginfo;
+		else
+			envinfop = infop;
+		/*
+		 * We need an additional uintmax_t to hold the length (and
+		 * keep the buffer aligned on 32-bit systems).
+		 */
+		len += sizeof(uintmax_t);
+		if (F_ISSET(infop, REGION_TRACKED))
+			len += sizeof(REGION_MEM);
+
+#ifdef DIAGNOSTIC
+		/* Plus one byte for the guard byte. */
+		++len;
+#endif
+		/* Check if we're over the limit. */
+		if (envinfop->max_alloc != 0 &&
+		     envinfop->allocated + len > envinfop->max_alloc)
+			return (ENOMEM);
+
+		/* Allocate the space. */
+		if ((ret = __os_malloc(env, len, &p)) != 0)
+			return (ret);
+		infop->allocated += len;
+		if (infop != envinfop)
+			envinfop->allocated += len;
+
+		*(uintmax_t *)p = len;
+#ifdef DIAGNOSTIC
+		p[len - 1] = GUARD_BYTE;
+#endif
+		if (F_ISSET(infop, REGION_TRACKED)) {
+			mem = (REGION_MEM *)(p + sizeof(uintmax_t));
+			mem->next = infop->mem;
+			infop->mem = mem;
+			p += sizeof(mem);
+		}
+		*(void **)retp = p + sizeof(uintmax_t);
+		return (0);
+	}
+
+	head = infop->head;
+	total_len = DB_ALLOC_SIZE(len);
+
+	/* Find the first size queue that could satisfy the request. */
+	COMPQUIET(q, NULL);
+#ifdef HAVE_MMAP_EXTEND
+retry:
+#endif
+	SET_QUEUE_FOR_SIZE(head, q, i, total_len);
+
+#ifdef HAVE_STATISTICS
+	if (i >= DB_SIZE_Q_COUNT)
+		i = DB_SIZE_Q_COUNT - 1;
+	++head->pow2_size[i];		/* Note the size of the request. */
+#endif
+
+	/*
+	 * Search this queue, and, if necessary, queues larger than this queue,
+	 * looking for a chunk we can use.
+	 */
+	STAT(st_search = 0);
+	for (elp = NULL;; ++q) {
+		SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) {
+			STAT(++st_search);
+
+			/*
+			 * Chunks are sorted from largest to smallest -- if
+			 * this chunk is less than what we need, no chunk
+			 * further down the list will be large enough.
+			 */
+			if (elp_tmp->len < total_len)
+				break;
+
+			/*
+			 * This chunk will do... maybe there's a better one,
+			 * but this one will do.
+			 */
+			elp = elp_tmp;
+
+			/*
+			 * We might have many chunks of the same size.  Stop
+			 * looking if we won't fragment memory by picking the
+			 * current one.
+			 */
+			if (elp_tmp->len - total_len <= SHALLOC_FRAGMENT)
+				break;
+		}
+		if (elp != NULL || ++i >= DB_SIZE_Q_COUNT)
+			break;
+	}
+
+#ifdef HAVE_STATISTICS
+	if (head->longest < st_search) {
+		head->longest = st_search;
+		STAT_PERFMON3(env,
+		    mpool, longest_search, len, infop->id, st_search);
+	}
+#endif
+
+	/*
+	 * If we don't find an element of the right size, try to extend
+	 * the region, if not then we are done.
+	 */
+	if (elp == NULL) {
+		ret = ENOMEM;
+#ifdef HAVE_MMAP_EXTEND
+		if (infop->rp->size < infop->rp->max &&
+		     (ret = __env_region_extend(env, infop)) == 0)
+			goto retry;
+#endif
+		STAT_INC_VERB(env, mpool, fail, head->failure, len, infop->id);
+		return (ret);
+	}
+	STAT_INC_VERB(env, mpool, alloc, head->success, len, infop->id);
+
+	/* Pull the chunk off of the size queue. */
+	SH_TAILQ_REMOVE(q, elp, sizeq, __alloc_element);
+
+	if (elp->len - total_len > SHALLOC_FRAGMENT) {
+		frag = (ALLOC_ELEMENT *)((u_int8_t *)elp + total_len);
+		frag->len = elp->len - total_len;
+		frag->ulen = 0;
+
+		elp->len = total_len;
+
+		/* The fragment follows the chunk on the address queue. */
+		SH_TAILQ_INSERT_AFTER(
+		    &head->addrq, elp, frag, addrq, __alloc_element);
+
+		/* Insert the frag into the correct size queue. */
+		__env_size_insert(head, frag);
+	}
+
+	p = (u_int8_t *)elp + sizeof(ALLOC_ELEMENT);
+	elp->ulen = len;
+#ifdef DIAGNOSTIC
+	p[len] = GUARD_BYTE;
+#endif
+	*(void **)retp = p;
+
+	return (0);
+}
+
+/*
+ * __env_alloc_free --
+ *	Free space into the shared region.
+ *
+ * PUBLIC: void __env_alloc_free __P((REGINFO *, void *));
+ */
+void
+__env_alloc_free(infop, ptr)
+	REGINFO *infop;
+	void *ptr;
+{
+	ALLOC_ELEMENT *elp, *elp_tmp;
+	ALLOC_LAYOUT *head;
+	ENV *env;
+	SIZEQ_HEAD *q;
+	size_t len;
+	u_int8_t i, *p;
+
+	env = infop->env;
+
+	/* In a private region, we call free. */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		/* Find the start of the memory chunk and its length. */
+		p = (u_int8_t *)((uintmax_t *)ptr - 1);
+		len = (size_t)*(uintmax_t *)p;
+
+		infop->allocated -= len;
+		if (F_ISSET(infop, REGION_SHARED))
+			env->reginfo->allocated -= len;
+
+#ifdef DIAGNOSTIC
+		/* Check the guard byte. */
+		DB_ASSERT(env, p[len - 1] == GUARD_BYTE);
+
+		/* Trash the memory chunk. */
+		memset(p, CLEAR_BYTE, len);
+#endif
+		__os_free(env, p);
+		return;
+	}
+
+#ifdef HAVE_MUTEX_SUPPORT
+	MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+	head = infop->head;
+
+	p = ptr;
+	elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT));
+
+	STAT_INC_VERB(env, mpool, free, head->freed, elp->ulen, infop->id);
+
+#ifdef DIAGNOSTIC
+	/* Check the guard byte. */
+	DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE);
+
+	/* Trash the memory chunk. */
+	memset(p, CLEAR_BYTE, (size_t)elp->len - sizeof(ALLOC_ELEMENT));
+#endif
+
+	/* Mark the memory as no longer in use. */
+	elp->ulen = 0;
+
+	/*
+	 * Try and merge this chunk with chunks on either side of it.  Two
+	 * chunks can be merged if they're contiguous and not in use.
+	 */
+	if ((elp_tmp =
+	    SH_TAILQ_PREV(&head->addrq, elp, addrq, __alloc_element)) != NULL &&
+	    elp_tmp->ulen == 0 &&
+	    (u_int8_t *)elp_tmp + elp_tmp->len == (u_int8_t *)elp) {
+		/*
+		 * If we're merging the entry into a previous entry, remove the
+		 * current entry from the addr queue and the previous entry from
+		 * its size queue, and merge.
+		 */
+		SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element);
+		SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+		SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+
+		elp_tmp->len += elp->len;
+		elp = elp_tmp;
+	}
+	if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL &&
+	    elp_tmp->ulen == 0 &&
+	    (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) {
+		/*
+		 * If we're merging the current entry into a subsequent entry,
+		 * remove the subsequent entry from the addr and size queues
+		 * and merge.
+		 */
+		SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element);
+		SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+		SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+
+		elp->len += elp_tmp->len;
+	}
+
+	/* Insert in the correct place in the size queues. */
+	__env_size_insert(head, elp);
+}
+
+/*
+ * __env_alloc_extend --
+ *	Extend a previously allocated chunk at the end of a region.
+ *
+ * PUBLIC: int __env_alloc_extend __P((REGINFO *, void *, size_t *));
+ */
+int
+__env_alloc_extend(infop, ptr, lenp)
+	REGINFO *infop;
+	void *ptr;
+	size_t *lenp;
+{
+	ALLOC_ELEMENT *elp, *elp_tmp;
+	ALLOC_LAYOUT *head;
+	ENV *env;
+	SIZEQ_HEAD *q;
+	size_t len, tlen;
+	u_int8_t i, *p;
+	int ret;
+
+	env = infop->env;
+
+	DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE));
+
+#ifdef HAVE_MUTEX_SUPPORT
+	MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+	head = infop->head;
+
+	p = ptr;
+	len = *lenp;
+	elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT));
+#ifdef DIAGNOSTIC
+	/* Check the guard byte. */
+	DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE);
+#endif
+
+	/* See if there is anything left in the region. */
+again:	if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL &&
+	    elp_tmp->ulen == 0 &&
+	    (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) {
+		/*
+		 * If we're merging the current entry into a subsequent entry,
+		 * remove the subsequent entry from the addr and size queues
+		 * and merge.
+		 */
+		SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element);
+		SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+		SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+		if (elp_tmp->len < len + SHALLOC_FRAGMENT) {
+			elp->len += elp_tmp->len;
+			if (elp_tmp->len < len)
+				len -= (size_t)elp_tmp->len;
+			else
+				len = 0;
+		} else {
+			tlen = (size_t)elp_tmp->len;
+			elp_tmp = (ALLOC_ELEMENT *) ((u_int8_t *)elp_tmp + len);
+			elp_tmp->len = tlen - len;
+			elp_tmp->ulen = 0;
+			elp->len += len;
+			len = 0;
+
+			/* The fragment follows the on the address queue. */
+			SH_TAILQ_INSERT_AFTER(
+			    &head->addrq, elp, elp_tmp, addrq, __alloc_element);
+
+			/* Insert the frag into the correct size queue. */
+			__env_size_insert(head, elp_tmp);
+		}
+	} else if (elp_tmp != NULL) {
+		__db_errx(env, DB_STR("1583", "block not at end of region"));
+		return (__env_panic(env, EINVAL));
+	}
+	if (len == 0)
+		goto done;
+
+	if ((ret = __env_region_extend(env, infop)) != 0) {
+		if (ret != ENOMEM)
+			return (ret);
+		goto done;
+	}
+	goto again;
+
+done:	elp->ulen = elp->len - sizeof(ALLOC_ELEMENT);
+#ifdef DIAGNOSTIC
+	elp->ulen -= sizeof(uintmax_t);
+	/* There was room for the guard byte in the chunk that came in. */
+	p[elp->ulen] = GUARD_BYTE;
+#endif
+	*lenp -= len;
+	infop->allocated += *lenp;
+	if (F_ISSET(infop, REGION_SHARED))
+		env->reginfo->allocated += *lenp;
+	return (0);
+}
+
+/*
+ * __env_size_insert --
+ *	Insert into the correct place in the size queues.
+ */
+static void
+__env_size_insert(head, elp)
+	ALLOC_LAYOUT *head;
+	ALLOC_ELEMENT *elp;
+{
+	SIZEQ_HEAD *q;
+	ALLOC_ELEMENT *elp_tmp;
+	u_int i;
+
+	/* Find the appropriate queue for the chunk. */
+	SET_QUEUE_FOR_SIZE(head, q, i, elp->len);
+
+	/* Find the correct slot in the size queue. */
+	SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element)
+		if (elp->len >= elp_tmp->len)
+			break;
+	if (elp_tmp == NULL)
+		SH_TAILQ_INSERT_TAIL(q, elp, sizeq);
+	else
+		SH_TAILQ_INSERT_BEFORE(q, elp_tmp, elp, sizeq, __alloc_element);
+}
+
+/*
+ * __env_region_extend --
+ *	Extend a region.
+ *
+ * PUBLIC: int __env_region_extend __P((ENV *, REGINFO *));
+ */
+int
+__env_region_extend(env, infop)
+	ENV *env;
+	REGINFO *infop;
+{
+	ALLOC_ELEMENT *elp;
+	REGION *rp;
+	int ret;
+
+	DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE));
+
+	ret = 0;
+	rp = infop->rp;
+	if (rp->size >= rp->max)
+		return (ENOMEM);
+	elp = (ALLOC_ELEMENT *)((u_int8_t *)infop->addr + rp->size);
+	if (rp->size + rp->alloc > rp->max)
+		rp->alloc = rp->max - rp->size;
+	rp->size += rp->alloc;
+	rp->size = (size_t)ALIGNP_INC(rp->size, sizeof(size_t));
+	if (rp->max - rp->size <= SHALLOC_FRAGMENT)
+		rp->size = rp->max;
+	if (infop->fhp &&
+	    (ret = __db_file_extend(env, infop->fhp, rp->size)) != 0)
+		return (ret);
+	elp->len = rp->alloc;
+	elp->ulen = 0;
+#ifdef DIAGNOSTIC
+	*(u_int8_t *)(elp+1) = GUARD_BYTE;
+#endif
+
+	SH_TAILQ_INSERT_TAIL(&((ALLOC_LAYOUT *)infop->head)->addrq, elp, addrq);
+	__env_alloc_free(infop, elp + 1);
+	if (rp->alloc < MEGABYTE)
+		rp->alloc += rp->size;
+	if (rp->alloc > MEGABYTE)
+		rp->alloc = MEGABYTE;
+	return (ret);
+}
+
+/*
+ * __env_elem_size --
+ *	Return the size of an allocated element.
+ * PUBLIC: uintmax_t __env_elem_size __P((ENV *, void *));
+ */
+uintmax_t
+__env_elem_size(env, p)
+	ENV *env;
+	void *p;
+{
+	ALLOC_ELEMENT *elp;
+	uintmax_t size;
+
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		size = *((uintmax_t *)p - 1);
+		size -= sizeof(uintmax_t);
+	} else {
+		elp = (ALLOC_ELEMENT *)((u_int8_t *)p - sizeof(ALLOC_ELEMENT));
+		size = elp->ulen;
+	}
+	return (size);
+}
+
+/*
+ * __env_get_chunk --
+ *	Return the next chunk allocated in a private region.
+ * PUBLIC: void * __env_get_chunk __P((REGINFO *, void **, uintmax_t *));
+ */
+void *
+__env_get_chunk(infop, nextp, sizep)
+	REGINFO *infop;
+	void **nextp;
+	uintmax_t *sizep;
+{
+	REGION_MEM *mem;
+
+	if (infop->mem == NULL)
+		return (NULL);
+	if (*nextp == NULL)
+		*nextp = infop->mem;
+	mem = *(REGION_MEM **)nextp;
+	*nextp = mem->next;
+
+	*sizep = __env_elem_size(infop->env, mem);
+	*sizep -= sizeof(*mem);
+
+	return ((void *)(mem + 1));
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __env_alloc_print --
+ *	Display the lists of memory chunks.
+ *
+ * PUBLIC: void __env_alloc_print __P((REGINFO *, u_int32_t));
+ */
+void
+__env_alloc_print(infop, flags)
+	REGINFO *infop;
+	u_int32_t flags;
+{
+	ALLOC_ELEMENT *elp;
+	ALLOC_LAYOUT *head;
+	ENV *env;
+	u_int i;
+
+	env = infop->env;
+	head = infop->head;
+
+	if (F_ISSET(env, ENV_PRIVATE))
+		return;
+
+	__db_msg(env,
+    "Region allocations: %lu allocations, %lu failures, %lu frees, %lu longest",
+	    (u_long)head->success, (u_long)head->failure, (u_long)head->freed,
+	    (u_long)head->longest);
+
+	if (!LF_ISSET(DB_STAT_ALL))
+		return;
+
+	__db_msg(env, "%s", "Allocations by power-of-two sizes:");
+	for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
+		__db_msg(env, "%3dKB\t%lu",
+		    (1024 << i) / 1024, (u_long)head->pow2_size[i]);
+
+	if (!LF_ISSET(DB_STAT_ALLOC))
+		return;
+	/*
+	 * We don't normally display the list of address/chunk pairs, a few
+	 * thousand lines of output is too voluminous for even DB_STAT_ALL.
+	 */
+	__db_msg(env,
+	    "Allocation list by address, offset: {chunk length, user length}");
+	SH_TAILQ_FOREACH(elp, &head->addrq, addrq, __alloc_element)
+		__db_msg(env, "\t%#lx, %lu {%lu, %lu}",
+		    P_TO_ULONG(elp), (u_long)R_OFFSET(infop, elp),
+		    (u_long)elp->len, (u_long)elp->ulen);
+
+	__db_msg(env, "Allocation free list by size: KB {chunk length}");
+	for (i = 0; i < DB_SIZE_Q_COUNT; ++i) {
+		__db_msg(env, "%3dKB", (1024 << i) / 1024);
+		SH_TAILQ_FOREACH(elp, &head->sizeq[i], sizeq, __alloc_element)
+			__db_msg(env,
+			    "\t%#lx {%lu}", P_TO_ULONG(elp), (u_long)elp->len);
+	}
+}
+#endif
diff --git a/src/env/env_backup.c b/src/env/env_backup.c
new file mode 100644
index 00000000..9c79dbb4
--- /dev/null
+++ b/src/env/env_backup.c
@@ -0,0 +1,166 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __env_backup_alloc __P((DB_ENV *));
+
+static int
+__env_backup_alloc(dbenv)
+	DB_ENV *dbenv;
+{
+	ENV *env;
+
+	env = dbenv->env;
+	if (env->backup_handle != NULL)
+		return (0);
+	return (__os_calloc(env, 1,
+	     sizeof(*env->backup_handle), &env->backup_handle));
+}
+
+/*
+ * __env_get_backup_config --
+ *
+ * PUBLIC: int __env_get_backup_config __P((DB_ENV *,
+ * PUBLIC:      DB_BACKUP_CONFIG, u_int32_t*));
+ */
+int
+__env_get_backup_config(dbenv, config, valuep)
+	DB_ENV *dbenv;
+	DB_BACKUP_CONFIG config;
+	u_int32_t *valuep;
+{
+	DB_BACKUP	*backup;
+
+	backup = dbenv->env->backup_handle;
+	if (backup == NULL)
+		return (EINVAL);
+
+	switch (config) {
+	case DB_BACKUP_WRITE_DIRECT:
+		*valuep = F_ISSET(backup, BACKUP_WRITE_DIRECT);
+		break;
+
+	case DB_BACKUP_READ_COUNT:
+		*valuep = backup->read_count;
+		break;
+
+	case DB_BACKUP_READ_SLEEP:
+		*valuep = backup->read_sleep;
+		break;
+
+	case DB_BACKUP_SIZE:
+		*valuep = backup->size;
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __env_set_backup_config --
+ *
+ * PUBLIC: int __env_set_backup_config __P((DB_ENV *,
+ * PUBLIC:      DB_BACKUP_CONFIG, u_int32_t));
+ */
+int
+__env_set_backup_config(dbenv, config, value)
+	DB_ENV *dbenv;
+	DB_BACKUP_CONFIG config;
+	u_int32_t value;
+{
+	DB_BACKUP	*backup;
+	int ret;
+
+	if ((ret = __env_backup_alloc(dbenv)) != 0)
+		return (ret);
+
+	backup = dbenv->env->backup_handle;
+	switch (config) {
+	case DB_BACKUP_WRITE_DIRECT:
+		if (value == 0)
+			F_CLR(backup, BACKUP_WRITE_DIRECT);
+		else
+			F_SET(backup, BACKUP_WRITE_DIRECT);
+		break;
+
+	case DB_BACKUP_READ_COUNT:
+		backup->read_count = value;
+		break;
+
+	case DB_BACKUP_READ_SLEEP:
+		backup->read_sleep = value;
+		break;
+
+	case DB_BACKUP_SIZE:
+		backup->size = value;
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * __env_get_backup_callbacks --
+ *
+ * PUBLIC: int __env_get_backup_callbacks __P((DB_ENV *,
+ * PUBLIC:     int (**)(DB_ENV *, const char *, const char *, void **),
+ * PUBLIC:     int (**)(DB_ENV *,
+ * PUBLIC:	    u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ * PUBLIC:     int (**)(DB_ENV *, const char *, void *)));
+ */
+int
+__env_get_backup_callbacks(dbenv, openp, writep, closep)
+	DB_ENV *dbenv;
+	int (**openp)(DB_ENV *, const char *, const char *, void **);
+	int (**writep)(DB_ENV *,
+		    u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+	int (**closep)(DB_ENV *, const char *, void *);
+{
+	DB_BACKUP	*backup;
+
+	backup = dbenv->env->backup_handle;
+	if (backup == NULL)
+		return (EINVAL);
+
+	*openp = backup->open;
+	*writep = backup->write;
+	*closep = backup->close;
+	return (0);
+}
+
+/*
+ * __env_set_backup_callbacks --
+ *
+ * PUBLIC: int __env_set_backup_callbacks __P((DB_ENV *,
+ * PUBLIC: int (*)(DB_ENV *, const char *, const char *, void **),
+ * PUBLIC: int (*)(DB_ENV *,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ * PUBLIC: int (*)(DB_ENV *, const char *, void *)));
+ */
+int
+__env_set_backup_callbacks(dbenv, open_func, write_func, close_func)
+	DB_ENV *dbenv;
+	int (*open_func)(DB_ENV *, const char *, const char *, void **);
+	int (*write_func)(DB_ENV *,
+	    u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+	int (*close_func)(DB_ENV *, const char *, void *);
+{
+	DB_BACKUP	*backup;
+	int ret;
+
+	if ((ret = __env_backup_alloc(dbenv)) != 0)
+		return (ret);
+
+	backup = dbenv->env->backup_handle;
+	backup->open = open_func;
+	backup->write = write_func;
+	backup->close = close_func;
+	return (0);
+}
diff --git a/src/env/env_config.c b/src/env/env_config.c
new file mode 100644
index 00000000..57496909
--- /dev/null
+++ b/src/env/env_config.c
@@ -0,0 +1,737 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc_auto/db_ext.h"
+
+/*
+ * DB_CONFIG lines are processed primarily by interpreting the command
+ * description tables initialized below.
+ *
+ * Most DB_CONFIG commands consist of a single token name followed by one or two
+ * integer or string arguments. These commands are described by entries in the
+ * config_descs[] array.
+ *
+ * The remaining, usually more complex, DB_CONFIG commands are handled by small
+ * code blocks in __config_parse().  Many of those commands need to translate
+ * option names to the integer values needed by the API configuration functions.
+ * Below the __config_descs[] initialization there are many FN array
+ * initializations which provide the mapping between user-specifiable strings
+ * and internally-used integer values. Typically there is one of these mappings
+ * defined for each complex DB_CONFIG command. Use __db_name_to_val()
+ * to translate a string to its integer value.
+ */
+typedef enum {
+	CFG_INT,	/* The argument is 1 signed integer. */
+	CFG_LONG,	/* The argument is 1 signed long int. */
+	CFG_UINT,	/* The argument is 1 unsigned integer. */
+	CFG_2INT,	/* The arguments are 2 signed integers. */
+	CFG_2UINT,	/* The arguments are 2 unsigned integers. */
+	CFG_STRING	/* The rest of the line is a string. */
+} __db_config_type;
+
+typedef struct __db_config_desc {
+	char *name;		/* The name of a simple DB_CONFIG command. */
+	__db_config_type type;	/* The enum describing its argument type(s). */
+	int (*func)();		/* The function to call with the argument(s). */
+} CFG_DESC;
+
+/* These typedefs help eliminate lint warnings where "func" above is used. */
+typedef int (*CFG_FUNC_STRING) __P((DB_ENV *, const char *));
+typedef int (*CFG_FUNC_INT) __P((DB_ENV *, int));
+typedef int (*CFG_FUNC_LONG) __P((DB_ENV *, long));
+typedef int (*CFG_FUNC_UINT) __P((DB_ENV *, u_int32_t));
+typedef int (*CFG_FUNC_2INT) __P((DB_ENV *, int, int));
+typedef int (*CFG_FUNC_2UINT) __P((DB_ENV *, u_int32_t, u_int32_t));
+
+/*
+ * This table lists the simple DB_CONFIG configuration commands. It is sorted by
+ * the command name, so that __config_scan() can bsearch() it.  After making an
+ * addition to this table, please be sure that it remains sorted.  With vi or
+ * vim, the following command line will do it:
+ *	:/^static const CFG_DESC config_descs/+1, /^}/-1 ! sort
+ *
+ * This table can contain aliases.  Aliases have different names with identical
+ * types and functions. At this time there are four aliases:
+ *	Outdated Name		Current Name
+ *	db_data_dir		set_data_dir
+ *	db_log_dir		set_lg_dir
+ *	db_tmp_dir		set_tmp_dir
+ *	set_tas_spins		mutex_set_tas_spins
+ */
+static const CFG_DESC config_descs[] = {
+    { "add_data_dir",		CFG_STRING,	__env_add_data_dir	},
+    { "db_data_dir",		CFG_STRING,	__env_set_data_dir	},
+    { "db_log_dir",		CFG_STRING,	__log_set_lg_dir	},
+    { "db_tmp_dir",		CFG_STRING,	__env_set_tmp_dir	},
+    { "mutex_set_align",	CFG_UINT,	__mutex_set_align	},
+    { "mutex_set_increment",	CFG_UINT,	__mutex_set_increment	},
+    { "mutex_set_init",		CFG_UINT,	__mutex_set_init	},
+    { "mutex_set_max",		CFG_UINT,	__mutex_set_max		},
+    { "mutex_set_tas_spins",	CFG_UINT,	__mutex_set_tas_spins	},
+    { "rep_set_clockskew",	CFG_2UINT,	__rep_set_clockskew	},
+    { "rep_set_limit",		CFG_2UINT,	__rep_set_limit		},
+    { "rep_set_nsites",		CFG_UINT,	__rep_set_nsites_pp	},
+    { "rep_set_priority",	CFG_UINT,	__rep_set_priority	},
+    { "rep_set_request",	CFG_2UINT,	__rep_set_request	},
+    { "set_cache_max",		CFG_2UINT,	__memp_set_cache_max	},
+    { "set_create_dir",		CFG_STRING,	__env_set_create_dir	},
+    { "set_data_dir",		CFG_STRING,	__env_set_data_dir	},
+    { "set_data_len",		CFG_UINT,	__env_set_data_len	},
+    { "set_intermediate_dir_mode",CFG_STRING, __env_set_intermediate_dir_mode },
+    { "set_lg_bsize",		CFG_UINT,	__log_set_lg_bsize	},
+    { "set_lg_dir",		CFG_STRING,	__log_set_lg_dir	},
+    { "set_lg_filemode",	CFG_INT,	__log_set_lg_filemode	},
+    { "set_lg_max",		CFG_UINT,	__log_set_lg_max	},
+    { "set_lg_regionmax",	CFG_UINT,	__log_set_lg_regionmax	},
+    { "set_lk_max_lockers",	CFG_UINT,	__lock_set_lk_max_lockers },
+    { "set_lk_max_locks",	CFG_UINT,	__lock_set_lk_max_locks },
+    { "set_lk_max_objects",	CFG_UINT,	__lock_set_lk_max_objects },
+    { "set_lk_partitions",	CFG_UINT,	__lock_set_lk_partitions },
+    { "set_lk_tablesize",	CFG_UINT,	__lock_set_lk_tablesize },
+    { "set_memory_max",		CFG_2UINT,	__env_set_memory_max	},
+    { "set_metadata_dir",	CFG_STRING,	__env_set_metadata_dir	},
+    { "set_mp_max_openfd",	CFG_INT,	__memp_set_mp_max_openfd },
+    { "set_mp_max_write",	CFG_2INT,	__memp_set_mp_max_write },
+    { "set_mp_mmapsize",	CFG_UINT,	__memp_set_mp_mmapsize	},
+    { "set_mp_mtxcount",	CFG_UINT,	__memp_set_mp_mtxcount	},
+    { "set_mp_pagesize",	CFG_UINT,	__memp_set_mp_pagesize	},
+    { "set_shm_key",		CFG_LONG,	__env_set_shm_key	},
+    { "set_tas_spins",		CFG_UINT,	__mutex_set_tas_spins	},
+    { "set_thread_count",	CFG_UINT,	__env_set_thread_count },
+    { "set_tmp_dir",		CFG_STRING,	__env_set_tmp_dir	},
+    { "set_tx_max",		CFG_UINT,	__txn_set_tx_max	}
+};
+
+/*
+ * Here are the option-name to option-value mappings used by complex commands.
+ */
+static const FN config_mem_init[] = {
+	{ (u_int32_t) DB_MEM_LOCK,		"DB_MEM_LOCK" },
+	{ (u_int32_t) DB_MEM_LOCKER,		"DB_MEM_LOCKER" },
+	{ (u_int32_t) DB_MEM_LOCKOBJECT,	"DB_MEM_LOCKOBJECT" },
+	{ (u_int32_t) DB_MEM_TRANSACTION,	"DB_MEM_TRANSACTION" },
+	{ (u_int32_t) DB_MEM_THREAD,		"DB_MEM_THREAD" },
+	{ (u_int32_t) DB_MEM_LOGID,		"DB_MEM_LOGID" },
+	{ 0, NULL }
+};
+
+static const FN config_rep_config[] = {
+	{ DB_REP_CONF_AUTOINIT,		"db_rep_conf_autoinit" },
+	{ DB_REP_CONF_AUTOROLLBACK,	"db_rep_conf_autorollback" },
+	{ DB_REP_CONF_BULK,		"db_rep_conf_bulk" },
+	{ DB_REP_CONF_DELAYCLIENT,	"db_rep_conf_delayclient" },
+	{ DB_REP_CONF_INMEM,		"db_rep_conf_inmem" },
+	{ DB_REP_CONF_LEASE,		"db_rep_conf_lease" },
+	{ DB_REP_CONF_NOWAIT,		"db_rep_conf_nowait" },
+	{ DB_REPMGR_CONF_2SITE_STRICT,	"db_repmgr_conf_2site_strict" },
+	{ DB_REPMGR_CONF_ELECTIONS,	"db_repmgr_conf_elections" },
+	{ 0, NULL }
+};
+
+static const FN config_rep_timeout[] = {
+	{ DB_REP_ACK_TIMEOUT,		"db_rep_ack_timeout" },
+	{ DB_REP_CHECKPOINT_DELAY,	"db_rep_checkpoint_delay" },
+	{ DB_REP_CONNECTION_RETRY,	"db_rep_connection_retry" },
+	{ DB_REP_ELECTION_TIMEOUT,	"db_rep_election_timeout" },
+	{ DB_REP_ELECTION_RETRY,	"db_rep_election_retry" },
+	{ DB_REP_FULL_ELECTION_TIMEOUT,	"db_rep_full_election_timeout" },
+	{ DB_REP_HEARTBEAT_MONITOR,	"db_rep_heartbeat_monitor" },
+	{ DB_REP_HEARTBEAT_SEND,	"db_rep_heartbeat_send" },
+	{ DB_REP_LEASE_TIMEOUT,		"db_rep_lease_timeout" },
+	{ 0, NULL }
+};
+
+static const FN config_repmgr_ack_policy[] = {
+	{ DB_REPMGR_ACKS_ALL,		"db_repmgr_acks_all" },
+	{ DB_REPMGR_ACKS_ALL_AVAILABLE,	"db_repmgr_acks_all_available" },
+	{ DB_REPMGR_ACKS_ALL_PEERS,	"db_repmgr_acks_all_peers" },
+	{ DB_REPMGR_ACKS_NONE,		"db_repmgr_acks_none" },
+	{ DB_REPMGR_ACKS_ONE,		"db_repmgr_acks_one" },
+	{ DB_REPMGR_ACKS_ONE_PEER,	"db_repmgr_acks_one_peer" },
+	{ DB_REPMGR_ACKS_QUORUM,	"db_repmgr_acks_quorum" },
+	{ 0, NULL }
+};
+
+static const FN config_repmgr_site[] = {
+	{ DB_BOOTSTRAP_HELPER,	"db_bootstrap_helper" },
+	{ DB_GROUP_CREATOR,	"db_group_creator" },
+	{ DB_LEGACY,		"db_legacy" },
+	{ DB_LOCAL_SITE,	"db_local_site" },
+	{ DB_REPMGR_PEER,	"db_repmgr_peer" },
+	{ 0, NULL }
+};
+
+static const FN config_set_flags[] = {
+	{ DB_AUTO_COMMIT,	"db_auto_commit" },
+	{ DB_CDB_ALLDB,		"db_cdb_alldb" },
+	{ DB_DIRECT_DB,		"db_direct_db" },
+	{ DB_DSYNC_DB,		"db_dsync_db" },
+	{ DB_MULTIVERSION,	"db_multiversion" },
+	{ DB_NOLOCKING,		"db_nolocking" },
+	{ DB_NOMMAP,		"db_nommap" },
+	{ DB_NOPANIC,		"db_nopanic" },
+	{ DB_OVERWRITE,		"db_overwrite" },
+	{ DB_REGION_INIT,	"db_region_init" },
+	{ DB_TIME_NOTGRANTED,	"db_time_notgranted" },
+	{ DB_TXN_NOSYNC,	"db_txn_nosync" },
+	{ DB_TXN_NOWAIT,	"db_txn_nowait" },
+	{ DB_TXN_SNAPSHOT,	"db_txn_snapshot" },
+	{ DB_TXN_WRITE_NOSYNC,	"db_txn_write_nosync" },
+	{ DB_YIELDCPU,		"db_yieldcpu" },
+	{ 0, NULL }
+};
+
+static const FN config_set_flags_forlog[] = {
+	{ DB_LOG_DIRECT,	"db_direct_log" },
+	{ DB_LOG_DSYNC,		"db_dsync_log" },
+	{ DB_LOG_AUTO_REMOVE,	"db_log_autoremove" },
+	{ DB_LOG_IN_MEMORY,	"db_log_inmemory" },
+	{ 0, NULL }
+};
+
+static const FN config_log_set_config[] = {
+	{ DB_LOG_DIRECT,	"db_log_direct" },
+	{ DB_LOG_DSYNC,		"db_log_dsync" },
+	{ DB_LOG_AUTO_REMOVE,	"db_log_auto_remove" },
+	{ DB_LOG_IN_MEMORY,	"db_log_in_memory" },
+	{ DB_LOG_ZERO,		"db_log_zero" },
+	{ 0, NULL }
+};
+
+static const FN config_set_lk_detect[] = {
+	{ DB_LOCK_DEFAULT,	"db_lock_default" },
+	{ DB_LOCK_EXPIRE,	"db_lock_expire" },
+	{ DB_LOCK_MAXLOCKS,	"db_lock_maxlocks" },
+	{ DB_LOCK_MAXWRITE,	"db_lock_maxwrite" },
+	{ DB_LOCK_MINLOCKS,	"db_lock_minlocks" },
+	{ DB_LOCK_MINWRITE,	"db_lock_minwrite" },
+	{ DB_LOCK_OLDEST,	"db_lock_oldest" },
+	{ DB_LOCK_RANDOM,	"db_lock_random" },
+	{ DB_LOCK_YOUNGEST,	"db_lock_youngest" },
+	{ 0, NULL }
+};
+
+static const FN config_set_open_flags[] = {
+	{ DB_INIT_REP,	"db_init_rep" },
+	{ DB_PRIVATE,	"db_private" },
+	{ DB_REGISTER,	"db_register" },
+	{ DB_THREAD,	"db_thread" },
+	{ 0, NULL }
+};
+
+static const FN config_set_verbose[] = {
+	{ DB_VERB_BACKUP,	"db_verb_backup" },
+	{ DB_VERB_DEADLOCK,	"db_verb_deadlock" },
+	{ DB_VERB_FILEOPS,	"db_verb_fileops" },
+	{ DB_VERB_FILEOPS_ALL,	"db_verb_fileops_all" },
+	{ DB_VERB_RECOVERY,	"db_verb_recovery" },
+	{ DB_VERB_REGISTER,	"db_verb_register" },
+	{ DB_VERB_REPLICATION,	"db_verb_replication" },
+	{ DB_VERB_REP_ELECT,	"db_verb_rep_elect" },
+	{ DB_VERB_REP_LEASE,	"db_verb_rep_lease" },
+	{ DB_VERB_REP_MISC,	"db_verb_rep_misc" },
+	{ DB_VERB_REP_MSGS,	"db_verb_rep_msgs" },
+	{ DB_VERB_REP_SYNC,	"db_verb_rep_sync" },
+	{ DB_VERB_REP_SYSTEM,	"db_verb_rep_system" },
+	{ DB_VERB_REP_TEST,	"db_verb_rep_test" },
+	{ DB_VERB_REPMGR_CONNFAIL,	"db_verb_repmgr_connfail" },
+	{ DB_VERB_REPMGR_MISC,	"db_verb_repmgr_misc" },
+	{ DB_VERB_WAITSFOR,	"db_verb_waitsfor" },
+	{ 0, NULL}
+};
+
+static int __config_parse __P((ENV *, char *, int));
+static int __config_scan __P((char *, char **, const CFG_DESC **));
+static int cmp_cfg_name __P((const void *, const void *element));
+
+/*
+ * __env_read_db_config --
+ *	Read the DB_CONFIG file.
+ *
+ * PUBLIC: int __env_read_db_config __P((ENV *));
+ */
+int
+__env_read_db_config(env)
+	ENV *env;
+{
+	FILE *fp;
+	int lc, ret;
+	char *p, buf[256];
+
+	/* Parse the config file. */
+	p = NULL;
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, "DB_CONFIG", NULL, &p)) != 0)
+		return (ret);
+	if (p == NULL)
+		fp = NULL;
+	else {
+		fp = fopen(p, "r");
+		__os_free(env, p);
+	}
+
+	if (fp == NULL)
+		return (0);
+
+	for (lc = 1; fgets(buf, sizeof(buf), fp) != NULL; ++lc) {
+		if ((p = strchr(buf, '\n')) == NULL)
+			p = buf + strlen(buf);
+		if (p > buf && p[-1] == '\r')
+			--p;
+		*p = '\0';
+		for (p = buf; *p != '\0' && isspace((int)*p); ++p)
+			;
+		if (*p == '\0' || *p == '#')
+			continue;
+
+		if ((ret = __config_parse(env, p, lc)) != 0)
+			break;
+	}
+	(void)fclose(fp);
+
+	return (ret);
+}
+
+#undef	CFG_GET_INT
+#define	CFG_GET_INT(s, vp) do {					\
+	int __ret;							\
+	if ((__ret =							\
+	    __db_getlong(env->dbenv, NULL, s, 0, INT_MAX, vp)) != 0)	\
+		return (__ret);						\
+} while (0)
+#undef	CFG_GET_LONG
+#define	CFG_GET_LONG(s, vp) do {					\
+	int __ret;							\
+	if ((__ret =							\
+	    __db_getlong(env->dbenv, NULL, s, 0, LONG_MAX, vp)) != 0)	\
+		return (__ret);						\
+} while (0)
+#undef	CFG_GET_UINT
+#define	CFG_GET_UINT(s, vp) do {					\
+	int __ret;							\
+	if ((__ret =							\
+	    __db_getulong(env->dbenv, NULL, s, 0, UINT_MAX, vp)) != 0)	\
+		return (__ret);						\
+} while (0)
+#undef	CFG_GET_UINT32
+#define	CFG_GET_UINT32(s, vp) do {					\
+	if (__db_getulong(env->dbenv, NULL, s, 0, UINT32_MAX, vp) != 0)	\
+		return (EINVAL);					\
+} while (0)
+
+/* This is the maximum number of tokens in a DB_CONFIG line. */
+#undef	CFG_SLOTS
+#define	CFG_SLOTS	10
+
+/*
+ * __config_parse --
+ *	Parse a single NAME VALUE pair.
+ */
+static int
+__config_parse(env, s, lc)
+	ENV *env;
+	char *s;
+	int lc;
+{
+	DB_ENV *dbenv;
+	DB_SITE *site;
+	u_long uv1, uv2;
+	long lv1, lv2;
+	u_int port;
+	int i, nf, onoff, bad, ret, t_ret;
+	char *argv[CFG_SLOTS];
+	const CFG_DESC *desc;
+
+	bad = 0;
+	dbenv = env->dbenv;
+
+	/*
+	 * Split the input line in 's' into its argv-like components, returning
+	 * the number of fields. If the command is one of the "simple" ones in
+	 * config_descs, also return its command descriptor.
+	 */
+	if ((nf = __config_scan(s, argv, &desc)) < 2) {
+format:		__db_errx(env, DB_STR_A("1584",
+		    "line %d: %s: incorrect name-value pair", "%d %s"),
+			    lc, argv[0]);
+		return (EINVAL);
+	}
+
+	/* Handle simple configuration lines here. */
+	if (desc != NULL) {
+		ret = 0;
+		switch (desc->type) {
+		  case CFG_INT:		/* <command> <int> */
+			if (nf != 2)
+				goto format;
+			CFG_GET_INT(argv[1], &lv1);
+			ret = ((CFG_FUNC_INT)desc->func)(dbenv, (int) lv1);
+			break;
+
+		  case CFG_LONG:	/* <command> <long int> */
+			if (nf != 2)
+				goto format;
+			CFG_GET_LONG(argv[1], &lv1);
+			ret = ((CFG_FUNC_LONG)desc->func)(dbenv, lv1);
+			break;
+
+		  case CFG_UINT:	/* <command> <uint> */
+			if (nf != 2)
+				goto format;
+			CFG_GET_UINT(argv[1], &uv1);
+			ret = ((CFG_FUNC_UINT)desc->func)
+			    (dbenv, (u_int32_t) uv1);
+			break;
+
+		  case CFG_2INT:	/* <command> <int1> <int2> */
+			if (nf != 3)
+				goto format;
+			CFG_GET_INT(argv[1], &lv1);
+			CFG_GET_INT(argv[2], &lv2);
+			ret = ((CFG_FUNC_2INT)desc->func)
+			    (dbenv, (int) lv1, (int) lv2);
+			break;
+
+		  case CFG_2UINT:	/* <command> <uint1> <uint2> */
+			if (nf != 3)
+				goto format;
+			CFG_GET_UINT(argv[1], &uv1);
+			CFG_GET_UINT(argv[2], &uv2);
+			ret = ((CFG_FUNC_2UINT)desc->func)
+			    (dbenv, (u_int32_t) uv1, (u_int32_t) uv2);
+			break;
+
+		  case CFG_STRING:	/* <command> <rest of line as string> */
+			ret = ((CFG_FUNC_STRING) desc->func)(dbenv, argv[1]);
+			break;
+		}
+		return (ret);
+	}
+
+	/*
+	 * The commands not covered in config_descs are handled below, each
+	 * with their own command-specific block of code. Most of them are
+	 * fairly similar to each other, but not quite enough to warrant
+	 * that they all be table-driven too.
+	 */
+
+	/* set_memory_init db_mem_XXX <unsigned> */
+	if (strcasecmp(argv[0], "set_memory_init") == 0) {
+		if (nf != 3)
+			goto format;
+		if ((lv1 = __db_name_to_val(config_mem_init, argv[1])) == -1)
+			goto format;
+		CFG_GET_UINT32(argv[2], &uv2);
+		return (__env_set_memory_init(dbenv,
+		    (DB_MEM_CONFIG) lv1, (u_int32_t)uv2));
+	}
+
+	/* rep_set_config { db_rep_conf_XXX | db_repmgr_conf_XXX }  [on|off] */
+	if (strcasecmp(argv[0], "rep_set_config") == 0) {
+		if (nf != 2 && nf != 3)
+			goto format;
+		onoff = 1;
+		if (nf == 3) {
+			if (strcasecmp(argv[2], "off") == 0)
+				onoff = 0;
+			else if (strcasecmp(argv[2], "on") != 0)
+				goto format;
+		}
+		if ((lv1 = __db_name_to_val(config_rep_config, argv[1])) == -1)
+			goto format;
+		return (__rep_set_config(dbenv, (u_int32_t)lv1, onoff));
+	}
+
+	/* rep_set_timeout db_rep_XXX <unsigned> */
+	if (strcasecmp(argv[0], "rep_set_timeout") == 0) {
+		if (nf != 3)
+			goto format;
+		if ((lv1 = __db_name_to_val(config_rep_timeout, argv[1])) == -1)
+			goto format;
+		CFG_GET_UINT32(argv[2], &uv2);
+		return (__rep_set_timeout(dbenv, lv1, (db_timeout_t)uv2));
+	}
+
+	/* repmgr_set_ack_policy db_repmgr_acks_XXX */
+	if (strcasecmp(argv[0], "repmgr_set_ack_policy") == 0) {
+		if (nf != 2)
+			goto format;
+		if ((lv1 =
+		    __db_name_to_val(config_repmgr_ack_policy, argv[1])) == -1)
+			goto format;
+		return (__repmgr_set_ack_policy(dbenv, lv1));
+	}
+
+	/*
+	 * Configure name/value pairs of config information for a site (local or
+	 * remote).
+	 *
+	 * repmgr_site host port [which value(on | off | unsigned)}] ...
+	 */
+	if (strcasecmp(argv[0], "repmgr_site") == 0) {
+		if (nf < 3 || (nf % 2) == 0)
+			goto format;
+		CFG_GET_UINT(argv[2], &uv2);
+		port = (u_int)uv2;
+
+		if ((ret = __repmgr_site(dbenv, argv[1], port, &site, 0)) != 0)
+			return (ret);
+#ifdef HAVE_REPLICATION_THREADS
+		for (i = 3; i < nf; i += 2) {
+			if ((lv1 = __db_name_to_val(
+			    config_repmgr_site, argv[i])) == -1) {
+				bad = 1;
+				break;
+			}
+
+			if (strcasecmp(argv[i + 1], "on") == 0)
+				uv2 = 1;
+			else if (strcasecmp(argv[i + 1], "off") == 0)
+				uv2 = 0;
+			else
+				CFG_GET_UINT32(argv[i + 1], &uv2);
+			if ((ret = __repmgr_site_config(site,
+			    (u_int32_t)lv1, (u_int32_t)uv2)) != 0)
+				break;
+		}
+		if ((t_ret = __repmgr_site_close(site)) != 0 && ret == 0)
+			ret = t_ret;
+		if (bad)
+			goto format;
+#else
+		/* If repmgr not built, __repmgr_site() returns DB_OPNOTSUP. */
+		COMPQUIET(i, 0);
+		COMPQUIET(t_ret, 0);
+		DB_ASSERT(env, 0);
+#endif
+		return (ret);
+	}
+
+	/* set_cachesize <unsigned gbytes> <unsigned bytes> <int ncaches> */
+	if (strcasecmp(argv[0], "set_cachesize") == 0) {
+		if (nf != 4)
+			goto format;
+		CFG_GET_UINT32(argv[1], &uv1);
+		CFG_GET_UINT32(argv[2], &uv2);
+		CFG_GET_INT(argv[3], &lv1);
+		return (__memp_set_cachesize(
+		    dbenv, (u_int32_t)uv1, (u_int32_t)uv2, (int)lv1));
+	}
+
+	/* set_intermediate_dir <integer dir permission> */
+	if (strcasecmp(argv[0], "set_intermediate_dir") == 0) {
+		if (nf != 2)
+			goto format;
+		CFG_GET_INT(argv[1], &lv1);
+		if (lv1 <= 0)
+			goto format;
+		env->dir_mode = (int)lv1;
+		return (0);
+	}
+
+	/* set_flags <env or log flag name> [on | off] */
+	if (strcasecmp(argv[0], "set_flags") == 0) {
+		if (nf != 2 && nf != 3)
+			goto format;
+		onoff = 1;
+		if (nf == 3) {
+			if (strcasecmp(argv[2], "off") == 0)
+				onoff = 0;
+			else if (strcasecmp(argv[2], "on") != 0)
+				goto format;
+		}
+		/* First see whether it is an env flag, then a log flag. */
+		if ((lv1 = __db_name_to_val(config_set_flags, argv[1])) != -1)
+			return (__env_set_flags(dbenv, (u_int32_t)lv1, onoff));
+		else if ((lv1 =
+		    __db_name_to_val(config_set_flags_forlog, argv[1])) != -1)
+			return (__log_set_config(dbenv, (u_int32_t)lv1, onoff));
+		goto format;
+	}
+
+	/* log_set_config <log flag name> [on | off] */
+	if (strcasecmp(argv[0], "log_set_config") == 0) {
+		if (nf != 2 && nf != 3)
+			goto format;
+		onoff = 1;
+		if (nf == 3) {
+			if (strcasecmp(argv[2], "off") == 0)
+				onoff = 0;
+			else if (strcasecmp(argv[2], "on") != 0)
+				goto format;
+		}
+		if ((lv1 =
+		    __db_name_to_val(config_log_set_config, argv[1])) == -1)
+			goto format;
+		return (__log_set_config(dbenv, (u_int32_t)lv1, onoff));
+	}
+
+	/* set_lk_detect db_lock_xxx */
+	if (strcasecmp(argv[0], "set_lk_detect") == 0) {
+		if (nf != 2)
+			goto format;
+		if ((lv1 =
+		    __db_name_to_val(config_set_lk_detect, argv[1])) == -1)
+			goto format;
+		return (__lock_set_lk_detect(dbenv, (u_int32_t)lv1));
+	}
+
+	/* set_lock_timeout <unsigned lock timeout> */
+	if (strcasecmp(argv[0], "set_lock_timeout") == 0) {
+		if (nf != 2)
+			goto format;
+		CFG_GET_UINT32(argv[1], &uv1);
+		return (__lock_set_env_timeout(
+		    dbenv, (u_int32_t)uv1, DB_SET_LOCK_TIMEOUT));
+	}
+
+	/* set_open_flags <env open flag name> [on | off] */
+	if (strcasecmp(argv[0], "set_open_flags") == 0) {
+		if (nf != 2 && nf != 3)
+			goto format;
+		onoff = 1;
+		if (nf == 3) {
+			if (strcasecmp(argv[2], "off") == 0)
+				onoff = 0;
+			else if (strcasecmp(argv[2], "on") != 0)
+				goto format;
+		}
+		if ((lv1 =
+		    __db_name_to_val(config_set_open_flags, argv[1])) == -1)
+			goto format;
+		if (onoff == 1)
+			FLD_SET(env->open_flags, (u_int32_t)lv1);
+		else
+			FLD_CLR(env->open_flags, (u_int32_t)lv1);
+		return (0);
+	}
+
+	/* set_region_init <0 or 1> */
+	if (strcasecmp(argv[0], "set_region_init") == 0) {
+		if (nf != 2)
+			goto format;
+		CFG_GET_INT(argv[1], &lv1);
+		if (lv1 != 0 && lv1 != 1)
+			goto format;
+		return (__env_set_flags(
+		    dbenv, DB_REGION_INIT, lv1 == 0 ? 0 : 1));
+	}
+
+	/* set_reg_timeout <unsigned timeout> */
+	if (strcasecmp(argv[0], "set_reg_timeout") == 0) {
+		if (nf != 2)
+			goto format;
+		CFG_GET_UINT32(argv[1], &uv1);
+		return (__env_set_timeout(
+		    dbenv, (u_int32_t)uv1, DB_SET_REG_TIMEOUT));
+	}
+
+	/* set_txn_timeout <unsigned timeout> */
+	if (strcasecmp(argv[0], "set_txn_timeout") == 0) {
+		if (nf != 2)
+			goto format;
+		CFG_GET_UINT32(argv[1], &uv1);
+		return (__lock_set_env_timeout(
+		    dbenv, (u_int32_t)uv1, DB_SET_TXN_TIMEOUT));
+	}
+
+	/* set_verbose db_verb_XXX [on | off] */
+	if (strcasecmp(argv[0], "set_verbose") == 0) {
+		if (nf != 2 && nf != 3)
+			goto format;
+		onoff = 1;
+		if (nf == 3) {
+			if (strcasecmp(argv[2], "off") == 0)
+				onoff = 0;
+			else if (strcasecmp(argv[2], "on") != 0)
+				goto format;
+		}
+		if ((lv1 = __db_name_to_val(config_set_verbose, argv[1])) == -1)
+			goto format;
+		return (__env_set_verbose(dbenv, (u_int32_t)lv1, onoff));
+	}
+
+	__db_errx(env,
+	    DB_STR_A("1585", "unrecognized name-value pair: %s", "%s"), s);
+	return (EINVAL);
+}
+
+/* cmp_cfg_name --
+ *	Bsearch comparison function for CFG_DESC.name, for looking up
+ *	the names of simple commmands.
+ */
+static int
+cmp_cfg_name(sought, element)
+	const void *sought;
+	const void *element;
+{
+	return
+	    (strcmp((const char *) sought, ((const CFG_DESC *) element)->name));
+}
+
+/*
+ * __config_scan --
+ *      Split DB_CONFIG lines into fields. Usually each whitespace separated
+ *	field is scanned as a distinct argument. However, if the command is
+ *	recognized as one needing a single string value, then the rest of the
+ *	line is returned as the one argument. That supports strings which
+ *	contain whitespaces, such as some directory paths.
+ *
+ *	This returns the number of fields. It sets *descptr to the command
+ *	descriptor (if it is recognized), or NULL.
+ */
+static int
+__config_scan(input, argv, descptr)
+	char *input, *argv[CFG_SLOTS];
+	const CFG_DESC **descptr;
+{
+	size_t tablecount;
+	int count;
+	char **ap;
+
+	tablecount = sizeof(config_descs) / sizeof(config_descs[0]);
+	*descptr = NULL;
+	for (count = 0, ap = argv; (*ap = strsep(&input, " \t\n")) != NULL;) {
+		/* Empty tokens are adjacent whitespaces; skip them. */
+		if (**ap == '\0')
+			continue;
+		/* Accept a non-empty token as the next field. */
+		count++;
+		ap++;
+		/*
+		 * If that was the first token, look it up in the simple command
+		 * table. If it is there and takes a single string value, then
+		 * return the remainder of the line (after skipping over any
+		 * leading whitespaces) without splitting it further.
+		 */
+		if (count == 1) {
+			*descptr = bsearch(argv[0], config_descs,
+			    tablecount, sizeof(config_descs[0]), cmp_cfg_name);
+			if (*descptr != NULL &&
+			    (*descptr)->type == CFG_STRING) {
+				count++;
+				while (isspace(*input))
+					input++;
+				*ap++ = input;
+				break;
+			}
+		}
+		/* Stop scanning if the line has too many tokens. */
+		if (count >= CFG_SLOTS)
+			break;
+	}
+	return (count);
+}
diff --git a/src/env/env_failchk.c b/src/env/env_failchk.c
new file mode 100644
index 00000000..05752f07
--- /dev/null
+++ b/src/env/env_failchk.c
@@ -0,0 +1,558 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#ifndef HAVE_SIMPLE_THREAD_TYPE
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"			/* Needed for call to __ham_func5. */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __env_in_api __P((ENV *));
+static void __env_clear_state __P((ENV *));
+
+/*
+ * __env_failchk_pp --
+ *	ENV->failchk pre/post processing.
+ *
+ * PUBLIC: int __env_failchk_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_failchk_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->failchk");
+
+	/*
+	 * ENV->failchk requires self and is-alive functions.  We
+	 * have a default self function, but no is-alive function.
+	 */
+	if (!ALIVE_ON(env)) {
+		__db_errx(env, DB_STR("1503",
+	"DB_ENV->failchk requires DB_ENV->is_alive be configured"));
+		return (EINVAL);
+	}
+
+	if (flags != 0)
+		return (__db_ferr(env, "DB_ENV->failchk", 0));
+
+	ENV_ENTER(env, ip);
+	FAILCHK_THREAD(env, ip);	/* mark as failchk thread */
+	ret = __env_failchk_int(dbenv);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+/*
+ * __env_failchk_int --
+ *	Process the subsystem failchk routines
+ *
+ * PUBLIC: int __env_failchk_int __P((DB_ENV *));
+ */
+int
+__env_failchk_int(dbenv)
+	DB_ENV *dbenv;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	F_SET(dbenv, DB_ENV_FAILCHK);
+
+	/*
+	 * We check for dead threads in the API first as this would be likely
+	 * to hang other things we try later, like locks and transactions.
+	 */
+	if ((ret = __env_in_api(env)) != 0)
+		goto err;
+
+	if (LOCKING_ON(env) && (ret = __lock_failchk(env)) != 0)
+		goto err;
+
+	if (TXN_ON(env) &&
+	    ((ret = __txn_failchk(env)) != 0 ||
+	    (ret = __dbreg_failchk(env)) != 0))
+		goto err;
+
+	if ((ret = __memp_failchk(env)) != 0)
+		goto err;
+
+#ifdef HAVE_REPLICATION_THREADS
+	if (REP_ON(env) && (ret = __repmgr_failchk(env)) != 0)
+		goto err;
+#endif
+
+	/* Mark any dead blocked threads as dead. */
+	__env_clear_state(env);
+
+#ifdef HAVE_MUTEX_SUPPORT
+	ret = __mut_failchk(env);
+#endif
+
+err:	F_CLR(dbenv, DB_ENV_FAILCHK);
+	return (ret);
+}
+
+/*
+ * __env_thread_size --
+ *	Initial amount of memory for thread info blocks.
+ * PUBLIC: size_t __env_thread_size __P((ENV *, size_t));
+ */
+size_t
+__env_thread_size(env, other_alloc)
+	ENV *env;
+	size_t other_alloc;
+{
+	DB_ENV *dbenv;
+	size_t size;
+	u_int32_t max;
+
+	dbenv = env->dbenv;
+	size = 0;
+
+	max = dbenv->thr_max;
+	if (dbenv->thr_init != 0) {
+		size =
+		    dbenv->thr_init * __env_alloc_size(sizeof(DB_THREAD_INFO));
+		if (max < dbenv->thr_init)
+			max = dbenv->thr_init;
+	} else if (max == 0 && ALIVE_ON(env)) {
+		if ((max = dbenv->tx_init) == 0) {
+			/*
+			 * They want thread tracking, but don't say how much.
+			 * Arbitrarily assume 1/10 of the remaining memory
+			 * or at least 100.  We just use this to size
+			 * the hash table.
+			 */
+			if (dbenv->memory_max != 0)
+				max = (u_int32_t)
+				    (((dbenv->memory_max - other_alloc) / 10) /
+					sizeof(DB_THREAD_INFO));
+			if (max < 100)
+				max = 100;
+		}
+	}
+	/*
+	 * Set the number of buckets to be 1/8th the number of
+	 * thread control blocks.  This is rather arbitrary.
+	 */
+	dbenv->thr_max = max;
+	if (max != 0)
+		size += __env_alloc_size(sizeof(DB_HASHTAB) *
+		    __db_tablesize(max / 8));
+	return (size);
+}
+
+/*
+ * __env_thread_max --
+ *	Return the amount of extra memory to hold thread information.
+ * PUBLIC: size_t __env_thread_max __P((ENV *));
+ */
+size_t
+__env_thread_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t size;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * Allocate space for thread info blocks.  Max is only advisory,
+	 * so we allocate 25% more.
+	 */
+	if (dbenv->thr_max > dbenv->thr_init) {
+		size = dbenv->thr_max - dbenv->thr_init;
+		size += size / 4;
+	} else {
+		dbenv->thr_max = dbenv->thr_init;
+		size = dbenv->thr_init / 4;
+	}
+
+	size = size * __env_alloc_size(sizeof(DB_THREAD_INFO));
+	return (size);
+}
+
+/*
+ * __env_thread_init --
+ *	Initialize the thread control block table.
+ *
+ * PUBLIC: int __env_thread_init __P((ENV *, int));
+ */
+int
+__env_thread_init(env, during_creation)
+	ENV *env;
+	int during_creation;
+{
+	DB_ENV *dbenv;
+	DB_HASHTAB *htab;
+	REGENV *renv;
+	REGINFO *infop;
+	THREAD_INFO *thread;
+	int ret;
+
+	dbenv = env->dbenv;
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	if (renv->thread_off == INVALID_ROFF) {
+		if (dbenv->thr_max == 0) {
+			env->thr_hashtab = NULL;
+			if (ALIVE_ON(env)) {
+				__db_errx(env, DB_STR("1504",
+		"is_alive method specified but no thread region allocated"));
+				return (EINVAL);
+			}
+			return (0);
+		}
+
+		if (!during_creation) {
+			__db_errx(env, DB_STR("1505",
+"thread table must be allocated when the database environment is created"));
+			return (EINVAL);
+		}
+
+		if ((ret =
+		    __env_alloc(infop, sizeof(THREAD_INFO), &thread)) != 0) {
+			__db_err(env, ret, DB_STR("1506",
+			    "unable to allocate a thread status block"));
+			return (ret);
+		}
+		memset(thread, 0, sizeof(*thread));
+		renv->thread_off = R_OFFSET(infop, thread);
+		thread->thr_nbucket = __db_tablesize(dbenv->thr_max / 8);
+		if ((ret = __env_alloc(infop,
+		     thread->thr_nbucket * sizeof(DB_HASHTAB), &htab)) != 0)
+			return (ret);
+		thread->thr_hashoff = R_OFFSET(infop, htab);
+		__db_hashinit(htab, thread->thr_nbucket);
+		thread->thr_max = dbenv->thr_max;
+		thread->thr_init = dbenv->thr_init;
+	} else {
+		thread = R_ADDR(infop, renv->thread_off);
+		htab = R_ADDR(infop, thread->thr_hashoff);
+	}
+
+	env->thr_hashtab = htab;
+	env->thr_nbucket = thread->thr_nbucket;
+	dbenv->thr_max = thread->thr_max;
+	dbenv->thr_init = thread->thr_init;
+	return (0);
+}
+
+/*
+ * __env_thread_destroy --
+ *	Destroy the thread control block table.
+ *
+ * PUBLIC: void __env_thread_destroy __P((ENV *));
+ */
+void
+__env_thread_destroy(env)
+	ENV *env;
+{
+	DB_HASHTAB *htab;
+	DB_THREAD_INFO *ip, *np;
+	REGENV *renv;
+	REGINFO *infop;
+	THREAD_INFO *thread;
+	u_int32_t i;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	if (renv->thread_off == INVALID_ROFF)
+		return;
+
+	thread = R_ADDR(infop, renv->thread_off);
+	if ((htab = env->thr_hashtab) != NULL) {
+		for (i = 0; i < env->thr_nbucket; i++) {
+			ip = SH_TAILQ_FIRST(&htab[i], __db_thread_info);
+			for (; ip != NULL; ip = np) {
+				np = SH_TAILQ_NEXT(ip,
+				    dbth_links, __db_thread_info);
+				__env_alloc_free(infop, ip);
+			}
+		}
+		__env_alloc_free(infop, htab);
+	}
+
+	__env_alloc_free(infop, thread);
+	return;
+}
+
+/*
+ * __env_in_api --
+ *	Look for threads which died in the api and complain.
+ *	If no threads died but there are blocked threads unpin
+ *	any buffers they may have locked.
+ */
+static int
+__env_in_api(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_HASHTAB *htab;
+	DB_THREAD_INFO *ip;
+	REGENV *renv;
+	REGINFO *infop;
+	THREAD_INFO *thread;
+	u_int32_t i;
+	int unpin, ret;
+
+	if ((htab = env->thr_hashtab) == NULL)
+		return (EINVAL);
+
+	dbenv = env->dbenv;
+	infop = env->reginfo;
+	renv = infop->primary;
+	thread = R_ADDR(infop, renv->thread_off);
+	unpin = 0;
+
+	for (i = 0; i < env->thr_nbucket; i++)
+		SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+			if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE ||
+			    (ip->dbth_state == THREAD_OUT &&
+			    thread->thr_count <  thread->thr_max))
+				continue;
+			if (dbenv->is_alive(
+			    dbenv, ip->dbth_pid, ip->dbth_tid, 0))
+				continue;
+			if (ip->dbth_state == THREAD_BLOCKED) {
+				ip->dbth_state = THREAD_BLOCKED_DEAD;
+				unpin = 1;
+				continue;
+			}
+			if (ip->dbth_state == THREAD_OUT) {
+				ip->dbth_state = THREAD_SLOT_NOT_IN_USE;
+				continue;
+			}
+			return (__db_failed(env, DB_STR("1507",
+			    "Thread died in Berkeley DB library"),
+			    ip->dbth_pid, ip->dbth_tid));
+		}
+
+	if (unpin == 0)
+		return (0);
+
+	for (i = 0; i < env->thr_nbucket; i++)
+		SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info)
+			if (ip->dbth_state == THREAD_BLOCKED_DEAD &&
+			    (ret = __memp_unpin_buffers(env, ip)) != 0)
+				return (ret);
+
+	return (0);
+}
+
+/*
+ * __env_clear_state --
+ *	Look for threads which died while blockedi and clear them..
+ */
+static void
+__env_clear_state(env)
+	ENV *env;
+{
+	DB_HASHTAB *htab;
+	DB_THREAD_INFO *ip;
+	u_int32_t i;
+
+	htab = env->thr_hashtab;
+	for (i = 0; i < env->thr_nbucket; i++)
+		SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info)
+			if (ip->dbth_state == THREAD_BLOCKED_DEAD)
+				ip->dbth_state = THREAD_SLOT_NOT_IN_USE;
+}
+
+struct __db_threadid {
+	pid_t pid;
+	db_threadid_t tid;
+};
+
+/*
+ * PUBLIC: int __env_set_state __P((ENV *, DB_THREAD_INFO **, DB_THREAD_STATE));
+ */
+int
+__env_set_state(env, ipp, state)
+	ENV *env;
+	DB_THREAD_INFO **ipp;
+	DB_THREAD_STATE state;
+{
+	struct __db_threadid id;
+	DB_ENV *dbenv;
+	DB_HASHTAB *htab;
+	DB_THREAD_INFO *ip;
+	REGENV *renv;
+	REGINFO *infop;
+	THREAD_INFO *thread;
+	u_int32_t indx;
+	int ret;
+
+	dbenv = env->dbenv;
+	htab = env->thr_hashtab;
+
+	if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) {
+		*ipp = NULL;
+		return (0);
+	}
+	dbenv->thread_id(dbenv, &id.pid, &id.tid);
+
+	/*
+	 * Hashing of thread ids.  This is simple but could be replaced with
+	 * something more expensive if needed.
+	 */
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+	/*
+	 * A thread ID may be a pointer, so explicitly cast to a pointer of
+	 * the appropriate size before doing the bitwise XOR.
+	 */
+	indx = (u_int32_t)((uintptr_t)id.pid ^ (uintptr_t)id.tid);
+#else
+	indx = __ham_func5(NULL, &id.tid, sizeof(id.tid));
+#endif
+	indx %= env->thr_nbucket;
+	SH_TAILQ_FOREACH(ip, &htab[indx], dbth_links, __db_thread_info) {
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+		if (id.pid == ip->dbth_pid && id.tid == ip->dbth_tid)
+			break;
+#else
+		if (memcmp(&id.pid, &ip->dbth_pid, sizeof(id.pid)) != 0)
+			continue;
+#ifdef HAVE_MUTEX_PTHREADS
+		if (pthread_equal(id.tid, ip->dbth_tid) == 0)
+#else
+		if (memcmp(&id.tid, &ip->dbth_tid, sizeof(id.tid)) != 0)
+#endif
+			continue;
+		break;
+#endif
+	}
+
+	/*
+	 * If ipp is not null,  return the thread control block if found.
+	 * Check to ensure the thread of control has been registered.
+	 */
+	if (state == THREAD_VERIFY) {
+		DB_ASSERT(env, ip != NULL && ip->dbth_state != THREAD_OUT);
+		if (ipp != NULL) {
+			if (ip == NULL) /* The control block wasn't found */
+				return (EINVAL);
+			*ipp = ip;
+		}
+		return (0);
+	}
+
+	*ipp = NULL;
+	ret = 0;
+	if (ip == NULL) {
+		infop = env->reginfo;
+		renv = infop->primary;
+		thread = R_ADDR(infop, renv->thread_off);
+		MUTEX_LOCK(env, renv->mtx_regenv);
+
+		/*
+		 * If we are passed the specified max, try to reclaim one from
+		 * our queue.  If failcheck has marked the slot not in use, we
+		 * can take it, otherwise we must call is_alive before freeing
+		 * it.
+		 */
+		if (thread->thr_count >= thread->thr_max) {
+			SH_TAILQ_FOREACH(
+			    ip, &htab[indx], dbth_links, __db_thread_info)
+				if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE ||
+				    (ip->dbth_state == THREAD_OUT &&
+				    ALIVE_ON(env) && !dbenv->is_alive(
+				    dbenv, ip->dbth_pid, ip->dbth_tid, 0)))
+					break;
+
+			if (ip != NULL) {
+				DB_ASSERT(env, ip->dbth_pincount == 0);
+				goto init;
+			}
+		}
+
+		thread->thr_count++;
+		if ((ret = __env_alloc(infop,
+		     sizeof(DB_THREAD_INFO), &ip)) == 0) {
+			memset(ip, 0, sizeof(*ip));
+			/*
+			 * This assumes we can link atomically since we do
+			 * no locking here.  We never use the backpointer
+			 * so we only need to be able to write an offset
+			 * atomically.
+			 */
+			SH_TAILQ_INSERT_HEAD(
+			    &htab[indx], ip, dbth_links, __db_thread_info);
+			ip->dbth_pincount = 0;
+			ip->dbth_pinmax = PINMAX;
+			ip->dbth_pinlist = R_OFFSET(infop, ip->dbth_pinarray);
+
+init:			ip->dbth_pid = id.pid;
+			ip->dbth_tid = id.tid;
+			ip->dbth_state = state;
+			SH_TAILQ_INIT(&ip->dbth_xatxn);
+		}
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+	} else
+		ip->dbth_state = state;
+	*ipp = ip;
+
+	DB_ASSERT(env, ret == 0);
+	if (ret != 0)
+		__db_errx(env, DB_STR("1508",
+		    "Unable to allocate thread control block"));
+	return (ret);
+}
+
+/*
+ * __env_thread_id_string --
+ *	Convert a thread id to a string.
+ *
+ * PUBLIC: char *__env_thread_id_string
+ * PUBLIC:     __P((DB_ENV *, pid_t, db_threadid_t, char *));
+ */
+char *
+__env_thread_id_string(dbenv, pid, tid, buf)
+	DB_ENV *dbenv;
+	pid_t pid;
+	db_threadid_t tid;
+	char *buf;
+{
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+#ifdef UINT64_FMT
+	char fmt[20];
+
+	snprintf(fmt, sizeof(fmt), "%s/%s", UINT64_FMT, UINT64_FMT);
+	snprintf(buf,
+	    DB_THREADID_STRLEN, fmt, (u_int64_t)pid, (u_int64_t)(uintptr_t)tid);
+#else
+	snprintf(buf, DB_THREADID_STRLEN, "%lu/%lu", (u_long)pid, (u_long)tid);
+#endif
+#else
+#ifdef UINT64_FMT
+	char fmt[20];
+
+	snprintf(fmt, sizeof(fmt), "%s/TID", UINT64_FMT);
+	snprintf(buf, DB_THREADID_STRLEN, fmt, (u_int64_t)pid);
+#else
+	snprintf(buf, DB_THREADID_STRLEN, "%lu/TID", (u_long)pid);
+#endif
+#endif
+	COMPQUIET(dbenv, NULL);
+	COMPQUIET(*(u_int8_t *)&tid, 0);
+
+	return (buf);
+}
diff --git a/src/env/env_file.c b/src/env/env_file.c
new file mode 100644
index 00000000..b102404d
--- /dev/null
+++ b/src/env/env_file.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2002, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_file_extend --
+ *	Initialize a regular file by writing the last page of the file.
+ *
+ * PUBLIC: int __db_file_extend __P((ENV *, DB_FH *, size_t));
+ */
+int
+__db_file_extend(env, fhp, size)
+	ENV *env;
+	DB_FH *fhp;
+	size_t size;
+{
+	db_pgno_t pages;
+	size_t nw;
+	u_int32_t relative;
+	int ret;
+	char buf;
+
+	buf = '\0';
+	/*
+	 * Extend the file by writing the last page.  If the region is >4Gb,
+	 * increment may be larger than the maximum possible seek "relative"
+	 * argument, as it's an unsigned 32-bit value.  Break the offset into
+	 * pages of 1MB each so we don't overflow -- (2^20 * 2^32 is bigger
+	 * than any memory I expect to see for awhile).
+	 */
+	pages = (db_pgno_t)((size - sizeof(buf)) / MEGABYTE);
+	relative = (u_int32_t)((size - sizeof(buf)) % MEGABYTE);
+	if ((ret = __os_seek(env, fhp, pages, MEGABYTE, relative)) == 0)
+		ret = __os_write(env, fhp, &buf, sizeof(buf), &nw);
+
+	return (ret);
+}
+
+/*
+ * __db_file_multi_write  --
+ *	Overwrite a file with multiple passes to corrupt the data.
+ *
+ * PUBLIC: int __db_file_multi_write __P((ENV *, const char *));
+ */
+int
+__db_file_multi_write(env, path)
+	ENV *env;
+	const char *path;
+{
+	DB_FH *fhp;
+	u_int32_t mbytes, bytes;
+	int ret;
+
+	if ((ret = __os_open(env, path, 0, DB_OSO_REGION, 0, &fhp)) == 0 &&
+	    (ret = __os_ioinfo(env, path, fhp, &mbytes, &bytes, NULL)) == 0) {
+		/*
+		 * !!!
+		 * Overwrite a regular file with alternating 0xff, 0x00 and 0xff
+		 * byte patterns.  Implies a fixed-block filesystem, journaling
+		 * or logging filesystems will require operating system support.
+		 */
+		if ((ret =
+		    __db_file_write(env, fhp, mbytes, bytes, 255)) != 0)
+			goto err;
+		if ((ret =
+		    __db_file_write(env, fhp, mbytes, bytes, 0)) != 0)
+			goto err;
+		if ((ret =
+		    __db_file_write(env, fhp, mbytes, bytes, 255)) != 0)
+			goto err;
+	} else
+		__db_err(env, ret, "%s", path);
+
+err:	if (fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+	return (ret);
+}
+
+/*
+ * __db_file_write --
+ *	A single pass over the file, writing the specified byte pattern.
+ *
+ * PUBLIC: int __db_file_write __P((ENV *,
+ * PUBLIC:     DB_FH *, u_int32_t, u_int32_t, int));
+ */
+int
+__db_file_write(env, fhp, mbytes, bytes, pattern)
+	ENV *env;
+	DB_FH *fhp;
+	int pattern;
+	u_int32_t mbytes, bytes;
+{
+	size_t len, nw;
+	int i, ret;
+	char *buf;
+
+#undef	FILE_WRITE_IO_SIZE
+#define	FILE_WRITE_IO_SIZE	(64 * 1024)
+	if ((ret = __os_malloc(env, FILE_WRITE_IO_SIZE, &buf)) != 0)
+		return (ret);
+	memset(buf, pattern, FILE_WRITE_IO_SIZE);
+
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		goto err;
+	for (; mbytes > 0; --mbytes)
+		for (i = MEGABYTE / FILE_WRITE_IO_SIZE; i > 0; --i)
+			if ((ret = __os_write(
+			    env, fhp, buf, FILE_WRITE_IO_SIZE, &nw)) != 0)
+				goto err;
+	for (; bytes > 0; bytes -= (u_int32_t)len) {
+		len = bytes < FILE_WRITE_IO_SIZE ? bytes : FILE_WRITE_IO_SIZE;
+		if ((ret = __os_write(env, fhp, buf, len, &nw)) != 0)
+			goto err;
+	}
+
+	ret = __os_fsync(env, fhp);
+
+err:	__os_free(env, buf);
+	return (ret);
+}
diff --git a/src/env/env_globals.c b/src/env/env_globals.c
new file mode 100644
index 00000000..955e6738
--- /dev/null
+++ b/src/env/env_globals.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * A structure with static initialization values for all of the global fields
+ * used by Berkeley DB.
+ * See dbinc/globals.h for the structure definition.
+ */
+DB_GLOBALS __db_global_values = {
+#ifdef HAVE_VXWORKS
+	0,				/* VxWorks: db_global_init */
+	NULL,				/* VxWorks: db_global_lock */
+#endif
+#ifdef DB_WIN32
+#ifndef DB_WINCE
+	{ 0 },			/* SECURITY_DESCRIPTOR win_default_sec_desc */
+	{ 0 },			/* SECURITY_ATTRIBUTES win_default_sec_attr */
+#endif
+	NULL,				/* SECURITY_ATTRIBUTES *win_sec_attr */
+#endif
+	{ NULL, NULL },			/* XA env list */
+
+	"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", /* db_line */
+	{ 0 },				/* error_buf */
+	0,				/* uid_init */
+	0,				/* rand_next */
+	0,				/* fid_serial */
+	0,				/* db_errno */
+	0,				/* num_active_pids */
+	0,				/* size_active_pids */
+	NULL,                           /* active_pids */
+	NULL,                           /* saved_errstr */
+	NULL,				/* j_assert */
+	NULL,				/* j_close */
+	NULL,				/* j_dirfree */
+	NULL,				/* j_dirlist */
+	NULL,				/* j_exists*/
+	NULL,				/* j_free */
+	NULL,				/* j_fsync */
+	NULL,				/* j_ftruncate */
+	NULL,				/* j_ioinfo */
+	NULL,				/* j_malloc */
+	NULL,				/* j_file_map */
+	NULL,				/* j_file_unmap */
+	NULL,				/* j_open */
+	NULL,				/* j_pread */
+	NULL,				/* j_pwrite */
+	NULL,				/* j_read */
+	NULL,				/* j_realloc */
+	NULL,				/* j_region_map */
+	NULL,				/* j_region_unmap */
+	NULL,				/* j_rename */
+	NULL,				/* j_seek */
+	NULL,				/* j_unlink */
+	NULL,				/* j_write */
+	NULL				/* j_yield */
+};
diff --git a/src/env/env_method.c b/src/env/env_method.c
new file mode 100644
index 00000000..63deacea
--- /dev/null
+++ b/src/env/env_method.c
@@ -0,0 +1,1918 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id: env_method.c,v dabaaeb7d839 2010/08/03 17:28:53 mike $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int  __db_env_init __P((DB_ENV *));
+static void __env_err __P((const DB_ENV *, int, const char *, ...));
+static void __env_errx __P((const DB_ENV *, const char *, ...));
+static int  __env_get_create_dir __P((DB_ENV *, const char **));
+static int  __env_get_data_dirs __P((DB_ENV *, const char ***));
+static int  __env_get_data_len __P((DB_ENV *, u_int32_t *));
+static int  __env_get_flags __P((DB_ENV *, u_int32_t *));
+static int  __env_get_home __P((DB_ENV *, const char **));
+static int  __env_get_intermediate_dir_mode __P((DB_ENV *, const char **));
+static int  __env_get_metadata_dir __P((DB_ENV *, const char **));
+static int  __env_get_shm_key __P((DB_ENV *, long *));
+static int  __env_get_thread_count __P((DB_ENV *, u_int32_t *));
+static int  __env_get_thread_id_fn __P((DB_ENV *,
+		void (**)(DB_ENV *, pid_t *, db_threadid_t *)));
+static int  __env_get_thread_id_string_fn __P((DB_ENV *,
+		char * (**)(DB_ENV *, pid_t, db_threadid_t, char *)));
+static int  __env_get_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t));
+static int  __env_get_tmp_dir __P((DB_ENV *, const char **));
+static int  __env_get_verbose __P((DB_ENV *, u_int32_t, int *));
+static int  __env_get_app_dispatch
+		__P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+static int  __env_set_app_dispatch
+		__P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+static int __env_set_event_notify
+		__P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *)));
+static int  __env_get_feedback __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+static int  __env_set_feedback __P((DB_ENV *, void (*)(DB_ENV *, int, int)));
+static int  __env_get_isalive __P((DB_ENV *,
+		int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+static int  __env_set_isalive __P((DB_ENV *,
+		int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+static int  __env_set_thread_id __P((DB_ENV *, void (*)(DB_ENV *,
+		pid_t *, db_threadid_t *)));
+static int  __env_set_thread_id_string __P((DB_ENV *,
+		char * (*)(DB_ENV *, pid_t, db_threadid_t, char *)));
+
+/*
+ * db_env_create --
+ *	DB_ENV constructor.
+ *
+ * EXTERN: int db_env_create __P((DB_ENV **, u_int32_t));
+ */
+int
+db_env_create(dbenvpp, flags)
+	DB_ENV **dbenvpp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	ENV *env;
+	int ret;
+
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 *
+	 * !!!
+	 * We can't call the flags-checking routines, we don't have an
+	 * environment yet.
+	 */
+	if (flags != 0)
+		return (EINVAL);
+
+	/* Allocate the DB_ENV and ENV structures -- we always have both. */
+	if ((ret = __os_calloc(NULL, 1, sizeof(DB_ENV), &dbenv)) != 0)
+		return (ret);
+	if ((ret = __os_calloc(NULL, 1, sizeof(ENV), &env)) != 0)
+		goto err;
+	dbenv->env = env;
+	env->dbenv = dbenv;
+
+	if ((ret = __db_env_init(dbenv)) != 0 ||
+	    (ret = __lock_env_create(dbenv)) != 0 ||
+	    (ret = __log_env_create(dbenv)) != 0 ||
+	    (ret = __memp_env_create(dbenv)) != 0 ||
+#ifdef HAVE_REPLICATION
+	    (ret = __rep_env_create(dbenv)) != 0 ||
+#endif
+	    (ret = __txn_env_create(dbenv)))
+		goto err;
+
+	*dbenvpp = dbenv;
+	return (0);
+
+err:	__db_env_destroy(dbenv);
+	return (ret);
+}
+
+/*
+ * __db_env_destroy --
+ *	DB_ENV destructor.
+ *
+ * PUBLIC: void __db_env_destroy __P((DB_ENV *));
+ */
+void
+__db_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	__lock_env_destroy(dbenv);
+	__log_env_destroy(dbenv);
+	__memp_env_destroy(dbenv);
+#ifdef HAVE_REPLICATION
+	__rep_env_destroy(dbenv);
+#endif
+	__txn_env_destroy(dbenv);
+
+	/*
+	 * Discard the underlying ENV structure.
+	 *
+	 * XXX
+	 * This is wrong, but can't be fixed until we finish the work of
+	 * splitting up the DB_ENV and ENV structures so that we don't
+	 * touch anything in the ENV as part of the above calls to subsystem
+	 * DB_ENV cleanup routines.
+	 */
+	memset(dbenv->env, CLEAR_BYTE, sizeof(ENV));
+	__os_free(NULL, dbenv->env);
+
+	memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV));
+	__os_free(NULL, dbenv);
+}
+
+/*
+ * __db_env_init --
+ *	Initialize a DB_ENV structure.
+ */
+static int
+__db_env_init(dbenv)
+	DB_ENV *dbenv;
+{
+	ENV *env;
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 *
+	 * Initialize the method handles.
+	 */
+	/* DB_ENV PUBLIC HANDLE LIST BEGIN */
+	dbenv->add_data_dir = __env_add_data_dir;
+	dbenv->backup = __db_backup;
+	dbenv->dbbackup = __db_dbbackup_pp;
+	dbenv->cdsgroup_begin = __cdsgroup_begin_pp;
+	dbenv->close = __env_close_pp;
+	dbenv->dbremove = __env_dbremove_pp;
+	dbenv->dbrename = __env_dbrename_pp;
+	dbenv->err = __env_err;
+	dbenv->errx = __env_errx;
+	dbenv->failchk = __env_failchk_pp;
+	dbenv->fileid_reset = __env_fileid_reset_pp;
+	dbenv->get_alloc = __env_get_alloc;
+	dbenv->get_app_dispatch = __env_get_app_dispatch;
+	dbenv->get_cache_max = __memp_get_cache_max;
+	dbenv->get_cachesize = __memp_get_cachesize;
+	dbenv->get_backup_callbacks = __env_get_backup_callbacks;
+	dbenv->get_backup_config = __env_get_backup_config;
+	dbenv->get_create_dir = __env_get_create_dir;
+	dbenv->get_data_dirs = __env_get_data_dirs;
+	dbenv->get_data_len = __env_get_data_len;
+	dbenv->get_encrypt_flags = __env_get_encrypt_flags;
+	dbenv->get_errcall = __env_get_errcall;
+	dbenv->get_errfile = __env_get_errfile;
+	dbenv->get_errpfx = __env_get_errpfx;
+	dbenv->get_feedback = __env_get_feedback;
+	dbenv->get_flags = __env_get_flags;
+	dbenv->get_home = __env_get_home;
+	dbenv->get_intermediate_dir_mode = __env_get_intermediate_dir_mode;
+	dbenv->get_isalive = __env_get_isalive;
+	dbenv->get_lg_bsize = __log_get_lg_bsize;
+	dbenv->get_lg_dir = __log_get_lg_dir;
+	dbenv->get_lg_filemode = __log_get_lg_filemode;
+	dbenv->get_lg_max = __log_get_lg_max;
+	dbenv->get_lg_regionmax = __log_get_lg_regionmax;
+	dbenv->get_lk_conflicts = __lock_get_lk_conflicts;
+	dbenv->get_lk_detect = __lock_get_lk_detect;
+	dbenv->get_lk_max_lockers = __lock_get_lk_max_lockers;
+	dbenv->get_lk_max_locks = __lock_get_lk_max_locks;
+	dbenv->get_lk_max_objects = __lock_get_lk_max_objects;
+	dbenv->get_lk_partitions = __lock_get_lk_partitions;
+	dbenv->get_lk_priority = __lock_get_lk_priority;
+	dbenv->get_lk_tablesize = __lock_get_lk_tablesize;
+	dbenv->get_memory_init = __env_get_memory_init;
+	dbenv->get_memory_max = __env_get_memory_max;
+	dbenv->get_metadata_dir = __env_get_metadata_dir;
+	dbenv->get_mp_max_openfd = __memp_get_mp_max_openfd;
+	dbenv->get_mp_max_write = __memp_get_mp_max_write;
+	dbenv->get_mp_mmapsize = __memp_get_mp_mmapsize;
+	dbenv->get_mp_mtxcount = __memp_get_mp_mtxcount;
+	dbenv->get_mp_pagesize = __memp_get_mp_pagesize;
+	dbenv->get_mp_tablesize = __memp_get_mp_tablesize;
+	dbenv->get_msgcall = __env_get_msgcall;
+	dbenv->get_msgfile = __env_get_msgfile;
+	dbenv->get_open_flags = __env_get_open_flags;
+	dbenv->get_shm_key = __env_get_shm_key;
+	dbenv->get_thread_count = __env_get_thread_count;
+	dbenv->get_thread_id_fn = __env_get_thread_id_fn;
+	dbenv->get_thread_id_string_fn = __env_get_thread_id_string_fn;
+	dbenv->get_timeout = __env_get_timeout;
+	dbenv->get_tmp_dir = __env_get_tmp_dir;
+	dbenv->get_tx_max = __txn_get_tx_max;
+	dbenv->get_tx_timestamp = __txn_get_tx_timestamp;
+	dbenv->get_verbose = __env_get_verbose;
+	dbenv->is_bigendian = __db_isbigendian;
+	dbenv->lock_detect = __lock_detect_pp;
+	dbenv->lock_get = __lock_get_pp;
+	dbenv->lock_id = __lock_id_pp;
+	dbenv->lock_id_free = __lock_id_free_pp;
+	dbenv->lock_put = __lock_put_pp;
+	dbenv->lock_stat = __lock_stat_pp;
+	dbenv->lock_stat_print = __lock_stat_print_pp;
+	dbenv->lock_vec = __lock_vec_pp;
+	dbenv->log_archive = __log_archive_pp;
+	dbenv->log_cursor = __log_cursor_pp;
+	dbenv->log_file = __log_file_pp;
+	dbenv->log_flush = __log_flush_pp;
+	dbenv->log_get_config = __log_get_config;
+	dbenv->log_printf = __log_printf_capi;
+	dbenv->log_put = __log_put_pp;
+	dbenv->log_put_record = __log_put_record_pp;
+	dbenv->log_read_record = __log_read_record_pp;
+	dbenv->log_set_config = __log_set_config;
+	dbenv->log_stat = __log_stat_pp;
+	dbenv->log_stat_print = __log_stat_print_pp;
+	dbenv->log_verify = __log_verify_pp;
+	dbenv->lsn_reset = __env_lsn_reset_pp;
+	dbenv->memp_fcreate = __memp_fcreate_pp;
+	dbenv->memp_register = __memp_register_pp;
+	dbenv->memp_stat = __memp_stat_pp;
+	dbenv->memp_stat_print = __memp_stat_print_pp;
+	dbenv->memp_sync = __memp_sync_pp;
+	dbenv->memp_trickle = __memp_trickle_pp;
+	dbenv->mutex_alloc = __mutex_alloc_pp;
+	dbenv->mutex_free = __mutex_free_pp;
+	dbenv->mutex_get_align = __mutex_get_align;
+	dbenv->mutex_get_increment = __mutex_get_increment;
+	dbenv->mutex_get_init = __mutex_get_init;
+	dbenv->mutex_get_max = __mutex_get_max;
+	dbenv->mutex_get_tas_spins = __mutex_get_tas_spins;
+	dbenv->mutex_lock = __mutex_lock_pp;
+	dbenv->mutex_set_align = __mutex_set_align;
+	dbenv->mutex_set_increment = __mutex_set_increment;
+	dbenv->mutex_set_init = __mutex_set_init;
+	dbenv->mutex_set_max = __mutex_set_max;
+	dbenv->mutex_set_tas_spins = __mutex_set_tas_spins;
+	dbenv->mutex_stat = __mutex_stat_pp;
+	dbenv->mutex_stat_print = __mutex_stat_print_pp;
+	dbenv->mutex_unlock = __mutex_unlock_pp;
+	dbenv->open = __env_open_pp;
+	dbenv->remove = __env_remove;
+	dbenv->rep_elect = __rep_elect_pp;
+	dbenv->rep_flush = __rep_flush;
+	dbenv->rep_get_clockskew = __rep_get_clockskew;
+	dbenv->rep_get_config = __rep_get_config;
+	dbenv->rep_get_limit = __rep_get_limit;
+	dbenv->rep_get_nsites = __rep_get_nsites;
+	dbenv->rep_get_priority = __rep_get_priority;
+	dbenv->rep_get_request = __rep_get_request;
+	dbenv->rep_get_timeout = __rep_get_timeout;
+	dbenv->rep_process_message = __rep_process_message_pp;
+	dbenv->rep_set_clockskew = __rep_set_clockskew;
+	dbenv->rep_set_config = __rep_set_config;
+	dbenv->rep_set_limit = __rep_set_limit;
+	dbenv->rep_set_nsites = __rep_set_nsites_pp;
+	dbenv->rep_set_priority = __rep_set_priority;
+	dbenv->rep_set_request = __rep_set_request;
+	dbenv->rep_set_timeout = __rep_set_timeout;
+	dbenv->rep_set_transport = __rep_set_transport_pp;
+	dbenv->rep_start = __rep_start_pp;
+	dbenv->rep_stat = __rep_stat_pp;
+	dbenv->rep_stat_print = __rep_stat_print_pp;
+	dbenv->rep_sync = __rep_sync;
+	dbenv->repmgr_channel = __repmgr_channel;
+	dbenv->repmgr_get_ack_policy = __repmgr_get_ack_policy;
+	dbenv->repmgr_local_site = __repmgr_local_site;
+	dbenv->repmgr_msg_dispatch = __repmgr_set_msg_dispatch;
+	dbenv->repmgr_set_ack_policy = __repmgr_set_ack_policy;
+	dbenv->repmgr_site = __repmgr_site;
+	dbenv->repmgr_site_by_eid = __repmgr_site_by_eid;
+	dbenv->repmgr_site_list = __repmgr_site_list;
+	dbenv->repmgr_start = __repmgr_start;
+	dbenv->repmgr_stat = __repmgr_stat_pp;
+	dbenv->repmgr_stat_print = __repmgr_stat_print_pp;
+	dbenv->set_alloc = __env_set_alloc;
+	dbenv->set_app_dispatch = __env_set_app_dispatch;
+	dbenv->set_backup_callbacks = __env_set_backup_callbacks;
+	dbenv->set_backup_config = __env_set_backup_config;
+	dbenv->set_cache_max = __memp_set_cache_max;
+	dbenv->set_cachesize = __memp_set_cachesize;
+	dbenv->set_create_dir = __env_set_create_dir;
+	dbenv->set_data_dir = __env_set_data_dir;
+	dbenv->set_data_len = __env_set_data_len;
+	dbenv->set_encrypt = __env_set_encrypt;
+	dbenv->set_errcall = __env_set_errcall;
+	dbenv->set_errfile = __env_set_errfile;
+	dbenv->set_errpfx = __env_set_errpfx;
+	dbenv->set_event_notify = __env_set_event_notify;
+	dbenv->set_feedback = __env_set_feedback;
+	dbenv->set_flags = __env_set_flags;
+	dbenv->set_intermediate_dir_mode = __env_set_intermediate_dir_mode;
+	dbenv->set_isalive = __env_set_isalive;
+	dbenv->set_lg_bsize = __log_set_lg_bsize;
+	dbenv->set_lg_dir = __log_set_lg_dir;
+	dbenv->set_lg_filemode = __log_set_lg_filemode;
+	dbenv->set_lg_max = __log_set_lg_max;
+	dbenv->set_lg_regionmax = __log_set_lg_regionmax;
+	dbenv->set_lk_conflicts = __lock_set_lk_conflicts;
+	dbenv->set_lk_detect = __lock_set_lk_detect;
+	dbenv->set_lk_max_lockers = __lock_set_lk_max_lockers;
+	dbenv->set_lk_max_locks = __lock_set_lk_max_locks;
+	dbenv->set_lk_max_objects = __lock_set_lk_max_objects;
+	dbenv->set_lk_partitions = __lock_set_lk_partitions;
+	dbenv->set_lk_priority = __lock_set_lk_priority;
+	dbenv->set_lk_tablesize = __lock_set_lk_tablesize;
+	dbenv->set_memory_init = __env_set_memory_init;
+	dbenv->set_memory_max = __env_set_memory_max;
+	dbenv->set_metadata_dir = __env_set_metadata_dir;
+	dbenv->set_mp_max_openfd = __memp_set_mp_max_openfd;
+	dbenv->set_mp_max_write = __memp_set_mp_max_write;
+	dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+	dbenv->set_mp_mtxcount = __memp_set_mp_mtxcount;
+	dbenv->set_mp_pagesize = __memp_set_mp_pagesize;
+	dbenv->set_mp_tablesize = __memp_set_mp_tablesize;
+	dbenv->set_msgcall = __env_set_msgcall;
+	dbenv->set_msgfile = __env_set_msgfile;
+	dbenv->set_paniccall = __env_set_paniccall;
+	dbenv->set_shm_key = __env_set_shm_key;
+	dbenv->set_thread_count = __env_set_thread_count;
+	dbenv->set_thread_id = __env_set_thread_id;
+	dbenv->set_thread_id_string = __env_set_thread_id_string;
+	dbenv->set_timeout = __env_set_timeout;
+	dbenv->set_tmp_dir = __env_set_tmp_dir;
+	dbenv->set_tx_max = __txn_set_tx_max;
+	dbenv->set_tx_timestamp = __txn_set_tx_timestamp;
+	dbenv->set_verbose = __env_set_verbose;
+	dbenv->stat_print = __env_stat_print_pp;
+	dbenv->txn_applied = __txn_applied_pp;
+	dbenv->txn_begin = __txn_begin_pp;
+	dbenv->txn_checkpoint = __txn_checkpoint_pp;
+	dbenv->txn_recover = __txn_recover_pp;
+	dbenv->txn_stat = __txn_stat_pp;
+	dbenv->txn_stat_print = __txn_stat_print_pp;
+	/* DB_ENV PUBLIC HANDLE LIST END */
+
+	/* DB_ENV PRIVATE HANDLE LIST BEGIN */
+	dbenv->prdbt = __db_prdbt;
+	/* DB_ENV PRIVATE HANDLE LIST END */
+
+	dbenv->shm_key = INVALID_REGION_SEGID;
+	dbenv->thread_id = __os_id;
+	dbenv->thread_id_string = __env_thread_id_string;
+
+	env = dbenv->env;
+	__os_id(NULL, &env->pid_cache, NULL);
+
+	env->db_ref = 0;
+	env->log_verify_wrap = __log_verify_wrap;
+	env->data_len = ENV_DEF_DATA_LEN;
+	TAILQ_INIT(&env->fdlist);
+
+	if (!__db_isbigendian())
+		F_SET(env, ENV_LITTLEENDIAN);
+	F_SET(env, ENV_NO_OUTPUT_SET);
+
+	return (0);
+}
+
+/*
+ * __env_err --
+ *	DbEnv.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__env_err(const DB_ENV *dbenv, int error, const char *fmt, ...)
+#else
+__env_err(dbenv, error, fmt, va_alist)
+	const DB_ENV *dbenv;
+	int error;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message with error string, to stderr by default. */
+	DB_REAL_ERR(dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __env_errx --
+ *	DbEnv.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__env_errx(const DB_ENV *dbenv, const char *fmt, ...)
+#else
+__env_errx(dbenv, fmt, va_alist)
+	const DB_ENV *dbenv;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message without error string, to stderr by default. */
+	DB_REAL_ERR(dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+static int
+__env_get_home(dbenv, homep)
+	DB_ENV *dbenv;
+	const char **homep;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_home");
+	*homep = env->db_home;
+
+	return (0);
+}
+
+/*
+ * __env_get_alloc --
+ *	{DB_ENV,DB}->get_alloc.
+ *
+ * PUBLIC: int  __env_get_alloc __P((DB_ENV *, void *(**)(size_t),
+ * PUBLIC:          void *(**)(void *, size_t), void (**)(void *)));
+ */
+int
+__env_get_alloc(dbenv, mal_funcp, real_funcp, free_funcp)
+	DB_ENV *dbenv;
+	void *(**mal_funcp) __P((size_t));
+	void *(**real_funcp) __P((void *, size_t));
+	void (**free_funcp) __P((void *));
+{
+
+	if (mal_funcp != NULL)
+		*mal_funcp = dbenv->db_malloc;
+	if (real_funcp != NULL)
+		*real_funcp = dbenv->db_realloc;
+	if (free_funcp != NULL)
+		*free_funcp = dbenv->db_free;
+	return (0);
+}
+
+/*
+ * __env_set_alloc --
+ *	{DB_ENV,DB}->set_alloc.
+ *
+ * PUBLIC: int  __env_set_alloc __P((DB_ENV *, void *(*)(size_t),
+ * PUBLIC:          void *(*)(void *, size_t), void (*)(void *)));
+ */
+int
+__env_set_alloc(dbenv, mal_func, real_func, free_func)
+	DB_ENV *dbenv;
+	void *(*mal_func) __P((size_t));
+	void *(*real_func) __P((void *, size_t));
+	void (*free_func) __P((void *));
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_alloc");
+
+	dbenv->db_malloc = mal_func;
+	dbenv->db_realloc = real_func;
+	dbenv->db_free = free_func;
+	return (0);
+}
+/*
+ * __env_get_memory_init --
+ *	DB_ENV->get_memory_init.
+ *
+ * PUBLIC: int  __env_get_memory_init __P((DB_ENV *,
+ * PUBLIC:	    DB_MEM_CONFIG, u_int32_t *));
+ */
+int
+__env_get_memory_init(dbenv, type, countp)
+	DB_ENV *dbenv;
+	DB_MEM_CONFIG type;
+	u_int32_t *countp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	switch (type) {
+	case DB_MEM_LOCK:
+		ENV_NOT_CONFIGURED(env,
+		    env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+		if (LOCKING_ON(env))
+			*countp = ((DB_LOCKREGION *)
+			    env->lk_handle->reginfo.primary)->stat.st_initlocks;
+		else
+			*countp = dbenv->lk_init;
+		break;
+	case DB_MEM_LOCKOBJECT:
+		ENV_NOT_CONFIGURED(env,
+		    env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+		if (LOCKING_ON(env))
+			*countp = ((DB_LOCKREGION *) env->
+			    lk_handle->reginfo.primary)->stat.st_initobjects;
+		else
+			*countp = dbenv->lk_init_objects;
+		break;
+	case DB_MEM_LOCKER:
+		ENV_NOT_CONFIGURED(env,
+		    env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+		if (LOCKING_ON(env))
+			*countp = ((DB_LOCKREGION *) env->
+			    lk_handle->reginfo.primary)->stat.st_initlockers;
+		else
+			*countp = dbenv->lk_init_lockers;
+		break;
+	case DB_MEM_LOGID:
+		ENV_NOT_CONFIGURED(env,
+		    env->lg_handle, "DB_ENV->get_memory_init", DB_INIT_LOG);
+
+		if (LOGGING_ON(env))
+			*countp = ((LOG *)env->lg_handle->
+			    reginfo.primary)->stat.st_fileid_init;
+		else
+			*countp = dbenv->lg_fileid_init;
+		break;
+	case DB_MEM_TRANSACTION:
+		ENV_NOT_CONFIGURED(env,
+		    env->tx_handle, "DB_ENV->memory_init", DB_INIT_TXN);
+
+		if (TXN_ON(env))
+			*countp = ((DB_TXNREGION *)
+			    env->tx_handle->reginfo.primary)->inittxns;
+		else
+			*countp = dbenv->tx_init;
+		break;
+	case DB_MEM_THREAD:
+		/* We always update thr_init when joining an env. */
+		*countp = dbenv->thr_init;
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * __env_set_memory_init --
+ *	DB_ENV->set_memory_init.
+ *
+ * PUBLIC: int  __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+ */
+int
+__env_set_memory_init(dbenv, type, count)
+	DB_ENV *dbenv;
+	DB_MEM_CONFIG type;
+	u_int32_t count;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_init");
+	switch (type) {
+	case DB_MEM_LOCK:
+		dbenv->lk_init = count;
+		break;
+	case DB_MEM_LOCKOBJECT:
+		dbenv->lk_init_objects = count;
+		break;
+	case DB_MEM_LOCKER:
+		dbenv->lk_init_lockers = count;
+		break;
+	case DB_MEM_LOGID:
+		dbenv->lg_fileid_init = count;
+		break;
+	case DB_MEM_TRANSACTION:
+		dbenv->tx_init = count;
+		break;
+	case DB_MEM_THREAD:
+		dbenv->thr_init = count;
+		break;
+	}
+
+	return (0);
+}
+/*
+ * __env_get_memory_max --
+ *	DB_ENV->get_memory_max.
+ *
+ * PUBLIC: int  __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__env_get_memory_max(dbenv, gbytes, bytes)
+	DB_ENV *dbenv;
+	u_int32_t *gbytes, *bytes;
+{
+	ENV *env;
+	env = dbenv->env;
+
+	if (F_ISSET(env, ENV_OPEN_CALLED)) {
+		*gbytes = (u_int32_t)(env->reginfo->rp->max / GIGABYTE);
+		*bytes = (u_int32_t)(env->reginfo->rp->max % GIGABYTE);
+	} else {
+		*gbytes = (u_int32_t)(dbenv->memory_max / GIGABYTE);
+		*bytes = (u_int32_t)(dbenv->memory_max % GIGABYTE);
+	}
+	return (0);
+}
+
+/*
+ * __env_set_memory_max --
+ *	DB_ENV->set_memory_max.
+ *
+ * PUBLIC: int  __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__env_set_memory_max(dbenv, gbytes, bytes)
+	DB_ENV *dbenv;
+	u_int32_t gbytes, bytes;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_max");
+
+	/*
+	 * If they are asking for 4GB exactly on a 32 bit platform, they
+	 * really meant 4GB - 1. Give it to them.
+	 */
+	if (sizeof(roff_t) == 4 && gbytes == 4 && bytes == 0) {
+		--gbytes;
+		bytes = GIGABYTE - 1;
+	}
+	/*
+	 * Make sure they wouldn't overflow the memory_max field on a
+	 * 32 bit architecture.
+	 */
+	if (sizeof(roff_t) == 4 && gbytes >= 4) {
+		__db_errx(env, DB_STR("1588",
+	    "Maximum memory size too large: maximum is 4GB"));
+		return (EINVAL);
+	}
+	dbenv->memory_max = ((roff_t)gbytes * GIGABYTE) + bytes;
+	return (0);
+}
+
+/*
+ * __env_get_app_dispatch --
+ *	Get the transaction abort recover function.
+ */
+static int
+__env_get_app_dispatch(dbenv, app_dispatchp)
+	DB_ENV *dbenv;
+	int (**app_dispatchp) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+{
+
+	if (app_dispatchp != NULL)
+		*app_dispatchp = dbenv->app_dispatch;
+	return (0);
+}
+
+/*
+ * __env_set_app_dispatch --
+ *	Set the transaction abort recover function.
+ */
+static int
+__env_set_app_dispatch(dbenv, app_dispatch)
+	DB_ENV *dbenv;
+	int (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_app_dispatch");
+
+	dbenv->app_dispatch = app_dispatch;
+	return (0);
+}
+
+/*
+ * __env_get_encrypt_flags --
+ *	{DB_ENV,DB}->get_encrypt_flags.
+ *
+ * PUBLIC: int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_get_encrypt_flags(dbenv, flagsp)
+	DB_ENV *dbenv;
+	u_int32_t *flagsp;
+{
+#ifdef HAVE_CRYPTO
+	DB_CIPHER *db_cipher;
+#endif
+	ENV *env;
+
+	env = dbenv->env;
+
+#ifdef HAVE_CRYPTO
+	db_cipher = env->crypto_handle;
+	if (db_cipher != NULL && db_cipher->alg == CIPHER_AES)
+		*flagsp = DB_ENCRYPT_AES;
+	else
+		*flagsp = 0;
+	return (0);
+#else
+	COMPQUIET(flagsp, 0);
+	__db_errx(env, DB_STR("1555",
+	    "library build did not include support for cryptography"));
+	return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __env_set_encrypt --
+ *	DB_ENV->set_encrypt.
+ *
+ * PUBLIC: int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_set_encrypt(dbenv, passwd, flags)
+	DB_ENV *dbenv;
+	const char *passwd;
+	u_int32_t flags;
+{
+#ifdef HAVE_CRYPTO
+	DB_THREAD_INFO *ip;
+	DB_CIPHER *db_cipher;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_encrypt");
+#define	OK_CRYPTO_FLAGS	(DB_ENCRYPT_AES)
+
+	if (flags != 0 && LF_ISSET(~OK_CRYPTO_FLAGS))
+		return (__db_ferr(env, "DB_ENV->set_encrypt", 0));
+
+	if (passwd == NULL || strlen(passwd) == 0) {
+		__db_errx(env, DB_STR("1556",
+		    "Empty password specified to set_encrypt"));
+		return (EINVAL);
+	}
+	ENV_ENTER(env, ip);
+	if (!CRYPTO_ON(env)) {
+		if ((ret = __os_calloc(env, 1, sizeof(DB_CIPHER), &db_cipher))
+		    != 0)
+			goto err;
+		env->crypto_handle = db_cipher;
+	} else
+		db_cipher = env->crypto_handle;
+
+	if (dbenv->passwd != NULL)
+		__os_free(env, dbenv->passwd);
+	if ((ret = __os_strdup(env, passwd, &dbenv->passwd)) != 0) {
+		__os_free(env, db_cipher);
+		goto err;
+	}
+	/*
+	 * We're going to need this often enough to keep around
+	 */
+	dbenv->passwd_len = strlen(dbenv->passwd) + 1;
+	/*
+	 * The MAC key is for checksumming, and is separate from
+	 * the algorithm.  So initialize it here, even if they
+	 * are using CIPHER_ANY.
+	 */
+	__db_derive_mac(
+	    (u_int8_t *)dbenv->passwd, dbenv->passwd_len, db_cipher->mac_key);
+	switch (flags) {
+	case 0:
+		F_SET(db_cipher, CIPHER_ANY);
+		break;
+	case DB_ENCRYPT_AES:
+		if ((ret =
+		    __crypto_algsetup(env, db_cipher, CIPHER_AES, 0)) != 0)
+			goto err1;
+		break;
+	default:				/* Impossible. */
+		break;
+	}
+	ENV_LEAVE(env, ip);
+	return (0);
+
+err1:
+	__os_free(env, dbenv->passwd);
+	__os_free(env, db_cipher);
+	env->crypto_handle = NULL;
+err:
+	ENV_LEAVE(env, ip);
+	return (ret);
+#else
+	COMPQUIET(passwd, NULL);
+	COMPQUIET(flags, 0);
+
+	__db_errx(dbenv->env, DB_STR("1557",
+	    "library build did not include support for cryptography"));
+	return (DB_OPNOTSUP);
+#endif
+}
+#ifndef HAVE_BREW
+static
+#endif
+const FLAG_MAP EnvMap[] = {
+	{ DB_AUTO_COMMIT,	DB_ENV_AUTO_COMMIT },
+	{ DB_CDB_ALLDB,		DB_ENV_CDB_ALLDB },
+	{ DB_DATABASE_LOCKING,	DB_ENV_DATABASE_LOCKING },
+	{ DB_DIRECT_DB,		DB_ENV_DIRECT_DB },
+	{ DB_DSYNC_DB,		DB_ENV_DSYNC_DB },
+	{ DB_HOTBACKUP_IN_PROGRESS, DB_ENV_HOTBACKUP },
+	{ DB_MULTIVERSION,	DB_ENV_MULTIVERSION },
+	{ DB_NOFLUSH,		DB_ENV_NOFLUSH },
+	{ DB_NOLOCKING,		DB_ENV_NOLOCKING },
+	{ DB_NOMMAP,		DB_ENV_NOMMAP },
+	{ DB_NOPANIC,		DB_ENV_NOPANIC },
+	{ DB_OVERWRITE,		DB_ENV_OVERWRITE },
+	{ DB_REGION_INIT,	DB_ENV_REGION_INIT },
+	{ DB_TIME_NOTGRANTED,	DB_ENV_TIME_NOTGRANTED },
+	{ DB_TXN_NOSYNC,	DB_ENV_TXN_NOSYNC },
+	{ DB_TXN_NOWAIT,	DB_ENV_TXN_NOWAIT },
+	{ DB_TXN_SNAPSHOT,	DB_ENV_TXN_SNAPSHOT },
+	{ DB_TXN_WRITE_NOSYNC,	DB_ENV_TXN_WRITE_NOSYNC },
+	{ DB_YIELDCPU,		DB_ENV_YIELDCPU }
+};
+
+/*
+ * __env_map_flags -- map from external to internal flags.
+ * PUBLIC: void __env_map_flags __P((const FLAG_MAP *,
+ * PUBLIC:       u_int, u_int32_t *, u_int32_t *));
+ */
+void
+__env_map_flags(flagmap, mapsize, inflagsp, outflagsp)
+	const FLAG_MAP *flagmap;
+	u_int mapsize;
+	u_int32_t *inflagsp, *outflagsp;
+{
+
+	const FLAG_MAP *fmp;
+	u_int i;
+
+	for (i = 0, fmp = flagmap;
+	    i < mapsize / sizeof(flagmap[0]); ++i, ++fmp)
+		if (FLD_ISSET(*inflagsp, fmp->inflag)) {
+			FLD_SET(*outflagsp, fmp->outflag);
+			FLD_CLR(*inflagsp, fmp->inflag);
+			if (*inflagsp == 0)
+				break;
+		}
+}
+
+/*
+ * __env_fetch_flags -- map from internal to external flags.
+ * PUBLIC: void __env_fetch_flags __P((const FLAG_MAP *,
+ * PUBLIC:       u_int, u_int32_t *, u_int32_t *));
+ */
+void
+__env_fetch_flags(flagmap, mapsize, inflagsp, outflagsp)
+	const FLAG_MAP *flagmap;
+	u_int mapsize;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	const FLAG_MAP *fmp;
+	u_int32_t i;
+
+	*outflagsp = 0;
+	for (i = 0, fmp = flagmap;
+	    i < mapsize / sizeof(flagmap[0]); ++i, ++fmp)
+		if (FLD_ISSET(*inflagsp, fmp->outflag))
+			FLD_SET(*outflagsp, fmp->inflag);
+}
+
+static int
+__env_get_flags(dbenv, flagsp)
+	DB_ENV *dbenv;
+	u_int32_t *flagsp;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+
+	__env_fetch_flags(EnvMap, sizeof(EnvMap), &dbenv->flags, flagsp);
+
+	env = dbenv->env;
+	/* Some flags are persisted in the regions. */
+	if (env->reginfo != NULL &&
+	    ((REGENV *)env->reginfo->primary)->panic != 0)
+		FLD_SET(*flagsp, DB_PANIC_ENVIRONMENT);
+
+	/* If the hotbackup counter is positive, set the flag indicating so. */
+	if (TXN_ON(env)) {
+		ENV_ENTER(env, ip);
+		TXN_SYSTEM_LOCK(env);
+		if (((DB_TXNREGION *)
+		    env->tx_handle->reginfo.primary)->n_hotbackup > 0)
+			FLD_SET(*flagsp, DB_HOTBACKUP_IN_PROGRESS);
+		TXN_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	}
+
+	return (0);
+}
+
+/*
+ * __env_set_flags --
+ *	DB_ENV->set_flags.
+ *
+ * PUBLIC: int  __env_set_flags __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_set_flags(dbenv, flags, on)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	int on;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t mapped_flags;
+	int mem_on, ret;
+
+	env = dbenv->env;
+
+#define	OK_FLAGS							\
+	(DB_AUTO_COMMIT | DB_CDB_ALLDB | DB_DATABASE_LOCKING |		\
+	    DB_DIRECT_DB | DB_DSYNC_DB |  DB_MULTIVERSION |		\
+	    DB_NOLOCKING | DB_NOMMAP | DB_NOPANIC | DB_OVERWRITE |	\
+	    DB_PANIC_ENVIRONMENT | DB_REGION_INIT |			\
+	    DB_TIME_NOTGRANTED | DB_TXN_NOSYNC | DB_TXN_NOWAIT |	\
+	    DB_TXN_SNAPSHOT | DB_TXN_WRITE_NOSYNC | DB_YIELDCPU |	\
+	    DB_HOTBACKUP_IN_PROGRESS | DB_NOFLUSH)
+
+	if (LF_ISSET(~OK_FLAGS))
+		return (__db_ferr(env, "DB_ENV->set_flags", 0));
+	if (on) {
+		if ((ret = __db_fcchk(env, "DB_ENV->set_flags",
+		    flags, DB_TXN_NOSYNC, DB_TXN_WRITE_NOSYNC)) != 0)
+			return (ret);
+		if (LF_ISSET(DB_DIRECT_DB) && __os_support_direct_io() == 0) {
+			__db_errx(env,
+	"DB_ENV->set_flags: direct I/O either not configured or not supported");
+			return (EINVAL);
+		}
+	}
+
+	if (LF_ISSET(DB_CDB_ALLDB))
+		ENV_ILLEGAL_AFTER_OPEN(env,
+		    "DB_ENV->set_flags: DB_CDB_ALLDB");
+	if (LF_ISSET(DB_PANIC_ENVIRONMENT)) {
+		ENV_ILLEGAL_BEFORE_OPEN(env,
+		    "DB_ENV->set_flags: DB_PANIC_ENVIRONMENT");
+		if (on) {
+			__db_errx(env, DB_STR("1558",
+			    "Environment panic set"));
+			(void)__env_panic(env, DB_RUNRECOVERY);
+		} else
+			__env_panic_set(env, 0);
+	}
+	if (LF_ISSET(DB_REGION_INIT))
+		ENV_ILLEGAL_AFTER_OPEN(env,
+		    "DB_ENV->set_flags: DB_REGION_INIT");
+
+	/*
+	 * DB_LOG_IN_MEMORY, DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC are
+	 * mutually incompatible.  If we're setting one of them, clear all
+	 * current settings.  If the environment is open, check to see that
+	 * logging is not in memory.
+	 */
+	if (on && LF_ISSET(DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC)) {
+		F_CLR(dbenv, DB_ENV_TXN_NOSYNC | DB_ENV_TXN_WRITE_NOSYNC);
+		if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+		    if ((ret =
+			__log_set_config(dbenv, DB_LOG_IN_MEMORY, 0)) != 0)
+				return (ret);
+		} else if (LOGGING_ON(env)) {
+			if ((ret = __log_get_config(dbenv,
+			    DB_LOG_IN_MEMORY, &mem_on)) != 0)
+				return (ret);
+			if (mem_on == 1) {
+				__db_errx(env, DB_STR("1559",
+				    "DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC"
+				    " may not be used with DB_LOG_IN_MEMORY"));
+				return (EINVAL);
+			}
+		}
+	}
+
+	/*
+	 * Settings of DB_HOTBACKUP_IN_PROGRESS are reference-counted
+	 * in REGENV.
+	 */
+	if (LF_ISSET(DB_HOTBACKUP_IN_PROGRESS)) {
+		/* You can't take a hot backup without transactions. */
+		ENV_REQUIRES_CONFIG(env, env->tx_handle,
+		    "DB_ENV->set_flags: DB_HOTBACKUP_IN_PROGRESS", DB_INIT_TXN);
+
+		ENV_ENTER(env, ip);
+		ret = __env_set_backup(env, on);
+		ENV_LEAVE(env, ip);
+		if (ret != 0)
+			return (ret);
+	}
+
+	mapped_flags = 0;
+	__env_map_flags(EnvMap, sizeof(EnvMap), &flags, &mapped_flags);
+	if (on)
+		F_SET(dbenv, mapped_flags);
+	else
+		F_CLR(dbenv, mapped_flags);
+
+	return (0);
+}
+
+/*
+ * __env_set_backup --
+ * PUBLIC: int __env_set_backup __P((ENV *, int));
+ */
+int
+__env_set_backup(env, on)
+	ENV *env;
+	int on;
+{
+	DB_TXNREGION *tenv;
+	int needs_checkpoint, ret;
+
+	tenv = (DB_TXNREGION *)env->tx_handle->reginfo.primary;
+	needs_checkpoint = 0;
+
+	TXN_SYSTEM_LOCK(env);
+	if (on) {
+		tenv->n_hotbackup++;
+		if (tenv->n_bulk_txn > 0)
+			needs_checkpoint = 1;
+	} else {
+		if (tenv->n_hotbackup == 0)
+			needs_checkpoint = -1; /* signal count error */
+		else
+			tenv->n_hotbackup--;
+	}
+	TXN_SYSTEM_UNLOCK(env);
+
+	if (needs_checkpoint == -1) {
+		__db_errx(env, DB_STR("1560",
+	    "Attempt to decrement hotbackup counter past zero"));
+		return (EINVAL);
+	}
+
+	if (needs_checkpoint && (ret = __txn_checkpoint(env, 0, 0, 0)))
+		return (ret);
+	return (0);
+}
+
+static int
+__env_get_data_dirs(dbenv, dirpp)
+	DB_ENV *dbenv;
+	const char ***dirpp;
+{
+	*dirpp = (const char **)dbenv->db_data_dir;
+	return (0);
+}
+
+/*
+ * __env_set_data_dir --
+ *	DB_ENV->set_data_dir.
+ *
+ * PUBLIC: int  __env_set_data_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_data_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	int ret;
+
+	if ((ret = __env_add_data_dir(dbenv, dir)) != 0)
+		return (ret);
+
+	if (dbenv->data_next == 1)
+		return (__env_set_create_dir(dbenv, dir));
+
+	return (0);
+}
+
+/*
+ * __env_add_data_dir --
+ *	DB_ENV->add_data_dir.
+ *
+ * PUBLIC: int  __env_add_data_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_add_data_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->add_data_dir");
+
+	/*
+	 * The array is NULL-terminated so it can be returned by get_data_dirs
+	 * without a length.
+	 */
+
+#define	DATA_INIT_CNT	20			/* Start with 20 data slots. */
+	if (dbenv->db_data_dir == NULL) {
+		if ((ret = __os_calloc(env, DATA_INIT_CNT,
+		    sizeof(char **), &dbenv->db_data_dir)) != 0)
+			return (ret);
+		dbenv->data_cnt = DATA_INIT_CNT;
+	} else if (dbenv->data_next == dbenv->data_cnt - 2) {
+		dbenv->data_cnt *= 2;
+		if ((ret = __os_realloc(env,
+		    (u_int)dbenv->data_cnt * sizeof(char **),
+		    &dbenv->db_data_dir)) != 0)
+			return (ret);
+	}
+
+	ret = __os_strdup(env,
+	    dir, &dbenv->db_data_dir[dbenv->data_next++]);
+	dbenv->db_data_dir[dbenv->data_next] = NULL;
+	return (ret);
+}
+
+/*
+ * __env_set_create_dir --
+ *	DB_ENV->set_create_dir.
+ * The list of directories cannot change after opening the env and setting
+ * a pointer must be atomic so we do not need to mutex here even if multiple
+ * threads are using the DB_ENV handle.
+ *
+ * PUBLIC: int  __env_set_create_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_create_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	ENV *env;
+	int i;
+
+	env = dbenv->env;
+
+	for (i = 0; i < dbenv->data_next; i++)
+		if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+			break;
+
+	if (i == dbenv->data_next) {
+		__db_errx(env, DB_STR_A("1561",
+		    "Directory %s not in environment list.", "%s"), dir);
+		return (EINVAL);
+	}
+
+	dbenv->db_create_dir = dbenv->db_data_dir[i];
+	return (0);
+}
+
+static int
+__env_get_create_dir(dbenv, dirp)
+	DB_ENV *dbenv;
+	const char **dirp;
+{
+	*dirp = dbenv->db_create_dir;
+	return (0);
+}
+
+static int
+__env_get_intermediate_dir_mode(dbenv, modep)
+	DB_ENV *dbenv;
+	const char **modep;
+{
+	*modep = dbenv->intermediate_dir_mode;
+	return (0);
+}
+
+/*
+ * __env_set_metadata_dir --
+ *	DB_ENV->set_metadata_dir.
+ *
+ * PUBLIC: int  __env_set_metadata_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_metadata_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	ENV *env;
+	int i, ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_metadata_dir");
+
+	/* If metadata_dir is not already on data_dir list, add it. */
+	for (i = 0; i < dbenv->data_next; i++)
+		if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+			break;
+	if (i == dbenv->data_next &&
+	    (ret = __env_add_data_dir(dbenv, dir)) != 0) {
+		__db_errx(env, DB_STR_A("1590",
+		    "Could not add %s to environment list.", "%s"), dir);
+		return (ret);
+	}
+
+	if (dbenv->db_md_dir != NULL)
+		__os_free(env, dbenv->db_md_dir);
+	return (__os_strdup(env, dir, &dbenv->db_md_dir));
+}
+
+static int
+__env_get_metadata_dir(dbenv, dirp)
+	DB_ENV *dbenv;
+	const char **dirp;
+{
+	*dirp = dbenv->db_md_dir;
+	return (0);
+}
+
+/*
+ * __env_set_data_len --
+ *	DB_ENV->set_data_len.
+ *
+ * PUBLIC: int  __env_set_data_len __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_set_data_len(dbenv, data_len)
+	DB_ENV *dbenv;
+	u_int32_t data_len;
+{
+
+	dbenv->env->data_len = data_len;
+	return (0);
+}
+
+static int
+__env_get_data_len(dbenv, data_lenp)
+	DB_ENV *dbenv;
+	u_int32_t *data_lenp;
+{
+	*data_lenp = dbenv->env->data_len;
+	return (0);
+}
+
+/*
+ * __env_set_intermediate_dir_mode --
+ *	DB_ENV->set_intermediate_dir_mode.
+ *
+ * PUBLIC: int  __env_set_intermediate_dir_mode __P((DB_ENV *, const char *));
+ */
+int
+__env_set_intermediate_dir_mode(dbenv, mode)
+	DB_ENV *dbenv;
+	const char *mode;
+{
+	ENV *env;
+	u_int t;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_intermediate_dir_mode");
+
+#define	__SETMODE(offset, valid_ch, mask) {				\
+	if (mode[offset] == (valid_ch))					\
+		t |= (mask);						\
+	else if (mode[offset] != '-')					\
+		goto format_err;					\
+}
+	t = 0;
+	__SETMODE(0, 'r', S_IRUSR);
+	__SETMODE(1, 'w', S_IWUSR);
+	__SETMODE(2, 'x', S_IXUSR);
+	__SETMODE(3, 'r', S_IRGRP);
+	__SETMODE(4, 'w', S_IWGRP);
+	__SETMODE(5, 'x', S_IXGRP);
+	__SETMODE(6, 'r', S_IROTH);
+	__SETMODE(7, 'w', S_IWOTH);
+	__SETMODE(8, 'x', S_IXOTH);
+	if (mode[9] != '\0' || t == 0) {
+		/*
+		 * We disallow modes of 0 -- we use 0 to decide the application
+		 * never configured intermediate directory permissions, and we
+		 * shouldn't create intermediate directories.  Besides, setting
+		 * the permissions to 0 makes no sense.
+		 */
+format_err:	__db_errx(env,
+	    "DB_ENV->set_intermediate_dir_mode: illegal mode \"%s\"", mode);
+		return (EINVAL);
+	}
+
+	if (dbenv->intermediate_dir_mode != NULL)
+		__os_free(env, dbenv->intermediate_dir_mode);
+	if ((ret = __os_strdup(env, mode, &dbenv->intermediate_dir_mode)) != 0)
+		return (ret);
+
+	env->dir_mode = (int)t;
+	return (0);
+}
+
+/*
+ * __env_get_errcall --
+ *	{DB_ENV,DB}->get_errcall.
+ *
+ * PUBLIC: void __env_get_errcall __P((DB_ENV *,
+ * PUBLIC:		void (**)(const DB_ENV *, const char *, const char *)));
+ */
+void
+__env_get_errcall(dbenv, errcallp)
+	DB_ENV *dbenv;
+	void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+	*errcallp = dbenv->db_errcall;
+}
+
+/*
+ * __env_set_errcall --
+ *	{DB_ENV,DB}->set_errcall.
+ *
+ * PUBLIC: void __env_set_errcall __P((DB_ENV *,
+ * PUBLIC:		void (*)(const DB_ENV *, const char *, const char *)));
+ */
+void
+__env_set_errcall(dbenv, errcall)
+	DB_ENV *dbenv;
+	void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	F_CLR(env, ENV_NO_OUTPUT_SET);
+	dbenv->db_errcall = errcall;
+}
+
+/*
+ * __env_get_errfile --
+ *	{DB_ENV,DB}->get_errfile.
+ *
+ * PUBLIC: void __env_get_errfile __P((DB_ENV *, FILE **));
+ */
+void
+__env_get_errfile(dbenv, errfilep)
+	DB_ENV *dbenv;
+	FILE **errfilep;
+{
+	*errfilep = dbenv->db_errfile;
+}
+
+/*
+ * __env_set_errfile --
+ *	{DB_ENV,DB}->set_errfile.
+ *
+ * PUBLIC: void __env_set_errfile __P((DB_ENV *, FILE *));
+ */
+void
+__env_set_errfile(dbenv, errfile)
+	DB_ENV *dbenv;
+	FILE *errfile;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	F_CLR(env, ENV_NO_OUTPUT_SET);
+	dbenv->db_errfile = errfile;
+}
+
+/*
+ * __env_get_errpfx --
+ *	{DB_ENV,DB}->get_errpfx.
+ *
+ * PUBLIC: void __env_get_errpfx __P((DB_ENV *, const char **));
+ */
+void
+__env_get_errpfx(dbenv, errpfxp)
+	DB_ENV *dbenv;
+	const char **errpfxp;
+{
+	*errpfxp = dbenv->db_errpfx;
+}
+
+/*
+ * __env_set_errpfx --
+ *	{DB_ENV,DB}->set_errpfx.
+ *
+ * PUBLIC: void __env_set_errpfx __P((DB_ENV *, const char *));
+ */
+void
+__env_set_errpfx(dbenv, errpfx)
+	DB_ENV *dbenv;
+	const char *errpfx;
+{
+	dbenv->db_errpfx = errpfx;
+}
+
+static int
+__env_get_feedback(dbenv, feedbackp)
+	DB_ENV *dbenv;
+	void (**feedbackp) __P((DB_ENV *, int, int));
+{
+	if (feedbackp != NULL)
+		*feedbackp = dbenv->db_feedback;
+	return (0);
+}
+
+static int
+__env_set_feedback(dbenv, feedback)
+	DB_ENV *dbenv;
+	void (*feedback) __P((DB_ENV *, int, int));
+{
+	dbenv->db_feedback = feedback;
+	return (0);
+}
+
+/*
+ * __env_get_thread_id_fn --
+ *	DB_ENV->get_thread_id_fn
+ */
+static int
+__env_get_thread_id_fn(dbenv, idp)
+	DB_ENV *dbenv;
+	void (**idp) __P((DB_ENV *, pid_t *, db_threadid_t *));
+{
+	if (idp != NULL)
+		*idp = dbenv->thread_id;
+	return (0);
+}
+
+/*
+ * __env_set_thread_id --
+ *	DB_ENV->set_thread_id
+ */
+static int
+__env_set_thread_id(dbenv, id)
+	DB_ENV *dbenv;
+	void (*id) __P((DB_ENV *, pid_t *, db_threadid_t *));
+{
+	dbenv->thread_id = id;
+	return (0);
+}
+
+/*
+ * __env_get_threadid_string_fn --
+ *	DB_ENV->get_threadid_string_fn
+ */
+static int
+__env_get_thread_id_string_fn(dbenv, thread_id_stringp)
+	DB_ENV *dbenv;
+	char *(**thread_id_stringp)
+	    __P((DB_ENV *, pid_t, db_threadid_t, char *));
+{
+	if (thread_id_stringp != NULL)
+		*thread_id_stringp = dbenv->thread_id_string;
+	return (0);
+}
+
+/*
+ * __env_set_threadid_string --
+ *	DB_ENV->set_threadid_string
+ */
+static int
+__env_set_thread_id_string(dbenv, thread_id_string)
+	DB_ENV *dbenv;
+	char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
+{
+	dbenv->thread_id_string = thread_id_string;
+	return (0);
+}
+
+/*
+ * __env_get_isalive --
+ *	DB_ENV->get_isalive
+ */
+static int
+__env_get_isalive(dbenv, is_alivep)
+	DB_ENV *dbenv;
+	int (**is_alivep) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) {
+		__db_errx(env, DB_STR("1562",
+	    "is_alive method specified but no thread region allocated"));
+		return (EINVAL);
+	}
+	if (is_alivep != NULL)
+		*is_alivep = dbenv->is_alive;
+	return (0);
+}
+
+/*
+ * __env_set_isalive --
+ *	DB_ENV->set_isalive
+ */
+static int
+__env_set_isalive(dbenv, is_alive)
+	DB_ENV *dbenv;
+	int (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) {
+		__db_errx(env, DB_STR("1563",
+	    "is_alive method specified but no thread region allocated"));
+		return (EINVAL);
+	}
+	dbenv->is_alive = is_alive;
+	return (0);
+}
+
+/*
+ * __env_get_thread_count --
+ *	DB_ENV->get_thread_count
+ */
+static int
+__env_get_thread_count(dbenv, countp)
+	DB_ENV *dbenv;
+	u_int32_t *countp;
+{
+	*countp = dbenv->thr_max;
+	return (0);
+}
+
+/*
+ * __env_set_thread_count --
+ *	DB_ENV->set_thread_count
+ *
+ * PUBLIC: int  __env_set_thread_count __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_set_thread_count(dbenv, count)
+	DB_ENV *dbenv;
+	u_int32_t count;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_thread_count");
+	dbenv->thr_max = count;
+
+	return (0);
+}
+
+/*
+ * __env_get_msgcall --
+ *	{DB_ENV,DB}->get_msgcall.
+ *
+ * PUBLIC: void __env_get_msgcall
+ * PUBLIC:     __P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+ */
+void
+__env_get_msgcall(dbenv, msgcallp)
+	DB_ENV *dbenv;
+	void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+	if (msgcallp != NULL)
+		*msgcallp = dbenv->db_msgcall;
+}
+
+/*
+ * __env_set_msgcall --
+ *	{DB_ENV,DB}->set_msgcall.
+ *
+ * PUBLIC: void __env_set_msgcall
+ * PUBLIC:     __P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+ */
+void
+__env_set_msgcall(dbenv, msgcall)
+	DB_ENV *dbenv;
+	void (*msgcall) __P((const DB_ENV *, const char *));
+{
+	dbenv->db_msgcall = msgcall;
+}
+
+/*
+ * __env_get_msgfile --
+ *	{DB_ENV,DB}->get_msgfile.
+ *
+ * PUBLIC: void __env_get_msgfile __P((DB_ENV *, FILE **));
+ */
+void
+__env_get_msgfile(dbenv, msgfilep)
+	DB_ENV *dbenv;
+	FILE **msgfilep;
+{
+	*msgfilep = dbenv->db_msgfile;
+}
+
+/*
+ * __env_set_msgfile --
+ *	{DB_ENV,DB}->set_msgfile.
+ *
+ * PUBLIC: void __env_set_msgfile __P((DB_ENV *, FILE *));
+ */
+void
+__env_set_msgfile(dbenv, msgfile)
+	DB_ENV *dbenv;
+	FILE *msgfile;
+{
+	dbenv->db_msgfile = msgfile;
+}
+
+/*
+ * __env_set_paniccall --
+ *	{DB_ENV,DB}->set_paniccall.
+ *
+ * PUBLIC: int  __env_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int)));
+ */
+int
+__env_set_paniccall(dbenv, paniccall)
+	DB_ENV *dbenv;
+	void (*paniccall) __P((DB_ENV *, int));
+{
+	dbenv->db_paniccall = paniccall;
+	return (0);
+}
+
+/*
+ * __env_set_event_notify --
+ *	DB_ENV->set_event_notify.
+ */
+static int
+__env_set_event_notify(dbenv, event_func)
+	DB_ENV *dbenv;
+	void (*event_func) __P((DB_ENV *, u_int32_t, void *));
+{
+	dbenv->db_event_func = event_func;
+	return (0);
+}
+
+static int
+__env_get_shm_key(dbenv, shm_keyp)
+	DB_ENV *dbenv;
+	long *shm_keyp;			/* !!!: really a key_t *. */
+{
+	*shm_keyp = dbenv->shm_key;
+	return (0);
+}
+
+/*
+ * __env_set_shm_key --
+ *	DB_ENV->set_shm_key.
+ *
+ * PUBLIC: int  __env_set_shm_key __P((DB_ENV *, long));
+ */
+int
+__env_set_shm_key(dbenv, shm_key)
+	DB_ENV *dbenv;
+	long shm_key;			/* !!!: really a key_t. */
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_shm_key");
+
+	dbenv->shm_key = shm_key;
+	return (0);
+}
+
+static int
+__env_get_tmp_dir(dbenv, dirp)
+	DB_ENV *dbenv;
+	const char **dirp;
+{
+	*dirp = dbenv->db_tmp_dir;
+	return (0);
+}
+
+/*
+ * __env_set_tmp_dir --
+ *	DB_ENV->set_tmp_dir.
+ *
+ * PUBLIC: int  __env_set_tmp_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_tmp_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (dbenv->db_tmp_dir != NULL)
+		__os_free(env, dbenv->db_tmp_dir);
+	return (__os_strdup(env, dir, &dbenv->db_tmp_dir));
+}
+
+static int
+__env_get_verbose(dbenv, which, onoffp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onoffp;
+{
+	switch (which) {
+	case DB_VERB_BACKUP:
+	case DB_VERB_DEADLOCK:
+	case DB_VERB_FILEOPS:
+	case DB_VERB_FILEOPS_ALL:
+	case DB_VERB_RECOVERY:
+	case DB_VERB_REGISTER:
+	case DB_VERB_REPLICATION:
+	case DB_VERB_REP_ELECT:
+	case DB_VERB_REP_LEASE:
+	case DB_VERB_REP_MISC:
+	case DB_VERB_REP_MSGS:
+	case DB_VERB_REP_SYNC:
+	case DB_VERB_REP_SYSTEM:
+	case DB_VERB_REP_TEST:
+	case DB_VERB_REPMGR_CONNFAIL:
+	case DB_VERB_REPMGR_MISC:
+	case DB_VERB_WAITSFOR:
+		*onoffp = FLD_ISSET(dbenv->verbose, which) ? 1 : 0;
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __env_set_verbose --
+ *	DB_ENV->set_verbose.
+ *
+ * PUBLIC: int  __env_set_verbose __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_set_verbose(dbenv, which, on)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int on;
+{
+	switch (which) {
+	case DB_VERB_BACKUP:
+	case DB_VERB_DEADLOCK:
+	case DB_VERB_FILEOPS:
+	case DB_VERB_FILEOPS_ALL:
+	case DB_VERB_RECOVERY:
+	case DB_VERB_REGISTER:
+	case DB_VERB_REPLICATION:
+	case DB_VERB_REP_ELECT:
+	case DB_VERB_REP_LEASE:
+	case DB_VERB_REP_MISC:
+	case DB_VERB_REP_MSGS:
+	case DB_VERB_REP_SYNC:
+	case DB_VERB_REP_SYSTEM:
+	case DB_VERB_REP_TEST:
+	case DB_VERB_REPMGR_CONNFAIL:
+	case DB_VERB_REPMGR_MISC:
+	case DB_VERB_WAITSFOR:
+		if (on)
+			FLD_SET(dbenv->verbose, which);
+		else
+			FLD_CLR(dbenv->verbose, which);
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_mi_env --
+ *	Method illegally called with public environment.
+ *
+ * PUBLIC: int __db_mi_env __P((ENV *, const char *));
+ */
+int
+__db_mi_env(env, name)
+	ENV *env;
+	const char *name;
+{
+	__db_errx(env, DB_STR_A("1564",
+	    "%s: method not permitted when environment specified", "%s"),
+	    name);
+	return (EINVAL);
+}
+
+/*
+ * __db_mi_open --
+ *	Method illegally called after open.
+ *
+ * PUBLIC: int __db_mi_open __P((ENV *, const char *, int));
+ */
+int
+__db_mi_open(env, name, after)
+	ENV *env;
+	const char *name;
+	int after;
+{
+	__db_errx(env, DB_STR_A("1565",
+	    "%s: method not permitted %s handle's open method", "%s %s"),
+	    name, after ? DB_STR_P("after") : DB_STR_P("before"));
+	return (EINVAL);
+}
+
+/*
+ * __env_not_config --
+ *	Method or function called without required configuration.
+ *
+ * PUBLIC: int __env_not_config __P((ENV *, char *, u_int32_t));
+ */
+int
+__env_not_config(env, i, flags)
+	ENV *env;
+	char *i;
+	u_int32_t flags;
+{
+	char *sub;
+	int is_sub;
+
+	is_sub = 1;
+
+	switch (flags) {
+	case DB_INIT_CDB:
+		sub = "DB_INIT_CDB";
+		is_sub = 0;
+		break;
+	case DB_INIT_LOCK:
+		sub = "locking";
+		break;
+	case DB_INIT_LOG:
+		sub = "logging";
+		break;
+	case DB_INIT_MPOOL:
+		sub = "memory pool";
+		break;
+	case DB_INIT_MUTEX:
+		sub = "mutex";
+		break;
+	case DB_INIT_REP:
+		sub = "replication";
+		break;
+	case DB_INIT_TXN:
+		sub = "transaction";
+		break;
+	default:
+		sub = "<unspecified>";
+		break;
+	}
+
+	if (is_sub) {
+		__db_errx(env, DB_STR_A("1566",
+    "%s interface requires an environment configured for the %s subsystem",
+	    "%s %s"), i, sub);
+	} else {
+		__db_errx(env, DB_STR_A("1587",
+	"%s interface requires an environment configured with %s",
+	    "%s %s"), i, sub);
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * __env_get_timeout --
+ *	DB_ENV->get_timeout
+ */
+static int
+__env_get_timeout(dbenv, timeoutp, flags)
+	DB_ENV *dbenv;
+	db_timeout_t *timeoutp;
+	u_int32_t flags;
+{
+	int ret;
+
+	ret = 0;
+	if (flags == DB_SET_REG_TIMEOUT) {
+		*timeoutp = dbenv->envreg_timeout;
+	} else
+		ret = __lock_get_env_timeout(dbenv, timeoutp, flags);
+	return (ret);
+}
+
+/*
+ * __env_set_timeout --
+ *	DB_ENV->set_timeout
+ *
+ * PUBLIC: int __env_set_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+ */
+int
+__env_set_timeout(dbenv, timeout, flags)
+	DB_ENV *dbenv;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	int ret;
+
+	ret = 0;
+	if (flags == DB_SET_REG_TIMEOUT)
+		dbenv->envreg_timeout = timeout;
+	else
+		ret = __lock_set_env_timeout(dbenv, timeout, flags);
+	return (ret);
+}
diff --git a/src/env/env_name.c b/src/env/env_name.c
new file mode 100644
index 00000000..a3a0b371
--- /dev/null
+++ b/src/env/env_name.c
@@ -0,0 +1,285 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __db_fullpath
+    __P((ENV *, const char *, const char *, int, int, char **));
+
+#define	DB_ADDSTR(add) {						\
+	/*								\
+	 * The string might be NULL or zero-length, and the p[-1]	\
+	 * might indirect to before the beginning of our buffer.	\
+	 */								\
+	if ((add) != NULL && (add)[0] != '\0') {			\
+		/* If leading slash, start over. */			\
+		if (__os_abspath(add)) {				\
+			p = str;					\
+			slash = 0;					\
+		}							\
+		/* Append to the current string. */			\
+		len = strlen(add);					\
+		if (slash)						\
+			*p++ = PATH_SEPARATOR[0];			\
+		memcpy(p, add, len);					\
+		p += len;						\
+		slash = strchr(PATH_SEPARATOR, p[-1]) == NULL;		\
+	}								\
+}
+
+/*
+ * __db_fullpath --
+ *	Constructs a path name relative to the environment home, and optionally
+ *	checks whether the file or directory exist.
+ */
+static int
+__db_fullpath(env, dir, file, check_file, check_dir, namep)
+	ENV *env;
+	const char *dir;
+	const char *file;
+	int check_file;
+	int check_dir;
+	char **namep;
+{
+	size_t len;
+	const char *home;
+	char *p, *str;
+	int isdir, ret, slash;
+
+	/* All paths are relative to the environment home. */
+	home = (env == NULL) ? NULL : env->db_home;
+
+	len =
+	    (home == NULL ? 0 : strlen(home) + 1) +
+	    (dir == NULL ? 0 : strlen(dir) + 1) +
+	    (file == NULL ? 0 : strlen(file) + 1);
+
+	if ((ret = __os_malloc(env, len, &str)) != 0)
+		return (ret);
+
+	slash = 0;
+	p = str;
+	DB_ADDSTR(home);
+	DB_ADDSTR(dir);
+	*p = '\0';
+	if (check_dir && (__os_exists(env, str, &isdir) != 0 || !isdir)) {
+		__os_free(env, str);
+		return (ENOENT);
+	}
+	DB_ADDSTR(file);
+	*p = '\0';
+
+	/*
+	 * If we're opening a data file, see if it exists.  If not, keep
+	 * trying.
+	 */
+	if (check_file && __os_exists(env, str, NULL) != 0) {
+		__os_free(env, str);
+		return (ENOENT);
+	}
+
+	if (namep == NULL)
+		__os_free(env, str);
+	else
+		*namep = str;
+	return (0);
+}
+
+#define	DB_CHECKFILE(file, dir, check_file, check_dir, namep, ret_dir) do { \
+	ret = __db_fullpath(env, dir, file,				\
+			check_file, check_dir, namep);			\
+	if (ret == 0 && (ret_dir) != NULL)				\
+		*(ret_dir) = (dir);					\
+	if (ret != ENOENT)						\
+		return (ret);						\
+} while (0)
+
+/*
+ * __db_appname --
+ *	Given an optional DB environment, directory and file name and type
+ *	of call, build a path based on the ENV->open rules, and return
+ *	it in allocated space.  Dirp can be used to specify a data directory
+ *	to use.  If not and one is used then drip will contain a pointer
+ *	to the directory name.
+ *
+ * PUBLIC: int __db_appname __P((ENV *, APPNAME,
+ * PUBLIC:    const char *, const char **, char **));
+ */
+int
+__db_appname(env, appname, file, dirp, namep)
+	ENV *env;
+	APPNAME appname;
+	const char *file;
+	const char **dirp;
+	char **namep;
+{
+	DB_ENV *dbenv;
+	char **ddp;
+	const char *dir;
+	int ret;
+
+	dbenv = env->dbenv;
+	dir = NULL;
+
+	if (namep != NULL)
+		*namep = NULL;
+
+	/*
+	 * Absolute path names are never modified.  If the file is an absolute
+	 * path, we're done.
+	 */
+	if (file != NULL && __os_abspath(file))
+		return (__os_strdup(env, file, namep));
+
+	/*
+	 * DB_APP_NONE:
+	 *      DB_HOME/file
+	 * DB_APP_DATA:
+	 *      DB_HOME/DB_DATA_DIR/file
+	 * DB_APP_LOG:
+	 *      DB_HOME/DB_LOG_DIR/file
+	 * DB_APP_TMP:
+	 *      DB_HOME/DB_TMP_DIR/<create>
+	 */
+	switch (appname) {
+	case DB_APP_NONE:
+		break;
+	case DB_APP_RECOVER:
+	case DB_APP_DATA:
+		/*
+		 * First, step through the data_dir entries, if any, looking
+		 * for the file.
+		 */
+		if (dbenv != NULL && dbenv->db_data_dir != NULL)
+			for (ddp = dbenv->db_data_dir; *ddp != NULL; ddp++)
+				DB_CHECKFILE(file, *ddp, 1, 0, namep, dirp);
+
+		/* Second, look in the environment home directory. */
+		DB_CHECKFILE(file, NULL, 1, 0, namep, dirp);
+
+		/*
+		 * Otherwise, we're going to create.  Use the specified
+		 * directory unless we're in recovery and it doesn't exist.
+		 */
+		if (dirp != NULL && *dirp != NULL)
+			DB_CHECKFILE(file, *dirp, 0,
+			    appname == DB_APP_RECOVER, namep, dirp);
+
+		/* Finally, use the create directory, if set. */
+		if (dbenv != NULL && dbenv->db_create_dir != NULL)
+			dir = dbenv->db_create_dir;
+		break;
+	case DB_APP_LOG:
+		if (dbenv != NULL)
+			dir = dbenv->db_log_dir;
+		break;
+	case DB_APP_TMP:
+		if (dbenv != NULL)
+			dir = dbenv->db_tmp_dir;
+		break;
+	case DB_APP_META:
+		if (dbenv != NULL)
+			dir = dbenv->db_md_dir;
+		break;
+	}
+
+	/*
+	 * Construct the full path.  For temporary files, it is an error if the
+	 * directory does not exist: if it doesn't, checking whether millions
+	 * of temporary files exist inside it takes a *very* long time.
+	 */
+	DB_CHECKFILE(file, dir, 0, appname == DB_APP_TMP, namep, dirp);
+
+	return (ret);
+}
+
+/*
+ * __db_tmp_open --
+ *	Create a temporary file.
+ *
+ * PUBLIC: int __db_tmp_open __P((ENV *, u_int32_t, DB_FH **));
+ */
+int
+__db_tmp_open(env, oflags, fhpp)
+	ENV *env;
+	u_int32_t oflags;
+	DB_FH **fhpp;
+{
+	pid_t pid;
+	int filenum, i, ipid, ret;
+	char *path;
+	char *firstx, *trv;
+
+	DB_ASSERT(env, fhpp != NULL);
+	*fhpp = NULL;
+
+#define	DB_TRAIL	"BDBXXXXX"
+	if ((ret = __db_appname(env, DB_APP_TMP, DB_TRAIL, NULL, &path)) != 0)
+		goto done;
+
+	/* Replace the X's with the process ID (in decimal). */
+	__os_id(env->dbenv, &pid, NULL);
+	ipid = (int)pid;
+	if (ipid < 0)
+		ipid = -ipid;
+	for (trv = path + strlen(path); *--trv == 'X'; ipid /= 10)
+		*trv = '0' + (u_char)(ipid % 10);
+	firstx = trv + 1;
+
+	/* Loop, trying to open a file. */
+	for (filenum = 1;; filenum++) {
+		if ((ret = __os_open(env, path, 0,
+		    oflags | DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
+		    DB_MODE_600, fhpp)) == 0) {
+			ret = 0;
+			goto done;
+		}
+
+		/*
+		 * !!!:
+		 * If we don't get an EEXIST error, then there's something
+		 * seriously wrong.  Unfortunately, if the implementation
+		 * doesn't return EEXIST for O_CREAT and O_EXCL regardless
+		 * of other possible errors, we've lost.
+		 */
+		if (ret != EEXIST) {
+			__db_err(env, ret, DB_STR_A("1586",
+			    "temporary open: %s", "%s"), path);
+			goto done;
+		}
+
+		/*
+		 * Generate temporary file names in a backwards-compatible way.
+		 * If pid == 12345, the result is:
+		 *   <path>/DB12345 (tried above, the first time through).
+		 *   <path>/DBa2345 ...  <path>/DBz2345
+		 *   <path>/DBaa345 ...  <path>/DBaz345
+		 *   <path>/DBba345, and so on.
+		 *
+		 * XXX
+		 * This algorithm is O(n**2) -- that is, creating 100 temporary
+		 * files requires 5,000 opens, creating 1000 files requires
+		 * 500,000.  If applications open a lot of temporary files, we
+		 * could improve performance by switching to timestamp-based
+		 * file names.
+		 */
+		for (i = filenum, trv = firstx; i > 0; i = (i - 1) / 26)
+			if (*trv++ == '\0') {
+				ret = EINVAL;
+				goto done;
+			}
+
+		for (i = filenum; i > 0; i = (i - 1) / 26)
+			*--trv = 'a' + ((i - 1) % 26);
+	}
+done:
+	__os_free(env, path);
+	return (ret);
+}
diff --git a/src/env/env_open.c b/src/env/env_open.c
new file mode 100644
index 00000000..7eddca3a
--- /dev/null
+++ b/src/env/env_open.c
@@ -0,0 +1,1262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __env_open_arg __P((DB_ENV *, u_int32_t));
+static int __file_handle_cleanup __P((ENV *));
+
+/*
+ * db_version --
+ *	Return legacy version information, including DB Major Version,
+ *	DB Minor Version, and DB Patch/Build numbers.
+ *
+ * EXTERN: char *db_version __P((int *, int *, int *));
+ */
+char *
+db_version(majverp, minverp, patchp)
+	int *majverp, *minverp, *patchp;
+{
+	if (majverp != NULL)
+		*majverp = DB_VERSION_MAJOR;
+	if (minverp != NULL)
+		*minverp = DB_VERSION_MINOR;
+	if (patchp != NULL)
+		*patchp = DB_VERSION_PATCH;
+	return ((char *)DB_VERSION_STRING);
+}
+
+/*
+ * db_full_version --
+ *	Return complete version information, including Oracle Family,
+ *	Oracle Release, DB Major Version, DB Minor Version, and DB
+ *	Patch/Build numbers.
+ *
+ * EXTERN: char *db_full_version __P((int *, int *, int *, int *, int *));
+ */
+char *
+db_full_version(familyp, releasep, majverp, minverp, patchp)
+	int *familyp, *releasep, *majverp, *minverp, *patchp;
+{
+	if (familyp != NULL)
+		*familyp = DB_VERSION_FAMILY;
+	if (releasep != NULL)
+		*releasep = DB_VERSION_RELEASE;
+	if (majverp != NULL)
+		*majverp = DB_VERSION_MAJOR;
+	if (minverp != NULL)
+		*minverp = DB_VERSION_MINOR;
+	if (patchp != NULL)
+		*patchp = DB_VERSION_PATCH;
+	return ((char *)DB_VERSION_FULL_STRING);
+}
+
+/*
+ * __env_open_pp --
+ *	DB_ENV->open pre/post processing.
+ *
+ * PUBLIC: int __env_open_pp __P((DB_ENV *, const char *, u_int32_t, int));
+ */
+int
+__env_open_pp(dbenv, db_home, flags, mode)
+	DB_ENV *dbenv;
+	const char *db_home;
+	u_int32_t flags;
+	int mode;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->open");
+
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_CREATE | DB_FAILCHK | DB_FAILCHK_ISALIVE | DB_INIT_CDB |	\
+	DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_REP |	\
+	DB_INIT_TXN | DB_LOCKDOWN | DB_NO_CHECKPOINT | DB_PRIVATE |	\
+	DB_RECOVER | DB_RECOVER_FATAL | DB_REGISTER | DB_SYSTEM_MEM |	\
+	DB_THREAD | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+#undef	OKFLAGS_CDB
+#define	OKFLAGS_CDB							\
+	(DB_CREATE | DB_INIT_CDB | DB_INIT_MPOOL | DB_LOCKDOWN |	\
+	DB_PRIVATE | DB_SYSTEM_MEM | DB_THREAD |			\
+	DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+
+	if ((ret = __db_fchk(env, "DB_ENV->open", flags, OKFLAGS)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(
+	    env, "DB_ENV->open", flags, DB_INIT_CDB, ~OKFLAGS_CDB)) != 0)
+		return (ret);
+
+#if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8)
+	if (F_ISSET(env, DB_PRIVATE)) {
+		__db_errx(env, DB_STR("1589", "DB_PRIVATE is not "
+			    "supported by 64-bit applications in "
+			    "mixed-size-addressing mode"));
+			return (EINVAL);
+		}
+#endif
+
+	return (__env_open(dbenv, db_home, flags, mode));
+}
+
+/*
+ * __env_open --
+ *	DB_ENV->open.
+ *
+ * PUBLIC: int __env_open __P((DB_ENV *, const char *, u_int32_t, int));
+ */
+int
+__env_open(dbenv, db_home, flags, mode)
+	DB_ENV *dbenv;
+	const char *db_home;
+	u_int32_t flags;
+	int mode;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t orig_flags;
+	int register_recovery, ret, t_ret;
+
+	ip = NULL;
+	env = dbenv->env;
+	register_recovery = 0;
+
+	/* Initial configuration. */
+	if ((ret = __env_config(dbenv, db_home, &flags, mode)) != 0)
+		return (ret);
+
+	/*
+	 * Save the DB_ENV handle's configuration flags as set by user-called
+	 * configuration methods and the environment directory's DB_CONFIG
+	 * file.  If we use this DB_ENV structure to recover the existing
+	 * environment or to remove an environment we created after failure,
+	 * we'll restore the DB_ENV flags to these values.
+	 */
+	orig_flags = dbenv->flags;
+
+	/* Check open flags. */
+	if ((ret = __env_open_arg(dbenv, flags)) != 0)
+		return (ret);
+
+	/*
+	 * If we're going to register with the environment, that's the first
+	 * thing we do.
+	 */
+	if (LF_ISSET(DB_REGISTER)) {
+		/*
+		 * Through the SQL interface (btree.c) we set
+		 * DB_FAILCHK_ISALIVE.  When set, we want to run failchk
+		 * if a recovery is needed. Set up the infrastructure to run
+		 * it.   SQL applications have no way to specify the thread
+		 * count or an isalive, so force it here. Failchk is run
+		 * inside of register code.
+		 */
+		if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
+			(void)__env_set_thread_count(dbenv, 50);
+			dbenv->is_alive = __envreg_isalive;
+		}
+
+		if ((ret =
+		    __envreg_register(env, &register_recovery, flags)) != 0)
+			goto err;
+		if (register_recovery) {
+			if (!LF_ISSET(DB_RECOVER)) {
+				__db_errx(env, DB_STR("1567",
+	    "The DB_RECOVER flag was not specified, and recovery is needed"));
+				ret = DB_RUNRECOVERY;
+				goto err;
+			}
+		} else
+			LF_CLR(DB_RECOVER);
+	}
+
+	/*
+	 * If we're doing recovery, destroy the environment so that we create
+	 * all the regions from scratch.  The major concern I have is if the
+	 * application stomps the environment with a rogue pointer.  We have
+	 * no way of detecting that, and we could be forced into a situation
+	 * where we start up and then crash, repeatedly.
+	 *
+	 * We do not check any flags like DB_PRIVATE before calling remove.
+	 * We don't care if the current environment was private or not, we
+	 * want to remove files left over for any reason, from any session.
+	 */
+retry:	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))
+#ifdef HAVE_REPLICATION
+		if ((ret = __rep_reset_init(env)) != 0 ||
+		    (ret = __env_remove_env(env)) != 0 ||
+#else
+		if ((ret = __env_remove_env(env)) != 0 ||
+#endif
+		    (ret = __env_refresh(dbenv, orig_flags, 0)) != 0)
+			goto err;
+
+	if ((ret = __env_attach_regions(dbenv, flags, orig_flags, 1)) != 0)
+		goto err;
+
+	/*
+	 * After attached to env, run failchk if not doing register
+	 * recovery.  Not providing this option with the DB_FAILCHK_ISALIVE
+	 * flag.
+	 */
+	if (LF_ISSET(DB_FAILCHK) && !register_recovery) {
+		ENV_ENTER(env, ip);
+		if ((ret = __env_failchk_int(dbenv)) != 0)
+			goto err;
+		ENV_LEAVE(env, ip);
+	}
+
+err:	if (ret != 0)
+		(void)__env_refresh(dbenv, orig_flags, 0);
+
+	if (register_recovery) {
+		/*
+		 * If recovery succeeded, release our exclusive lock, other
+		 * processes can now proceed.
+		 *
+		 * If recovery failed, unregister now and let another process
+		 * clean up.
+		 */
+		if (ret == 0 && (t_ret = __envreg_xunlock(env)) != 0)
+			ret = t_ret;
+		if (ret != 0)
+			(void)__envreg_unregister(env, 1);
+	}
+
+	/*
+	 * If the open is called with DB_REGISTER we can potentially skip
+	 * running recovery on a panicked environment. We can't check the panic
+	 * bit earlier since checking requires opening the environment.
+	 * Only retry if DB_RECOVER was specified - the register_recovery flag
+	 * indicates that.
+	 */
+	if (ret == DB_RUNRECOVERY && !register_recovery &&
+	    !LF_ISSET(DB_RECOVER) && LF_ISSET(DB_REGISTER)) {
+		LF_SET(DB_RECOVER);
+		goto retry;
+	}
+
+	return (ret);
+}
+
+/*
+ * __env_open_arg --
+ *	DB_ENV->open flags checking.
+ */
+static int
+__env_open_arg(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	ret = 0;
+
+	if (LF_ISSET(DB_REGISTER)) {
+		if (!__os_support_db_register()) {
+			__db_errx(env, DB_STR("1568",
+	    "Berkeley DB library does not support DB_REGISTER on this system"));
+			return (EINVAL);
+		}
+		if ((ret = __db_fcchk(env, "DB_ENV->open", flags,
+		    DB_PRIVATE, DB_REGISTER | DB_SYSTEM_MEM)) != 0)
+			return (ret);
+		if (LF_ISSET(DB_CREATE) && !LF_ISSET(DB_INIT_TXN)) {
+			__db_errx(env, DB_STR("1569",
+			    "registration requires transaction support"));
+			return (EINVAL);
+		}
+	}
+	/*
+	 * Only check for flags compatible with DB_INIT_REP when creating
+	 * since otherwise it'll be ignored anyway.
+	 */
+	if (LF_ISSET(DB_INIT_REP) && LF_ISSET(DB_CREATE)) {
+		if (!__os_support_replication()) {
+			__db_errx(env, DB_STR("1570",
+	    "Berkeley DB library does not support replication on this system"));
+			return (EINVAL);
+		}
+		if (!LF_ISSET(DB_INIT_LOCK)) {
+			__db_errx(env, DB_STR("1571",
+			    "replication requires locking support"));
+			return (EINVAL);
+		}
+		if (!LF_ISSET(DB_INIT_TXN)) {
+			__db_errx(env, DB_STR("1572",
+			    "replication requires transaction support"));
+			return (EINVAL);
+		}
+	}
+	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) {
+		if ((ret = __db_fcchk(env,
+		    "DB_ENV->open", flags, DB_RECOVER, DB_RECOVER_FATAL)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(env,
+		    "DB_ENV->open", flags, DB_REGISTER, DB_RECOVER_FATAL)) != 0)
+			return (ret);
+		if (!LF_ISSET(DB_CREATE)) {
+			__db_errx(env, DB_STR("1573",
+			    "recovery requires the create flag"));
+			return (EINVAL);
+		}
+		if (!LF_ISSET(DB_INIT_TXN)) {
+			__db_errx(env, DB_STR("1574",
+			    "recovery requires transaction support"));
+			return (EINVAL);
+		}
+	}
+	if (LF_ISSET(DB_FAILCHK)) {
+		if (!ALIVE_ON(env)) {
+			__db_errx(env, DB_STR("1575",
+		    "DB_FAILCHK requires DB_ENV->is_alive be configured"));
+			return (EINVAL);
+		}
+		if (dbenv->thr_max == 0) {
+			__db_errx(env, DB_STR("1576",
+	    "DB_FAILCHK requires DB_ENV->set_thread_count be configured"));
+			return (EINVAL);
+		}
+	}
+
+#ifdef HAVE_MUTEX_THREAD_ONLY
+	/*
+	 * Currently we support one kind of mutex that is intra-process only,
+	 * POSIX 1003.1 pthreads, because a variety of systems don't support
+	 * the full pthreads API, and our only alternative is test-and-set.
+	 */
+	if (!LF_ISSET(DB_PRIVATE)) {
+		__db_errx(env, DB_STR("1577",
+    "Berkeley DB library configured to support only private environments"));
+		return (EINVAL);
+	}
+#endif
+
+#ifdef HAVE_MUTEX_FCNTL
+	/*
+	 * !!!
+	 * We need a file descriptor for fcntl(2) locking.  We use the file
+	 * handle from the REGENV file for this purpose.
+	 *
+	 * Since we may be using shared memory regions, e.g., shmget(2), and
+	 * not a mapped-in regular file, the backing file may be only a few
+	 * bytes in length.  So, this depends on the ability to call fcntl to
+	 * lock file offsets much larger than the actual physical file.  I
+	 * think that's safe -- besides, very few systems actually need this
+	 * kind of support, SunOS is the only one still in wide use of which
+	 * I'm aware.
+	 *
+	 * The error case is if an application lacks spinlocks and wants to be
+	 * threaded.  That doesn't work because fcntl will lock the underlying
+	 * process, including all its threads.
+	 */
+	if (F_ISSET(env, ENV_THREAD)) {
+		__db_errx(env, DB_STR("1578",
+    "architecture lacks fast mutexes: applications cannot be threaded"));
+		return (EINVAL);
+	}
+#endif
+	return (ret);
+}
+
+/*
+ * __env_remove --
+ *	DB_ENV->remove.
+ *
+ * PUBLIC: int __env_remove __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_remove(dbenv, db_home, flags)
+	DB_ENV *dbenv;
+	const char *db_home;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbenv->env;
+
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_FORCE | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB_ENV->remove", flags, OKFLAGS)) != 0)
+		return (ret);
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->remove");
+
+	if ((ret = __env_config(dbenv, db_home, &flags, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Turn the environment off -- if the environment is corrupted, this
+	 * could fail.  Ignore any error if we're forcing the question.
+	 */
+	if ((ret = __env_turn_off(env, flags)) == 0 || LF_ISSET(DB_FORCE))
+		ret = __env_remove_env(env);
+
+	if ((t_ret = __env_close(dbenv, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __env_config --
+ *	Argument-based initialization.
+ *
+ * PUBLIC: int __env_config __P((DB_ENV *, const char *, u_int32_t *, int));
+ */
+int
+__env_config(dbenv, db_home, flagsp, mode)
+	DB_ENV *dbenv;
+	const char *db_home;
+	u_int32_t *flagsp;
+	int mode;
+{
+	ENV *env;
+	int ret;
+	u_int32_t flags;
+	char *home, home_buf[DB_MAXPATHLEN];
+
+	env = dbenv->env;
+	flags = *flagsp;
+
+	/*
+	 * Set the database home.
+	 *
+	 * Use db_home by default, this allows utilities to reasonably
+	 * override the environment either explicitly or by using a -h
+	 * option.  Otherwise, use the environment if it's permitted
+	 * and initialized.
+	 */
+	home = (char *)db_home;
+	if (home == NULL && (LF_ISSET(DB_USE_ENVIRON) ||
+	    (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot()))) {
+		home = home_buf;
+		if ((ret = __os_getenv(
+		    env, "DB_HOME", &home, sizeof(home_buf))) != 0)
+			return (ret);
+		/*
+		 * home set to NULL if __os_getenv failed to find DB_HOME.
+		 */
+	}
+	if (home != NULL) {
+		if (env->db_home != NULL)
+			__os_free(env, env->db_home);
+		if ((ret = __os_strdup(env, home, &env->db_home)) != 0)
+			return (ret);
+	}
+
+	/* Save a copy of the DB_ENV->open method flags. */
+	env->open_flags = flags;
+
+	/* Default permissions are read-write for both owner and group. */
+	env->db_mode = mode == 0 ? DB_MODE_660 : mode;
+
+	/* Read the DB_CONFIG file. */
+	if ((ret = __env_read_db_config(env)) != 0)
+		return (ret);
+
+	/*
+	 * Update the DB_ENV->open method flags. The copy of the flags might
+	 * have been changed during reading DB_CONFIG file.
+	 */
+	flags = env->open_flags;
+
+	/*
+	 * If no temporary directory path was specified in the config file,
+	 * choose one.
+	 */
+	if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(env, flags)) != 0)
+		return (ret);
+
+	*flagsp = flags;
+	return (0);
+}
+
+/*
+ * __env_close_pp --
+ *	DB_ENV->close pre/post processor.
+ *
+ * PUBLIC: int __env_close_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_close_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_check, ret, t_ret;
+	u_int32_t close_flags, flags_orig;
+
+	env = dbenv->env;
+	ret = 0;
+	close_flags = flags_orig = 0;
+
+	/*
+	 * Validate arguments, but as a DB_ENV handle destructor, we can't
+	 * fail.
+	 */
+	if (flags != 0 && flags != DB_FORCESYNC &&
+	    (t_ret = __db_ferr(env, "DB_ENV->close", 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+#define	DBENV_FORCESYNC		0x00000001
+#define	DBENV_CLOSE_REPCHECK	0x00000010
+	if (flags == DB_FORCESYNC)
+		close_flags |= DBENV_FORCESYNC;
+
+	/*
+	 * If the environment has panic'd, all we do is try and discard
+	 * the important resources.
+	 */
+	if (PANIC_ISSET(env)) {
+		/* clean up from registry file */
+		if (dbenv->registry != NULL) {
+			/*
+			 * Temporarily set no panic so we do not trigger the
+			 * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwr
+			 * thus allowing the unregister to happen correctly.
+			 */
+			flags_orig = F_ISSET(dbenv, DB_ENV_NOPANIC);
+			F_SET(dbenv, DB_ENV_NOPANIC);
+			(void)__envreg_unregister(env, 0);
+			dbenv->registry = NULL;
+			if (!flags_orig)
+				F_CLR(dbenv, DB_ENV_NOPANIC);
+		}
+
+		/* Close all underlying threads and sockets. */
+		if (IS_ENV_REPLICATED(env))
+			(void)__repmgr_close(env);
+
+		/* Close all underlying file handles. */
+		(void)__file_handle_cleanup(env);
+
+		PANIC_CHECK(env);
+	}
+
+	ENV_ENTER(env, ip);
+
+	rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
+	if (rep_check) {
+#ifdef HAVE_REPLICATION_THREADS
+		/*
+		 * Shut down Replication Manager threads first of all.  This
+		 * must be done before __env_rep_enter to avoid a deadlock that
+		 * could occur if repmgr's background threads try to do a rep
+		 * operation that needs __rep_lockout.
+		 */
+		if ((t_ret = __repmgr_close(env)) != 0 && ret == 0)
+			ret = t_ret;
+#endif
+		if ((t_ret = __env_rep_enter(env, 0)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	if (rep_check)
+		close_flags |= DBENV_CLOSE_REPCHECK;
+	if ((t_ret = __env_close(dbenv, close_flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Don't ENV_LEAVE as we have already detached from the region. */
+	return (ret);
+}
+
+/*
+ * __env_close --
+ *	DB_ENV->close.
+ *
+ * PUBLIC: int __env_close __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_close(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int ret, rep_check, t_ret;
+	char **p;
+	u_int32_t close_flags;
+
+	env = dbenv->env;
+	ret = 0;
+	close_flags = LF_ISSET(DBENV_FORCESYNC) ? 0 : DB_NOSYNC;
+	rep_check = LF_ISSET(DBENV_CLOSE_REPCHECK);
+
+	/*
+	 * Check to see if we were in the middle of restoring transactions and
+	 * need to close the open files.
+	 */
+	if (TXN_ON(env) && (t_ret = __txn_preclose(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+#ifdef HAVE_REPLICATION
+	if ((t_ret = __rep_env_close(env)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+
+	/*
+	 * Close all databases opened in this environment after the rep region
+	 * is closed. Rep region's internal database is already closed now.
+	 */
+	while ((dbp = TAILQ_FIRST(&env->dblist)) != NULL) {
+		/*
+		 * Do not close the handle on a database partition, since it
+		 * will be closed when closing the handle on the main database.
+		 */
+		while (dbp != NULL && F_ISSET(dbp, DB_AM_PARTDB))
+			dbp = TAILQ_NEXT(dbp, dblistlinks);
+		DB_ASSERT(env, dbp != NULL);
+		/*
+		 * Note down and ignore the error code. Since we can't do
+		 * anything about the dbp handle anyway if the close
+		 * operation fails. But we want to return the error to the
+		 * caller. This is how this function takes care of various
+		 * close operation errors.
+		 */
+		if (dbp->alt_close != NULL)
+			t_ret = dbp->alt_close(dbp, close_flags);
+		else
+			t_ret = __db_close(dbp, NULL, close_flags);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/*
+	 * Detach from the regions and undo the allocations done by
+	 * DB_ENV->open.
+	 */
+	if ((t_ret = __env_refresh(dbenv, 0, rep_check)) != 0 && ret == 0)
+		ret = t_ret;
+
+#ifdef HAVE_CRYPTO
+	/*
+	 * Crypto comes last, because higher level close functions need
+	 * cryptography.
+	 */
+	if ((t_ret = __crypto_env_close(env)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+
+	/* If we're registered, clean up. */
+	if (dbenv->registry != NULL) {
+		(void)__envreg_unregister(env, 0);
+		dbenv->registry = NULL;
+	}
+
+	/* Check we've closed all underlying file handles. */
+	if ((t_ret = __file_handle_cleanup(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release any string-based configuration parameters we've copied. */
+	if (dbenv->db_log_dir != NULL)
+		__os_free(env, dbenv->db_log_dir);
+	dbenv->db_log_dir = NULL;
+	if (dbenv->db_tmp_dir != NULL)
+		__os_free(env, dbenv->db_tmp_dir);
+	dbenv->db_tmp_dir = NULL;
+	if (dbenv->db_md_dir != NULL)
+		__os_free(env, dbenv->db_md_dir);
+	dbenv->db_md_dir = NULL;
+	if (dbenv->db_data_dir != NULL) {
+		for (p = dbenv->db_data_dir; *p != NULL; ++p)
+			__os_free(env, *p);
+		__os_free(env, dbenv->db_data_dir);
+		dbenv->db_data_dir = NULL;
+		dbenv->data_next = 0;
+	}
+	if (dbenv->intermediate_dir_mode != NULL)
+		__os_free(env, dbenv->intermediate_dir_mode);
+	if (env->db_home != NULL) {
+		__os_free(env, env->db_home);
+		env->db_home = NULL;
+	}
+
+	if (env->backup_handle != NULL) {
+		__os_free(env, env->backup_handle);
+		env->backup_handle = NULL;
+	}
+
+	/* Discard the structure. */
+	__db_env_destroy(dbenv);
+
+	return (ret);
+}
+
+/*
+ * __env_refresh --
+ *	Refresh the DB_ENV structure.
+ * PUBLIC: int __env_refresh __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_refresh(dbenv, orig_flags, rep_check)
+	DB_ENV *dbenv;
+	u_int32_t orig_flags;
+	int rep_check;
+{
+	DB *ldbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbenv->env;
+	ret = 0;
+
+	/*
+	 * Release resources allocated by DB_ENV->open, and return it to the
+	 * state it was in just before __env_open was called.  (This means
+	 * state set by pre-open configuration functions must be preserved.)
+	 *
+	 * Refresh subsystems, in the reverse order they were opened (txn
+	 * must be first, it may want to discard locks and flush the log).
+	 *
+	 * !!!
+	 * Note that these functions, like all of __env_refresh, only undo
+	 * the effects of __env_open.  Functions that undo work done by
+	 * db_env_create or by a configuration function should go in
+	 * __env_close.
+	 */
+	if (TXN_ON(env) &&
+	    (t_ret = __txn_env_refresh(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (LOGGING_ON(env) &&
+	    (t_ret = __log_env_refresh(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Locking should come after logging, because closing log results
+	 * in files closing which may require locks being released.
+	 */
+	if (LOCKING_ON(env)) {
+		if (!F_ISSET(env, ENV_THREAD) &&
+		    env->env_lref != NULL && (t_ret =
+		    __lock_id_free(env, env->env_lref)) != 0 && ret == 0)
+			ret = t_ret;
+		env->env_lref = NULL;
+
+		if ((t_ret = __lock_env_refresh(env)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* Discard the DB_ENV, ENV handle mutexes. */
+	if ((t_ret = __mutex_free(env, &dbenv->mtx_db_env)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __mutex_free(env, &env->mtx_env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Discard DB list and its mutex.
+	 * Discard the MT mutex.
+	 *
+	 * !!!
+	 * This must be done after we close the log region, because we close
+	 * database handles and so acquire this mutex when we close log file
+	 * handles.
+	 */
+	if (env->db_ref != 0) {
+		__db_errx(env, DB_STR("1579",
+		    "Database handles still open at environment close"));
+		TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks)
+			__db_errx(env, DB_STR_A("1580",
+			    "Open database handle: %s%s%s", "%s %s %s"),
+			    ldbp->fname == NULL ? "unnamed" : ldbp->fname,
+			    ldbp->dname == NULL ? "" : "/",
+			    ldbp->dname == NULL ? "" : ldbp->dname);
+		if (ret == 0)
+			ret = EINVAL;
+	}
+	TAILQ_INIT(&env->dblist);
+	if ((t_ret = __mutex_free(env, &env->mtx_dblist)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __mutex_free(env, &env->mtx_mt)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (env->mt != NULL) {
+		__os_free(env, env->mt);
+		env->mt = NULL;
+	}
+
+	if (MPOOL_ON(env)) {
+		/*
+		 * If it's a private environment, flush the contents to disk.
+		 * Recovery would have put everything back together, but it's
+		 * faster and cleaner to flush instead.
+		 *
+		 * Ignore application max-write configuration, we're shutting
+		 * down.
+		 */
+		if (F_ISSET(env, ENV_PRIVATE) &&
+		    !F_ISSET(dbenv, DB_ENV_NOFLUSH) &&
+		    (t_ret = __memp_sync_int(env, NULL, 0,
+		    DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+
+		if ((t_ret = __memp_env_refresh(env)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/*
+	 * If we're included in a shared replication handle count, this
+	 * is our last chance to decrement that count.
+	 *
+	 * !!!
+	 * We can't afford to do anything dangerous after we decrement the
+	 * handle count, of course, as replication may be proceeding with
+	 * client recovery.  However, since we're discarding the regions
+	 * as soon as we drop the handle count, there's little opportunity
+	 * to do harm.
+	 */
+	if (rep_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Refresh the replication region.
+	 *
+	 * Must come after we call __env_db_rep_exit above.
+	 */
+	if (REP_ON(env) && (t_ret = __rep_env_refresh(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+#ifdef HAVE_CRYPTO
+	/*
+	 * Crypto comes last, because higher level close functions need
+	 * cryptography.
+	 */
+	if (env->reginfo != NULL &&
+	    (t_ret = __crypto_env_refresh(env)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+
+	/*
+	 * Mark the thread as out of the env before we get rid of the handles
+	 * needed to do so.
+	 */
+	if (env->thr_hashtab != NULL &&
+	    (t_ret = __env_set_state(env, &ip, THREAD_OUT)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * We are about to detach from the mutex region.  This is the last
+	 * chance we have to acquire/destroy a mutex -- acquire/destroy the
+	 * mutex and release our reference.
+	 *
+	 * !!!
+	 * There are two DbEnv methods that care about environment reference
+	 * counts: DbEnv.close and DbEnv.remove.  The DbEnv.close method is
+	 * not a problem because it only decrements the reference count and
+	 * no actual resources are discarded -- lots of threads of control
+	 * can call DbEnv.close at the same time, and regardless of racing
+	 * on the reference count mutex, we wouldn't have a problem.  Since
+	 * the DbEnv.remove method actually discards resources, we can have
+	 * a problem.
+	 *
+	 * If we decrement the reference count to 0 here, go to sleep, and
+	 * the DbEnv.remove method is called, by the time we run again, the
+	 * underlying shared regions could have been removed.  That's fine,
+	 * except we might actually need the regions to resolve outstanding
+	 * operations in the various subsystems, and if we don't have hard
+	 * OS references to the regions, we could get screwed.  Of course,
+	 * we should have hard OS references to everything we need, but just
+	 * in case, we put off decrementing the reference count as long as
+	 * possible.
+	 */
+	if ((t_ret = __env_ref_decrement(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+#ifdef HAVE_MUTEX_SUPPORT
+	if (MUTEX_ON(env) &&
+	    (t_ret = __mutex_env_refresh(env)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+	/* Free memory for thread tracking. */
+	if (env->reginfo != NULL) {
+		if (F_ISSET(env, ENV_PRIVATE)) {
+			__env_thread_destroy(env);
+			t_ret = __env_detach(env, 1);
+		} else
+			t_ret = __env_detach(env, 0);
+
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+
+		/*
+		 * !!!
+		 * Don't free env->reginfo or set the reference to NULL,
+		 * that was done by __env_detach().
+		 */
+	}
+
+	if (env->recover_dtab.int_dispatch != NULL) {
+		__os_free(env, env->recover_dtab.int_dispatch);
+		env->recover_dtab.int_size = 0;
+		env->recover_dtab.int_dispatch = NULL;
+	}
+	if (env->recover_dtab.ext_dispatch != NULL) {
+		__os_free(env, env->recover_dtab.ext_dispatch);
+		env->recover_dtab.ext_size = 0;
+		env->recover_dtab.ext_dispatch = NULL;
+	}
+
+	dbenv->flags = orig_flags;
+
+	return (ret);
+}
+
+/*
+ * __file_handle_cleanup --
+ *	Close any underlying open file handles so we don't leak system
+ *	resources.
+ */
+static int
+__file_handle_cleanup(env)
+	ENV *env;
+{
+	DB_FH *fhp;
+
+	if (TAILQ_FIRST(&env->fdlist) == NULL)
+		return (0);
+
+	__db_errx(env, DB_STR("1581",
+	    "File handles still open at environment close"));
+	while ((fhp = TAILQ_FIRST(&env->fdlist)) != NULL) {
+		__db_errx(env, DB_STR_A("1582", "Open file handle: %s", "%s"),
+		    fhp->name);
+		(void)__os_closehandle(env, fhp);
+	}
+	return (EINVAL);
+}
+
+/*
+ * __env_get_open_flags
+ *	DbEnv.get_open_flags method.
+ *
+ * PUBLIC: int __env_get_open_flags __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_get_open_flags(dbenv, flagsp)
+	DB_ENV *dbenv;
+	u_int32_t *flagsp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_open_flags");
+
+	*flagsp = env->open_flags;
+	return (0);
+}
+/*
+ * __env_attach_regions --
+ *	Perform attaches to env and required regions (subsystems)
+ *
+ * PUBLIC: int __env_attach_regions __P((DB_ENV *,  u_int32_t, u_int32_t, int));
+ */
+int
+__env_attach_regions(dbenv, flags, orig_flags, retry_ok)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	u_int32_t orig_flags;
+	int retry_ok;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REGINFO *infop;
+	u_int32_t init_flags;
+	int create_ok, rep_check, ret;
+
+	ip = NULL;
+	env = dbenv->env;
+	rep_check = 0;
+
+	/* Convert the DB_ENV->open flags to internal flags. */
+	create_ok = LF_ISSET(DB_CREATE) ? 1 : 0;
+	if (LF_ISSET(DB_LOCKDOWN))
+		F_SET(env, ENV_LOCKDOWN);
+	if (LF_ISSET(DB_PRIVATE))
+		F_SET(env, ENV_PRIVATE);
+	if (LF_ISSET(DB_RECOVER_FATAL))
+		F_SET(env, ENV_RECOVER_FATAL);
+	if (LF_ISSET(DB_SYSTEM_MEM))
+		F_SET(env, ENV_SYSTEM_MEM);
+	if (LF_ISSET(DB_THREAD))
+		F_SET(env, ENV_THREAD);
+
+	/*
+	 * Create/join the environment.  We pass in the flags of interest to
+	 * a thread subsequently joining an environment we create.  If we're
+	 * not the ones to create the environment, our flags will be updated
+	 * to match the existing environment.
+	 */
+	init_flags = 0;
+	if (LF_ISSET(DB_INIT_CDB))
+		FLD_SET(init_flags, DB_INITENV_CDB);
+	if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB))
+		FLD_SET(init_flags, DB_INITENV_CDB_ALLDB);
+	if (LF_ISSET(DB_INIT_LOCK))
+		FLD_SET(init_flags, DB_INITENV_LOCK);
+	if (LF_ISSET(DB_INIT_LOG))
+		FLD_SET(init_flags, DB_INITENV_LOG);
+	if (LF_ISSET(DB_INIT_MPOOL))
+		FLD_SET(init_flags, DB_INITENV_MPOOL);
+	if (LF_ISSET(DB_INIT_REP))
+		FLD_SET(init_flags, DB_INITENV_REP);
+	if (LF_ISSET(DB_INIT_TXN))
+		FLD_SET(init_flags, DB_INITENV_TXN);
+	if ((ret = __env_attach(env, &init_flags, create_ok, retry_ok)) != 0)
+		goto err;
+
+	/*
+	 * __env_attach will return the saved init_flags field, which contains
+	 * the DB_INIT_* flags used when the environment was created.
+	 *
+	 * We may be joining an environment -- reset our flags to match the
+	 * ones in the environment.
+	 */
+	if (FLD_ISSET(init_flags, DB_INITENV_CDB))
+		LF_SET(DB_INIT_CDB);
+	if (FLD_ISSET(init_flags, DB_INITENV_LOCK))
+		LF_SET(DB_INIT_LOCK);
+	if (FLD_ISSET(init_flags, DB_INITENV_LOG))
+		LF_SET(DB_INIT_LOG);
+	if (FLD_ISSET(init_flags, DB_INITENV_MPOOL))
+		LF_SET(DB_INIT_MPOOL);
+	if (FLD_ISSET(init_flags, DB_INITENV_REP))
+		LF_SET(DB_INIT_REP);
+	if (FLD_ISSET(init_flags, DB_INITENV_TXN))
+		LF_SET(DB_INIT_TXN);
+	if (FLD_ISSET(init_flags, DB_INITENV_CDB_ALLDB) &&
+	    (ret = __env_set_flags(dbenv, DB_CDB_ALLDB, 1)) != 0)
+		goto err;
+
+	/* Initialize for CDB product. */
+	if (LF_ISSET(DB_INIT_CDB)) {
+		LF_SET(DB_INIT_LOCK);
+		F_SET(env, ENV_CDB);
+	}
+
+	/*
+	 * Update the flags to match the database environment.  The application
+	 * may have specified flags of 0 to join the environment, and this line
+	 * replaces that value with the flags corresponding to the existing,
+	 * underlying set of subsystems.  This means the DbEnv.get_open_flags
+	 * method returns the flags to open the existing environment instead of
+	 * the specific flags passed to the DbEnv.open method.
+	 */
+	env->open_flags = flags;
+
+	/*
+	 * The DB_ENV structure has now been initialized.  Turn off further
+	 * use of the DB_ENV structure and most initialization methods, we're
+	 * about to act on the values we currently have.
+	 */
+	F_SET(env, ENV_OPEN_CALLED);
+
+	infop = env->reginfo;
+
+#ifdef HAVE_MUTEX_SUPPORT
+	/*
+	 * Initialize the mutex regions first before ENV_ENTER().
+	 * Mutexes need to be 'on' when attaching to an existing env
+	 * in order to safely allocate the thread tracking info.
+	 */
+	if ((ret = __mutex_open(env, create_ok)) != 0)
+		goto err;
+	/* The MUTEX_REQUIRED() in __env_alloc() expects this to be set. */
+	infop->mtx_alloc = ((REGENV *)infop->primary)->mtx_regenv;
+#endif
+	/*
+	 * Initialize thread tracking and enter the API.
+	 */
+	if ((ret =
+	    __env_thread_init(env, F_ISSET(infop, REGION_CREATE) ? 1 : 0)) != 0)
+		goto err;
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * Initialize the subsystems.
+	 */
+	/*
+	 * We can now acquire/create mutexes: increment the region's reference
+	 * count.
+	 */
+	if ((ret = __env_ref_increment(env)) != 0)
+		goto err;
+
+	/*
+	 * Initialize the handle mutexes.
+	 */
+	if ((ret = __mutex_alloc(env,
+	    MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbenv->mtx_db_env)) != 0 ||
+	    (ret = __mutex_alloc(env,
+	    MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &env->mtx_env)) != 0)
+		goto err;
+
+	/*
+	 * Initialize the replication area next, so that we can lock out this
+	 * call if we're currently running recovery for replication.
+	 */
+	if (LF_ISSET(DB_INIT_REP) && (ret = __rep_open(env)) != 0)
+		goto err;
+
+	rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
+	if (rep_check && (ret = __env_rep_enter(env, 0)) != 0)
+		goto err;
+
+	if (LF_ISSET(DB_INIT_MPOOL)) {
+		if ((ret = __memp_open(env, create_ok)) != 0)
+			goto err;
+
+		/*
+		 * BDB does do cache I/O during recovery and when starting up
+		 * replication.  If creating a new environment, then suppress
+		 * any application max-write configuration.
+		 */
+		if (create_ok)
+			(void)__memp_set_config(
+			    dbenv, DB_MEMP_SUPPRESS_WRITE, 1);
+
+		/*
+		 * Initialize the DB list and its mutex.  If the mpool is
+		 * not initialized, we can't ever open a DB handle, which
+		 * is why this code lives here.
+		 */
+		TAILQ_INIT(&env->dblist);
+		if ((ret = __mutex_alloc(env, MTX_ENV_DBLIST,
+		    DB_MUTEX_PROCESS_ONLY, &env->mtx_dblist)) != 0)
+			goto err;
+
+		/* Register DB's pgin/pgout functions.  */
+		if ((ret = __memp_register(
+		    env, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Initialize the ciphering area prior to any running of recovery so
+	 * that we can initialize the keys, etc. before recovery, including
+	 * the MT mutex.
+	 *
+	 * !!!
+	 * This must be after the mpool init, but before the log initialization
+	 * because log_open may attempt to run log_recover during its open.
+	 */
+	if (LF_ISSET(DB_INIT_MPOOL | DB_INIT_LOG | DB_INIT_TXN) &&
+	    (ret = __crypto_region_init(env)) != 0)
+		goto err;
+	if ((ret = __mutex_alloc(
+	    env, MTX_TWISTER, DB_MUTEX_PROCESS_ONLY, &env->mtx_mt)) != 0)
+		goto err;
+
+	/*
+	 * Transactions imply logging but do not imply locking.  While almost
+	 * all applications want both locking and logging, it would not be
+	 * unreasonable for a single threaded process to want transactions for
+	 * atomicity guarantees, but not necessarily need concurrency.
+	 */
+	if (LF_ISSET(DB_INIT_LOG | DB_INIT_TXN))
+		if ((ret = __log_open(env)) != 0)
+			goto err;
+	if (LF_ISSET(DB_INIT_LOCK))
+		if ((ret = __lock_open(env)) != 0)
+			goto err;
+
+	if (LF_ISSET(DB_INIT_TXN)) {
+		if ((ret = __txn_open(env)) != 0)
+			goto err;
+
+		/*
+		 * If the application is running with transactions, initialize
+		 * the function tables.
+		 */
+		if ((ret = __env_init_rec(env,
+		    ((LOG *)env->lg_handle->reginfo.primary)->persist.version))
+		    != 0)
+			goto err;
+	}
+
+	/* Perform recovery for any previous run. */
+	if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+	    (ret = __db_apprec(env, ip, NULL, NULL, 1,
+	    LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL | DB_NO_CHECKPOINT))) != 0)
+		goto err;
+
+	/*
+	 * If we've created the regions, are running with transactions, and did
+	 * not just run recovery, we need to log the fact that the transaction
+	 * IDs got reset.
+	 *
+	 * If we ran recovery, there may be prepared-but-not-yet-committed
+	 * transactions that need to be resolved.  Recovery resets the minimum
+	 * transaction ID and logs the reset if that's appropriate, so we
+	 * don't need to do anything here in the recover case.
+	 */
+	if (TXN_ON(env) &&
+	    !FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) &&
+	    F_ISSET(infop, REGION_CREATE) &&
+	    !LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+	    (ret = __txn_reset(env)) != 0)
+		goto err;
+
+	/* The database environment is ready for business. */
+	if ((ret = __env_turn_on(env)) != 0)
+		goto err;
+
+	if (rep_check)
+		ret = __env_db_rep_exit(env);
+
+	/* Turn any application-specific max-write configuration back on. */
+	if (LF_ISSET(DB_INIT_MPOOL))
+		(void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0);
+
+err:	if (ret == 0)
+		ENV_LEAVE(env, ip);
+	else {
+		/*
+		 * If we fail after creating regions, panic and remove them.
+		 *
+		 * !!!
+		 * No need to call __env_db_rep_exit, that work is done by the
+		 * calls to __env_refresh.
+		 */
+		infop = env->reginfo;
+		if (infop != NULL && F_ISSET(infop, REGION_CREATE)) {
+			ret = __env_panic(env, ret);
+
+			/* Refresh the DB_ENV so can use it to call remove. */
+			(void)__env_refresh(dbenv, orig_flags, rep_check);
+			(void)__env_remove_env(env);
+			(void)__env_refresh(dbenv, orig_flags, 0);
+		} else
+			(void)__env_refresh(dbenv, orig_flags, rep_check);
+		/* clear the fact that the region had been opened */
+		F_CLR(env, ENV_OPEN_CALLED);
+	}
+
+	return (ret);
+}
diff --git a/src/env/env_recover.c b/src/env/env_recover.c
new file mode 100644
index 00000000..9636554a
--- /dev/null
+++ b/src/env/env_recover.c
@@ -0,0 +1,1093 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#ifndef lint
+static const char copyright[] =
+    "Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.\n";
+#endif
+
+static int	__db_log_corrupt __P((ENV *, DB_LSN *));
+static int	__env_init_rec_42 __P((ENV *));
+static int	__env_init_rec_43 __P((ENV *));
+static int	__env_init_rec_46 __P((ENV *));
+static int	__env_init_rec_47 __P((ENV *));
+static int	__env_init_rec_48 __P((ENV *));
+static int	__log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *));
+
+static double	__lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
+static int	__log_backup __P((ENV *, DB_LOGC *, DB_LSN *, DB_LSN*));
+
+/*
+ * __db_apprec --
+ *	Perform recovery.  If max_lsn is non-NULL, then we are trying
+ * to synchronize this system up with another system that has a max
+ * LSN of max_lsn, so we need to roll back sufficiently far for that
+ * to work.  See __log_backup for details.
+ *
+ * PUBLIC: int __db_apprec __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t));
+ */
+int
+__db_apprec(env, ip, max_lsn, trunclsn, update, flags)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_LSN *max_lsn, *trunclsn;
+	int update;
+	u_int32_t flags;
+{
+	DBT data;
+	DB_ENV *dbenv;
+	DB_LOGC *logc;
+	DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn, tlsn;
+	DB_LSN *vtrunc_ckp, *vtrunc_lsn;
+	DB_TXNHEAD *txninfo;
+	DB_TXNREGION *region;
+	REGENV *renv;
+	REGINFO *infop;
+	__txn_ckp_args *ckp_args;
+	time_t now, tlow;
+	double nfiles;
+	u_int32_t hi_txn, log_size, txnid;
+	int32_t low;
+	int all_recovered, progress, rectype, ret, t_ret;
+	char *p, *pass;
+	char t1[CTIME_BUFLEN], t2[CTIME_BUFLEN], time_buf[CTIME_BUFLEN];
+
+	COMPQUIET(nfiles, (double)0.001);
+
+	dbenv = env->dbenv;
+	logc = NULL;
+	ckp_args = NULL;
+	hi_txn = TXN_MAXIMUM;
+	txninfo = NULL;
+	pass = DB_STR_P("initial");
+	ZERO_LSN(lsn);
+
+	/*
+	 * XXX
+	 * Get the log size.  No locking required because we're single-threaded
+	 * during recovery.
+	 */
+	log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
+
+	/*
+	 * If we need to, update the env handle timestamp.
+	 */
+	if (update && REP_ON(env)) {
+		infop = env->reginfo;
+		renv = infop->primary;
+		(void)time(&renv->rep_timestamp);
+	}
+
+	/* Set in-recovery flags. */
+	F_SET(env->lg_handle, DBLOG_RECOVER);
+	region = env->tx_handle->reginfo.primary;
+	F_SET(region, TXN_IN_RECOVERY);
+
+	/* Allocate a cursor for the log. */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+
+	/*
+	 * If the user is specifying recovery to a particular point in time
+	 * or to a particular LSN, find the point to start recovery from.
+	 */
+	ZERO_LSN(lowlsn);
+	if (max_lsn != NULL) {
+		if ((ret = __log_backup(env, logc, max_lsn, &lowlsn)) != 0)
+			goto err;
+	} else if (dbenv->tx_timestamp != 0) {
+		if ((ret = __log_earliest(env, logc, &low, &lowlsn)) != 0)
+			goto err;
+		if ((int32_t)dbenv->tx_timestamp < low) {
+			t1[sizeof(t1) - 1] = '\0';
+			(void)strncpy(t1, __os_ctime(
+			    &dbenv->tx_timestamp, time_buf), sizeof(t1) - 1);
+			if ((p = strchr(t1, '\n')) != NULL)
+				*p = '\0';
+
+			t2[sizeof(t2) - 1] = '\0';
+			tlow = (time_t)low;
+			(void)strncpy(t2, __os_ctime(
+			    &tlow, time_buf), sizeof(t2) - 1);
+			if ((p = strchr(t2, '\n')) != NULL)
+				*p = '\0';
+
+			__db_errx(env, DB_STR_A("1509",
+		    "Invalid recovery timestamp %s; earliest time is %s",
+			    "%s %s"), t1, t2);
+			ret = EINVAL;
+			goto err;
+		}
+	}
+
+	/*
+	 * Recovery is done in three passes:
+	 * Pass #0:
+	 *	We need to find the position from which we will open files.
+	 *	We need to open files beginning with the earlier of the
+	 *	most recent checkpoint LSN and a checkpoint LSN before the
+	 *	recovery timestamp, if specified.  We need to be before the
+	 *	most recent checkpoint LSN because we are going to collect
+	 *	information about which transactions were begun before we
+	 *	start rolling forward.  Those that were should never be undone
+	 *	because queue cannot use LSNs to determine what operations can
+	 *	safely be aborted and it cannot rollback operations in
+	 *	transactions for which there may be records not processed
+	 *	during recovery.  We need to consider earlier points in time
+	 *	in case we are recovering to a particular timestamp.
+	 *
+	 * Pass #1:
+	 *	Read forward through the log from the position found in pass 0
+	 *	opening and closing files, and recording transactions for which
+	 *	we've seen their first record (the transaction's prev_lsn is
+	 *	0,0).  At the end of this pass, we know all transactions for
+	 *	which we've seen begins and we have the "current" set of files
+	 *	open.
+	 *
+	 * Pass #2:
+	 *	Read backward through the log undoing any uncompleted TXNs.
+	 *	There are four cases:
+	 *	    1.  If doing catastrophic recovery, we read to the
+	 *		beginning of the log
+	 *	    2.  If we are doing normal reovery, then we have to roll
+	 *		back to the most recent checkpoint LSN.
+	 *	    3.  If we are recovering to a point in time, then we have
+	 *		to roll back to the checkpoint whose ckp_lsn is earlier
+	 *		than the specified time.  __log_earliest will figure
+	 *		this out for us.
+	 *	    4.	If we are recovering back to a particular LSN, then
+	 *		we have to roll back to the checkpoint whose ckp_lsn
+	 *		is earlier than the max_lsn.  __log_backup will figure
+	 *		that out for us.
+	 *	In case 2, "uncompleted TXNs" include all those who committed
+	 *	after the user's specified timestamp.
+	 *
+	 * Pass #3:
+	 *	Read forward through the log from the LSN found in pass #2,
+	 *	redoing any committed TXNs (which committed after any user-
+	 *	specified rollback point).  During this pass, checkpoint
+	 *	file information is ignored, and file openings and closings
+	 *	are redone.
+	 *
+	 * ckp_lsn   -- lsn of the last checkpoint or the first in the log.
+	 * first_lsn -- the lsn where the forward passes begin.
+	 * last_lsn  -- the last lsn in the log, used for feedback
+	 * lowlsn    -- the lsn we are rolling back to, if we are recovering
+	 *		to a point in time.
+	 * lsn       -- temporary use lsn.
+	 * stop_lsn  -- the point at which forward roll should stop
+	 */
+
+	/*
+	 * Find out the last lsn, so that we can estimate how far along we
+	 * are in recovery.  This will help us determine how much log there
+	 * is between the first LSN that we're going to be working with and
+	 * the last one.  We assume that each of the three phases takes the
+	 * same amount of time (a false assumption) and then use the %-age
+	 * of the amount of log traversed to figure out how much of the
+	 * pass we've accomplished.
+	 *
+	 * If we can't find any log records, we're kind of done.
+	 */
+#ifdef UMRW
+	ZERO_LSN(last_lsn);
+#endif
+	memset(&data, 0, sizeof(data));
+	/*
+	 * Pass #0
+	 * Find the LSN from which we begin OPENFILES.
+	 *
+	 * If this is a catastrophic recovery, or if no checkpoint exists
+	 * in the log, the LSN is the first LSN in the log.
+	 *
+	 * Otherwise, it is the minimum of (1) the LSN in the last checkpoint
+	 * and (2) the LSN in the checkpoint before any specified recovery
+	 * timestamp or max_lsn.
+	 */
+	/*
+	 * Get the first LSN in the log; it's an initial default
+	 * even if this is not a catastrophic recovery.
+	 */
+	if ((ret = __logc_get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) {
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		else
+			__db_errx(env, DB_STR("1510",
+			    "First log record not found"));
+		goto err;
+	}
+	first_lsn = ckp_lsn;
+
+	if (!LF_ISSET(DB_RECOVER_FATAL)) {
+		if ((ret = __txn_getckp(env, &ckp_lsn)) == 0 &&
+		    (ret = __logc_get(logc, &ckp_lsn, &data, DB_SET)) == 0) {
+			/* We have a recent checkpoint.  This is LSN (1). */
+			if ((ret = __txn_ckp_read(env,
+			    data.data, &ckp_args)) != 0) {
+				__db_errx(env, DB_STR_A("1511",
+				    "Invalid checkpoint record at [%ld][%ld]",
+				    "%ld %ld"), (u_long)ckp_lsn.file,
+				    (u_long)ckp_lsn.offset);
+				goto err;
+			}
+			first_lsn = ckp_args->ckp_lsn;
+			__os_free(env, ckp_args);
+		}
+
+		/*
+		 * If LSN (2) exists, use it if it's before LSN (1).
+		 * (If LSN (1) doesn't exist, first_lsn is the
+		 * beginning of the log, so will "win" this check.)
+		 *
+		 * XXX
+		 * In the recovery-to-a-timestamp case, lowlsn is chosen by
+		 * __log_earliest, and is the checkpoint LSN of the
+		 * *earliest* checkpoint in the unreclaimed log.  I
+		 * (krinsky) believe that we could optimize this by looking
+		 * instead for the LSN of the *latest* checkpoint before
+		 * the timestamp of interest, but I'm not sure that this
+		 * is worth doing right now.  (We have to look for lowlsn
+		 * and low anyway, to make sure the requested timestamp is
+		 * somewhere in the logs we have, and all that's required
+		 * is that we pick *some* checkpoint after the beginning of
+		 * the logs and before the timestamp.
+		 */
+		if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) &&
+		    LOG_COMPARE(&lowlsn, &first_lsn) < 0) {
+			first_lsn = lowlsn;
+		}
+	}
+
+	if ((ret = __logc_get(logc, &last_lsn, &data, DB_LAST)) != 0) {
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		else
+			__db_errx(env, DB_STR("1512",
+			    "Last log record not found"));
+		goto err;
+	}
+
+	rectype = 0;
+	txnid = 0;
+	do {
+		if (LOG_COMPARE(&lsn, &first_lsn) == 0)
+			break;
+		/* check if we have a recycle record. */
+		if (rectype != DB___txn_recycle)
+			LOGCOPY_32(env, &rectype, data.data);
+		/* txnid is after rectype, which is a u_int32. */
+		LOGCOPY_32(env, &txnid,
+		    (u_int8_t *)data.data + sizeof(u_int32_t));
+
+		if (txnid != 0)
+			break;
+	} while ((ret = __logc_get(logc, &lsn, &data, DB_PREV)) == 0);
+
+	/*
+	 * There are no transactions, so there is nothing to do unless
+	 * we're recovering to an LSN.  If we are, we need to proceed since
+	 * we'll still need to do a vtruncate based on information we haven't
+	 * yet collected.
+	 */
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	else if (ret != 0)
+		goto err;
+
+	hi_txn = txnid;
+
+	/* Get the record at first_lsn. */
+	if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) {
+		__db_errx(env, DB_STR_A("1513",
+		    "Checkpoint LSN record [%ld][%ld] not found", "%ld %ld"),
+		    (u_long)first_lsn.file, (u_long)first_lsn.offset);
+		goto err;
+	}
+
+	if (dbenv->db_feedback != NULL) {
+		if (last_lsn.file == first_lsn.file)
+			nfiles = (double)
+			    (last_lsn.offset - first_lsn.offset) / log_size;
+		else
+			nfiles = (double)(last_lsn.file - first_lsn.file) +
+			    (double)((log_size - first_lsn.offset) +
+			    last_lsn.offset) / log_size;
+		/* We are going to divide by nfiles; make sure it isn't 0. */
+		if (nfiles < 0.001)
+			nfiles = 0.001;
+	}
+
+	/* Find a low txnid. */
+	ret = 0;
+	if (hi_txn != 0) do {
+		/* txnid is after rectype, which is a u_int32. */
+		LOGCOPY_32(env, &txnid,
+		    (u_int8_t *)data.data + sizeof(u_int32_t));
+
+		if (txnid != 0)
+			break;
+	} while ((ret = __logc_get(logc, &lsn, &data, DB_NEXT)) == 0);
+
+	/*
+	 * There are no transactions and we're not recovering to an LSN (see
+	 * above), so there is nothing to do.
+	 */
+	if (ret == DB_NOTFOUND) {
+		if (LOG_COMPARE(&lsn, &last_lsn) != 0)
+			ret = __db_log_corrupt(env, &lsn);
+		else
+			ret = 0;
+	}
+
+	/* Reset to the first lsn. */
+	if (ret != 0 ||
+	    (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
+		goto err;
+
+	/* Initialize the transaction list. */
+	if ((ret = __db_txnlist_init(env, ip,
+	    txnid, hi_txn, max_lsn, &txninfo)) != 0)
+		goto err;
+
+	/*
+	 * Pass #1
+	 * Run forward through the log starting at the first relevant lsn.
+	 */
+	if ((ret = __env_openfiles(env, logc,
+	    txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
+		goto err;
+
+	/* If there were no transactions, then we can bail out early. */
+	if (hi_txn == 0 && max_lsn == NULL) {
+		lsn = last_lsn;
+		goto done;
+	}
+
+	/*
+	 * Pass #2.
+	 *
+	 * We used first_lsn to tell us how far back we need to recover,
+	 * use it here.
+	 */
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+		__db_msg(env, DB_STR_A("1514",
+		    "Recovery starting from [%lu][%lu]", "%lu %lu"),
+		    (u_long)first_lsn.file, (u_long)first_lsn.offset);
+
+	pass = DB_STR_P("backward");
+	for (ret = __logc_get(logc, &lsn, &data, DB_LAST);
+	    ret == 0 && LOG_COMPARE(&lsn, &first_lsn) >= 0;
+	    ret = __logc_get(logc, &lsn, &data, DB_PREV)) {
+		if (dbenv->db_feedback != NULL) {
+			progress = 34 + (int)(33 * (__lsn_diff(&first_lsn,
+			    &last_lsn, &lsn, log_size, 0) / nfiles));
+			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+		}
+
+		tlsn = lsn;
+		ret = __db_dispatch(env, &env->recover_dtab,
+		    &data, &tlsn, DB_TXN_BACKWARD_ROLL, txninfo);
+		if (ret != 0) {
+			if (ret != DB_TXN_CKP)
+				goto msgerr;
+			else
+				ret = 0;
+		}
+	}
+	if (ret == DB_NOTFOUND) {
+		if (LOG_COMPARE(&lsn, &first_lsn) > 0)
+			ret = __db_log_corrupt(env, &lsn);
+		else
+			ret = 0;
+	}
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Pass #3.  If we are recovering to a timestamp or to an LSN,
+	 * we need to make sure that we don't roll-forward beyond that
+	 * point because there may be non-transactional operations (e.g.,
+	 * closes that would fail).  The last_lsn variable is used for
+	 * feedback calculations, but use it to set an initial stopping
+	 * point for the forward pass, and then reset appropriately to
+	 * derive a real stop_lsn that tells how far the forward pass
+	 * should go.
+	 */
+	pass = DB_STR_P("forward");
+	stop_lsn = last_lsn;
+	if (max_lsn != NULL || dbenv->tx_timestamp != 0)
+		stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn;
+
+	for (ret = __logc_get(logc, &lsn, &data, DB_NEXT);
+	    ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
+		if (dbenv->db_feedback != NULL) {
+			progress = 67 + (int)(33 * (__lsn_diff(&first_lsn,
+			    &last_lsn, &lsn, log_size, 1) / nfiles));
+			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+		}
+
+		tlsn = lsn;
+		ret = __db_dispatch(env, &env->recover_dtab,
+		    &data, &tlsn, DB_TXN_FORWARD_ROLL, txninfo);
+		if (ret != 0) {
+			if (ret != DB_TXN_CKP)
+				goto msgerr;
+			else
+				ret = 0;
+		}
+		/*
+		 * If we are recovering to a timestamp or an LSN,
+		 * we need to make sure that we don't try to roll
+		 * forward beyond the soon-to-be end of log.
+		 */
+		if (LOG_COMPARE(&lsn, &stop_lsn) >= 0)
+			break;
+
+	}
+	if (ret == DB_NOTFOUND)
+		ret = __db_log_corrupt(env, &lsn);
+	if (ret != 0)
+		goto err;
+
+	if (max_lsn == NULL)
+		region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
+
+done:
+	/* We are going to truncate, so we'd best close the cursor. */
+	if (logc != NULL) {
+		if ((ret = __logc_close(logc)) != 0)
+			goto err;
+		logc = NULL;
+	}
+	/*
+	 * Also flush the cache before truncating the log. It's recovery,
+	 * ignore any application max-write configuration.
+	 */
+	if ((ret = __memp_sync_int(env,
+	    NULL, 0, DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0)
+		goto err;
+	if (dbenv->tx_timestamp != 0) {
+		/* Run recovery up to this timestamp. */
+		region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
+		vtrunc_lsn = &((DB_TXNHEAD *)txninfo)->maxlsn;
+		vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn;
+	} else if (max_lsn != NULL) {
+		/* This is a HA client syncing to the master. */
+		if (!IS_ZERO_LSN(((DB_TXNHEAD *)txninfo)->ckplsn))
+			region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
+		else if ((ret =
+		    __txn_findlastckp(env, &region->last_ckp, max_lsn)) != 0)
+			goto err;
+		vtrunc_lsn = max_lsn;
+		vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn;
+	} else {
+		/*
+		 * The usual case: we recovered the whole (valid) log; clear
+		 * out any partial record after the recovery point.
+		 */
+		vtrunc_lsn = &lsn;
+		vtrunc_ckp = &region->last_ckp;
+	}
+	if ((ret = __log_vtruncate(env, vtrunc_lsn, vtrunc_ckp, trunclsn)) != 0)
+		goto err;
+
+	/* If we had no txns, figure out if we need a checkpoint. */
+	if (hi_txn == 0 && __dbreg_log_nofiles(env))
+		LF_SET(DB_NO_CHECKPOINT);
+	/*
+	 * Usually we close all files at the end of recovery, unless there are
+	 * prepared transactions or errors in the checkpoint.
+	 */
+	all_recovered = region->stat.st_nrestores == 0;
+	/*
+	 * Log a checkpoint here so subsequent recoveries can skip what's been
+	 * done; this is unnecessary for HA rep clients, as they do not write
+	 * log records.
+	 */
+	if (max_lsn == NULL &&	!LF_ISSET(DB_NO_CHECKPOINT) &&
+	    (ret = __txn_checkpoint(env,
+		0, 0, DB_CKP_INTERNAL | DB_FORCE)) != 0) {
+		/*
+		 * If there was no space for the checkpoint or flushing db
+		 * pages we can still bring the environment up, if only for
+		 * read-only access. We must not close the open files because a
+		 * subsequent recovery might still need to redo this portion
+		 * of the log [#18590].
+		 */
+		if (max_lsn == NULL && ret == ENOSPC) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+				__db_msg(env, DB_STR_A("1515",
+		    "Recovery continuing after non-fatal checkpoint error: %s",
+				    "%s"), db_strerror(ret));
+			all_recovered = 0;
+		}
+		else
+			goto err;
+	}
+
+	if (all_recovered ) {
+		/* Close all the db files that are open. */
+		if ((ret = __dbreg_close_files(env, 0)) != 0)
+			goto err;
+	} else {
+		if ((ret = __dbreg_mark_restored(env)) != 0)
+			goto err;
+		F_SET(env->lg_handle, DBLOG_OPENFILES);
+	}
+
+	if (max_lsn != NULL) {
+		/*
+		 * Now we need to open files that should be open in order for
+		 * client processing to continue.  However, since we've
+		 * truncated the log, we need to recompute from where the
+		 * openfiles pass should begin.
+		 */
+		if ((ret = __log_cursor(env, &logc)) != 0)
+			goto err;
+		if ((ret =
+		    __logc_get(logc, &first_lsn, &data, DB_FIRST)) != 0) {
+			if (ret == DB_NOTFOUND)
+				ret = 0;
+			else
+				__db_errx(env, DB_STR("1516",
+				    "First log record not found"));
+			goto err;
+		}
+		if ((ret = __txn_getckp(env, &first_lsn)) == 0 &&
+		    (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) == 0) {
+			/* We have a recent checkpoint.  This is LSN (1). */
+			if ((ret = __txn_ckp_read(env,
+			    data.data, &ckp_args)) != 0) {
+				__db_errx(env, DB_STR_A("1517",
+				    "Invalid checkpoint record at [%ld][%ld]",
+				    "%ld %ld"), (u_long)first_lsn.file,
+				    (u_long)first_lsn.offset);
+				goto err;
+			}
+			first_lsn = ckp_args->ckp_lsn;
+			__os_free(env, ckp_args);
+		}
+		if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
+			goto err;
+		if ((ret = __env_openfiles(env, logc,
+		    txninfo, &data, &first_lsn, max_lsn, nfiles, 1)) != 0)
+			goto err;
+	} else if (all_recovered) {
+		/*
+		 * If there are no transactions that need resolution, whether
+		 * because they are prepared or because recovery will need to
+		 * process them, we need to reset the transaction ID space and
+		 * log this fact.
+		 */
+		if ((rectype != DB___txn_recycle || hi_txn != 0) &&
+		    (ret = __txn_reset(env)) != 0)
+			goto err;
+	} else {
+		if ((ret = __txn_recycle_id(env, 0)) != 0)
+			goto err;
+	}
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
+		(void)time(&now);
+		__db_msg(env, DB_STR_A("1518",
+		    "Recovery complete at %.24s", "%.24s"),
+		    __os_ctime(&now, time_buf));
+		__db_msg(env, DB_STR_A("1519",
+		    "Maximum transaction ID %lx recovery checkpoint [%lu][%lu]",
+		    "%lx %lu %lu"), (u_long)(txninfo == NULL ?
+		    TXN_MINIMUM : ((DB_TXNHEAD *)txninfo)->maxid),
+		    (u_long)region->last_ckp.file,
+		    (u_long)region->last_ckp.offset);
+	}
+
+	if (0) {
+msgerr:		__db_errx(env, DB_STR_A("1520",
+		    "Recovery function for LSN %lu %lu failed on %s pass",
+		    "%lu %lu %s"), (u_long)lsn.file, (u_long)lsn.offset, pass);
+	}
+
+err:	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (txninfo != NULL)
+		__db_txnlist_end(env, txninfo);
+
+	dbenv->tx_timestamp = 0;
+
+	F_CLR(env->lg_handle, DBLOG_RECOVER);
+	F_CLR(region, TXN_IN_RECOVERY);
+
+	return (ret);
+}
+
+/*
+ * Figure out how many logfiles we have processed.  If we are moving
+ * forward (is_forward != 0), then we're computing current - low.  If
+ * we are moving backward, we are computing high - current.  max is
+ * the number of bytes per logfile.
+ */
+static double
+__lsn_diff(low, high, current, max, is_forward)
+	DB_LSN *low, *high, *current;
+	u_int32_t max;
+	int is_forward;
+{
+	double nf;
+
+	/*
+	 * There are three cases in each direction.  If you are in the
+	 * same file, then all you need worry about is the difference in
+	 * offsets.  If you are in different files, then either your offsets
+	 * put you either more or less than the integral difference in the
+	 * number of files -- we need to handle both of these.
+	 */
+	if (is_forward) {
+		if (current->file == low->file)
+			nf = (double)(current->offset - low->offset) / max;
+		else if (current->offset < low->offset)
+			nf = (double)((current->file - low->file) - 1) +
+			    (double)((max - low->offset) + current->offset) /
+			    max;
+		else
+			nf = (double)(current->file - low->file) +
+			    (double)(current->offset - low->offset) / max;
+	} else {
+		if (current->file == high->file)
+			nf = (double)(high->offset - current->offset) / max;
+		else if (current->offset > high->offset)
+			nf = (double)((high->file - current->file) - 1) +
+			    (double)
+			    ((max - current->offset) + high->offset) / max;
+		else
+			nf = (double)(high->file - current->file) +
+			    (double)(high->offset - current->offset) / max;
+	}
+	return (nf);
+}
+
+/*
+ * __log_backup --
+ *
+ * This is used to find the earliest log record to process when a client
+ * is trying to sync up with a master whose max LSN is less than this
+ * client's max lsn; we want to roll back everything after that.
+ *
+ * Find the latest checkpoint whose ckp_lsn is less than the max lsn.
+ */
+static int
+__log_backup(env, logc, max_lsn, start_lsn)
+	ENV *env;
+	DB_LOGC *logc;
+	DB_LSN *max_lsn, *start_lsn;
+{
+	DBT data;
+	DB_LSN lsn;
+	__txn_ckp_args *ckp_args;
+	int ret;
+
+	memset(&data, 0, sizeof(data));
+	ckp_args = NULL;
+
+	if ((ret = __txn_getckp(env, &lsn)) != 0)
+		goto err;
+	while ((ret = __logc_get(logc, &lsn, &data, DB_SET)) == 0) {
+		if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0)
+			return (ret);
+		/*
+		 * Follow checkpoints through the log until
+		 * we find one with a ckp_lsn less than
+		 * or equal max_lsn.
+		 */
+		if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) {
+			*start_lsn = ckp_args->ckp_lsn;
+			break;
+		}
+
+		lsn = ckp_args->last_ckp;
+		/*
+		 * If there are no more checkpoints behind us, we're
+		 * done.  Break with DB_NOTFOUND.
+		 */
+		if (IS_ZERO_LSN(lsn)) {
+			ret = DB_NOTFOUND;
+			break;
+		}
+		__os_free(env, ckp_args);
+		ckp_args = NULL;
+	}
+
+	if (ckp_args != NULL)
+		__os_free(env, ckp_args);
+	/*
+	 * If we walked back through all the checkpoints,
+	 * set the cursor on the first log record.
+	 */
+err:	if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND))
+		ret = __logc_get(logc, start_lsn, &data, DB_FIRST);
+	return (ret);
+}
+
+/*
+ * __log_earliest --
+ *
+ * Return the earliest recovery point for the log files present.  The
+ * earliest recovery time is the time stamp of the first checkpoint record
+ * whose checkpoint LSN is greater than the first LSN we process.
+ */
+static int
+__log_earliest(env, logc, lowtime, lowlsn)
+	ENV *env;
+	DB_LOGC *logc;
+	int32_t *lowtime;
+	DB_LSN *lowlsn;
+{
+	__txn_ckp_args *ckpargs;
+	DB_LSN first_lsn, lsn;
+	DBT data;
+	u_int32_t rectype;
+	int cmp, ret;
+
+	memset(&data, 0, sizeof(data));
+
+	/*
+	 * Read forward through the log looking for the first checkpoint
+	 * record whose ckp_lsn is greater than first_lsn.
+	 */
+	for (ret = __logc_get(logc, &first_lsn, &data, DB_FIRST);
+	    ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
+		LOGCOPY_32(env, &rectype, data.data);
+		if (rectype != DB___txn_ckp)
+			continue;
+		if ((ret =
+		    __txn_ckp_read(env, data.data, &ckpargs)) == 0) {
+			cmp = LOG_COMPARE(&ckpargs->ckp_lsn, &first_lsn);
+			*lowlsn = ckpargs->ckp_lsn;
+			*lowtime = ckpargs->timestamp;
+
+			__os_free(env, ckpargs);
+			if (cmp >= 0)
+				break;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __env_openfiles --
+ * Perform the pass of recovery that opens files.  This is used
+ * both during regular recovery and an initial call to txn_recover (since
+ * we need files open in order to abort prepared, but not yet committed
+ * transactions).
+ *
+ * See the comments in db_apprec for a detailed description of the
+ * various recovery passes.
+ *
+ * If we are not doing feedback processing (i.e., we are doing txn_recover
+ * processing and in_recovery is zero), then last_lsn can be NULL.
+ *
+ * PUBLIC: int __env_openfiles __P((ENV *,
+ * PUBLIC:     DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int));
+ */
+int
+__env_openfiles(env, logc, txninfo,
+    data, open_lsn, last_lsn, nfiles, in_recovery)
+	ENV *env;
+	DB_LOGC *logc;
+	void *txninfo;
+	DBT *data;
+	DB_LSN *open_lsn, *last_lsn;
+	double nfiles;
+	int in_recovery;
+{
+	DB_ENV *dbenv;
+	DB_LSN lsn, tlsn;
+	u_int32_t log_size;
+	int progress, ret;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * XXX
+	 * Get the log size.  No locking required because we're single-threaded
+	 * during recovery.
+	 */
+	log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
+
+	lsn = *open_lsn;
+	for (;;) {
+		if (in_recovery && dbenv->db_feedback != NULL) {
+			DB_ASSERT(env, last_lsn != NULL);
+			progress = (int)(33 * (__lsn_diff(open_lsn,
+			   last_lsn, &lsn, log_size, 1) / nfiles));
+			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+		}
+
+		tlsn = lsn;
+		ret = __db_dispatch(env, &env->recover_dtab, data, &tlsn,
+		    in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES,
+		    txninfo);
+		if (ret != 0 && ret != DB_TXN_CKP) {
+			__db_errx(env, DB_STR_A("1521",
+			    "Recovery function for LSN %lu %lu failed",
+			    "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+			break;
+		}
+		if ((ret = __logc_get(logc, &lsn, data, DB_NEXT)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				if (last_lsn != NULL &&
+				   LOG_COMPARE(&lsn, last_lsn) != 0)
+					ret = __db_log_corrupt(env, &lsn);
+				else
+					ret = 0;
+			}
+			break;
+		}
+	}
+
+	return (ret);
+}
+
+static int
+__db_log_corrupt(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	__db_errx(env, DB_STR_A("1522",
+	    "Log file corrupt at LSN: [%lu][%lu]", "%lu %lu"),
+	    (u_long)lsnp->file, (u_long)lsnp->offset);
+	return (EINVAL);
+}
+
+/*
+ * __env_init_rec --
+ *
+ * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t));
+ */
+int
+__env_init_rec(env, version)
+	ENV *env;
+	u_int32_t version;
+{
+	int ret;
+
+	/*
+	 * We need to prime the recovery table with the current recovery
+	 * functions.  Then we overwrite only specific entries based on
+	 * each previous version we support.
+	 */
+	if ((ret = __bam_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __crdel_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __db_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __dbreg_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __fop_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __ham_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __heap_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __qam_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __repmgr_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+	if ((ret = __txn_init_recover(env, &env->recover_dtab)) != 0)
+		goto err;
+
+	/*
+	 * After installing all the current recovery routines, we want to
+	 * override them with older versions if we are reading a down rev
+	 * log (from a downrev replication master).  If a log record is
+	 * changed then we must use the previous version for all older
+	 * logs.  If a record is changed in multiple revisions then the
+	 * oldest revision that applies must be used.  Therefore we override
+	 * the recovery functions in reverse log version order.
+	 */
+	/*
+	 * DB_LOGVERSION_53 is a strict superset of DB_LOGVERSION_50.
+	 * So, only check > DB_LOGVERSION_48p2.  If/When log records are
+	 * altered, the condition below will need to change.
+	 */
+	if (version > DB_LOGVERSION_48p2)
+		goto done;
+	if ((ret = __env_init_rec_48(env)) != 0)
+		goto err;
+	/*
+	 * Patch 2 added __db_pg_trunc but did not replace any log records
+	 * so we want to override the same functions as in the original release.
+	 */
+	if (version >= DB_LOGVERSION_48)
+		goto done;
+	if ((ret = __env_init_rec_47(env)) != 0)
+		goto err;
+	if (version == DB_LOGVERSION_47)
+		goto done;
+	if ((ret = __env_init_rec_46(env)) != 0)
+		goto err;
+	/*
+	 * There are no log record/recovery differences between 4.4 and 4.5.
+	 * The log version changed due to checksum.  There are no log recovery
+	 * differences between 4.5 and 4.6.  The name of the rep_gen in
+	 * txn_checkpoint changed (to spare, since we don't use it anymore).
+	 */
+	if (version >= DB_LOGVERSION_44)
+		goto done;
+	if ((ret = __env_init_rec_43(env)) != 0)
+		goto err;
+	if (version == DB_LOGVERSION_43)
+		goto done;
+	if (version != DB_LOGVERSION_42) {
+		__db_errx(env, DB_STR_A("1523", "Unknown version %lu",
+		    "%lu"), (u_long)version);
+		ret = EINVAL;
+		goto err;
+	}
+	ret = __env_init_rec_42(env);
+
+done:
+err:	return (ret);
+}
+
+static int
+__env_init_rec_42(env)
+	ENV *env;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_relink_42_recover, DB___db_relink_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_pg_alloc_42_recover, DB___db_pg_alloc_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_pg_free_42_recover, DB___db_pg_free_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_pg_freedata_42_recover, DB___db_pg_freedata_42)) != 0)
+		goto err;
+#ifdef HAVE_HASH
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __ham_metagroup_42_recover, DB___ham_metagroup_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __ham_groupalloc_42_recover, DB___ham_groupalloc_42)) != 0)
+		goto err;
+#endif
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __txn_ckp_42_recover, DB___txn_ckp_42)) != 0)
+		goto err;
+err:
+	return (ret);
+}
+
+static int
+__env_init_rec_43(env)
+	ENV *env;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __bam_relink_43_recover, DB___bam_relink_43)) != 0)
+		goto err;
+	/*
+	 * We want to use the 4.2-based txn_regop record.
+	 */
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __txn_regop_42_recover, DB___txn_regop_42)) != 0)
+		goto err;
+err:
+	return (ret);
+}
+
+static int
+__env_init_rec_46(env)
+	ENV *env;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __bam_merge_44_recover, DB___bam_merge_44)) != 0)
+		goto err;
+
+err:	return (ret);
+}
+
+static int
+__env_init_rec_47(env)
+	ENV *env;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __bam_split_42_recover, DB___bam_split_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __fop_create_42_recover, DB___fop_create_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __fop_write_42_recover, DB___fop_write_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __fop_rename_42_recover, DB___fop_rename_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __fop_rename_noundo_46_recover, DB___fop_rename_noundo_46)) != 0)
+		goto err;
+
+err:
+	return (ret);
+}
+
+static int
+__env_init_rec_48(env)
+	ENV *env;
+{
+	int ret;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_addrem_42_recover, DB___db_addrem_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __db_big_42_recover, DB___db_big_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __bam_split_48_recover, DB___bam_split_48)) != 0)
+		goto err;
+#ifdef HAVE_HASH
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __ham_insdel_42_recover, DB___ham_insdel_42)) != 0)
+		goto err;
+	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+	    __ham_replace_42_recover, DB___ham_replace_42)) != 0)
+		goto err;
+#endif
+err:
+	return (ret);
+}
diff --git a/src/env/env_region.c b/src/env/env_region.c
new file mode 100644
index 00000000..113bea21
--- /dev/null
+++ b/src/env/env_region.c
@@ -0,0 +1,1497 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int  __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **));
+static int  __env_faultmem __P((ENV *, void *, size_t, int));
+static int  __env_sys_attach __P((ENV *, REGINFO *, REGION *));
+static int  __env_sys_detach __P((ENV *, REGINFO *, int));
+static void __env_des_destroy __P((ENV *, REGION *));
+static void __env_remove_file __P((ENV *));
+
+/*
+ * __env_attach
+ *	Join/create the environment
+ *
+ * PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int));
+ */
+int
+__env_attach(env, init_flagsp, create_ok, retry_ok)
+	ENV *env;
+	u_int32_t *init_flagsp;
+	int create_ok, retry_ok;
+{
+	DB_ENV *dbenv;
+	REGENV rbuf, *renv;
+	REGENV_REF ref;
+	REGINFO *infop;
+	REGION *rp, tregion;
+	size_t max, nrw, size;
+	long segid;
+	u_int32_t bytes, i, mbytes, nregions, signature;
+	u_int retry_cnt;
+	int majver, minver, patchver, ret;
+	char buf[sizeof(DB_REGION_FMT) + 20];
+
+	/* Initialization */
+	dbenv = env->dbenv;
+	retry_cnt = 0;
+	signature = __env_struct_sig();
+
+	/* Repeated initialization. */
+loop:	renv = NULL;
+	rp = NULL;
+
+	/* Set up the ENV's REG_INFO structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0)
+		return (ret);
+	infop->env = env;
+	infop->type = REGION_TYPE_ENV;
+	infop->id = REGION_ID_ENV;
+	infop->flags = REGION_JOIN_OK;
+	if (create_ok)
+		F_SET(infop, REGION_CREATE_OK);
+
+	/* Build the region name. */
+	if (F_ISSET(env, ENV_PRIVATE))
+		ret = __os_strdup(env, "process-private", &infop->name);
+	else {
+		(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+		ret = __db_appname(env, DB_APP_NONE, buf, NULL, &infop->name);
+	}
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * We have to single-thread the creation of the REGENV region.  Once
+	 * it exists, we can serialize using region mutexes, but until then
+	 * we have to be the only player in the game.
+	 *
+	 * If this is a private environment, we are only called once and there
+	 * are no possible race conditions.
+	 *
+	 * If this is a public environment, we use the filesystem to ensure
+	 * the creation of the environment file is single-threaded.
+	 *
+	 * If the application has specified their own mapping functions, try
+	 * and create the region.  The application will have to let us know if
+	 * it's actually a creation or not, and we'll have to fall-back to a
+	 * join if it's not a create.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL)
+		goto creation;
+
+	/*
+	 * Try to create the file, if we have the authority.  We have to ensure
+	 * that multiple threads/processes attempting to simultaneously create
+	 * the file are properly ordered.  Open using the O_CREAT and O_EXCL
+	 * flags so that multiple attempts to create the region will return
+	 * failure in all but one.  POSIX 1003.1 requires that EEXIST be the
+	 * errno return value -- I sure hope they're right.
+	 */
+	if (create_ok) {
+		if ((ret = __os_open(env, infop->name, 0,
+		    DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION,
+		    env->db_mode, &env->lockfhp)) == 0)
+			goto creation;
+		if (ret != EEXIST) {
+			__db_err(env, ret, "%s", infop->name);
+			goto err;
+		}
+	}
+
+	/* The region must exist, it's not okay to recreate it. */
+	F_CLR(infop, REGION_CREATE_OK);
+
+	/*
+	 * If we couldn't create the file, try and open it.  (If that fails,
+	 * we're done.)
+	 */
+	if ((ret = __os_open(
+	    env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * The region may be in system memory not backed by the filesystem
+	 * (more specifically, not backed by this file), and we're joining
+	 * it.  In that case, the process that created it will have written
+	 * out a REGENV_REF structure as its only contents.  We read that
+	 * structure before we do anything further, e.g., we can't just map
+	 * that file in and then figure out what's going on.
+	 *
+	 * All of this noise is because some systems don't have a coherent VM
+	 * and buffer cache, and what's worse, when you mix operations on the
+	 * VM and buffer cache, half the time you hang the system.
+	 *
+	 * If the file is the size of an REGENV_REF structure, then we know
+	 * the real region is in some other memory.  (The only way you get a
+	 * file that size is to deliberately write it, as it's smaller than
+	 * any possible disk sector created by writing a file or mapping the
+	 * file into memory.)  In which case, retrieve the structure from the
+	 * file and use it to acquire the referenced memory.
+	 *
+	 * If the structure is larger than a REGENV_REF structure, then this
+	 * file is backing the shared memory region, and we just map it into
+	 * memory.
+	 *
+	 * And yes, this makes me want to take somebody and kill them.  (I
+	 * digress -- but you have no freakin' idea.  This is unbelievably
+	 * stupid and gross, and I've probably spent six months of my life,
+	 * now, trying to make different versions of it work.)
+	 */
+	if ((ret = __os_ioinfo(env, infop->name,
+	    env->lockfhp, &mbytes, &bytes, NULL)) != 0) {
+		__db_err(env, ret, "%s", infop->name);
+		goto err;
+	}
+
+	/*
+	 * !!!
+	 * A size_t is OK -- regions get mapped into memory, and so can't
+	 * be larger than a size_t.
+	 */
+	size = mbytes * MEGABYTE + bytes;
+
+	/*
+	 * If the size is less than the size of a REGENV_REF structure, the
+	 * region (or, possibly, the REGENV_REF structure) has not yet been
+	 * completely written.  Shouldn't be possible, but there's no reason
+	 * not to wait awhile and try again.
+	 *
+	 * If the region is precisely the size of a ref, then we don't
+	 * have the region here, just the meta-data, which implies that
+	 * that we are using SYSTEM V shared memory (SYSTEM_MEM).  However,
+	 * if the flags say that we are using SYSTEM_MEM and the region is
+	 * bigger than the ref, something bad has happened -- we are storing
+	 * something in the region file other than meta-data and that
+	 * shouldn't happen.
+	 */
+	if (size < sizeof(ref))
+		goto retry;
+	else {
+
+		if (size == sizeof(ref))
+			F_SET(env, ENV_SYSTEM_MEM);
+		else if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+			ret = EINVAL;
+			__db_err(env, ret, DB_STR_A("1535",
+		    "%s: existing environment not created in system memory",
+			    "%s"), infop->name);
+			goto err;
+		} else {
+			if ((ret = __os_read(env, env->lockfhp, &rbuf,
+			    sizeof(rbuf), &nrw)) != 0 ||
+			    nrw < (size_t)sizeof(rbuf) ||
+			    (ret = __os_seek(env,
+			    env->lockfhp, 0, 0, rbuf.region_off)) != 0) {
+				__db_err(env, ret, DB_STR_A("1536",
+				     "%s: unable to read region info", "%s"),
+				     infop->name);
+				goto err;
+			}
+		}
+
+		if ((ret = __os_read(env, env->lockfhp, &ref,
+		    sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
+			if (ret == 0)
+				ret = EIO;
+			__db_err(env, ret, DB_STR_A("1537",
+			    "%s: unable to read system-memory information",
+			    "%s"), infop->name);
+			goto err;
+		}
+		size = ref.size;
+		max = ref.max;
+		segid = ref.segid;
+	}
+
+#ifndef HAVE_MUTEX_FCNTL
+	/*
+	 * If we're not doing fcntl locking, we can close the file handle.  We
+	 * no longer need it and the less contact between the buffer cache and
+	 * the VM, the better.
+	 */
+	(void)__os_closehandle(env, env->lockfhp);
+	 env->lockfhp = NULL;
+#endif
+
+	/* Call the region join routine to acquire the region. */
+	memset(&tregion, 0, sizeof(tregion));
+	tregion.size = (roff_t)size;
+	tregion.max = (roff_t)max;
+	tregion.segid = segid;
+	if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
+		goto err;
+
+user_map_functions:
+	/*
+	 * The environment's REGENV structure has to live at offset 0 instead
+	 * of the usual alloc information.  Set the primary reference and
+	 * correct the "head" value to reference the alloc region.
+	 */
+	infop->primary = infop->addr;
+	infop->head = (u_int8_t *)infop->addr + sizeof(REGENV);
+	renv = infop->primary;
+
+	/*
+	 * Make sure the region matches our build.  Special case a region
+	 * that's all nul bytes, just treat it like any other corruption.
+	 */
+	if (renv->majver != DB_VERSION_MAJOR ||
+	    renv->minver != DB_VERSION_MINOR) {
+		if (renv->majver != 0 || renv->minver != 0) {
+			__db_errx(env, DB_STR_A("1538",
+	    "Program version %d.%d doesn't match environment version %d.%d",
+			    "%d %d %d %d"), DB_VERSION_MAJOR, DB_VERSION_MINOR,
+			    renv->majver, renv->minver);
+			ret = DB_VERSION_MISMATCH;
+		} else
+			ret = EINVAL;
+		goto err;
+	}
+	if (renv->signature != signature) {
+		__db_errx(env, DB_STR("1539",
+		    "Build signature doesn't match environment"));
+		ret = DB_VERSION_MISMATCH;
+		goto err;
+	}
+
+	/*
+	 * Check if the environment has had a catastrophic failure.
+	 *
+	 * Check the magic number to ensure the region is initialized.  If the
+	 * magic number isn't set, the lock may not have been initialized, and
+	 * an attempt to use it could lead to random behavior.
+	 *
+	 * The panic and magic values aren't protected by any lock, so we never
+	 * use them in any check that's more complex than set/not-set.
+	 *
+	 * !!!
+	 * I'd rather play permissions games using the underlying file, but I
+	 * can't because Windows/NT filesystems won't open files mode 0.
+	 */
+	if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) {
+		ret = __env_panic_msg(env);
+		goto err;
+	}
+	if (renv->magic != DB_REGION_MAGIC)
+		goto retry;
+
+	/*
+	 * Get a reference to the underlying REGION information for this
+	 * environment.
+	 */
+	if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL)
+		goto find_err;
+	infop->rp = rp;
+
+	/*
+	 * There's still a possibility for inconsistent data.  When we acquired
+	 * the size of the region and attached to it, it might have still been
+	 * growing as part of its creation.  We can detect this by checking the
+	 * size we originally found against the region's current size.  (The
+	 * region's current size has to be final, the creator finished growing
+	 * it before setting the magic number in the region.)
+	 *
+	 * !!!
+	 * Skip this test when the application specified its own map functions.
+	 * The size of the region is essentially unknown in that case: some
+	 * other process asked the application's map function for some bytes,
+	 * but we were never told the final size of the region.  We could get
+	 * a size back from the map function, but for all we know, our process'
+	 * map function only knows how to join regions, it has no clue how big
+	 * those regions are.
+	 */
+	if (DB_GLOBAL(j_region_map) == NULL && rp->size != size)
+		goto retry;
+
+	/*
+	 * Check our callers configuration flags, it's an error to configure
+	 * incompatible or additional subsystems in an existing environment.
+	 * Return the total set of flags to the caller so they initialize the
+	 * correct set of subsystems.
+	 */
+	if (init_flagsp != NULL) {
+		FLD_CLR(*init_flagsp, renv->init_flags);
+		if (*init_flagsp != 0) {
+			__db_errx(env, DB_STR("1540",
+    "configured environment flags incompatible with existing environment"));
+			ret = EINVAL;
+			goto err;
+		}
+		*init_flagsp = renv->init_flags;
+	}
+
+	/*
+	 * Fault the pages into memory.  Note, do this AFTER releasing the
+	 * lock, because we're only reading the pages, not writing them.
+	 */
+	(void)__env_faultmem(env, infop->primary, rp->size, 0);
+
+	/* Everything looks good, we're done. */
+	env->reginfo = infop;
+	return (0);
+
+creation:
+	/* Create the environment region. */
+	F_SET(infop, REGION_CREATE);
+
+	/*
+	 * Allocate room for REGION structures plus overhead.
+	 */
+	memset(&tregion, 0, sizeof(tregion));
+	nregions = __memp_max_regions(env) + 5;
+	size = nregions * sizeof(REGION);
+	size += dbenv->passwd_len;
+	size += (dbenv->thr_max + dbenv->thr_max / 4) *
+	    __env_alloc_size(sizeof(DB_THREAD_INFO));
+	/* Space for replication buffer. */
+	if (init_flagsp != NULL && FLD_ISSET(*init_flagsp, DB_INITENV_REP))
+		size += MEGABYTE;
+	size += __txn_region_size(env);
+	size += __log_region_size(env);
+	size += __env_thread_size(env, size);
+	size += __lock_region_size(env, size);
+
+	tregion.size = (roff_t)size;
+	tregion.segid = INVALID_REGION_SEGID;
+
+	if ((tregion.max = dbenv->memory_max) == 0) {
+		/* Add some slop. */
+		size += 16 * 1024;
+		tregion.max = (roff_t)size;
+
+		tregion.max += (roff_t)__lock_region_max(env);
+		tregion.max += (roff_t)__txn_region_max(env);
+		tregion.max += (roff_t)__log_region_max(env);
+		tregion.max += (roff_t)__env_thread_max(env);
+	} else if (tregion.size > tregion.max) {
+		__db_errx(env, DB_STR_A("1542",
+	"Minimum environment memory size %ld is bigger than spcified max %ld.",
+		    "%ld %ld"), (u_long)tregion.size, (u_long)tregion.max);
+		ret = EINVAL;
+		goto err;
+	} else if (F_ISSET(env, ENV_PRIVATE))
+		infop->max_alloc = dbenv->memory_max;
+
+	if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
+		goto err;
+
+	/*
+	 * If the application has specified its own mapping functions, we don't
+	 * know until we get here if we are creating the region or not.   The
+	 * way we find out is underlying functions clear the REGION_CREATE flag.
+	 */
+	if (!F_ISSET(infop, REGION_CREATE))
+		goto user_map_functions;
+
+	/*
+	 * Fault the pages into memory.  Note, do this BEFORE we initialize
+	 * anything, because we're writing the pages, not just reading them.
+	 */
+	(void)__env_faultmem(env, infop->addr, tregion.size, 1);
+
+	/*
+	 * The first object in the region is the REGENV structure.  This is
+	 * different from the other regions, and, from everything else in
+	 * this region, where all objects are allocated from the pool, i.e.,
+	 * there aren't any fixed locations.  The remaining space is made
+	 * available for later allocation.
+	 *
+	 * The allocation space must be size_t aligned, because that's what
+	 * the initialization routine is going to store there.  To make sure
+	 * that happens, the REGENV structure was padded with a final size_t.
+	 * No other region needs to worry about it because all of them treat
+	 * the entire region as allocation space.
+	 *
+	 * Set the primary reference and correct the "head" value to reference
+	 * the alloc region.
+	 */
+	infop->primary = infop->addr;
+	infop->head = (u_int8_t *)infop->addr + sizeof(REGENV);
+	__env_alloc_init(infop, tregion.size - sizeof(REGENV));
+
+	/*
+	 * Initialize the rest of the REGENV structure.  (Don't set the magic
+	 * number to the correct value, that would validate the environment).
+	 */
+	renv = infop->primary;
+	renv->magic = 0;
+	renv->panic = 0;
+
+	(void)db_version(&majver, &minver, &patchver);
+	renv->majver = (u_int32_t)majver;
+	renv->minver = (u_int32_t)minver;
+	renv->patchver = (u_int32_t)patchver;
+	renv->signature = signature;
+
+	(void)time(&renv->timestamp);
+	__os_unique_id(env, &renv->envid);
+
+	/*
+	 * Initialize init_flags to store the flags that any other environment
+	 * handle that uses DB_JOINENV to join this environment will need.
+	 */
+	renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
+
+	/*
+	 * Set up the region array.  We use an array rather than a linked list
+	 * as we have to traverse this list after failure in some cases, and
+	 * we don't want to infinitely loop should the application fail while
+	 * we're manipulating the list.
+	 */
+	renv->region_cnt = nregions;
+	if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) {
+		__db_err(env, ret, DB_STR("1543",
+		    "unable to create new master region array"));
+		goto err;
+	}
+	renv->region_off = R_OFFSET(infop, rp);
+	for (i = 0; i < nregions; ++i, ++rp)
+		rp->id = INVALID_REGION_ID;
+
+	renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF;
+	renv->flags = 0;
+	renv->op_timestamp = renv->rep_timestamp = 0;
+	renv->mtx_regenv = MUTEX_INVALID;
+	renv->reg_panic = 0;
+
+	/*
+	 * Get the underlying REGION structure for this environment.  Note,
+	 * we created the underlying OS region before we acquired the REGION
+	 * structure, which is backwards from the normal procedure.  Update
+	 * the REGION structure.
+	 */
+	if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) {
+find_err:	__db_errx(env, DB_STR_A("1544",
+		    "%s: unable to find environment", "%s"), infop->name);
+		if (ret == 0)
+			ret = EINVAL;
+		goto err;
+	}
+	infop->rp = rp;
+	rp->alloc = rp->size = tregion.size;
+	rp->max = tregion.max;
+	rp->segid = tregion.segid;
+
+	/*
+	 * !!!
+	 * If we create an environment where regions are public and in system
+	 * memory, we have to inform processes joining the environment how to
+	 * attach to the shared memory segment.  So, we write the shared memory
+	 * identifier into the file, to be read by those other processes.
+	 *
+	 * XXX
+	 * This is really OS-layer information, but I can't see any easy way
+	 * to move it down there without passing down information that it has
+	 * no right to know, e.g., that this is the one-and-only REGENV region
+	 * and not some other random region.
+	 */
+	if (tregion.segid != INVALID_REGION_SEGID) {
+		ref.size = tregion.size;
+		ref.segid = tregion.segid;
+		ref.max = tregion.max;
+		if ((ret = __os_write(
+		    env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) {
+			__db_err(env, ret, DB_STR_A("1545",
+			    "%s: unable to write out public environment ID",
+			    "%s"), infop->name);
+			goto err;
+		}
+	}
+
+#ifndef HAVE_MUTEX_FCNTL
+	/*
+	 * If we're not doing fcntl locking, we can close the file handle.  We
+	 * no longer need it and the less contact between the buffer cache and
+	 * the VM, the better.
+	 */
+	if (env->lockfhp != NULL) {
+		 (void)__os_closehandle(env, env->lockfhp);
+		 env->lockfhp = NULL;
+	}
+#endif
+
+	/* Everything looks good, we're done. */
+	env->reginfo = infop;
+	return (0);
+
+err:
+retry:	/* Close any open file handle. */
+	if (env->lockfhp != NULL) {
+		(void)__os_closehandle(env, env->lockfhp);
+		env->lockfhp = NULL;
+	}
+
+	/*
+	 * If we joined or created the region, detach from it.  If we created
+	 * it, destroy it.  Note, there's a path in the above code where we're
+	 * using a temporary REGION structure because we haven't yet allocated
+	 * the real one.  In that case the region address (addr) will be filled
+	 * in, but the REGION pointer (rp) won't.  Fix it.
+	 */
+	if (infop->addr != NULL) {
+		if (infop->rp == NULL)
+			infop->rp = &tregion;
+
+		(void)__env_sys_detach(env,
+		    infop, F_ISSET(infop, REGION_CREATE));
+
+		if (rp != NULL && F_ISSET(env, DB_PRIVATE))
+			__env_alloc_free(infop, rp);
+	}
+
+	/* Free the allocated name and/or REGINFO structure. */
+	if (infop->name != NULL)
+		__os_free(env, infop->name);
+	__os_free(env, infop);
+
+	/* If we had a temporary error, wait awhile and try again. */
+	if (ret == 0) {
+		if (!retry_ok || ++retry_cnt > 3) {
+			__db_errx(env, DB_STR("1546",
+			    "unable to join the environment"));
+			ret = EAGAIN;
+		} else {
+			__os_yield(env, retry_cnt * 3, 0);
+			goto loop;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __env_turn_on --
+ *	Turn on the created environment.
+ *
+ * PUBLIC: int __env_turn_on __P((ENV *));
+ */
+int
+__env_turn_on(env)
+	ENV *env;
+{
+	REGENV *renv;
+	REGINFO *infop;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	/* If we didn't create the region, there's no need for further work. */
+	if (!F_ISSET(infop, REGION_CREATE))
+		return (0);
+
+	/*
+	 * Validate the file.  All other threads of control are waiting
+	 * on this value to be written -- "Let slip the hounds of war!"
+	 */
+	renv->magic = DB_REGION_MAGIC;
+
+	return (0);
+}
+
+/*
+ * __env_turn_off --
+ *	Turn off the environment.
+ *
+ * PUBLIC: int __env_turn_off __P((ENV *, u_int32_t));
+ */
+int
+__env_turn_off(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	int ret, t_ret;
+
+	ret = 0;
+
+	/*
+	 * Connect to the environment: If we can't join the environment, we
+	 * guess it's because it doesn't exist and we're done.
+	 *
+	 * If the environment exists, attach and lock the environment.
+	 */
+	if (__env_attach(env, NULL, 0, 1) != 0)
+		return (0);
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	MUTEX_LOCK(env, renv->mtx_regenv);
+
+	/*
+	 * If the environment is in use, we're done unless we're forcing the
+	 * issue or the environment has panic'd.  (If the environment panic'd,
+	 * the thread holding the reference count may not have cleaned up, so
+	 * we clean up.  It's possible the application didn't plan on removing
+	 * the environment in this particular call, but panic'd environments
+	 * aren't useful to anyone.)
+	 *
+	 * Otherwise, panic the environment and overwrite the magic number so
+	 * any thread of control attempting to connect (or racing with us) will
+	 * back off and retry, or just die.
+	 */
+	if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic)
+		ret = EBUSY;
+	else
+		renv->panic = 1;
+
+	/*
+	 * Unlock the environment (nobody should need this lock because
+	 * we've poisoned the pool) and detach from the environment.
+	 */
+	MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+	if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __env_panic_set --
+ *	Set/clear unrecoverable error.
+ *
+ * PUBLIC: void __env_panic_set __P((ENV *, int));
+ */
+void
+__env_panic_set(env, on)
+	ENV *env;
+	int on;
+{
+	if (env != NULL && env->reginfo != NULL)
+		((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0;
+}
+
+/*
+ * __env_ref_increment --
+ *	Increment the environment's reference count.
+ *
+ * PUBLIC: int __env_ref_increment __P((ENV *));
+ */
+int
+__env_ref_increment(env)
+	ENV *env;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	int ret;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	/* If we're creating the primary region, allocate a mutex. */
+	if (F_ISSET(infop, REGION_CREATE)) {
+		if ((ret = __mutex_alloc(
+		    env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0)
+			return (ret);
+		renv->refcnt = 1;
+	} else {
+		/* Lock the environment, increment the reference, unlock. */
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		++renv->refcnt;
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+	}
+
+	F_SET(env, ENV_REF_COUNTED);
+	return (0);
+}
+
+/*
+ * __env_ref_decrement --
+ *	Decrement the environment's reference count.
+ *
+ * PUBLIC: int __env_ref_decrement __P((ENV *));
+ */
+int
+__env_ref_decrement(env)
+	ENV *env;
+{
+	REGENV *renv;
+	REGINFO *infop;
+
+	/* Be cautious -- we may not have an environment. */
+	if ((infop = env->reginfo) == NULL)
+		return (0);
+
+	renv = infop->primary;
+
+	/* Even if we have an environment, may not have reference counted it. */
+	if (F_ISSET(env, ENV_REF_COUNTED)) {
+		/* Lock the environment, decrement the reference, unlock. */
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		if (renv->refcnt == 0)
+			__db_errx(env, DB_STR("1547",
+			    "environment reference count went negative"));
+		else
+			--renv->refcnt;
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+		F_CLR(env, ENV_REF_COUNTED);
+	}
+
+	/* If a private environment, we're done with the mutex, destroy it. */
+	return (F_ISSET(env, ENV_PRIVATE) ?
+	    __mutex_free(env, &renv->mtx_regenv) : 0);
+}
+
+/*
+ * __env_ref_get --
+ *	Get the number of environment references.  This is an unprotected
+ *	read of refcnt to simply provide a spot check of the value.  It
+ *	is only intended for use as an internal utility routine.
+ *
+ * PUBLIC: int __env_ref_get __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_ref_get(dbenv, countp)
+	DB_ENV *dbenv;
+	u_int32_t *countp;
+{
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+
+	env = dbenv->env;
+	infop = env->reginfo;
+	renv = infop->primary;
+	*countp = renv->refcnt;
+	return (0);
+}
+
+/*
+ * __env_detach --
+ *	Detach from the environment.
+ *
+ * PUBLIC: int __env_detach __P((ENV *, int));
+ */
+int
+__env_detach(env, destroy)
+	ENV *env;
+	int destroy;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	REGION rp;
+	int ret, t_ret;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	ret = 0;
+
+	/* Close the locking file handle. */
+	if (env->lockfhp != NULL) {
+		if ((t_ret =
+		    __os_closehandle(env, env->lockfhp)) != 0 && ret == 0)
+			ret = t_ret;
+		env->lockfhp = NULL;
+	}
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (destroy) {
+		/*
+		 * Free the REGION array.
+		 *
+		 * The actual underlying region structure is allocated from the
+		 * primary shared region, and we're about to free it.  Save a
+		 * copy on our stack for the REGINFO to reference when it calls
+		 * down into the OS layer to release the shared memory segment.
+		 */
+		rp = *infop->rp;
+		infop->rp = &rp;
+
+		if (renv->region_off != INVALID_ROFF)
+			__env_alloc_free(
+			   infop, R_ADDR(infop, renv->region_off));
+	}
+
+	/*
+	 * Set the ENV->reginfo field to NULL.  BDB uses the ENV->reginfo
+	 * field to decide if the underlying region can be accessed or needs
+	 * cleanup.  We're about to destroy what it references, so it needs to
+	 * be cleared.
+	 */
+	env->reginfo = NULL;
+	env->thr_hashtab = NULL;
+
+	if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0)
+		ret = t_ret;
+	if (infop->name != NULL)
+		__os_free(env, infop->name);
+
+	/* Discard the ENV->reginfo field's memory. */
+	__os_free(env, infop);
+
+	return (ret);
+}
+
+/*
+ * __env_remove_env --
+ *	Remove an environment.
+ *
+ * PUBLIC: int __env_remove_env __P((ENV *));
+ */
+int
+__env_remove_env(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	REGENV *renv;
+	REGINFO *infop, reginfo;
+	REGION *rp;
+	u_int32_t flags_orig, i;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * We do not want to hang on a mutex request, nor do we care about
+	 * panics.
+	 */
+	flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+	F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+
+	/*
+	 * This routine has to walk a nasty line between not looking into the
+	 * environment (which may be corrupted after an app or system crash),
+	 * and removing everything that needs removing.
+	 *
+	 * Connect to the environment: If we can't join the environment, we
+	 * guess it's because it doesn't exist.  Remove the underlying files,
+	 * at least.
+	 */
+	if (__env_attach(env, NULL, 0, 0) != 0)
+		goto remfiles;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	/*
+	 * Kill the environment, if it's not already dead.
+	 */
+	renv->panic = 1;
+
+	/*
+	 * Walk the array of regions.  Connect to each region and disconnect
+	 * with the destroy flag set.  This shouldn't cause any problems, even
+	 * if the region is corrupted, because we never look inside the region
+	 * (with the single exception of mutex regions on systems where we have
+	 * to return resources to the underlying system).
+	 */
+	for (rp = R_ADDR(infop, renv->region_off),
+	    i = 0; i < renv->region_cnt; ++i, ++rp) {
+		if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV)
+			continue;
+		/*
+		 * !!!
+		 * The REGION_CREATE_OK flag is set for Windows/95 -- regions
+		 * are zero'd out when the last reference to the region goes
+		 * away, in which case the underlying OS region code requires
+		 * callers be prepared to create the region in order to join it.
+		 */
+		memset(&reginfo, 0, sizeof(reginfo));
+		reginfo.id = rp->id;
+		reginfo.flags = REGION_CREATE_OK;
+
+		/*
+		 * If we get here and can't attach and/or detach to the
+		 * region, it's a mess.  Ignore errors, there's nothing
+		 * we can do about them.
+		 */
+		if (__env_region_attach(env, &reginfo, 0, 0) != 0)
+			continue;
+
+#ifdef  HAVE_MUTEX_SYSTEM_RESOURCES
+		/*
+		 * If destroying the mutex region, return any system
+		 * resources to the system.
+		 */
+		if (reginfo.type == REGION_TYPE_MUTEX)
+			__mutex_resource_return(env, &reginfo);
+#endif
+		(void)__env_region_detach(env, &reginfo, 1);
+	}
+
+	/* Detach from the environment's primary region. */
+	(void)__env_detach(env, 1);
+
+remfiles:
+	/*
+	 * Walk the list of files in the directory, unlinking files in the
+	 * Berkeley DB name space.
+	 */
+	__env_remove_file(env);
+
+	F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+	F_SET(dbenv, flags_orig);
+
+	return (0);
+}
+
+/*
+ * __env_remove_file --
+ *	Discard any region files in the filesystem.
+ */
+static void
+__env_remove_file(env)
+	ENV *env;
+{
+	int cnt, fcnt, lastrm, ret;
+	const char *dir;
+	char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
+
+	/* Get the full path of a file in the environment. */
+	(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, buf, NULL, &path)) != 0)
+		return;
+
+	/* Get the parent directory for the environment. */
+	if ((p = __db_rpath(path)) == NULL) {
+		p = path;
+		saved_char = *p;
+
+		dir = PATH_DOT;
+	} else {
+		saved_char = *p;
+		*p = '\0';
+
+		dir = path;
+	}
+
+	/* Get the list of file names. */
+	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0)
+		__db_err(env, ret, "%s", dir);
+
+	/* Restore the path, and free it. */
+	*p = saved_char;
+	__os_free(env, path);
+
+	if (ret != 0)
+		return;
+
+	/*
+	 * Remove files from the region directory.
+	 */
+	for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
+		/* Skip anything outside our name space. */
+		if (!IS_DB_FILE(names[cnt]))
+			continue;
+
+		/* Skip queue extent files. */
+		if (strncmp(names[cnt], "__dbq.", 6) == 0)
+			continue;
+		if (strncmp(names[cnt], "__dbp.", 6) == 0)
+			continue;
+
+		/* Skip registry files. */
+		if (strncmp(names[cnt], "__db.register", 13) == 0)
+			continue;
+
+		/* Skip replication files. */
+		if (strncmp(names[cnt], "__db.rep", 8) == 0)
+			continue;
+
+		/*
+		 * Remove the primary environment region last, because it's
+		 * the key to this whole mess.
+		 */
+		if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
+			lastrm = cnt;
+			continue;
+		}
+
+		/* Remove the file. */
+		if (__db_appname(env,
+		    DB_APP_NONE, names[cnt], NULL, &path) == 0) {
+			/*
+			 * Overwrite region files.  Temporary files would have
+			 * been maintained in encrypted format, so there's no
+			 * reason to overwrite them.  This is not an exact
+			 * check on the file being a region file, but it's
+			 * not likely to be wrong, and the worst thing that can
+			 * happen is we overwrite a file that didn't need to be
+			 * overwritten.
+			 */
+			(void)__os_unlink(env, path, 1);
+			__os_free(env, path);
+		}
+	}
+
+	if (lastrm != -1)
+		if (__db_appname(env,
+		    DB_APP_NONE, names[lastrm], NULL, &path) == 0) {
+			(void)__os_unlink(env, path, 1);
+			__os_free(env, path);
+		}
+	__os_dirfree(env, names, fcnt);
+}
+
+/*
+ * __env_region_attach
+ *	Join/create a region.
+ *
+ * PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t));
+ */
+int
+__env_region_attach(env, infop, init, max)
+	ENV *env;
+	REGINFO *infop;
+	size_t init, max;
+{
+	REGION *rp;
+	int ret;
+	char buf[sizeof(DB_REGION_FMT) + 20];
+
+	/*
+	 * Find or create a REGION structure for this region.  If we create
+	 * it, the REGION_CREATE flag will be set in the infop structure.
+	 */
+	F_CLR(infop, REGION_CREATE);
+	if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0)
+		return (ret);
+	infop->env = env;
+	infop->rp = rp;
+	infop->type = rp->type;
+	infop->id = rp->id;
+
+	/*
+	 * __env_des_get may have created the region and reset the create
+	 * flag.  If we're creating the region, set the desired size.
+	 */
+	if (F_ISSET(infop, REGION_CREATE)) {
+		rp->alloc = rp->size = (roff_t)init;
+		rp->max = (roff_t)max;
+	}
+
+	/* Join/create the underlying region. */
+	(void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, buf, NULL, &infop->name)) != 0)
+		goto err;
+	if ((ret = __env_sys_attach(env, infop, rp)) != 0)
+		goto err;
+
+	/*
+	 * Fault the pages into memory.  Note, do this BEFORE we initialize
+	 * anything because we're writing pages in created regions, not just
+	 * reading them.
+	 */
+	(void)__env_faultmem(env,
+	    infop->addr, rp->size, F_ISSET(infop, REGION_CREATE));
+
+	/*
+	 * !!!
+	 * The underlying layer may have just decided that we are going
+	 * to create the region.  There are various system issues that
+	 * can result in a useless region that requires re-initialization.
+	 *
+	 * If we created the region, initialize it for allocation.
+	 */
+	if (F_ISSET(infop, REGION_CREATE))
+		__env_alloc_init(infop, rp->size);
+
+	return (0);
+
+err:	/* Discard the underlying region. */
+	if (infop->addr != NULL)
+		(void)__env_sys_detach(env,
+		    infop, F_ISSET(infop, REGION_CREATE));
+	else if (infop->name != NULL) {
+		__os_free(env, infop->name);
+		infop->name = NULL;
+	}
+	infop->rp = NULL;
+	infop->id = INVALID_REGION_ID;
+
+	/* Discard the REGION structure if we created it. */
+	if (F_ISSET(infop, REGION_CREATE)) {
+		__env_des_destroy(env, rp);
+		F_CLR(infop, REGION_CREATE);
+	}
+
+	return (ret);
+}
+
+/*
+ * __env_region_share
+ *	Share the primary region.
+ *
+ * PUBLIC: int __env_region_share __P((ENV *, REGINFO *));
+ */
+int
+__env_region_share(env, infop)
+	ENV *env;
+	REGINFO *infop;
+{
+	REGINFO *envinfo;
+	REGION *rp;
+
+	envinfo = env->reginfo;
+	rp = envinfo->rp;
+	F_SET(infop, F_ISSET(envinfo, REGION_CREATE) | REGION_SHARED);
+	infop->addr = envinfo->addr;
+	infop->head = envinfo->head;
+
+	infop->env = env;
+	infop->rp = rp;
+	infop->name = envinfo->name;
+	infop->fhp = envinfo->fhp;
+	infop->type = rp->type;
+	infop->id = rp->id;
+
+	return (0);
+}
+
+/*
+ * __env_region_detach --
+ *	Detach from a region.
+ *
+ * PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int));
+ */
+int
+__env_region_detach(env, infop, destroy)
+	ENV *env;
+	REGINFO *infop;
+	int destroy;
+{
+	REGION *rp;
+	REGION_MEM  *mem, *next;
+	int ret;
+
+	if (F_ISSET(env, ENV_PRIVATE))
+		destroy = 1;
+	else if (F_ISSET(infop, REGION_SHARED))
+		return (0);
+
+	rp = infop->rp;
+
+	/*
+	 * When discarding the regions as we shut down a database environment,
+	 * discard any allocated shared memory segments.  This is the last time
+	 * we use them, and db_region_destroy is the last region-specific call
+	 * we make.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL) {
+		for (mem = infop->mem; mem != NULL; mem = next) {
+			next = mem->next;
+			__env_alloc_free(infop, mem);
+		}
+		__env_alloc_free(infop, infop->primary);
+	}
+
+	if (F_ISSET(infop, REGION_SHARED))
+		return (0);
+
+	/* Detach from the underlying OS region. */
+	ret = __env_sys_detach(env, infop, destroy);
+
+	/* If we destroyed the region, discard the REGION structure. */
+	if (destroy)
+		__env_des_destroy(env, rp);
+
+	/* Destroy the structure. */
+	if (infop->name != NULL)
+		__os_free(env, infop->name);
+
+	return (ret);
+}
+
+/*
+ * __env_sys_attach --
+ *	Prep and call the underlying OS attach function.
+ */
+static int
+__env_sys_attach(env, infop, rp)
+	ENV *env;
+	REGINFO *infop;
+	REGION *rp;
+{
+	int ret;
+
+	/*
+	 * All regions are created on 8K boundaries out of sheer paranoia,
+	 * so we don't make some underlying VM unhappy. Make sure we don't
+	 * overflow or underflow.
+	 */
+#define	OS_VMPAGESIZE		(8 * 1024)
+#define	OS_VMROUNDOFF(i) {						\
+	if ((i) + OS_VMPAGESIZE - 1 > (i))				\
+		(i) += OS_VMPAGESIZE - 1;				\
+	(i) -= (i) % OS_VMPAGESIZE;					\
+}
+	if (F_ISSET(infop, REGION_CREATE)) {
+		OS_VMROUNDOFF(rp->size);
+		OS_VMROUNDOFF(rp->max);
+	}
+
+#ifdef DB_REGIONSIZE_MAX
+	/* Some architectures have hard limits on the maximum region size. */
+	if (rp->size > DB_REGIONSIZE_MAX) {
+		__db_errx(env, DB_STR_A("1548",
+		    "region size %lu is too large; maximum is %lu", "%lu %lu"),
+		    (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX);
+		return (EINVAL);
+	}
+	if (rp->max > DB_REGIONSIZE_MAX) {
+		__db_errx(env, DB_STR_A("1549",
+		    "region max %lu is too large; maximum is %lu", "%lu %lu"),
+		    (u_long)rp->max, (u_long)DB_REGIONSIZE_MAX);
+		return (EINVAL);
+	}
+#endif
+
+	/*
+	 * If a region is private, malloc the memory.
+	 *
+	 * !!!
+	 * If this fails because the region is too large to malloc, mmap(2)
+	 * using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative.
+	 * I don't know of any architectures (yet!) where malloc is a problem.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+		/*
+		 * !!!
+		 * There exist spinlocks that don't work in malloc memory, e.g.,
+		 * the HP/UX msemaphore interface.  If we don't have locks that
+		 * will work in malloc memory, we better not be private or not
+		 * be threaded.
+		 */
+		if (F_ISSET(env, ENV_THREAD)) {
+			__db_errx(env, DB_STR("1550",
+"architecture does not support locks inside process-local (malloc) memory"));
+			__db_errx(env, DB_STR("1551",
+	    "application may not specify both DB_PRIVATE and DB_THREAD"));
+			return (EINVAL);
+		}
+#endif
+		if ((ret = __os_malloc(
+		    env, sizeof(REGENV), &infop->addr)) != 0)
+			return (ret);
+
+	} else {
+#if !defined(HAVE_MMAP_EXTEND)
+		/* Extend any disk file to its full size before mapping it. */
+		rp->size = rp->max;
+#endif
+		if ((ret = __os_attach(env, infop, rp)) != 0)
+			return (ret);
+	}
+
+	/* Set the start of the allocation region. */
+	infop->head = infop->addr;
+
+	/*
+	 * We require that the memory is aligned to fix the largest integral
+	 * type.  Otherwise, multiple processes mapping the same shared region
+	 * would have to memcpy every value before reading it.
+	 */
+	if (infop->addr != ALIGNP_INC(infop->addr, sizeof(uintmax_t))) {
+		__db_errx(env, DB_STR("1552",
+		    "region memory was not correctly aligned"));
+		(void)__env_sys_detach(env, infop,
+		    F_ISSET(infop, REGION_CREATE));
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __env_sys_detach --
+ *	Prep and call the underlying OS detach function.
+ */
+static int
+__env_sys_detach(env, infop, destroy)
+	ENV *env;
+	REGINFO *infop;
+	int destroy;
+{
+
+	/* If a region is private, free the memory. */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		__os_free(env, infop->addr);
+		return (0);
+	}
+
+	return (__os_detach(env, infop, destroy));
+}
+
+/*
+ * __env_des_get --
+ *	Return a reference to the shared information for a REGION,
+ *	optionally creating a new entry.
+ */
+static int
+__env_des_get(env, env_infop, infop, rpp)
+	ENV *env;
+	REGINFO *env_infop, *infop;
+	REGION **rpp;
+{
+	REGENV *renv;
+	REGION *rp, *empty_slot, *first_type;
+	u_int32_t i, maxid;
+
+	*rpp = NULL;
+	renv = env_infop->primary;
+
+	/*
+	 * If the caller wants to join a region, walk through the existing
+	 * regions looking for a matching ID (if ID specified) or matching
+	 * type (if type specified).  If we return based on a matching type
+	 * return the "primary" region, that is, the first region that was
+	 * created of this type.
+	 *
+	 * Track the first empty slot and maximum region ID for new region
+	 * allocation.
+	 *
+	 * MaxID starts at REGION_ID_ENV, the ID of the primary environment.
+	 */
+	maxid = REGION_ID_ENV;
+	empty_slot = first_type = NULL;
+	for (rp = R_ADDR(env_infop, renv->region_off),
+	    i = 0; i < renv->region_cnt; ++i, ++rp) {
+		if (rp->id == INVALID_REGION_ID) {
+			if (empty_slot == NULL)
+				empty_slot = rp;
+			continue;
+		}
+		if (infop->id != INVALID_REGION_ID) {
+			if (infop->id == rp->id)
+				break;
+			continue;
+		}
+		if (infop->type == rp->type &&
+		    F_ISSET(infop, REGION_JOIN_OK) &&
+		    (first_type == NULL || first_type->id > rp->id))
+			first_type = rp;
+
+		if (rp->id > maxid)
+			maxid = rp->id;
+	}
+
+	/* If we found a matching ID (or a matching type), return it. */
+	if (i >= renv->region_cnt)
+		rp = first_type;
+	if (rp != NULL) {
+		*rpp = rp;
+		return (0);
+	}
+
+	/*
+	 * If we didn't find a region and we don't have permission to create
+	 * the region, fail.  The caller generates any error message.
+	 */
+	if (!F_ISSET(infop, REGION_CREATE_OK))
+		return (ENOENT);
+
+	/*
+	 * If we didn't find a region and don't have room to create the region
+	 * fail with an error message, there's a sizing problem.
+	 */
+	if (empty_slot == NULL) {
+		__db_errx(env, DB_STR("1553",
+		    "no room remaining for additional REGIONs"));
+		return (ENOENT);
+	}
+
+	/*
+	 * Initialize a REGION structure for the caller.  If id was set, use
+	 * that value, otherwise we use the next available ID.
+	 */
+	memset(empty_slot, 0, sizeof(REGION));
+	empty_slot->segid = INVALID_REGION_SEGID;
+
+	/*
+	 * Set the type and ID; if no region ID was specified,
+	 * allocate one.
+	 */
+	empty_slot->type = infop->type;
+	empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
+
+	F_SET(infop, REGION_CREATE);
+
+	*rpp = empty_slot;
+	return (0);
+}
+
+/*
+ * __env_des_destroy --
+ *	Destroy a reference to a REGION.
+ */
+static void
+__env_des_destroy(env, rp)
+	ENV *env;
+	REGION *rp;
+{
+	COMPQUIET(env, NULL);
+
+	rp->id = INVALID_REGION_ID;
+}
+
+/*
+ * __env_faultmem --
+ *	Fault the region into memory.
+ */
+static int
+__env_faultmem(env, addr, size, created)
+	ENV *env;
+	void *addr;
+	size_t size;
+	int created;
+{
+	int ret;
+	u_int8_t *p, *t;
+
+	/* Ignore heap regions. */
+	if (F_ISSET(env, ENV_PRIVATE))
+		return (0);
+
+	/*
+	 * It's sometimes significantly faster to page-fault in all of the
+	 * region's pages before we run the application, as we see nasty
+	 * side-effects when we page-fault while holding various locks, i.e.,
+	 * the lock takes a long time to acquire because of the underlying
+	 * page fault, and the other threads convoy behind the lock holder.
+	 *
+	 * If we created the region, we write a non-zero value so that the
+	 * system can't cheat.  If we're just joining the region, we can
+	 * only read the value and try to confuse the compiler sufficiently
+	 * that it doesn't figure out that we're never really using it.
+	 *
+	 * Touch every page (assuming pages are 512B, the smallest VM page
+	 * size used in any general purpose processor).
+	 */
+	ret = 0;
+	if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) {
+		if (created)
+			for (p = addr,
+			    t = (u_int8_t *)addr + size; p < t; p += 512)
+				p[0] = 0xdb;
+		else
+			for (p = addr,
+			    t = (u_int8_t *)addr + size; p < t; p += 512)
+				ret |= p[0];
+	}
+
+	return (ret);
+}
diff --git a/src/env/env_register.c b/src/env/env_register.c
new file mode 100644
index 00000000..7475444d
--- /dev/null
+++ b/src/env/env_register.c
@@ -0,0 +1,730 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#define	REGISTER_FILE	"__db.register"
+
+#define	PID_EMPTY	"X                      0\n"	/* Unused PID entry */
+#define	PID_FMT		"%24lu\n"			/* PID entry format */
+							/* Unused PID test */
+#define	PID_ISEMPTY(p)	(memcmp(p, PID_EMPTY, PID_LEN) == 0)
+#define	PID_LEN		(25)				/* PID entry length */
+
+#define	REGISTRY_LOCK(env, pos, nowait)					\
+	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait)
+#define	REGISTRY_UNLOCK(env, pos)					\
+	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0)
+#define	REGISTRY_EXCL_LOCK(env, nowait)					\
+	REGISTRY_LOCK(env, 1, nowait)
+#define	REGISTRY_EXCL_UNLOCK(env)					\
+	REGISTRY_UNLOCK(env, 1)
+
+static	int __envreg_add __P((ENV *, int *, u_int32_t));
+static	int __envreg_pid_compare __P((const void *, const void *));
+static	int __envreg_create_active_pid __P((ENV *, char *));
+
+/*
+ * Support for portable, multi-process database environment locking, based on
+ * the Subversion SR (#11511).
+ *
+ * The registry feature is configured by specifying the DB_REGISTER flag to the
+ * DbEnv.open method.  If DB_REGISTER is specified, DB opens the registry file
+ * in the database environment home directory.  The registry file is formatted
+ * as follows:
+ *
+ *	                    12345		# process ID slot 1
+ *	X		# empty slot
+ *	                    12346		# process ID slot 2
+ *	X		# empty slot
+ *	                    12347		# process ID slot 3
+ *	                    12348		# process ID slot 4
+ *	X                   12349		# empty slot
+ *	X		# empty slot
+ *
+ * All lines are fixed-length.  All lines are process ID slots.  Empty slots
+ * are marked with leading non-digit characters.
+ *
+ * To modify the file, you get an exclusive lock on the first byte of the file.
+ *
+ * While holding any DbEnv handle, each process has an exclusive lock on the
+ * first byte of a process ID slot.  There is a restriction on having more
+ * than one DbEnv handle open at a time, because Berkeley DB uses per-process
+ * locking to implement this feature, that is, a process may never have more
+ * than a single slot locked.
+ *
+ * This work requires that if a process dies or the system crashes, locks held
+ * by the dying processes will be dropped.  (We can't use system shared
+ * memory-backed or filesystem-backed locks because they're persistent when a
+ * process dies.)  On POSIX systems, we use fcntl(2) locks; on Win32 we have
+ * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
+ * Lockfile/UnlockFile.
+ *
+ * We could implement the same solution with flock locking instead of fcntl,
+ * but flock would require a separate file for each process of control (and
+ * probably each DbEnv handle) in the database environment, which is fairly
+ * ugly.
+ *
+ * Whenever a process opens a new DbEnv handle, it walks the registry file and
+ * verifies it CANNOT acquire the lock for any non-empty slot.  If a lock for
+ * a non-empty slot is available, we know a process died holding an open handle,
+ * and recovery needs to be run.
+ *
+ * It's possible to get corruption in the registry file.  If a write system
+ * call fails after partially completing, there can be corrupted entries in
+ * the registry file, or a partial entry at the end of the file.  This is OK.
+ * A corrupted entry will be flagged as a non-empty line during the registry
+ * file walk.  Since the line was corrupted by process failure, no process will
+ * hold a lock on the slot, which will lead to recovery being run.
+ *
+ * There can still be processes running in the environment when we recover it,
+ * and, in fact, there can still be processes running in the old environment
+ * after we're up and running in a new one.  This is safe because performing
+ * recovery panics (and removes) the existing environment, so the window of
+ * vulnerability is small.  Further, we check the panic flag in the DB API
+ * methods, when waking from spinning on a mutex, and whenever we're about to
+ * write to disk).  The only window of corruption is if the write check of the
+ * panic were to complete, the region subsequently be recovered, and then the
+ * write continues.  That's very, very unlikely to happen.  This vulnerability
+ * already exists in Berkeley DB, too, the registry code doesn't make it any
+ * worse than it already is.
+ *
+ * The only way to avoid that window entirely is to ensure that all processes
+ * in the Berkeley DB environment exit before we run recovery.   Applications
+ * can do that if they maintain their own process registry outside of Berkeley
+ * DB, but it's a little more difficult to do here.   The obvious approach is
+ * to send signals to any process using the database environment as soon as we
+ * decide to run recovery, but there are problems with that approach: we might
+ * not have permission to send signals to the process, the process might have
+ * signal handlers installed, the cookie stored might not be the same as kill's
+ * argument, we may not be able to reliably tell if the process died, and there
+ * are probably other problems.  However, if we can send a signal, it reduces
+ * the window, and so we include the code here.  To configure it, turn on the
+ * DB_ENVREG_KILL_ALL #define.
+ */
+#define	DB_ENVREG_KILL_ALL	0
+
+/*
+ * __envreg_register --
+ *	Register a ENV handle.
+ *
+ * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t));
+ */
+int
+__envreg_register(env, need_recoveryp, flags)
+	ENV *env;
+	int *need_recoveryp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	pid_t pid;
+	u_int32_t bytes, mbytes;
+	int ret;
+	char *pp;
+
+	*need_recoveryp = 0;
+
+	dbenv = env->dbenv;
+	dbenv->thread_id(dbenv, &pid, NULL);
+	pp = NULL;
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+		__db_msg(env, DB_STR_A("1524",
+	"%lu: register environment", "%lu"), (u_long)pid);
+
+	/* Build the path name and open the registry file. */
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, REGISTER_FILE, NULL, &pp)) != 0)
+		goto err;
+	if ((ret = __os_open(env, pp, 0,
+	    DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0)
+		goto err;
+
+	/*
+	 * Wait for an exclusive lock on the file.
+	 *
+	 * !!!
+	 * We're locking bytes that don't yet exist, but that's OK as far as
+	 * I know.
+	 */
+	if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0)
+		goto err;
+
+	/*
+	 * If the file size is 0, initialize the file.
+	 *
+	 * Run recovery if we create the file, that means we can clean up the
+	 * system by removing the registry file and restarting the application.
+	 */
+	if ((ret = __os_ioinfo(
+	    env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
+		goto err;
+	if (mbytes == 0 && bytes == 0) {
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+			__db_msg(env, DB_STR_A("1525",
+			    "%lu: creating %s", "%lu %s"), (u_long)pid, pp);
+		*need_recoveryp = 1;
+	}
+
+	/* Register this process. */
+	if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0)
+		goto err;
+
+	/*
+	 * Release our exclusive lock if we don't need to run recovery.  If
+	 * we need to run recovery, ENV->open will call back into register
+	 * code once recovery has completed.
+	 */
+	if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0)
+		goto err;
+
+	if (0) {
+err:		*need_recoveryp = 0;
+
+		/*
+		 * !!!
+		 * Closing the file handle must release all of our locks.
+		 */
+		if (dbenv->registry != NULL)
+			(void)__os_closehandle(env, dbenv->registry);
+		dbenv->registry = NULL;
+	}
+
+	if (pp != NULL)
+		__os_free(env, pp);
+
+	return (ret);
+}
+
+/*
+ * __envreg_add --
+ *	Add the process' pid to the register.
+ */
+static int
+__envreg_add(env, need_recoveryp, flags)
+	ENV *env;
+	int *need_recoveryp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	REGENV * renv;
+	REGINFO *infop;
+	pid_t pid;
+	off_t end, pos, dead;
+	size_t nr, nw;
+	u_int lcnt;
+	u_int32_t bytes, mbytes, orig_flags;
+	int need_recovery, ret, t_ret;
+	char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];
+
+	dbenv = env->dbenv;
+	need_recovery = 0;
+	COMPQUIET(dead, 0);
+	COMPQUIET(p, NULL);
+	ip = NULL;
+
+	/* Get a copy of our process ID. */
+	dbenv->thread_id(dbenv, &pid, NULL);
+	snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+		__db_msg(env, DB_STR_A("1526",
+		    "%lu: adding self to registry", "%lu"), (u_long)pid);
+
+#if DB_ENVREG_KILL_ALL
+	if (0) {
+kill_all:	/*
+		 * A second pass through the file, this time killing any
+		 * processes still running.
+		 */
+		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+			return (ret);
+	}
+#endif
+
+	/*
+	 * Read the file.  Skip empty slots, and check that a lock is held
+	 * for any allocated slots.  An allocated slot which we can lock
+	 * indicates a process died holding a handle and recovery needs to
+	 * be run.
+	 */
+	for (lcnt = 0;; ++lcnt) {
+		if ((ret = __os_read(
+		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+			return (ret);
+		if (nr == 0)
+			break;
+
+		/*
+		 * A partial record at the end of the file is possible if a
+		 * previously un-registered process was interrupted while
+		 * registering.
+		 */
+		if (nr != PID_LEN) {
+			need_recovery = 1;
+			break;
+		}
+
+		if (PID_ISEMPTY(buf)) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env, DB_STR_A("1527",
+				    "%02u: EMPTY", "%02u"), lcnt);
+			continue;
+		}
+
+		/*
+		 * !!!
+		 * DB_REGISTER is implemented using per-process locking, only
+		 * a single ENV handle may be open per process.  Enforce
+		 * that restriction.
+		 */
+		if (memcmp(buf, pid_buf, PID_LEN) == 0) {
+			__db_errx(env, DB_STR("1528",
+"DB_REGISTER limits processes to one open DB_ENV handle per environment"));
+			return (EINVAL);
+		}
+
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
+			for (p = buf; *p == ' ';)
+				++p;
+			buf[nr - 1] = '\0';
+		}
+
+#if DB_ENVREG_KILL_ALL
+		if (need_recovery) {
+			pid = (pid_t)strtoul(buf, NULL, 10);
+			(void)kill(pid, SIGKILL);
+
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env, DB_STR_A("1529",
+				    "%02u: %s: KILLED", "%02u %s"), lcnt, p);
+			continue;
+		}
+#endif
+		pos = (off_t)lcnt * PID_LEN;
+		if (REGISTRY_LOCK(env, pos, 1) == 0) {
+			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
+				return (ret);
+
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env, DB_STR_A("1530",
+				    "%02u: %s: FAILED", "%02u %s"), lcnt, p);
+
+			need_recovery = 1;
+			dead = pos;
+#if DB_ENVREG_KILL_ALL
+			goto kill_all;
+#else
+			break;
+#endif
+		} else
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env, DB_STR_A("1531",
+				    "%02u: %s: LOCKED", "%02u %s"), lcnt, p);
+	}
+
+	/*
+	 * If we have to perform recovery...
+	 *
+	 * Mark all slots empty.  Registry ignores empty slots we can't lock,
+	 * so it doesn't matter if any of the processes are in the middle of
+	 * exiting Berkeley DB -- they'll discard their lock when they exit.
+	 */
+	if (need_recovery) {
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+			__db_msg(env, "%lu: recovery required", (u_long)pid);
+
+		if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env,
+				    "%lu: performing failchk", (u_long)pid);
+
+			if (LF_ISSET(DB_FAILCHK_ISALIVE))
+				if ((ret = __envreg_create_active_pid(
+				    env, pid_buf)) != 0)
+					goto sig_proc;
+
+			/* The environment will already exist, so we do not
+			 * want DB_CREATE set, nor do we want any recovery at
+			 * this point.  No need to put values back as flags is
+			 * passed in by value.  Save original dbenv flags in
+			 * case we need to recover/remove existing environment.
+			 * Set DB_ENV_FAILCHK before attach to help ensure we
+			 * dont block on a mutex held by the dead process.
+			 */
+			LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL);
+			orig_flags = dbenv->flags;
+			F_SET(dbenv, DB_ENV_FAILCHK);
+			/* Attach to environment and subsystems. */
+			if ((ret = __env_attach_regions(
+			    dbenv, flags, orig_flags, 0)) != 0)
+				goto sig_proc;
+			if ((t_ret =
+			    __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+			if ((t_ret =
+			    __env_failchk_int(dbenv)) != 0 && ret == 0)
+				ret = t_ret;
+
+			/* Free active pid array if used. */
+			if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
+				DB_GLOBAL(num_active_pids) = 0;
+				DB_GLOBAL(size_active_pids) = 0;
+				__os_free( env, DB_GLOBAL(active_pids));
+			}
+
+			/* Detach from environment and deregister thread. */
+			if ((t_ret =
+			    __env_refresh(dbenv, orig_flags, 0)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+			if (ret == 0) {
+				if ((ret = __os_seek(env, dbenv->registry,
+				    0, 0,(u_int32_t)dead)) != 0 ||
+				    (ret = __os_write(env, dbenv->registry,
+				    PID_EMPTY, PID_LEN, &nw)) != 0)
+					return (ret);
+				need_recovery = 0;
+				goto add;
+			}
+
+		}
+		/* If we can't attach, then we cannot set DB_REGISTER panic. */
+sig_proc:	if (__env_attach(env, NULL, 0, 0) == 0) {
+			infop = env->reginfo;
+			renv = infop->primary;
+			/* Indicate DB_REGSITER panic.  Also, set environment
+			 * panic as this is the panic trigger mechanism in
+			 * the code that everything looks for.
+			 */
+			renv->reg_panic = 1;
+			renv->panic = 1;
+			(void)__env_detach(env, 0);
+		}
+
+		/* Wait for processes to see the panic and leave. */
+		__os_yield(env, 0, dbenv->envreg_timeout);
+
+		/* FIGURE out how big the file is. */
+		if ((ret = __os_ioinfo(
+		    env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
+			return (ret);
+		end = (off_t)mbytes * MEGABYTE + bytes;
+
+		/*
+		 * Seek to the beginning of the file and overwrite slots to
+		 * the end of the file.
+		 *
+		 * It's possible for there to be a partial entry at the end of
+		 * the file if a process died when trying to register.  If so,
+		 * correct for it and overwrite it as well.
+		 */
+		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+			return (ret);
+		for (lcnt = 0; lcnt < ((u_int)end / PID_LEN +
+		    ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) {
+
+			if ((ret = __os_read(
+			    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+				return (ret);
+
+			pos = (off_t)lcnt * PID_LEN;
+			/* do not notify on dead process */
+			if (pos != dead) {
+				pid = (pid_t)strtoul(buf, NULL, 10);
+				DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid);
+			}
+
+			if ((ret = __os_seek(env,
+			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
+			    (ret = __os_write(env,
+			    dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
+				return (ret);
+		}
+		/* wait one last time to get everyone out */
+		__os_yield(env, 0, dbenv->envreg_timeout);
+	}
+
+	/*
+	 * Seek to the first process slot and add ourselves to the first empty
+	 * slot we can lock.
+	 */
+add:	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+		return (ret);
+	for (lcnt = 0;; ++lcnt) {
+		if ((ret = __os_read(
+		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+			return (ret);
+		if (nr == PID_LEN && !PID_ISEMPTY(buf))
+			continue;
+		pos = (off_t)lcnt * PID_LEN;
+		if (REGISTRY_LOCK(env, pos, 1) == 0) {
+			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+				__db_msg(env, DB_STR_A("1532",
+				    "%lu: locking slot %02u at offset %lu",
+				    "%lu %02u %lu"), (u_long)pid, lcnt,
+				    (u_long)pos);
+
+			if ((ret = __os_seek(env,
+			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
+			    (ret = __os_write(env,
+			    dbenv->registry, pid_buf, PID_LEN, &nw)) != 0)
+				return (ret);
+			dbenv->registry_off = (u_int32_t)pos;
+			break;
+		}
+	}
+
+	if (need_recovery)
+		*need_recoveryp = 1;
+
+	return (ret);
+}
+
+/*
+ * __envreg_unregister --
+ *	Unregister a ENV handle.
+ *
+ * PUBLIC: int __envreg_unregister __P((ENV *, int));
+ */
+int
+__envreg_unregister(env, recovery_failed)
+	ENV *env;
+	int recovery_failed;
+{
+	DB_ENV *dbenv;
+	size_t nw;
+	int ret, t_ret;
+
+	dbenv = env->dbenv;
+	ret = 0;
+
+	/*
+	 * If recovery failed, we want to drop our locks and return, but still
+	 * make sure any subsequent process doesn't decide everything is just
+	 * fine and try to get into the database environment.  In the case of
+	 * an error, discard our locks, but leave our slot filled-in.
+	 */
+	if (recovery_failed)
+		goto err;
+
+	/*
+	 * Why isn't an exclusive lock necessary to discard a ENV handle?
+	 *
+	 * We mark our process ID slot empty before we discard the process slot
+	 * lock, and threads of control reviewing the register file ignore any
+	 * slots which they can't lock.
+	 */
+	if ((ret = __os_seek(env,
+	    dbenv->registry, 0, 0, dbenv->registry_off)) != 0 ||
+	    (ret = __os_write(
+	    env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * This code assumes that closing the file descriptor discards all
+	 * held locks.
+	 *
+	 * !!!
+	 * There is an ordering problem here -- in the case of a process that
+	 * failed in recovery, we're unlocking both the exclusive lock and our
+	 * slot lock.  If the OS unlocked the exclusive lock and then allowed
+	 * another thread of control to acquire the exclusive lock before also
+	 * also releasing our slot lock, we could race.  That can't happen, I
+	 * don't think.
+	 */
+err:	if ((t_ret =
+	    __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbenv->registry = NULL;
+	return (ret);
+}
+
+/*
+ * __envreg_xunlock --
+ *	Discard the exclusive lock held by the ENV handle.
+ *
+ * PUBLIC: int __envreg_xunlock __P((ENV *));
+ */
+int
+__envreg_xunlock(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	pid_t pid;
+	int ret;
+
+	dbenv = env->dbenv;
+	dbenv->thread_id(dbenv, &pid, NULL);
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+		__db_msg(env, DB_STR_A("1533",
+		    "%lu: recovery completed, unlocking", "%lu"), (u_long)pid);
+
+	if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0)
+		return (ret);
+
+	__db_err(env, ret, DB_STR_A("1534",
+	    "%s: exclusive file unlock", "%s"), REGISTER_FILE);
+	return (__env_panic(env, ret));
+}
+
+/*
+ * __envreg_pid_compare --
+ *	Compare routine for qsort and bsearch calls.
+ *	returns neg if key is less than membr, 0 if equal and
+ *	pos if key is greater than membr.
+ */
+static int
+__envreg_pid_compare(key, membr)
+	const void *key;
+	const void *membr;
+{
+	return ( *(pid_t*)key - *(pid_t*)membr );
+}
+
+/*
+ * __envreg_isalive --
+ *	Default isalive function that uses contents of an array of active pids
+ *	gotten from the db_register file to determine if process is still
+ *	alive.
+ *
+ * PUBLIC: int __envreg_isalive
+ * PUBLIC:   __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+ */
+int
+__envreg_isalive(dbenv, pid, tid, flags )
+	DB_ENV *dbenv;
+	pid_t pid;
+	db_threadid_t tid;
+	u_int32_t flags;
+{
+	/* in this case we really do not care about tid, simply for lint */
+	DB_THREADID_INIT(tid);
+
+	/* if is not an expected value then return early */
+	if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY)))
+		return (EINVAL);
+
+	if (DB_GLOBAL(active_pids) == NULL ||
+	    DB_GLOBAL(num_active_pids) == 0 || dbenv == NULL)
+		return (0);
+	/*
+	 * bsearch returns a pointer to an entry in active_pids if a match
+	 * is found on pid, else no match found it returns NULL.   This
+	 * routine will return a 1 if a match is found, else a 0.
+	 */
+	if (bsearch(&pid, DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+	    sizeof(pid_t), __envreg_pid_compare))
+		return 1;
+
+	return (0);
+}
+
+/*
+ * __envreg_create_active_pid --
+ *	Create array of pids, if need more room in array then double size.
+ *	Only add active pids from DB_REGISTER file into array.
+ */
+static int
+__envreg_create_active_pid(env, my_pid)
+	ENV *env;
+	char *my_pid;
+{
+	DB_ENV *dbenv;
+	char buf[PID_LEN + 10];
+	int    ret;
+	off_t  pos;
+	pid_t  pid, *tmparray;
+	size_t tmpsize, nr;
+	u_int lcnt;
+
+	dbenv = env->dbenv;
+	pos = 0;
+	ret = 0;
+
+	/*
+	 * Walk through DB_REGISTER file, we grab pid entries that are locked
+	 * as those represent processes that are still alive.   Ignore empty
+	 * slots, or those that are unlocked.
+	 */
+	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+		return (ret);
+	for (lcnt = 0;; ++lcnt) {
+		if ((ret = __os_read(
+		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+			return (ret);
+
+		/* all done is read nothing, or get a partial record */
+		if (nr == 0 || nr != PID_LEN)
+			break;
+		if (PID_ISEMPTY(buf))
+			continue;
+
+		pos = (off_t)lcnt * PID_LEN;
+		if (REGISTRY_LOCK(env, pos, 1) == 0) {
+			/* got lock, so process died. Do not add to array */
+			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
+				return (ret);
+		} else {
+			/* first, check to make sure we have room in arrary */
+			if (DB_GLOBAL(num_active_pids) + 1 >
+			    DB_GLOBAL(size_active_pids)) {
+				tmpsize =
+				   DB_GLOBAL(size_active_pids) * sizeof(pid_t);
+
+				/* start with 512, then double if must grow */
+				tmpsize = tmpsize>0 ? tmpsize*2 : 512;
+				if ((ret = __os_malloc
+				    (env, tmpsize, &tmparray )) != 0)
+					return (ret);
+
+				/* if array exists, then copy and free */
+				if (DB_GLOBAL(active_pids)) {
+					memcpy( tmparray,
+					    DB_GLOBAL(active_pids),
+					    DB_GLOBAL(num_active_pids) *
+					    sizeof(pid_t));
+					__os_free( env, DB_GLOBAL(active_pids));
+				}
+
+				DB_GLOBAL(active_pids) = tmparray;
+				DB_GLOBAL(size_active_pids) = tmpsize;
+
+				/*
+				 * The process getting here has not been added
+				 * to the DB_REGISTER file yet, so include it
+				 * as the first item in array
+				 */
+				if (DB_GLOBAL(num_active_pids) == 0) {
+					pid = (pid_t)strtoul(my_pid, NULL, 10);
+					DB_GLOBAL(active_pids)
+					   [DB_GLOBAL(num_active_pids)++] = pid;
+				}
+			}
+
+			/* insert into array */
+			pid = (pid_t)strtoul(buf, NULL, 10);
+			DB_GLOBAL(active_pids)
+			    [DB_GLOBAL(num_active_pids)++] = pid;
+
+		}
+
+	}
+
+	/* lets sort the array to allow for binary search in isalive func */
+	qsort(DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+	    sizeof(pid_t), __envreg_pid_compare);
+	return (ret);
+}
diff --git a/src/env/env_sig.c b/src/env/env_sig.c
new file mode 100644
index 00000000..6d127f85
--- /dev/null
+++ b/src/env/env_sig.c
@@ -0,0 +1,201 @@
+/*-
+ * DO NOT EDIT: automatically built by dist/s_sig.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_join.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/log_verify.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/* 
+ * For a pure 32bit/64bit environment, we check all structures and calculate a
+ * signature. For compatible environment, we only check the structures in
+ * shared memory.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+#define	__STRUCTURE_COUNT	41
+#else
+#define	__STRUCTURE_COUNT	(41 + 104)
+#endif
+
+/*
+ * __env_struct_sig --
+ *	Compute signature of structures.
+ *
+ * PUBLIC: u_int32_t __env_struct_sig __P((void));
+ */
+u_int32_t
+__env_struct_sig()
+{
+	u_short t[__STRUCTURE_COUNT + 5];
+	u_int i;
+
+	i = 0;
+#define	__ADD(s)	(t[i++] = sizeof(struct s))
+
+#ifdef	HAVE_MUTEX_SUPPORT
+	__ADD(__db_mutex_stat);
+#endif
+	__ADD(__db_lock_stat);
+	__ADD(__db_lock_hstat);
+	__ADD(__db_lock_pstat);
+	__ADD(__db_ilock);
+	__ADD(__db_lock_u);
+	__ADD(__db_lsn);
+	__ADD(__db_log_stat);
+	__ADD(__db_mpool_stat);
+	__ADD(__db_rep_stat);
+	__ADD(__db_repmgr_stat);
+	__ADD(__db_seq_stat);
+	__ADD(__db_bt_stat);
+	__ADD(__db_h_stat);
+	__ADD(__db_heap_stat);
+	__ADD(__db_qam_stat);
+	__ADD(__db_thread_info);
+	__ADD(__db_lockregion);
+	__ADD(__sh_dbt);
+	__ADD(__db_lockobj);
+	__ADD(__db_locker);
+	__ADD(__db_lockpart);
+	__ADD(__db_lock);
+	__ADD(__log);
+	__ADD(__mpool);
+	__ADD(__db_mpool_fstat_int);
+	__ADD(__mpoolfile);
+	__ADD(__bh);
+#ifdef	HAVE_MUTEX_SUPPORT
+	__ADD(__db_mutexregion);
+#endif
+#ifdef	HAVE_MUTEX_SUPPORT
+	__ADD(__db_mutex_t);
+#endif
+	__ADD(__db_reg_env);
+	__ADD(__db_region);
+	__ADD(__rep);
+	__ADD(__db_txn_stat_int);
+	__ADD(__db_txnregion);
+
+#ifndef HAVE_MIXED_SIZE_ADDRESSING
+	__ADD(__db_dbt);
+	__ADD(__db_lockreq);
+	__ADD(__db_log_cursor);
+	__ADD(__log_rec_spec);
+	__ADD(__db_mpoolfile);
+	__ADD(__db_mpool_fstat);
+	__ADD(__db_txn);
+	__ADD(__kids);
+	__ADD(__my_cursors);
+	__ADD(__femfs);
+	__ADD(__db_preplist);
+	__ADD(__db_txn_active);
+	__ADD(__db_txn_stat);
+	__ADD(__db_txn_token);
+	__ADD(__db_repmgr_site);
+	__ADD(__db_repmgr_conn_err);
+	__ADD(__db_seq_record);
+	__ADD(__db_sequence);
+	__ADD(__db);
+	__ADD(__cq_fq);
+	__ADD(__cq_aq);
+	__ADD(__cq_jq);
+	__ADD(__db_heap_rid);
+	__ADD(__dbc);
+	__ADD(__key_range);
+	__ADD(__db_compact);
+	__ADD(__db_env);
+	__ADD(__db_distab);
+	__ADD(__db_logvrfy_config);
+	__ADD(__db_channel);
+	__ADD(__db_site);
+	__ADD(__fn);
+	__ADD(__db_msgbuf);
+	__ADD(__pin_list);
+	__ADD(__env_thread_info);
+	__ADD(__flag_map);
+	__ADD(__db_backup_handle);
+	__ADD(__env);
+	__ADD(__dbc_internal);
+	__ADD(__dbpginfo);
+	__ADD(__epg);
+	__ADD(__cursor);
+	__ADD(__btree);
+	__ADD(__db_cipher);
+	__ADD(__db_foreign_info);
+	__ADD(__db_txnhead);
+	__ADD(__db_txnlist);
+	__ADD(__join_cursor);
+	__ADD(__pg_chksum);
+	__ADD(__pg_crypto);
+	__ADD(__heaphdr);
+	__ADD(__heaphdrsplt);
+	__ADD(__pglist);
+	__ADD(__vrfy_dbinfo);
+	__ADD(__vrfy_pageinfo);
+	__ADD(__vrfy_childinfo);
+	__ADD(__db_globals);
+	__ADD(__envq);
+	__ADD(__heap);
+	__ADD(__heap_cursor);
+	__ADD(__db_locktab);
+	__ADD(__db_entry);
+	__ADD(__fname);
+	__ADD(__db_log);
+	__ADD(__hdr);
+	__ADD(__log_persist);
+	__ADD(__db_commit);
+	__ADD(__db_filestart);
+	__ADD(__log_rec_hdr);
+	__ADD(__db_log_verify_info);
+	__ADD(__txn_verify_info);
+	__ADD(__lv_filereg_info);
+	__ADD(__lv_filelife);
+	__ADD(__lv_ckp_info);
+	__ADD(__lv_timestamp_info);
+	__ADD(__lv_txnrange);
+	__ADD(__add_recycle_params);
+	__ADD(__ckp_verify_params);
+	__ADD(__db_mpool);
+	__ADD(__db_mpreg);
+	__ADD(__db_mpool_hash);
+	__ADD(__bh_frozen_p);
+	__ADD(__bh_frozen_a);
+#ifdef	HAVE_MUTEX_SUPPORT
+	__ADD(__db_mutexmgr);
+#endif
+	__ADD(__fh_t);
+	__ADD(__db_partition);
+	__ADD(__part_internal);
+	__ADD(__qcursor);
+	__ADD(__mpfarray);
+	__ADD(__qmpf);
+	__ADD(__queue);
+	__ADD(__qam_filelist);
+	__ADD(__db_reg_env_ref);
+	__ADD(__db_region_mem_t);
+	__ADD(__db_reginfo_t);
+	__ADD(__rep_waiter);
+	__ADD(__db_rep);
+	__ADD(__rep_lease_entry);
+	__ADD(__txn_detail);
+	__ADD(__db_txnmgr);
+	__ADD(__db_commit_info);
+	__ADD(__txn_logrec);
+#endif
+
+	return (__ham_func5(NULL, t, i * sizeof(t[0])));
+}
diff --git a/src/env/env_stat.c b/src/env/env_stat.c
new file mode 100644
index 00000000..9bc3fe7e
--- /dev/null
+++ b/src/env/env_stat.c
@@ -0,0 +1,879 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int   __env_print_all __P((ENV *, u_int32_t));
+static int   __env_print_dbenv_all __P((ENV *, u_int32_t));
+static int   __env_print_env_all __P((ENV *, u_int32_t));
+static int   __env_print_fh __P((ENV *));
+static int   __env_print_stats __P((ENV *, u_int32_t));
+static int   __env_print_thread __P((ENV *));
+static int   __env_stat_print __P((ENV *, u_int32_t));
+static char *__env_thread_state_print __P((DB_THREAD_STATE));
+static const char *
+	     __reg_type __P((reg_type_t));
+
+/*
+ * __env_stat_print_pp --
+ *	ENV->stat_print pre/post processor.
+ *
+ * PUBLIC: int __env_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->stat_print");
+
+	if ((ret = __db_fchk(env, "DB_ENV->stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_ALLOC |
+	    DB_STAT_CLEAR | DB_STAT_SUBSYSTEM)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__env_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __env_stat_print --
+ *	ENV->stat_print method.
+ */
+static int
+__env_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	time_t now;
+	int ret;
+	char time_buf[CTIME_BUFLEN];
+
+	(void)time(&now);
+	__db_msg(env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+	if ((ret = __env_print_stats(env, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __env_print_all(env, flags)) != 0)
+		return (ret);
+
+	if ((ret = __env_print_thread(env)) != 0)
+		return (ret);
+
+	if ((ret = __env_print_fh(env)) != 0)
+		return (ret);
+
+	if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+		return (0);
+
+	if (LOGGING_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __log_stat_print(env, flags)) != 0)
+			return (ret);
+
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __dbreg_stat_print(env, flags)) != 0)
+			return (ret);
+	}
+
+	if (LOCKING_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __lock_stat_print(env, flags)) != 0)
+			return (ret);
+	}
+
+	if (MPOOL_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __memp_stat_print(env, flags)) != 0)
+			return (ret);
+	}
+
+	if (REP_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __rep_stat_print(env, flags)) != 0)
+			return (ret);
+#ifdef HAVE_REPLICATION_THREADS
+		if ((ret = __repmgr_stat_print(env, flags)) != 0)
+			return (ret);
+#endif
+	}
+
+	if (TXN_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __txn_stat_print(env, flags)) != 0)
+			return (ret);
+	}
+
+#ifdef HAVE_MUTEX_SUPPORT
+	/*
+	 * Dump the mutexes last.  If DB_STAT_CLEAR is set this will
+	 * clear out the mutex counters and we want to see them in
+	 * the context of the other subsystems first.
+	 */
+	if (MUTEX_ON(env)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		if ((ret = __mutex_stat_print(env, flags)) != 0)
+			return (ret);
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * __env_print_stats --
+ *	Display the default environment statistics.
+ *
+ */
+static int
+__env_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	char time_buf[CTIME_BUFLEN];
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	if (LF_ISSET(DB_STAT_ALL)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Default database environment information:");
+	}
+	STAT_HEX("Magic number", renv->magic);
+	STAT_LONG("Panic value", renv->panic);
+	__db_msg(env, "%d.%d.%d\tEnvironment version",
+	    renv->majver, renv->minver, renv->patchver);
+	STAT_LONG("Btree version", DB_BTREEVERSION);
+	STAT_LONG("Hash version", DB_HASHVERSION);
+	STAT_LONG("Lock version", DB_LOCKVERSION);
+	STAT_LONG("Log version", DB_LOGVERSION);
+	STAT_LONG("Queue version", DB_QAMVERSION);
+	STAT_LONG("Sequence version", DB_SEQUENCE_VERSION);
+	STAT_LONG("Txn version", DB_TXNVERSION);
+	__db_msg(env,
+	    "%.24s\tCreation time", __os_ctime(&renv->timestamp, time_buf));
+	STAT_HEX("Environment ID", renv->envid);
+	__mutex_print_debug_single(env,
+	    "Primary region allocation and reference count mutex",
+	    renv->mtx_regenv, flags);
+	STAT_LONG("References", renv->refcnt);
+	__db_dlbytes(env, "Current region size",
+	    (u_long)0, (u_long)0, (u_long)infop->rp->size);
+	__db_dlbytes(env, "Maximum region size",
+	    (u_long)0, (u_long)0, (u_long)infop->rp->max);
+
+	return (0);
+}
+
+/*
+ * __env_print_all --
+ *	Display the debugging environment statistics.
+ */
+static int
+__env_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+
+	/*
+	 * There are two structures -- DB_ENV and ENV.
+	 */
+	ret = __env_print_dbenv_all(env, flags);
+	if ((t_ret = __env_print_env_all(env, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __env_print_dbenv_all --
+ *	Display the debugging environment statistics.
+ */
+static int
+__env_print_dbenv_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN db_env_fn[] = {
+		{ DB_ENV_AUTO_COMMIT,		"DB_ENV_AUTO_COMMIT" },
+		{ DB_ENV_CDB_ALLDB,		"DB_ENV_CDB_ALLDB" },
+		{ DB_ENV_DIRECT_DB,		"DB_ENV_DIRECT_DB" },
+		{ DB_ENV_DSYNC_DB,		"DB_ENV_DSYNC_DB" },
+		{ DB_ENV_MULTIVERSION,		"DB_ENV_MULTIVERSION" },
+		{ DB_ENV_NOLOCKING,		"DB_ENV_NOLOCKING" },
+		{ DB_ENV_NOMMAP,		"DB_ENV_NOMMAP" },
+		{ DB_ENV_NOPANIC,		"DB_ENV_NOPANIC" },
+		{ DB_ENV_OVERWRITE,		"DB_ENV_OVERWRITE" },
+		{ DB_ENV_REGION_INIT,		"DB_ENV_REGION_INIT" },
+		{ DB_ENV_TIME_NOTGRANTED,	"DB_ENV_TIME_NOTGRANTED" },
+		{ DB_ENV_TXN_NOSYNC,		"DB_ENV_TXN_NOSYNC" },
+		{ DB_ENV_TXN_NOWAIT,		"DB_ENV_TXN_NOWAIT" },
+		{ DB_ENV_TXN_SNAPSHOT,		"DB_ENV_TXN_SNAPSHOT" },
+		{ DB_ENV_TXN_WRITE_NOSYNC,	"DB_ENV_TXN_WRITE_NOSYNC" },
+		{ DB_ENV_YIELDCPU,		"DB_ENV_YIELDCPU" },
+		{ 0,				NULL }
+	};
+	static const FN vfn[] = {
+		{ DB_VERB_DEADLOCK,		"DB_VERB_DEADLOCK" },
+		{ DB_VERB_FILEOPS,		"DB_VERB_FILEOPS" },
+		{ DB_VERB_FILEOPS_ALL,		"DB_VERB_FILEOPS_ALL" },
+		{ DB_VERB_RECOVERY,		"DB_VERB_RECOVERY" },
+		{ DB_VERB_REGISTER,		"DB_VERB_REGISTER" },
+		{ DB_VERB_REPLICATION,		"DB_VERB_REPLICATION" },
+		{ DB_VERB_REP_ELECT,		"DB_VERB_REP_ELECT" },
+		{ DB_VERB_REP_LEASE,		"DB_VERB_REP_LEASE" },
+		{ DB_VERB_REP_MISC,		"DB_VERB_REP_MISC" },
+		{ DB_VERB_REP_MSGS,		"DB_VERB_REP_MSGS" },
+		{ DB_VERB_REP_SYNC,		"DB_VERB_REP_SYNC" },
+		{ DB_VERB_REP_SYSTEM,		"DB_VERB_REP_SYSTEM" },
+		{ DB_VERB_REP_TEST,		"DB_VERB_REP_TEST" },
+		{ DB_VERB_REPMGR_CONNFAIL,	"DB_VERB_REPMGR_CONNFAIL" },
+		{ DB_VERB_REPMGR_MISC,		"DB_VERB_REPMGR_MISC" },
+		{ DB_VERB_WAITSFOR,		"DB_VERB_WAITSFOR" },
+		{ 0,				NULL }
+	};
+	DB_ENV *dbenv;
+	DB_MSGBUF mb;
+	char **p;
+
+	dbenv = env->dbenv;
+	DB_MSGBUF_INIT(&mb);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	STAT_POINTER("ENV", dbenv->env);
+	__mutex_print_debug_single(
+	    env, "DB_ENV handle mutex", dbenv->mtx_db_env, flags);
+	STAT_ISSET("Errcall", dbenv->db_errcall);
+	STAT_ISSET("Errfile", dbenv->db_errfile);
+	STAT_STRING("Errpfx", dbenv->db_errpfx);
+	STAT_ISSET("Msgfile", dbenv->db_msgfile);
+	STAT_ISSET("Msgcall", dbenv->db_msgcall);
+
+	STAT_ISSET("AppDispatch", dbenv->app_dispatch);
+	STAT_ISSET("Event", dbenv->db_event_func);
+	STAT_ISSET("Feedback", dbenv->db_feedback);
+	STAT_ISSET("Free", dbenv->db_free);
+	STAT_ISSET("Panic", dbenv->db_paniccall);
+	STAT_ISSET("Malloc", dbenv->db_malloc);
+	STAT_ISSET("Realloc", dbenv->db_realloc);
+	STAT_ISSET("IsAlive", dbenv->is_alive);
+	STAT_ISSET("ThreadId", dbenv->thread_id);
+	STAT_ISSET("ThreadIdString", dbenv->thread_id_string);
+
+	STAT_STRING("Log dir", dbenv->db_log_dir);
+	STAT_STRING("Metadata dir", dbenv->db_md_dir);
+	STAT_STRING("Tmp dir", dbenv->db_tmp_dir);
+	if (dbenv->db_data_dir == NULL)
+		STAT_ISSET("Data dir", dbenv->db_data_dir);
+	else {
+		for (p = dbenv->db_data_dir; *p != NULL; ++p)
+			__db_msgadd(env, &mb, "%s\tData dir", *p);
+		DB_MSGBUF_FLUSH(env, &mb);
+	}
+
+	STAT_STRING(
+	    "Intermediate directory mode", dbenv->intermediate_dir_mode);
+
+	STAT_LONG("Shared memory key", dbenv->shm_key);
+
+	STAT_ISSET("Password", dbenv->passwd);
+
+	STAT_ISSET("App private", dbenv->app_private);
+	STAT_ISSET("Api1 internal", dbenv->api1_internal);
+	STAT_ISSET("Api2 internal", dbenv->api2_internal);
+
+	__db_prflags(env, NULL, dbenv->verbose, vfn, NULL, "\tVerbose flags");
+
+	STAT_ULONG("Mutex align", dbenv->mutex_align);
+	STAT_ULONG("Mutex cnt", dbenv->mutex_cnt);
+	STAT_ULONG("Mutex inc", dbenv->mutex_inc);
+	STAT_ULONG("Mutex tas spins", dbenv->mutex_tas_spins);
+
+	STAT_ISSET("Lock conflicts", dbenv->lk_conflicts);
+	STAT_LONG("Lock modes", dbenv->lk_modes);
+	STAT_ULONG("Lock detect", dbenv->lk_detect);
+	STAT_ULONG("Lock init", dbenv->lk_init);
+	STAT_ULONG("Lock init lockers", dbenv->lk_init_lockers);
+	STAT_ULONG("Lock init objects", dbenv->lk_init_objects);
+	STAT_ULONG("Lock max", dbenv->lk_max);
+	STAT_ULONG("Lock max lockers", dbenv->lk_max_lockers);
+	STAT_ULONG("Lock max objects", dbenv->lk_max_objects);
+	STAT_ULONG("Lock partitions", dbenv->lk_partitions);
+	STAT_ULONG("Lock object hash table size", dbenv->object_t_size);
+	STAT_ULONG("Lock timeout", dbenv->lk_timeout);
+
+	STAT_ULONG("Log bsize", dbenv->lg_bsize);
+	STAT_FMT("Log file mode", "%#o", int, dbenv->lg_filemode);
+	STAT_ULONG("Log region max", dbenv->lg_regionmax);
+	STAT_ULONG("Log size", dbenv->lg_size);
+
+	STAT_ULONG("Cache GB", dbenv->mp_gbytes);
+	STAT_ULONG("Cache B", dbenv->mp_bytes);
+	STAT_ULONG("Cache max GB", dbenv->mp_max_gbytes);
+	STAT_ULONG("Cache max B", dbenv->mp_max_bytes);
+	STAT_ULONG("Cache mmap size", dbenv->mp_mmapsize);
+	STAT_ULONG("Cache max open fd", dbenv->mp_maxopenfd);
+	STAT_ULONG("Cache max write", dbenv->mp_maxwrite);
+	STAT_ULONG("Cache number", dbenv->mp_ncache);
+	STAT_ULONG("Cache max write sleep", dbenv->mp_maxwrite_sleep);
+
+	STAT_ULONG("Txn init", dbenv->tx_init);
+	STAT_ULONG("Txn max", dbenv->tx_max);
+	STAT_ULONG("Txn timestamp", dbenv->tx_timestamp);
+	STAT_ULONG("Txn timeout", dbenv->tx_timeout);
+
+	STAT_ULONG("Thread count", dbenv->thr_max);
+
+	STAT_ISSET("Registry", dbenv->registry);
+	STAT_ULONG("Registry offset", dbenv->registry_off);
+	STAT_ULONG("Registry timeout", dbenv->envreg_timeout);
+
+	__db_prflags(env,
+	    NULL, dbenv->flags, db_env_fn, NULL, "\tPublic environment flags");
+
+	return (0);
+}
+
+/*
+ * __env_print_env_all --
+ *	Display the debugging environment statistics.
+ */
+static int
+__env_print_env_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN env_fn[] = {
+		{ ENV_CDB,			"ENV_CDB" },
+		{ ENV_DBLOCAL,			"ENV_DBLOCAL" },
+		{ ENV_LOCKDOWN,			"ENV_LOCKDOWN" },
+		{ ENV_NO_OUTPUT_SET,		"ENV_NO_OUTPUT_SET" },
+		{ ENV_OPEN_CALLED,		"ENV_OPEN_CALLED" },
+		{ ENV_PRIVATE,			"ENV_PRIVATE" },
+		{ ENV_RECOVER_FATAL,		"ENV_RECOVER_FATAL" },
+		{ ENV_REF_COUNTED,		"ENV_REF_COUNTED" },
+		{ ENV_SYSTEM_MEM,		"ENV_SYSTEM_MEM" },
+		{ ENV_THREAD,			"ENV_THREAD" },
+		{ 0,				NULL }
+	};
+	static const FN ofn[] = {
+		{ DB_CREATE,			"DB_CREATE" },
+		{ DB_FORCE,			"DB_FORCE" },
+		{ DB_INIT_CDB,			"DB_INIT_CDB" },
+		{ DB_INIT_LOCK,			"DB_INIT_LOCK" },
+		{ DB_INIT_LOG,			"DB_INIT_LOG" },
+		{ DB_INIT_MPOOL,		"DB_INIT_MPOOL" },
+		{ DB_INIT_REP,			"DB_INIT_REP" },
+		{ DB_INIT_TXN,			"DB_INIT_TXN" },
+		{ DB_LOCKDOWN,			"DB_LOCKDOWN" },
+		{ DB_NOMMAP,			"DB_NOMMAP" },
+		{ DB_PRIVATE,			"DB_PRIVATE" },
+		{ DB_RDONLY,			"DB_RDONLY" },
+		{ DB_RECOVER,			"DB_RECOVER" },
+		{ DB_RECOVER_FATAL,		"DB_RECOVER_FATAL" },
+		{ DB_SYSTEM_MEM,		"DB_SYSTEM_MEM" },
+		{ DB_THREAD,			"DB_THREAD" },
+		{ DB_TRUNCATE,			"DB_TRUNCATE" },
+		{ DB_TXN_NOSYNC,		"DB_TXN_NOSYNC" },
+		{ DB_USE_ENVIRON,		"DB_USE_ENVIRON" },
+		{ DB_USE_ENVIRON_ROOT,		"DB_USE_ENVIRON_ROOT" },
+		{ 0,				NULL }
+	};
+	static const FN regenvfn[] = {
+		{ DB_REGENV_REPLOCKED,		"DB_REGENV_REPLOCKED" },
+		{ 0,				NULL }
+	};
+	REGENV *renv;
+	REGINFO *infop;
+	REGION *rp;
+	u_int32_t i;
+	char time_buf[CTIME_BUFLEN];
+
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	STAT_POINTER("DB_ENV", env->dbenv);
+	__mutex_print_debug_single(
+	    env, "ENV handle mutex", env->mtx_env, flags);
+
+	STAT_STRING("Home", env->db_home);
+	__db_prflags(env, NULL, env->open_flags, ofn, NULL, "\tOpen flags");
+	STAT_FMT("Mode", "%#o", int, env->db_mode);
+
+	STAT_ULONG("Pid cache", env->pid_cache);
+
+	STAT_ISSET("Lockfhp", env->lockfhp);
+
+	STAT_ISSET("Locker", env->env_lref);
+
+	STAT_ISSET("Internal recovery table", env->recover_dtab.int_dispatch);
+	STAT_ULONG("Number of recovery table slots",
+	    env->recover_dtab.int_size);
+	STAT_ISSET("External recovery table", env->recover_dtab.ext_dispatch);
+	STAT_ULONG("Number of recovery table slots",
+	    env->recover_dtab.ext_size);
+
+	STAT_ULONG("Thread hash buckets", env->thr_nbucket);
+	STAT_ISSET("Thread hash table", env->thr_hashtab);
+
+	__mutex_print_debug_single(
+	    env, "ENV list of DB handles mutex", env->mtx_dblist, flags);
+	STAT_LONG("DB reference count", env->db_ref);
+
+	__mutex_print_debug_single(env, "MT mutex", env->mtx_mt, flags);
+
+	STAT_ISSET("Crypto handle", env->crypto_handle);
+	STAT_ISSET("Lock handle", env->lk_handle);
+	STAT_ISSET("Log handle", env->lg_handle);
+	STAT_ISSET("Cache handle", env->mp_handle);
+	STAT_ISSET("Mutex handle", env->mutex_handle);
+	STAT_ISSET("Replication handle", env->rep_handle);
+	STAT_ISSET("Txn handle", env->tx_handle);
+
+	STAT_ISSET("User copy", env->dbt_usercopy);
+
+	STAT_LONG("Test abort", env->test_abort);
+	STAT_LONG("Test check", env->test_check);
+	STAT_LONG("Test copy", env->test_copy);
+
+	__db_prflags(env,
+	    NULL, env->flags, env_fn, NULL, "\tPrivate environment flags");
+
+	__db_print_reginfo(env, infop, "Primary", flags);
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "Per region database environment information:");
+	for (rp = R_ADDR(infop, renv->region_off),
+	    i = 0; i < renv->region_cnt; ++i, ++rp) {
+		if (rp->id == INVALID_REGION_ID)
+			continue;
+		__db_msg(env, "%s Region:", __reg_type(rp->type));
+		STAT_LONG("Region ID", rp->id);
+		STAT_LONG("Segment ID", rp->segid);
+		__db_dlbytes(env,
+		    "Size", (u_long)0, (u_long)0, (u_long)rp->size);
+	}
+	__db_prflags(env,
+	    NULL, renv->init_flags, ofn, NULL, "\tInitialization flags");
+	STAT_ULONG("Region slots", renv->region_cnt);
+	__db_prflags(env,
+	    NULL, renv->flags, regenvfn, NULL, "\tReplication flags");
+	__db_msg(env, "%.24s\tOperation timestamp",
+	    renv->op_timestamp == 0 ?
+	    "!Set" : __os_ctime(&renv->op_timestamp, time_buf));
+	__db_msg(env, "%.24s\tReplication timestamp",
+	    renv->rep_timestamp == 0 ?
+	    "!Set" : __os_ctime(&renv->rep_timestamp, time_buf));
+
+	return (0);
+}
+
+static char *
+__env_thread_state_print(state)
+	DB_THREAD_STATE state;
+{
+	switch (state) {
+	case THREAD_ACTIVE:
+		return ("active");
+	case THREAD_BLOCKED:
+		return ("blocked");
+	case THREAD_BLOCKED_DEAD:
+		return ("blocked and dead");
+	case THREAD_OUT:
+		return ("out");
+	default:
+		return ("unknown");
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __env_print_thread --
+ *	Display the thread block state.
+ */
+static int
+__env_print_thread(env)
+	ENV *env;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_HASHTAB *htab;
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	PIN_LIST *list, *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	THREAD_INFO *thread;
+	u_int32_t i;
+	char buf[DB_THREADID_STRLEN];
+
+	dbenv = env->dbenv;
+
+	/* The thread table may not be configured. */
+	if ((htab = env->thr_hashtab) == NULL)
+		return (0);
+
+	dbmp = env->mp_handle;
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "Thread tracking information");
+
+	/* Dump out the info we have on thread tracking. */
+	infop = env->reginfo;
+	renv = infop->primary;
+	thread = R_ADDR(infop, renv->thread_off);
+	STAT_ULONG("Thread blocks allocated", thread->thr_count);
+	STAT_ULONG("Thread allocation threshold", thread->thr_max);
+	STAT_ULONG("Thread hash buckets", thread->thr_nbucket);
+
+	/* Dump out the info we have on active threads. */
+	__db_msg(env, "Thread status blocks:");
+	for (i = 0; i < env->thr_nbucket; i++)
+		SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+			if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE)
+				continue;
+			__db_msg(env, "\tprocess/thread %s: %s",
+			    dbenv->thread_id_string(
+			    dbenv, ip->dbth_pid, ip->dbth_tid, buf),
+			    __env_thread_state_print(ip->dbth_state));
+			list = R_ADDR(env->reginfo, ip->dbth_pinlist);
+			for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
+				if (lp->b_ref == INVALID_ROFF)
+					continue;
+				bhp = R_ADDR(
+				    &dbmp->reginfo[lp->region], lp->b_ref);
+				__db_msg(env,
+				     "\t\tpins: %lu", (u_long)bhp->pgno);
+			}
+		}
+	return (0);
+}
+
+/*
+ * __env_print_fh --
+ *	Display statistics for all handles open in this environment.
+ */
+static int
+__env_print_fh(env)
+	ENV *env;
+{
+	DB_FH *fhp;
+
+	if (TAILQ_FIRST(&env->fdlist) == NULL)
+		return (0);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "Environment file handle information");
+
+	MUTEX_LOCK(env, env->mtx_env);
+
+	TAILQ_FOREACH(fhp, &env->fdlist, q)
+		__db_print_fh(env, NULL, fhp, 0);
+
+	MUTEX_UNLOCK(env, env->mtx_env);
+
+	return (0);
+}
+
+/*
+ * __db_print_fh --
+ *	Print out a file handle.
+ *
+ * PUBLIC: void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t));
+ */
+void
+__db_print_fh(env, tag, fh, flags)
+	ENV *env;
+	const char *tag;
+	DB_FH *fh;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_FH_NOSYNC,	"DB_FH_NOSYNC" },
+		{ DB_FH_OPENED,	"DB_FH_OPENED" },
+		{ DB_FH_UNLINK,	"DB_FH_UNLINK" },
+		{ 0,		NULL }
+	};
+
+	if (fh == NULL) {
+		STAT_ISSET(tag, fh);
+		return;
+	}
+
+	STAT_STRING("file-handle.file name", fh->name);
+
+	__mutex_print_debug_single(
+	    env, "file-handle.mutex", fh->mtx_fh, flags);
+
+	STAT_LONG("file-handle.reference count", fh->ref);
+	STAT_LONG("file-handle.file descriptor", fh->fd);
+
+	STAT_ULONG("file-handle.page number", fh->pgno);
+	STAT_ULONG("file-handle.page size", fh->pgsize);
+	STAT_ULONG("file-handle.page offset", fh->offset);
+
+	STAT_ULONG("file-handle.seek count", fh->seek_count);
+	STAT_ULONG("file-handle.read count", fh->read_count);
+	STAT_ULONG("file-handle.write count", fh->write_count);
+
+	__db_prflags(env, NULL, fh->flags, fn, NULL, "\tfile-handle.flags");
+}
+
+/*
+ * __db_print_fileid --
+ *	Print out a file ID.
+ *
+ * PUBLIC: void __db_print_fileid __P((ENV *, u_int8_t *, const char *));
+ */
+void
+__db_print_fileid(env, id, suffix)
+	ENV *env;
+	u_int8_t *id;
+	const char *suffix;
+{
+	DB_MSGBUF mb;
+	int i;
+
+	if (id == NULL) {
+		STAT_ISSET("ID", id);
+		return;
+	}
+
+	DB_MSGBUF_INIT(&mb);
+	for (i = 0; i < DB_FILE_ID_LEN; ++i, ++id) {
+		__db_msgadd(env, &mb, "%x", (u_int)*id);
+		if (i < DB_FILE_ID_LEN - 1)
+			__db_msgadd(env, &mb, " ");
+	}
+	if (suffix != NULL)
+		__db_msgadd(env, &mb, "%s", suffix);
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_dl --
+ *	Display a big value.
+ *
+ * PUBLIC: void __db_dl __P((ENV *, const char *, u_long));
+ */
+void
+__db_dl(env, msg, value)
+	ENV *env;
+	const char *msg;
+	u_long value;
+{
+	/*
+	 * Two formats: if less than 10 million, display as the number, if
+	 * greater than 10 million display as ###M.
+	 */
+	if (value < 10000000)
+		__db_msg(env, "%lu\t%s", value, msg);
+	else
+		__db_msg(env, "%luM\t%s (%lu)", value / 1000000, msg, value);
+}
+
+/*
+ * __db_dl_pct --
+ *	Display a big value, and related percentage.
+ *
+ * PUBLIC: void __db_dl_pct
+ * PUBLIC:          __P((ENV *, const char *, u_long, int, const char *));
+ */
+void
+__db_dl_pct(env, msg, value, pct, tag)
+	ENV *env;
+	const char *msg, *tag;
+	u_long value;
+	int pct;
+{
+	DB_MSGBUF mb;
+
+	DB_MSGBUF_INIT(&mb);
+
+	/*
+	 * Two formats: if less than 10 million, display as the number, if
+	 * greater than 10 million, round it off and display as ###M.
+	 */
+	if (value < 10000000)
+		__db_msgadd(env, &mb, "%lu\t%s", value, msg);
+	else
+		__db_msgadd(env,
+		    &mb, "%luM\t%s", (value + 500000) / 1000000, msg);
+	if (tag == NULL)
+		__db_msgadd(env, &mb, " (%d%%)", pct);
+	else
+		__db_msgadd(env, &mb, " (%d%% %s)", pct, tag);
+
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_dlbytes --
+ *	Display a big number of bytes.
+ *
+ * PUBLIC: void __db_dlbytes
+ * PUBLIC:     __P((ENV *, const char *, u_long, u_long, u_long));
+ */
+void
+__db_dlbytes(env, msg, gbytes, mbytes, bytes)
+	ENV *env;
+	const char *msg;
+	u_long gbytes, mbytes, bytes;
+{
+	DB_MSGBUF mb;
+	const char *sep;
+
+	DB_MSGBUF_INIT(&mb);
+
+	/* Normalize the values. */
+	while (bytes >= MEGABYTE) {
+		++mbytes;
+		bytes -= MEGABYTE;
+	}
+	while (mbytes >= GIGABYTE / MEGABYTE) {
+		++gbytes;
+		mbytes -= GIGABYTE / MEGABYTE;
+	}
+
+	if (gbytes == 0 && mbytes == 0 && bytes == 0)
+		__db_msgadd(env, &mb, "0");
+	else {
+		sep = "";
+		if (gbytes > 0) {
+			__db_msgadd(env, &mb, "%luGB", gbytes);
+			sep = " ";
+		}
+		if (mbytes > 0) {
+			__db_msgadd(env, &mb, "%s%luMB", sep, mbytes);
+			sep = " ";
+		}
+		if (bytes >= 1024) {
+			__db_msgadd(env, &mb, "%s%luKB", sep, bytes / 1024);
+			bytes %= 1024;
+			sep = " ";
+		}
+		if (bytes > 0)
+			__db_msgadd(env, &mb, "%s%luB", sep, bytes);
+	}
+
+	__db_msgadd(env, &mb, "\t%s", msg);
+
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_print_reginfo --
+ *	Print out underlying shared region information.
+ *
+ * PUBLIC: void __db_print_reginfo
+ * PUBLIC:     __P((ENV *, REGINFO *, const char *, u_int32_t));
+ */
+void
+__db_print_reginfo(env, infop, s, flags)
+	ENV *env;
+	REGINFO *infop;
+	const char *s;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ REGION_CREATE,	"REGION_CREATE" },
+		{ REGION_CREATE_OK,	"REGION_CREATE_OK" },
+		{ REGION_JOIN_OK,	"REGION_JOIN_OK" },
+		{ REGION_SHARED,	"REGION_SHARED" },
+		{ 0,			NULL }
+	};
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "%s REGINFO information:",  s);
+	STAT_STRING("Region type", __reg_type(infop->type));
+	STAT_ULONG("Region ID", infop->id);
+	STAT_STRING("Region name", infop->name);
+	STAT_POINTER("Region address", infop->addr);
+	STAT_POINTER("Region allocation head", infop->head);
+	STAT_POINTER("Region primary address", infop->primary);
+	STAT_ULONG("Region maximum allocation", infop->max_alloc);
+	STAT_ULONG("Region allocated", infop->allocated);
+	__env_alloc_print(infop, flags);
+
+	__db_prflags(env, NULL, infop->flags, fn, NULL, "\tRegion flags");
+}
+
+/*
+ * __reg_type --
+ *	Return the region type string.
+ */
+static const char *
+__reg_type(t)
+	reg_type_t t;
+{
+	switch (t) {
+	case REGION_TYPE_ENV:
+		return ("Environment");
+	case REGION_TYPE_LOCK:
+		return ("Lock");
+	case REGION_TYPE_LOG:
+		return ("Log");
+	case REGION_TYPE_MPOOL:
+		return ("Mpool");
+	case REGION_TYPE_MUTEX:
+		return ("Mutex");
+	case REGION_TYPE_TXN:
+		return ("Transaction");
+	case INVALID_REGION_TYPE:
+		return ("Invalid");
+	}
+	return ("Unknown");
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_stat_not_built --
+ *	Common error routine when library not built with statistics.
+ *
+ * PUBLIC: int __db_stat_not_built __P((ENV *));
+ */
+int
+__db_stat_not_built(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("1554",
+	    "Library build did not include statistics support"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__env_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/fileops/fileops.src b/src/fileops/fileops.src
new file mode 100644
index 00000000..cdb6af27
--- /dev/null
+++ b/src/fileops/fileops.src
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__fop
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE #include "dbinc/fop.h"
+INCLUDE
+
+/*
+ * create -- create a file system object.
+ *
+ * name: name in the file system
+ * appname: indicates if the name needs to go through __db_appname
+ * mode: file system mode
+ */
+BEGIN_COMPAT create		42	143
+DBT	name		DBT		s
+ARG	appname		u_int32_t	lu
+ARG	mode		u_int32_t	o
+END
+
+BEGIN create		48	143
+DBT	name		DBT		s
+DBT	dirname		DBT		s
+ARG	appname		u_int32_t	lu
+ARG	mode		u_int32_t	o
+END
+
+/*
+ * remove -- remove a file system object.
+ *
+ * name: name in the file system
+ * appname: indicates if the name needs to go through __db_appname
+ */
+BEGIN remove		42	144
+DBT	name		DBT		s
+DBT	fid		DBT		s
+ARG	appname		u_int32_t	lu
+END
+
+/*
+ * write: log the writing of data into an object.
+ *
+ * name: file containing the page.
+ * appname: indicates if the name needs to go through __db_appname
+ * pgsize: page size.
+ * pageno: page number in the file.
+ * offset: offset on the page.
+ * page: the actual meta-data page.
+ * flag: non-0 indicates that this is a tempfile, so we needn't undo
+ *	these modifications (we'll toss the file).
+ */
+BEGIN_COMPAT write	42	145
+DBT	name		DBT		s
+ARG	appname		u_int32_t	lu
+ARG	pgsize		u_int32_t	lu
+ARG	pageno		db_pgno_t	lu
+ARG	offset		u_int32_t	lu
+DBT	page		DBT		s
+ARG	flag		u_int32_t	lu
+END
+
+BEGIN write	48	145
+DBT	name		DBT		s
+DBT	dirname		DBT		s
+ARG	appname		u_int32_t	lu
+ARG	pgsize		u_int32_t	lu
+ARG	pageno		db_pgno_t	lu
+ARG	offset		u_int32_t	lu
+DBT	page		DBT		s
+ARG	flag		u_int32_t	lu
+END
+
+/*
+ * rename: move a file from one name to another.
+ * The appname value indicates if this is a path name that should be used
+ * directly (i.e., no interpretation) or if it is a pathname that should
+ * be interpreted via calls to __db_appname.  The fileid is the 20-byte
+ * DB fileid of the file being renamed.  We need to check it on recovery
+ * so that we don't inadvertently overwrite good files.
+ *
+ * There are two variants of this log record: one that must be both done
+ * and undone and one that is not undone (used for renaming tmp files, see
+ * SR #15119)
+ *
+ * These two record types use the same structure, read, and print functions,
+ * but have different recovery functions.
+ */
+BEGIN_COMPAT rename	42	146
+DUPLICATE rename_noundo	46	150
+DBT	oldname		DBT		s
+DBT	newname		DBT		s
+DBT	fileid		DBT		s
+ARG	appname		u_int32_t	lu
+END
+
+BEGIN rename	48	146
+DUPLICATE rename_noundo	46	150
+DBT	oldname		DBT		s
+DBT	newname		DBT		s
+DBT	dirname		DBT		s
+DBT	fileid		DBT		s
+ARG	appname		u_int32_t	lu
+END
+
+/*
+ * File removal record.  This is a DB-level log record that indicates
+ * we've just completed some form of file removal.  The purpose of this
+ * log record is to logically identify the particular instance of the
+ * named file so that during recovery, in deciding if we should roll-forward
+ * a remove or a rename, we can make sure that we don't roll one forward and
+ * delete or overwrite the wrong file.
+ * real_fid:	The 20-byte unique file identifier of the original file being
+ *		removed.
+ * tmp_fid:	The unique fid of the tmp file that is removed.
+ * name:	The pre- __db_appname name of the file
+ * child:	The transaction that removed or renamed the file.
+ */
+ */
+BEGIN file_remove	42	141
+DBT	real_fid	DBT	s
+DBT	tmp_fid		DBT	s
+DBT	name		DBT	s
+ARG	appname		u_int32_t	lu
+ARG	child		u_int32_t	lx
+END
diff --git a/src/fileops/fileops_auto.c b/src/fileops/fileops_auto.c
new file mode 100644
index 00000000..0db619a5
--- /dev/null
+++ b/src/fileops/fileops_auto.c
@@ -0,0 +1,118 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+DB_LOG_RECSPEC __fop_create_42_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_create_42_args, name), "name", ""},
+	{LOGREC_ARG, SSZ(__fop_create_42_args, appname), "appname", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_create_42_args, mode), "mode", "%o"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_create_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_create_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__fop_create_args, dirname), "dirname", ""},
+	{LOGREC_ARG, SSZ(__fop_create_args, appname), "appname", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_create_args, mode), "mode", "%o"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_remove_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_remove_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__fop_remove_args, fid), "fid", ""},
+	{LOGREC_ARG, SSZ(__fop_remove_args, appname), "appname", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_write_42_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_write_42_args, name), "name", ""},
+	{LOGREC_ARG, SSZ(__fop_write_42_args, appname), "appname", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_42_args, pgsize), "pgsize", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_42_args, pageno), "pageno", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_42_args, offset), "offset", "%lu"},
+	{LOGREC_DBT, SSZ(__fop_write_42_args, page), "page", ""},
+	{LOGREC_ARG, SSZ(__fop_write_42_args, flag), "flag", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_write_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_write_args, name), "name", ""},
+	{LOGREC_DBT, SSZ(__fop_write_args, dirname), "dirname", ""},
+	{LOGREC_ARG, SSZ(__fop_write_args, appname), "appname", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_args, pgsize), "pgsize", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_args, pageno), "pageno", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_write_args, offset), "offset", "%lu"},
+	{LOGREC_DBT, SSZ(__fop_write_args, page), "page", ""},
+	{LOGREC_ARG, SSZ(__fop_write_args, flag), "flag", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_42_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_noundo_46_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, dirname), "dirname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_noundo_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, dirname), "dirname", ""},
+	{LOGREC_DBT, SSZ(__fop_rename_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_file_remove_desc[] = {
+	{LOGREC_DBT, SSZ(__fop_file_remove_args, real_fid), "real_fid", ""},
+	{LOGREC_DBT, SSZ(__fop_file_remove_args, tmp_fid), "tmp_fid", ""},
+	{LOGREC_DBT, SSZ(__fop_file_remove_args, name), "name", ""},
+	{LOGREC_ARG, SSZ(__fop_file_remove_args, appname), "appname", "%lu"},
+	{LOGREC_ARG, SSZ(__fop_file_remove_args, child), "child", "%lx"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __fop_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_create_recover, DB___fop_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_remove_recover, DB___fop_remove)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_write_recover, DB___fop_write)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_recover, DB___fop_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_noundo_recover, DB___fop_rename_noundo)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_file_remove_recover, DB___fop_file_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/fileops/fileops_autop.c b/src/fileops/fileops_autop.c
new file mode 100644
index 00000000..6e271a17
--- /dev/null
+++ b/src/fileops/fileops_autop.c
@@ -0,0 +1,177 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+/*
+ * PUBLIC: int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_create_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_create_42", __fop_create_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_create_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_create_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_create", __fop_create_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_remove_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_remove_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_remove", __fop_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_write_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_write_42", __fop_write_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_write_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_write_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_write", __fop_write_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_rename_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_rename_42", __fop_rename_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_rename_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_rename_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_rename", __fop_rename_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_file_remove_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__fop_file_remove", __fop_file_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_create_print, DB___fop_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_remove_print, DB___fop_remove)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_write_print, DB___fop_write)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_print, DB___fop_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_print, DB___fop_rename_noundo)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_file_remove_print, DB___fop_file_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/fileops/fop_basic.c b/src/fileops/fop_basic.c
new file mode 100644
index 00000000..d6c707f2
--- /dev/null
+++ b/src/fileops/fop_basic.c
@@ -0,0 +1,318 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+/*
+ * The transactional guarantees Berkeley DB provides for file
+ * system level operations (database physical file create, delete,
+ * rename) are based on our understanding of current file system
+ * semantics; a system that does not provide these semantics and
+ * guarantees could be in danger.
+ *
+ * First, as in standard database changes, fsync and fdatasync must
+ * work: when applied to the log file, the records written into the
+ * log must be transferred to stable storage.
+ *
+ * Second, it must not be possible for the log file to be removed
+ * without previous file system level operations being flushed to
+ * stable storage.  Berkeley DB applications write log records
+ * describing file system operations into the log, then perform the
+ * file system operation, then commit the enclosing transaction
+ * (which flushes the log file to stable storage).  Subsequently,
+ * a database environment checkpoint may make it possible for the
+ * application to remove the log file containing the record of the
+ * file system operation.  DB's transactional guarantees for file
+ * system operations require the log file removal not succeed until
+ * all previous filesystem operations have been flushed to stable
+ * storage.  In other words, the flush of the log file, or the
+ * removal of the log file, must block until all previous
+ * filesystem operations have been flushed to stable storage.  This
+ * semantic is not, as far as we know, required by any existing
+ * standards document, but we have never seen a filesystem where
+ * it does not apply.
+ */
+
+/*
+ * __fop_create --
+ * Create a (transactionally protected) file system object.  This is used
+ * to create DB files now, potentially blobs, queue extents and anything
+ * else you wish to store in a file system object.
+ *
+ * PUBLIC: int __fop_create __P((ENV *, DB_TXN *,
+ * PUBLIC:     DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
+ */
+int
+__fop_create(env, txn, fhpp, name, dirp, appname, mode, flags)
+	ENV *env;
+	DB_TXN *txn;
+	DB_FH **fhpp;
+	const char *name, **dirp;
+	APPNAME appname;
+	int mode;
+	u_int32_t flags;
+{
+	DBT data, dirdata;
+	DB_FH *fhp;
+	DB_LSN lsn;
+	int ret;
+	char *real_name;
+
+	real_name = NULL;
+	fhp = NULL;
+
+	if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
+		return (ret);
+
+	if (mode == 0)
+		mode = DB_MODE_600;
+
+	if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+	    && txn != NULL
+#endif
+	    ) {
+		DB_INIT_DBT(data, name, strlen(name) + 1);
+		if (dirp != NULL && *dirp != NULL)
+			DB_INIT_DBT(dirdata, *dirp, strlen(*dirp) + 1);
+		else
+			memset(&dirdata, 0, sizeof(dirdata));
+		if ((ret = __fop_create_log(env, txn, &lsn,
+		    flags | DB_FLUSH,
+		    &data, &dirdata, (u_int32_t)appname, (u_int32_t)mode)) != 0)
+			goto err;
+	}
+
+	DB_ENV_TEST_RECOVERY(env, DB_TEST_POSTLOG, ret, name);
+
+	if (fhpp == NULL)
+		fhpp = &fhp;
+	ret = __os_open(
+	    env, real_name, 0, DB_OSO_CREATE | DB_OSO_EXCL, mode, fhpp);
+
+err:
+DB_TEST_RECOVERY_LABEL
+	if (fhpp == &fhp && fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	return (ret);
+}
+
+/*
+ * __fop_remove --
+ *	Remove a file system object.
+ *
+ * PUBLIC: int __fop_remove __P((ENV *, DB_TXN *,
+ * PUBLIC:     u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
+ */
+int
+__fop_remove(env, txn, fileid, name, dirp, appname, flags)
+	ENV *env;
+	DB_TXN *txn;
+	u_int8_t *fileid;
+	const char *name, **dirp;
+	APPNAME appname;
+	u_int32_t flags;
+{
+	DBT fdbt, ndbt;
+	DB_LSN lsn;
+	char *real_name;
+	int ret;
+
+	real_name = NULL;
+
+	if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
+		goto err;
+
+	if (!IS_REAL_TXN(txn)) {
+		if (fileid != NULL && (ret = __memp_nameop(
+		    env, fileid, NULL, real_name, NULL, 0)) != 0)
+			goto err;
+	} else {
+		if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+		    && txn != NULL
+#endif
+		) {
+			memset(&fdbt, 0, sizeof(ndbt));
+			fdbt.data = fileid;
+			fdbt.size = fileid == NULL ? 0 : DB_FILE_ID_LEN;
+			DB_INIT_DBT(ndbt, name, strlen(name) + 1);
+			if ((ret = __fop_remove_log(env, txn, &lsn,
+			    flags, &ndbt, &fdbt, (u_int32_t)appname)) != 0)
+				goto err;
+		}
+		ret = __txn_remevent(env, txn, real_name, fileid, 0);
+	}
+
+err:	if (real_name != NULL)
+		__os_free(env, real_name);
+	return (ret);
+}
+
+/*
+ * __fop_write
+ *
+ * Write "size" bytes from "buf" to file "name" beginning at offset "off."
+ * If the file is open, supply a handle in fhp.  Istmp indicate if this is
+ * an operation that needs to be undone in the face of failure (i.e., if
+ * this is a write to a temporary file, we're simply going to remove the
+ * file, so don't worry about undoing the write).
+ *
+ * Currently, we *only* use this with istmp true.  If we need more general
+ * handling, then we'll have to zero out regions on abort (and possibly
+ * log the before image of the data in the log record).
+ *
+ * PUBLIC: int __fop_write __P((ENV *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, APPNAME, DB_FH *, u_int32_t,
+ * PUBLIC:     db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__fop_write(env, txn,
+    name, dirname, appname, fhp, pgsize, pageno, off, buf, size, istmp, flags)
+	ENV *env;
+	DB_TXN *txn;
+	const char *name, *dirname;
+	APPNAME appname;
+	DB_FH *fhp;
+	u_int32_t pgsize;
+	db_pgno_t pageno;
+	u_int32_t off;
+	void *buf;
+	u_int32_t size, istmp, flags;
+{
+	DBT data, namedbt, dirdbt;
+	DB_LSN lsn;
+	size_t nbytes;
+	int local_open, ret, t_ret;
+	char *real_name;
+
+	DB_ASSERT(env, istmp != 0);
+
+	ret = local_open = 0;
+	real_name = NULL;
+
+	if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+	    && txn != NULL
+#endif
+	    ) {
+		memset(&data, 0, sizeof(data));
+		data.data = buf;
+		data.size = size;
+		DB_INIT_DBT(namedbt, name, strlen(name) + 1);
+		if (dirname != NULL)
+			DB_INIT_DBT(dirdbt, dirname, strlen(dirname) + 1);
+		else
+			memset(&dirdbt, 0, sizeof(dirdbt));
+		if ((ret = __fop_write_log(env, txn,
+		    &lsn, flags, &namedbt, &dirdbt, (u_int32_t)appname,
+		    pgsize, pageno, off, &data, istmp)) != 0)
+			goto err;
+	}
+
+	if (fhp == NULL) {
+		/* File isn't open; we need to reopen it. */
+		if ((ret = __db_appname(env,
+		    appname, name, &dirname, &real_name)) != 0)
+			return (ret);
+
+		if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0)
+			goto err;
+		local_open = 1;
+	}
+
+	/* Seek to offset. */
+	if ((ret = __os_seek(env, fhp, pageno, pgsize, off)) != 0)
+		goto err;
+
+	/* Now do the write. */
+	if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0)
+		goto err;
+
+err:	if (local_open &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+			ret = t_ret;
+
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	return (ret);
+}
+
+/*
+ * __fop_rename --
+ *	Change a file's name.
+ *
+ * PUBLIC: int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *,
+ * PUBLIC:      const char **, u_int8_t *, APPNAME, int, u_int32_t));
+ */
+int
+__fop_rename(env, txn, oldname, newname, dirp, fid, appname, with_undo, flags)
+	ENV *env;
+	DB_TXN *txn;
+	const char *oldname;
+	const char *newname;
+	const char **dirp;
+	u_int8_t *fid;
+	APPNAME appname;
+	int with_undo;
+	u_int32_t flags;
+{
+	DBT fiddbt, dir, new, old;
+	DB_LSN lsn;
+	int ret;
+	char *n, *o;
+
+	o = n = NULL;
+	if ((ret = __db_appname(env, appname, oldname, dirp, &o)) != 0)
+		goto err;
+	if ((ret = __db_appname(env, appname, newname, dirp, &n)) != 0)
+		goto err;
+
+	if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+	    && txn != NULL
+#endif
+	    ) {
+		DB_INIT_DBT(old, oldname, strlen(oldname) + 1);
+		DB_INIT_DBT(new, newname, strlen(newname) + 1);
+		if (dirp != NULL && *dirp != NULL)
+			DB_INIT_DBT(dir, *dirp, strlen(*dirp) + 1);
+		else
+			memset(&dir, 0, sizeof(dir));
+		memset(&fiddbt, 0, sizeof(fiddbt));
+		fiddbt.data = fid;
+		fiddbt.size = DB_FILE_ID_LEN;
+		if (with_undo)
+			ret = __fop_rename_log(env,
+			    txn, &lsn, flags | DB_FLUSH,
+			    &old, &new, &dir, &fiddbt, (u_int32_t)appname);
+		else
+			ret = __fop_rename_noundo_log(env,
+			    txn, &lsn, flags | DB_FLUSH,
+			    &old, &new, &dir, &fiddbt, (u_int32_t)appname);
+		if (ret != 0)
+			goto err;
+	}
+
+	ret = __memp_nameop(env, fid, newname, o, n, 0);
+
+err:	if (o != NULL)
+		__os_free(env, o);
+	if (n != NULL)
+		__os_free(env, n);
+	return (ret);
+}
diff --git a/src/fileops/fop_rec.c b/src/fileops/fop_rec.c
new file mode 100644
index 00000000..52d6175d
--- /dev/null
+++ b/src/fileops/fop_rec.c
@@ -0,0 +1,697 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __fop_rename_recover_int
+    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+static int __fop_rename_42_recover_int
+    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+
+/*
+ * The transactional guarantees Berkeley DB provides for file
+ * system level operations (database physical file create, delete,
+ * rename) are based on our understanding of current file system
+ * semantics; a system that does not provide these semantics and
+ * guarantees could be in danger.
+ *
+ * First, as in standard database changes, fsync and fdatasync must
+ * work: when applied to the log file, the records written into the
+ * log must be transferred to stable storage.
+ *
+ * Second, it must not be possible for the log file to be removed
+ * without previous file system level operations being flushed to
+ * stable storage.  Berkeley DB applications write log records
+ * describing file system operations into the log, then perform the
+ * file system operation, then commit the enclosing transaction
+ * (which flushes the log file to stable storage).  Subsequently,
+ * a database environment checkpoint may make it possible for the
+ * application to remove the log file containing the record of the
+ * file system operation.  DB's transactional guarantees for file
+ * system operations require the log file removal not succeed until
+ * all previous filesystem operations have been flushed to stable
+ * storage.  In other words, the flush of the log file, or the
+ * removal of the log file, must block until all previous
+ * filesystem operations have been flushed to stable storage.  This
+ * semantic is not, as far as we know, required by any existing
+ * standards document, but we have never seen a filesystem where
+ * it does not apply.
+ */
+
+/*
+ * __fop_create_recover --
+ *	Recovery function for create.
+ *
+ * PUBLIC: int __fop_create_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_create_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_create_args *argp;
+	DB_FH *fhp;
+	DBMETA *meta;
+	u_int8_t mbuf[DBMETASIZE];
+	int ret;
+	char *real_name;
+	const char *dirname;
+
+	COMPQUIET(info, NULL);
+
+	real_name = NULL;
+	REC_PRINT(__fop_create_print);
+	REC_NOOP_INTRO(__fop_create_read);
+	meta = (DBMETA *)mbuf;
+
+	if (argp->dirname.size == 0)
+		dirname = NULL;
+	else
+		dirname = (const char *)argp->dirname.data;
+
+	if ((ret = __db_appname(env, (APPNAME)argp->appname == DB_APP_DATA ?
+	    DB_APP_RECOVER : (APPNAME)argp->appname,
+	    (const char *)argp->name.data, &dirname, &real_name)) != 0)
+		goto out;
+
+	if (DB_UNDO(op)) {
+		/*
+		 * If the file was opened in mpool, we must mark it as
+		 * dead via nameop which will also unlink the file.
+		 */
+		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
+			if (__fop_read_meta(env,
+			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
+			    __db_chk_meta(env, NULL, meta, 1) == 0) {
+				if ((ret = __memp_nameop(env,
+				    meta->uid, NULL, real_name, NULL, 0)) != 0)
+					goto out;
+			} else {
+				(void)__os_closehandle(env, fhp);
+				goto do_unlink;
+			}
+			(void)__os_closehandle(env, fhp);
+		} else
+do_unlink:		(void)__os_unlink(env, real_name, 0);
+	} else if (DB_REDO(op)) {
+		if ((ret = __os_open(env, real_name, 0,
+		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
+			(void)__os_closehandle(env, fhp);
+		else
+			goto out;
+	}
+
+	*lsnp = argp->prev_lsn;
+
+out: if (real_name != NULL)
+		__os_free(env, real_name);
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_create_42_recover --
+ *	Recovery function for create.
+ *
+ * PUBLIC: int __fop_create_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_create_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_create_args *argp;
+	DB_FH *fhp;
+	DBMETA *meta;
+	u_int8_t mbuf[DBMETASIZE];
+	int ret;
+	char *real_name;
+
+	COMPQUIET(info, NULL);
+
+	real_name = NULL;
+	REC_PRINT(__fop_create_print);
+	REC_NOOP_INTRO(__fop_create_read);
+	meta = (DBMETA *)mbuf;
+
+	if ((ret = __db_appname(env, (APPNAME)argp->appname,
+	    (const char *)argp->name.data, NULL, &real_name)) != 0)
+		goto out;
+
+	if (DB_UNDO(op)) {
+		/*
+		 * If the file was opened in mpool, we must mark it as
+		 * dead via nameop which will also unlink the file.
+		 */
+		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
+			if (__fop_read_meta(env,
+			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
+			    __db_chk_meta(env, NULL, meta, 1) == 0) {
+				if ((ret = __memp_nameop(env,
+				    meta->uid, NULL, real_name, NULL, 0)) != 0)
+					goto out;
+			} else
+				goto do_unlink;
+			(void)__os_closehandle(env, fhp);
+		} else
+do_unlink:		(void)__os_unlink(env, real_name, 0);
+	} else if (DB_REDO(op)) {
+		if ((ret = __os_open(env, real_name, 0,
+		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
+			(void)__os_closehandle(env, fhp);
+		else
+			goto out;
+	}
+
+	*lsnp = argp->prev_lsn;
+
+out: if (real_name != NULL)
+		__os_free(env, real_name);
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_remove_recover --
+ *	Recovery function for remove.
+ *
+ * PUBLIC: int __fop_remove_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_remove_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_remove_args *argp;
+	int ret;
+	char *real_name;
+
+	COMPQUIET(info, NULL);
+
+	real_name = NULL;
+	REC_PRINT(__fop_remove_print);
+	REC_NOOP_INTRO(__fop_remove_read);
+
+	if ((ret = __db_appname(env, (APPNAME)argp->appname,
+	    (const char *)argp->name.data, NULL, &real_name)) != 0)
+		goto out;
+
+	/* Its ok if the file is not there. */
+	if (DB_REDO(op))
+		(void)__memp_nameop(env,
+		    (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
+
+	*lsnp = argp->prev_lsn;
+out:	if (real_name != NULL)
+		__os_free(env, real_name);
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_write_recover --
+ *	Recovery function for writechunk.
+ *
+ * PUBLIC: int __fop_write_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_write_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__fop_write_print);
+	REC_NOOP_INTRO(__fop_write_read);
+
+	ret = 0;
+	if (DB_UNDO(op))
+		DB_ASSERT(env, argp->flag != 0);
+	else if (DB_REDO(op))
+		ret = __fop_write(env,
+		    argp->txnp, argp->name.data,
+		    argp->dirname.size == 0 ? NULL : argp->dirname.data,
+		    (APPNAME)argp->appname == DB_APP_DATA ? DB_APP_RECOVER :
+		    (APPNAME)argp->appname,
+		    NULL, argp->pgsize, argp->pageno, argp->offset,
+		    argp->page.data, argp->page.size, argp->flag, 0);
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_write_42_recover --
+ *	Recovery function for writechunk.
+ *
+ * PUBLIC: int __fop_write_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_write_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__fop_write_print);
+	REC_NOOP_INTRO(__fop_write_read);
+
+	ret = 0;
+	if (DB_UNDO(op))
+		DB_ASSERT(env, argp->flag != 0);
+	else if (DB_REDO(op))
+		ret = __fop_write(env,
+		    argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname,
+		    NULL, argp->pgsize, argp->pageno, argp->offset,
+		    argp->page.data, argp->page.size, argp->flag, 0);
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_rename_recover --
+ *	Recovery functions for rename.  There are two variants that
+ * both use the same utility function.  Had we known about this on day
+ * one, we would have simply added a parameter.  However, since we need
+ * to retain old records for backward compatibility (online-upgrade)
+ * wrapping the two seems like the right solution.
+ *
+ * PUBLIC: int __fop_rename_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * PUBLIC: int __fop_rename_noundo_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_rename_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
+}
+
+int
+__fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
+}
+
+static int
+__fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+	int undo;
+{
+	__fop_rename_args *argp;
+	APPNAME appname;
+	DB_FH *fhp;
+	DBMETA *meta;
+	u_int8_t *fileid, mbuf[DBMETASIZE];
+	int ret;
+	char *real_new, *real_old, *src;
+	const char *dirname;
+
+	COMPQUIET(info, NULL);
+
+	fhp = NULL;
+	meta = (DBMETA *)&mbuf[0];
+	ret = 0;
+	real_new = real_old = NULL;
+
+	REC_PRINT(__fop_rename_print);
+	REC_NOOP_INTRO(__fop_rename_read);
+	fileid = argp->fileid.data;
+
+	if (argp->dirname.size == 0)
+		dirname = NULL;
+	else
+		dirname = (const char *)argp->dirname.data;
+
+	if ((APPNAME)argp->appname == DB_APP_DATA)
+		appname = DB_APP_RECOVER;
+	else
+		appname = (APPNAME)argp->appname;
+
+	if ((ret = __db_appname(env, appname, (const char *)argp->newname.data,
+	    &dirname, &real_new)) != 0)
+		goto out;
+	if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data,
+	    &dirname, &real_old)) != 0)
+		goto out;
+
+	/*
+	 * Verify that we are manipulating the correct file.  We should always
+	 * be OK on an ABORT or an APPLY, but during recovery, we have to
+	 * check.
+	 */
+	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
+		src = DB_UNDO(op) ? real_new : real_old;
+		/*
+		 * Interpret any error as meaning that the file either doesn't
+		 * exist, doesn't have a meta-data page, or is in some other
+		 * way, shape or form, incorrect, so that we should not restore
+		 * it.
+		 */
+		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
+			goto done;
+		if (__fop_read_meta(env,
+		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
+			goto done;
+		if (__db_chk_meta(env, NULL, meta, 1) != 0)
+			goto done;
+		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
+			goto done;
+		(void)__os_closehandle(env, fhp);
+		fhp = NULL;
+		if (DB_REDO(op)) {
+			/*
+			 * Check to see if the target file exists.  If it
+			 * does and it does not have the proper id then
+			 * it is a later version.  We just remove the source
+			 * file since the state of the world is beyond this
+			 * point.
+			 */
+			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
+			    __fop_read_meta(env, src, mbuf,
+			    DBMETASIZE, fhp, 1, NULL) == 0 &&
+			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
+			    memcmp(argp->fileid.data,
+			    meta->uid, DB_FILE_ID_LEN) != 0) {
+				(void)__memp_nameop(env,
+				    fileid, NULL, real_old, NULL, 0);
+				goto done;
+			}
+		}
+	}
+
+	if (undo && DB_UNDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->oldname.data, real_new, real_old, 0);
+	if (DB_REDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->newname.data, real_old, real_new, 0);
+
+done:	*lsnp = argp->prev_lsn;
+out:	if (real_new != NULL)
+		__os_free(env, real_new);
+	if (real_old != NULL)
+		__os_free(env, real_old);
+	if (fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+
+	REC_NOOP_CLOSE;
+}
+/*
+ * __fop_rename_42_recover --
+ *	Recovery functions for rename.  There are two variants that
+ * both use the same utility function.  Had we known about this on day
+ * one, we would have simply added a parameter.  However, since we need
+ * to retain old records for backward compatibility (online-upgrade)
+ * wrapping the two seems like the right solution.
+ *
+ * PUBLIC: int __fop_rename_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * PUBLIC: int __fop_rename_noundo_46_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_rename_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 1));
+}
+
+int
+__fop_rename_noundo_46_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 0));
+}
+
+static int
+__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+	int undo;
+{
+	__fop_rename_args *argp;
+	DB_FH *fhp;
+	DBMETA *meta;
+	u_int8_t *fileid, mbuf[DBMETASIZE];
+	int ret;
+	char *real_new, *real_old, *src;
+
+	COMPQUIET(info, NULL);
+
+	fhp = NULL;
+	meta = (DBMETA *)&mbuf[0];
+	ret = 0;
+	real_new = real_old = NULL;
+
+	REC_PRINT(__fop_rename_print);
+	REC_NOOP_INTRO(__fop_rename_read);
+	fileid = argp->fileid.data;
+
+	if ((ret = __db_appname(env, (APPNAME)argp->appname,
+	    (const char *)argp->newname.data, NULL, &real_new)) != 0)
+		goto out;
+	if ((ret = __db_appname(env, (APPNAME)argp->appname,
+	    (const char *)argp->oldname.data, NULL, &real_old)) != 0)
+		goto out;
+
+	/*
+	 * Verify that we are manipulating the correct file.  We should always
+	 * be OK on an ABORT or an APPLY, but during recovery, we have to
+	 * check.
+	 */
+	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
+		src = DB_UNDO(op) ? real_new : real_old;
+		/*
+		 * Interpret any error as meaning that the file either doesn't
+		 * exist, doesn't have a meta-data page, or is in some other
+		 * way, shape or form, incorrect, so that we should not restore
+		 * it.
+		 */
+		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
+			goto done;
+		if (__fop_read_meta(env,
+		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
+			goto done;
+		if (__db_chk_meta(env, NULL, meta, 1) != 0)
+			goto done;
+		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
+			goto done;
+		(void)__os_closehandle(env, fhp);
+		fhp = NULL;
+		if (DB_REDO(op)) {
+			/*
+			 * Check to see if the target file exists.  If it
+			 * does and it does not have the proper id then
+			 * it is a later version.  We just remove the source
+			 * file since the state of the world is beyond this
+			 * point.
+			 */
+			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
+			    __fop_read_meta(env, src, mbuf,
+			    DBMETASIZE, fhp, 1, NULL) == 0 &&
+			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
+			    memcmp(argp->fileid.data,
+			    meta->uid, DB_FILE_ID_LEN) != 0) {
+				(void)__memp_nameop(env,
+				    fileid, NULL, real_old, NULL, 0);
+				goto done;
+			}
+		}
+	}
+
+	if (undo && DB_UNDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->oldname.data, real_new, real_old, 0);
+	if (DB_REDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->newname.data, real_old, real_new, 0);
+
+done:	*lsnp = argp->prev_lsn;
+out:	if (real_new != NULL)
+		__os_free(env, real_new);
+	if (real_old != NULL)
+		__os_free(env, real_old);
+	if (fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_file_remove_recover --
+ *	Recovery function for file_remove.  On the REDO pass, we need to
+ * make sure no one recreated the file while we weren't looking.  On an
+ * undo pass must check if the file we are interested in is the one that
+ * exists and then set the status of the child transaction depending on
+ * what we find out.
+ *
+ * PUBLIC: int __fop_file_remove_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_file_remove_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__fop_file_remove_args *argp;
+	DBMETA *meta;
+	DB_FH *fhp;
+	size_t len;
+	u_int8_t mbuf[DBMETASIZE];
+	u_int32_t cstat, ret_stat;
+	int is_real, is_tmp, ret;
+	char *real_name;
+
+	fhp = NULL;
+	meta = (DBMETA *)&mbuf[0];
+	is_real = is_tmp = 0;
+	real_name = NULL;
+	REC_PRINT(__fop_file_remove_print);
+	REC_NOOP_INTRO(__fop_file_remove_read);
+
+	/*
+	 * This record is only interesting on the backward, forward, and
+	 * apply phases.
+	 */
+	if (op != DB_TXN_BACKWARD_ROLL &&
+	    op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
+		goto done;
+
+	if ((ret = __db_appname(env, (APPNAME)argp->appname,
+	    argp->name.data, NULL, &real_name)) != 0)
+		goto out;
+
+	/* Verify that we are manipulating the correct file.  */
+	len = 0;
+	if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
+	    (ret = __fop_read_meta(env, real_name,
+	    mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
+		/*
+		 * If len is non-zero, then the file exists and has something
+		 * in it, but that something isn't a full meta-data page, so
+		 * this is very bad.  Bail out!
+		 */
+		if (len != 0)
+			goto out;
+
+		/* File does not exist. */
+		cstat = TXN_EXPECTED;
+	} else {
+		/*
+		 * We can ignore errors here since we'll simply fail the
+		 * checks below and assume this is the wrong file.
+		 */
+		(void)__db_chk_meta(env, NULL, meta, 1);
+		is_real =
+		    memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+		is_tmp =
+		    memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+
+		if (!is_real && !is_tmp)
+			/* File exists, but isn't what we were removing. */
+			cstat = TXN_IGNORE;
+		else
+			/* File exists and is the one that we were removing. */
+			cstat = TXN_COMMIT;
+	}
+	if (fhp != NULL) {
+		(void)__os_closehandle(env, fhp);
+		fhp = NULL;
+	}
+
+	if (DB_UNDO(op)) {
+		/* On the backward pass, we leave a note for the child txn. */
+		if ((ret = __db_txnlist_update(env,
+		    info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
+			goto out;
+	} else if (DB_REDO(op)) {
+		/*
+		 * On the forward pass, check if someone recreated the
+		 * file while we weren't looking.
+		 */
+		if (cstat == TXN_COMMIT)
+			(void)__memp_nameop(env,
+			    is_real ? argp->real_fid.data : argp->tmp_fid.data,
+			    NULL, real_name, NULL, 0);
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (real_name != NULL)
+		__os_free(env, real_name);
+	if (fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+	REC_NOOP_CLOSE;
+}
diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c
new file mode 100644
index 00000000..1925ffd1
--- /dev/null
+++ b/src/fileops/fop_util.c
@@ -0,0 +1,1841 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
+static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
+static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
+	    u_int32_t));
+static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
+	       const char *, const char *, const char *, DB_LOCKER *));
+static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
+	     const char *, const char *, const char *, DB_LOCKER *));
+
+/*
+ * Acquire the environment meta-data lock.  The parameters are the
+ * environment (ENV), the locker id to use in acquiring the lock (ID)
+ * and a pointer to a DB_LOCK.
+ *
+ * !!!
+ * Turn off locking for Critical Path.  The application must do its own
+ * synchronization of open/create.  Two threads creating and opening a
+ * file at the same time may have unpredictable results.
+ */
+#ifdef CRITICALPATH_10266
+#define	GET_ENVLOCK(ENV, ID, L) (0)
+#else
+#define	GET_ENVLOCK(ENV, ID, L) do {					\
+	DBT __dbt;							\
+	u_int32_t __lockval;						\
+									\
+	if (LOCKING_ON((ENV))) {					\
+		__lockval = 1;						\
+		__dbt.data = &__lockval;				\
+		__dbt.size = sizeof(__lockval);				\
+		if ((ret = __lock_get((ENV), (ID),			\
+		    0, &__dbt, DB_LOCK_WRITE, (L))) != 0)		\
+			goto err;					\
+	}								\
+} while (0)
+#endif
+
+#define	RESET_MPF(D, F) do {						\
+	(void)__memp_fclose((D)->mpf, (F));				\
+	(D)->mpf = NULL;						\
+	F_CLR((D), DB_AM_OPEN_CALLED);					\
+	if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0)		\
+		goto err;						\
+} while (0)
+
+/*
+ * If we open a file handle and our caller is doing fcntl(2) locking,
+ * we can't close the handle because that would discard the caller's
+ * lock. Save it until we close or refresh the DB handle.
+ */
+#define	CLOSE_HANDLE(D, F) {						\
+	if ((F) != NULL) {						\
+		if (LF_ISSET(DB_FCNTL_LOCKING))				\
+			(D)->saved_open_fhp = (F);			\
+		else if ((t_ret =					\
+		    __os_closehandle((D)->env, (F))) != 0) {		\
+			if (ret == 0)					\
+				ret = t_ret;				\
+			goto err;					\
+		}							\
+		(F) = NULL;						\
+	}								\
+}
+
+/*
+ * __fop_lock_handle --
+ *
+ * Get the handle lock for a database.  If the envlock is specified, do this
+ * as a lock_vec call that releases the environment lock before acquiring the
+ * handle lock.
+ *
+ * PUBLIC: int __fop_lock_handle __P((ENV *,
+ * PUBLIC:     DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
+ *
+ */
+int
+__fop_lock_handle(env, dbp, locker, mode, elockp, flags)
+	ENV *env;
+	DB *dbp;
+	DB_LOCKER *locker;
+	db_lockmode_t mode;
+	DB_LOCK *elockp;
+	u_int32_t flags;
+{
+	DBT fileobj;
+	DB_LOCKREQ reqs[2], *ereq;
+	DB_LOCK_ILOCK lock_desc;
+	int ret;
+
+	if (!LOCKING_ON(env) ||
+	    F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
+		return (0);
+
+	/*
+	 * If we are in recovery, the only locking we should be
+	 * doing is on the global environment.  The one exception
+	 * is if we are opening an exclusive database on a client 
+	 * syncing with the master.
+	 */
+	if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
+		return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
+
+	memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
+	lock_desc.pgno = dbp->meta_pgno;
+	lock_desc.type = DB_HANDLE_LOCK;
+
+	memset(&fileobj, 0, sizeof(fileobj));
+	fileobj.data = &lock_desc;
+	fileobj.size = sizeof(lock_desc);
+	DB_TEST_SUBLOCKS(env, flags);
+	if (F2_ISSET(dbp, DB2_AM_INTEXCL))
+	    flags |= DB_LOCK_IGNORE_REC;
+	if (elockp == NULL)
+		ret = __lock_get(env, locker,
+		    flags, &fileobj, mode, &dbp->handle_lock);
+	else {
+		reqs[0].op = DB_LOCK_PUT;
+		reqs[0].lock = *elockp;
+		reqs[1].op = DB_LOCK_GET;
+		reqs[1].mode = mode;
+		reqs[1].obj = &fileobj;
+		reqs[1].timeout = 0;
+		if ((ret = __lock_vec(env,
+		    locker, flags, reqs, 2, &ereq)) == 0) {
+			dbp->handle_lock = reqs[1].lock;
+			if (elockp != &dbp->handle_lock)
+				LOCK_INIT(*elockp);
+		} else if (ereq != reqs)
+			LOCK_INIT(*elockp);
+	}
+
+	dbp->cur_locker = locker;
+	return (ret);
+}
+
+/*
+ * __fop_file_setup --
+ *
+ * Perform all the needed checking and locking to open up or create a
+ * file.
+ *
+ * There's a reason we don't push this code down into the buffer cache.
+ * The problem is that there's no information external to the file that
+ * we can use as a unique ID.  UNIX has dev/inode pairs, but they are
+ * not necessarily unique after reboot, if the file was mounted via NFS.
+ * Windows has similar problems, as the FAT filesystem doesn't maintain
+ * dev/inode numbers across reboot.  So, we must get something from the
+ * file we can use to ensure that, even after a reboot, the file we're
+ * joining in the cache is the right file for us to join.  The solution
+ * we use is to maintain a file ID that's stored in the database, and
+ * that's why we have to open and read the file before calling into the
+ * buffer cache or obtaining a lock (we use this unique fileid to lock
+ * as well as to identify like files in the cache).
+ *
+ * There are a couple of idiosyncrasies that this code must support, in
+ * particular, DB_TRUNCATE and DB_FCNTL_LOCKING.  First, we disallow
+ * DB_TRUNCATE in the presence of transactions, since opening a file with
+ * O_TRUNC will result in data being lost in an unrecoverable fashion.
+ * We also disallow DB_TRUNCATE if locking is enabled, because even in
+ * the presence of locking, we cannot avoid race conditions, so allowing
+ * DB_TRUNCATE with locking would be misleading.  See SR [#7345] for more
+ * details.
+ *
+ * However, if you are running with neither locking nor transactions, then
+ * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
+ * regardless of its contents.
+ *
+ * FCNTL locking introduces another set of complications.  First, the only
+ * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
+ * with programs like Sendmail and Postfix.  In these cases, the caller may
+ * already have a lock on the file; we need to make sure that any file handles
+ * we open remain open, because if we were to close them, the lock held by the
+ * caller would go away.  Furthermore, Sendmail and/or Postfix need the ability
+ * to create databases in empty files.  So, when you're doing FCNTL locking,
+ * it's reasonable that you are trying to create a database into a 0-length
+ * file and we allow it, while under normal conditions, we do not create
+ * databases if the files already exist and are not Berkeley DB files.
+ *
+ * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
+ * PUBLIC:     DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
+ */
+int
+__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	int mode;
+	u_int32_t flags, *retidp;
+{
+	DBTYPE save_type;
+	DB_FH *fhp;
+	DB_LOCK elock;
+	DB_LOCKER *locker;
+	DB_TXN *stxn;
+	ENV *env;
+	size_t len;
+	APPNAME aflags;
+	u_int32_t dflags, oflags;
+	u_int8_t mbuf[DBMETASIZE];
+	int created_locker, create_ok, ret, retries, t_ret, tmp_created;
+	int truncating, was_inval;
+	char *real_name, *real_tmpname, *tmpname;
+	db_lockmode_t lockmode;
+
+	*retidp = TXN_INVALID;
+
+	env = dbp->env;
+	fhp = NULL;
+	LOCK_INIT(elock);
+	stxn = NULL;
+	created_locker = tmp_created = truncating = was_inval = 0;
+	real_name = real_tmpname = tmpname = NULL;
+	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+	aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
+	    (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
+	LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
+
+	ret = 0;
+	retries = 0;
+	save_type = dbp->type;
+	if (F2_ISSET(dbp, DB2_AM_EXCL))
+		lockmode = DB_LOCK_WRITE;
+	else
+		lockmode = DB_LOCK_READ;
+
+	/*
+	 * Get a lockerid for this handle.  There are paths through queue
+	 * rename and remove where this dbp already has a locker, so make
+	 * sure we don't clobber it and conflict.
+	 */
+	if (LOCKING_ON(env) &&
+	    !F_ISSET(dbp, DB_AM_COMPENSATE) &&
+	    !F_ISSET(dbp, DB_AM_RECOVER) &&
+	    dbp->locker == DB_LOCK_INVALIDID) {
+		if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+			goto err;
+		created_locker = 1;
+	}
+	LOCK_INIT(dbp->handle_lock);
+
+	if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) {
+		if ((ret = __lock_addfamilylocker(env,
+		    txn->txnid, dbp->locker->id, 1)) != 0)
+			goto err;
+		txn = NULL;
+	}
+
+	locker = txn == NULL ? dbp->locker : txn->locker;
+
+	oflags = 0;
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		real_name = (char *)name;
+	else {
+		/* Get the real backing file name. */
+		if ((ret = __db_appname(env,
+		    aflags, name, &dbp->dirname, &real_name)) != 0)
+			goto err;
+
+		/* Fill in the default file mode. */
+		if (mode == 0)
+			mode = DB_MODE_660;
+
+		if (LF_ISSET(DB_RDONLY))
+			oflags |= DB_OSO_RDONLY;
+		if (LF_ISSET(DB_TRUNCATE))
+			oflags |= DB_OSO_TRUNC;
+	}
+
+	retries = 0;
+	create_ok = LF_ISSET(DB_CREATE);
+	LF_CLR(DB_CREATE);
+
+retry:
+	/*
+	 * If we cannot create the file, only retry a few times.  We
+	 * think we might be in a race with another create, but it could
+	 * be that the backup filename exists (that is, is left over from
+	 * a previous crash).  It is also possible to read the metadata
+	 * page while it is being written and fail the checksum.
+	 */
+	if (++retries > DB_RETRY) {
+		__db_errx(env, DB_STR_A("0002",
+		    "__fop_file_setup:  Retry limit (%d) exceeded", "%d"),
+		    DB_RETRY);
+		goto err;
+	}
+	if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
+		GET_ENVLOCK(env, locker, &elock);
+	if (name == NULL)
+		ret = ENOENT;
+	else if (F_ISSET(dbp, DB_AM_INMEM)) {
+		ret = __env_mpool(dbp, name, flags);
+		/*
+		 * We are using __env_open as a check for existence.
+		 * However, __env_mpool does an actual open and there
+		 * are scenarios where the object exists, but cannot be
+		 * opened, because our settings don't match those internally.
+		 * We need to check for that explicitly.  We'll need the
+		 * mpool open to read the meta-data page, so we're going to
+		 * have to temporarily turn this dbp into an UNKNOWN one.
+		 */
+		if (ret == EINVAL) {
+			was_inval = 1;
+			save_type = dbp->type;
+			dbp->type = DB_UNKNOWN;
+			ret = __env_mpool(dbp, name, flags);
+			dbp->type = save_type;
+		}
+	} else
+		ret = __os_exists(env, real_name, NULL);
+
+	if (ret == 0) {
+		/*
+		 * If the file exists, there are 5 possible cases:
+		 * 1. DB_EXCL was specified so this is an error, unless
+		 *	this is a file left around after a rename and we
+		 *	are in the same transaction.  This gets decomposed
+		 *	into several subcases, because we check for various
+		 *	errors before we know we're in rename.
+		 * 2. We are truncating, and it doesn't matter what kind
+		 *	of file it is, we should open/create it.
+		 * 3. It is 0-length, we are not doing transactions (i.e.,
+		 *      we are sendmail), we should open/create into it.
+		 *	-- on-disk files only!
+		 * 4. Is it a Berkeley DB file and we should simply open it.
+		 * 5. It is not a BDB file and we should return an error.
+		 */
+
+		/* Open file (if there is one). */
+reopen:		if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
+		    __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
+			goto err;
+
+		/* Case 2: DB_TRUNCATE: we must do the creation in place. */
+		if (LF_ISSET(DB_TRUNCATE)) {
+			if (LF_ISSET(DB_EXCL)) {
+				/* Case 1a: DB_EXCL and DB_TRUNCATE. */
+				ret = EEXIST;
+				goto err;
+			}
+			tmpname = (char *)name;
+			goto creat2;
+		}
+
+		/* Cases 1,3-5: we need to read the meta-data page. */
+		if (F_ISSET(dbp, DB_AM_INMEM)) {
+			if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp,
+			    txn, NULL, name, TXN_INVALID)) != 0)
+				return (ret);
+			ret = __fop_inmem_read_meta(
+			    dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY);
+		} else {
+			ret = __fop_read_meta(env, real_name, mbuf,
+			    sizeof(mbuf), fhp,
+			    LF_ISSET(DB_NOERROR) ||
+			    (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
+			    &len);
+
+			/* Case 3: 0-length, no txns. */
+			if (ret != 0 && len == 0 && txn == NULL) {
+				if (LF_ISSET(DB_EXCL)) {
+					/*
+					 * Case 1b: DB_EXCL and
+					 * 0-length file exists.
+					 */
+					ret = EEXIST;
+					goto err;
+				}
+				tmpname = (char *)name;
+				if (create_ok)
+					goto creat2;
+				goto done;
+			}
+
+			/* 
+			 * Case 4: This is a valid file.  Now check the
+			 * checksum and decrypt the file so the file 
+			 * id can be obtained for the handle lock.  Note that
+			 * the checksum can fail if the database is being
+			 * written (possible because the handle lock has
+			 * not been obtained yet).  So on checksum fail retry
+			 * until the checksum succeeds or the number of 
+			 * retries is exhausted, then throw an error.
+			 */
+			if (ret == 0 && (ret = __db_chk_meta(env, dbp,
+			    (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) {
+				if ((t_ret = __ENV_LPUT(env, elock)) != 0) {
+					ret = t_ret;
+					goto err;
+				}
+				/* 
+				 * Retry unless the number of retries is
+				 * exhausted.
+				 */
+				if (!(retries < DB_RETRY)) {
+					__db_errx(env, DB_STR_A("0210",
+			"%s: metadata page checksum error", "%s"), real_name);
+					if (F_ISSET(dbp, DB_AM_RECOVER))
+						ret = ENOENT;
+					else
+						ret = EINVAL;
+					goto err;
+				}
+				if ((ret = __os_closehandle(env, fhp)) != 0)
+					goto err;
+				goto retry;
+			}
+			/* Get the file id for the handle lock. */
+			if (ret == 0)
+				memcpy(dbp->fileid,
+				((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+		}
+
+		/* Case 5: Invalid file. */
+		if (ret != 0)
+			goto err;
+
+		/* Now, get our handle lock. */
+		if ((ret = __fop_lock_handle(env,
+		    dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) {
+			if ((ret = __ENV_LPUT(env, elock)) != 0)
+				goto err;
+		} else if (ret != DB_LOCK_NOTGRANTED ||
+		    ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) ||
+		    F2_ISSET(dbp, DB2_AM_NOWAIT)))
+			goto err;
+		else {
+			PERFMON3(env,
+			    race, fop_file_setup, (char *) name, ret, flags);
+			/*
+			 * We were unable to acquire the handle lock without
+			 * blocking.  The fact that we are blocking might mean
+			 * that someone else is trying to delete the file.
+			 * Since some platforms cannot delete files while they
+			 * are open (Windows), we are going to have to close
+			 * the file.  This would be a problem if we were doing
+			 * FCNTL locking, because our closing the handle would
+			 * release the FCNTL locks.  Fortunately, if we are
+			 * doing FCNTL locking, then we should never fail to
+			 * acquire our handle lock, so we should never get here.
+			 * We assert it here to make sure we aren't destroying
+			 * any application level FCNTL semantics.
+			 */
+			DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
+			if (!F_ISSET(dbp, DB_AM_INMEM)) {
+				if ((ret = __os_closehandle(env, fhp)) != 0)
+					goto err;
+				fhp = NULL;
+			}
+			if ((ret = __fop_lock_handle(env,
+			    dbp, locker, lockmode, &elock, 0)) != 0) {
+				if (F_ISSET(dbp, DB_AM_INMEM))
+					RESET_MPF(dbp, 0);
+				goto err;
+			}
+
+			/*
+			 * If we had to wait, we might be waiting on a
+			 * dummy file used in create/destroy of a database.
+			 * To be sure we have the correct information we
+			 * try again.
+			 */
+			if (F_ISSET(dbp, DB_AM_INMEM)) {
+				RESET_MPF(dbp, 0);
+				MAKE_INMEM(dbp);
+			}
+			if ((ret =
+			    __ENV_LPUT(env, dbp->handle_lock)) != 0) {
+				LOCK_INIT(dbp->handle_lock);
+				goto err;
+			}
+			goto retry;
+
+		}
+
+		/* 
+		 * If we got here, then we have the handle lock, it is now
+		 * safe to check the rest of the meta data, since the file
+		 * will not be deleted out from under the handle.
+		 */
+		if (F_ISSET(dbp, DB_AM_INMEM)) {
+			if ((ret = __fop_inmem_read_meta(
+			    dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
+				goto err;
+		} else {
+			if ((ret = __db_meta_setup(env, dbp, real_name, 
+			    (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
+				goto err;
+		}
+
+		/*
+		 * Check for a file in the midst of a rename.  If we find that
+		 * the file is in the midst of a rename, it must be the case
+		 * that it is in our current transaction (else we would still
+		 * be blocking), so we can continue along and create a new file
+		 * with the same name.  In that case, we have to close the file
+		 * handle because we reuse it below.  This is a case where
+		 * a 'was_inval' above is OK.
+		 */
+		if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+			was_inval = 0;
+			if (create_ok) {
+				if (F_ISSET(dbp, DB_AM_INMEM)) {
+					RESET_MPF(dbp, DB_MPOOL_DISCARD);
+				} else if ((ret =
+				    __os_closehandle(env, fhp)) != 0)
+					goto err;
+				LF_SET(DB_CREATE);
+				goto create;
+			} else {
+				ret = ENOENT;
+				goto err;
+			}
+		}
+
+		/* If we get here, a was_inval is bad. */
+		if (was_inval) {
+			ret = EINVAL;
+			goto err;
+		}
+
+		/*
+		 * Now, case 1: check for DB_EXCL, because the file that exists
+		 * is not in the middle of a rename, so we have an error.  This
+		 * is a weird case, but we need to make sure that we don't
+		 * continue to hold the handle lock, since technically, we
+		 * should not have been allowed to open it.
+		 */
+		if (LF_ISSET(DB_EXCL)) {
+			ret = __ENV_LPUT(env, dbp->handle_lock);
+			LOCK_INIT(dbp->handle_lock);
+			if (ret == 0)
+				ret = EEXIST;
+			goto err;
+		}
+		goto done;
+	}
+
+	/* File does not exist. */
+#ifdef	HAVE_VXWORKS
+	/*
+	 * VxWorks can return file-system specific error codes if the
+	 * file does not exist, not ENOENT.
+	 */
+	if (!create_ok)
+#else
+	if (!create_ok || ret != ENOENT)
+#endif
+		goto err;
+	LF_SET(DB_CREATE);
+	/*
+	 * If we were trying to open a non-existent master database
+	 * readonly clear that here.
+	 */
+	LF_CLR(DB_RDONLY);
+	F_CLR(dbp, DB_AM_RDONLY);
+	ret = 0;
+
+	/*
+	 * We need to create file, which means that we need to set up the file,
+	 * the fileid and the locks.  Then we need to call the appropriate
+	 * routines to create meta-data pages.  For in-memory files, we retain
+	 * the environment lock, while for on-disk files, we drop the env lock
+	 * and create into a temporary.
+	 */
+	if (!F_ISSET(dbp, DB_AM_INMEM) &&
+	    (ret = __ENV_LPUT(env, elock)) != 0)
+		goto err;
+
+create:	if (txn != NULL && IS_REP_CLIENT(env) &&
+	    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		__db_errx(env, DB_STR("0003",
+		    "Transactional create on replication client disallowed"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		if (LOGGING_ON(env) && (ret =
+		    __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0)
+			return (ret);
+		if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
+			goto err;
+		if (TXN_ON(env) && txn != NULL &&
+		    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+			goto err;
+		if ((ret = __fop_create(env, stxn, &fhp,
+		    tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) {
+			/*
+			 * If no transactions, there is a race on creating the
+			 * backup file, as the backup file name is the same for
+			 * all processes.  Wait for the other process to finish
+			 * with the name.
+			 */
+			if (!TXN_ON(env) && ret == EEXIST) {
+				PERFMON3(env,
+				    race, fop_file_setup, tmpname, ret, flags);
+				__os_free(env, tmpname);
+				tmpname = NULL;
+				__os_yield(env, 1, 0);
+				goto retry;
+			}
+			goto err;
+		}
+		tmp_created = 1;
+	}
+
+creat2:	if (!F_ISSET(dbp, DB_AM_INMEM)) {
+		if ((ret = __db_appname(env,
+		    aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0)
+			goto err;
+
+		/* Set the pagesize if it isn't yet set. */
+		if (dbp->pgsize == 0 &&
+		    (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
+			goto errmsg;
+
+		/* Construct a file_id. */
+		if ((ret =
+		    __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
+			goto errmsg;
+	}
+
+	if ((ret = __db_new_file(dbp, ip,
+	    F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
+		goto err;
+
+	/* Output the REOPEN record after we create. */
+	if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret =
+	    __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0)
+		return (ret);
+
+	/*
+	 * We need to close the handle here on platforms where remove and
+	 * rename fail if a handle is open (including Windows).
+	 */
+	CLOSE_HANDLE(dbp, fhp);
+
+	/*
+	 * Now move the file into place unless we are creating in place (because
+	 * we created a database in a file that started out 0-length).  If
+	 * this is an in-memory file, we may or may not hold the environment
+	 * lock depending on how we got here.
+	 */
+	if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
+	    !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
+		GET_ENVLOCK(env, locker, &elock);
+
+	if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+		F_CLR(dbp, DB_AM_IN_RENAME);
+		__txn_remrem(env, txn, real_name);
+	} else if (name == tmpname) {
+		/* We created it in place. */
+	} else if (!F_ISSET(dbp, DB_AM_INMEM) &&
+	    __os_exists(env, real_name, NULL) == 0) {
+		/*
+		 * Someone managed to create the file; remove our temp
+		 * and try to open the file that now exists.
+		 */
+		(void)__fop_remove(env, NULL,
+		    dbp->fileid, tmpname, &dbp->dirname, aflags, dflags);
+		(void)__ENV_LPUT(env, dbp->handle_lock);
+		LOCK_INIT(dbp->handle_lock);
+
+		if (stxn != NULL) {
+			ret = __txn_abort(stxn);
+			stxn = NULL;
+		}
+		if (ret != 0)
+			goto err;
+		goto reopen;
+	}
+
+	if (name != NULL && (ret = __fop_lock_handle(env,
+	    dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)|
+	    (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+		goto err;
+	if (tmpname != NULL &&
+	    tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
+	    name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0)
+		goto err;
+	if ((ret = __ENV_LPUT(env, elock)) != 0)
+		goto err;
+
+	if (stxn != NULL) {
+		*retidp = stxn->txnid;
+		ret = __txn_commit(stxn, 0);
+		stxn = NULL;
+	} else
+		*retidp = TXN_INVALID;
+
+	if (ret != 0)
+		goto err;
+
+	F_SET(dbp, DB_AM_CREATED);
+
+	if (0) {
+errmsg:		__db_err(env, ret, "%s", name);
+
+err:		CLOSE_HANDLE(dbp, fhp);
+		if (stxn != NULL)
+			(void)__txn_abort(stxn);
+		if (tmp_created && txn == NULL)
+			(void)__fop_remove(env,
+			    NULL, NULL, tmpname, NULL, aflags, dflags);
+		if (txn == NULL)
+			(void)__ENV_LPUT(env, dbp->handle_lock);
+		(void)__ENV_LPUT(env, elock);
+		if (created_locker) {
+			(void)__lock_id_free(env, dbp->locker);
+			dbp->locker = NULL;
+		}
+	}
+
+done:	/*
+	 * There are cases where real_name and tmpname take on the
+	 * exact same string, so we need to make sure that we do not
+	 * free twice.
+	 */
+	if (!truncating && tmpname != NULL && tmpname != name)
+		__os_free(env, tmpname);
+	if (real_name != name && real_name != NULL)
+		__os_free(env, real_name);
+	if (real_tmpname != NULL)
+		__os_free(env, real_tmpname);
+	CLOSE_HANDLE(dbp, fhp);
+
+	return (ret);
+}
+
+/*
+ * __fop_set_pgsize --
+ *	Set the page size based on file information.
+ */
+static int
+__fop_set_pgsize(dbp, fhp, name)
+	DB *dbp;
+	DB_FH *fhp;
+	const char *name;
+{
+	ENV *env;
+	u_int32_t iopsize;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * Use the filesystem's optimum I/O size as the pagesize if a pagesize
+	 * not specified.  Some filesystems have 64K as their optimum I/O size,
+	 * but as that results in fairly large default caches, we limit the
+	 * default pagesize to 16K.
+	 */
+	if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
+		__db_err(env, ret, "%s", name);
+		return (ret);
+	}
+	if (iopsize < 512)
+		iopsize = 512;
+	if (iopsize > 16 * 1024)
+		iopsize = 16 * 1024;
+
+	/*
+	 * Sheer paranoia, but we don't want anything that's not a power-of-2
+	 * (we rely on that for alignment of various types on the pages), and
+	 * we want a multiple of the sector size as well.  If the value
+	 * we got out of __os_ioinfo looks bad, use a default instead.
+	 */
+	if (!IS_VALID_PAGESIZE(iopsize))
+		iopsize = DB_DEF_IOSIZE;
+
+	dbp->pgsize = iopsize;
+	F_SET(dbp, DB_AM_PGDEF);
+
+	return (0);
+}
+
+/*
+ * __fop_subdb_setup --
+ *
+ * Subdb setup is significantly simpler than file setup.  In terms of
+ * locking, for the duration of the operation/transaction, the locks on
+ * the meta-data page will suffice to protect us from simultaneous operations
+ * on the sub-database.  Before we complete the operation though, we'll get a
+ * handle lock on the subdatabase so that on one else can try to remove it
+ * while we've got it open.  We use an object that looks like the meta-data
+ * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
+ * locks.
+ *
+ * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, int, u_int32_t));
+ */
+int
+__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *mname, *name;
+	int mode;
+	u_int32_t flags;
+{
+	DB *mdbp;
+	ENV *env;
+	db_lockmode_t lkmode;
+	u_int32_t mflags;
+	int ret, t_ret;
+
+	mdbp = NULL;
+	env = dbp->env;
+
+	mflags = flags | DB_RDONLY;
+retry:	if ((ret = __db_master_open(dbp,
+	    ip, txn, mname, mflags, mode, &mdbp)) != 0)
+		return (ret);
+	/*
+	 * If we created this file, then we need to set the DISCARD flag so
+	 * that if we fail in the middle of this routine, we discard from the
+	 * mpool any pages that we just created.
+	 */
+	if (F_ISSET(mdbp, DB_AM_CREATED))
+		F_SET(mdbp, DB_AM_DISCARD);
+
+	/*
+	 * We are going to close this instance of the master, so we can
+	 * steal its handle instead of reopening a handle on the database.
+	 */
+	if (LF_ISSET(DB_FCNTL_LOCKING)) {
+		dbp->saved_open_fhp = mdbp->saved_open_fhp;
+		mdbp->saved_open_fhp = NULL;
+	}
+
+	/* Copy the pagesize and set the sub-database flag. */
+	dbp->pgsize = mdbp->pgsize;
+	F_SET(dbp, DB_AM_SUBDB);
+
+	if (name != NULL && (ret = __db_master_update(mdbp, dbp,
+	    ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
+		if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) {
+			/* We need to reopen the master R/W to do the create. */
+			if ((ret = __db_close(mdbp, txn, 0)) != 0)
+				goto err;
+			FLD_CLR(mflags, DB_RDONLY);
+			goto retry;
+		}
+		goto err;
+	}
+
+	/*
+	 * Hijack the master's locker ID as well, so that our locks don't
+	 * conflict with the master's.  Since we're closing the master,
+	 * that locker would just have been freed anyway.  Once we've gotten
+	 * the locker id, we need to acquire the handle lock for this
+	 * subdatabase.
+	 */
+	dbp->locker = mdbp->locker;
+	mdbp->locker = NULL;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
+
+	/*
+	 * We copy our fileid from our master so that we all open
+	 * the same file in mpool.  We'll use the meta-pgno to lock
+	 * so that we end up with different handle locks.
+	 */
+
+	memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
+	lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ||
+	    F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ;
+	if ((ret = __fop_lock_handle(env, dbp,
+	    txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
+	    NOWAIT_FLAG(txn) |
+	    (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+		goto err;
+
+	if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
+		/*
+		 * If there was no transaction and we created this database,
+		 * then we need to undo the update of the master database.
+		 */
+		if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
+			(void)__db_master_update(mdbp, dbp,
+			    ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
+		F_CLR(dbp, DB_AM_CREATED);
+		goto err;
+	}
+
+	/*
+	 * XXX
+	 * This should have been done at the top of this routine.  The problem
+	 * is that __db_init_subdb() uses "standard" routines to process the
+	 * meta-data page and set information in the DB handle based on it.
+	 * Those routines have to deal with swapped pages and will normally set
+	 * the DB_AM_SWAP flag.  However, we use the master's metadata page and
+	 * that has already been swapped, so they get the is-swapped test wrong.
+	 */
+	F_CLR(dbp, DB_AM_SWAP);
+	F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
+
+	/*
+	 * In the file create case, these happen in separate places so we have
+	 * two different tests.  They end up in the same place for subdbs, but
+	 * for compatibility with file testing, we put them both here anyway.
+	 */
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
+
+	/*
+	 * File exists and we have the appropriate locks; we should now
+	 * process a normal open.
+	 */
+	if (F_ISSET(mdbp, DB_AM_CREATED)) {
+		F_SET(dbp, DB_AM_CREATED_MSTR);
+		F_CLR(mdbp, DB_AM_DISCARD);
+	}
+
+	if (0) {
+err:
+DB_TEST_RECOVERY_LABEL
+		if (txn == NULL)
+			(void)__ENV_LPUT(env, dbp->handle_lock);
+	}
+
+	/*
+	 * The master's handle lock is under the control of the
+	 * subdb (it acquired the master's locker).  We want to
+	 * keep the master's handle lock so that no one can remove
+	 * the file while the subdb is open.  If we register the
+	 * trade event and then invalidate the copy of the lock
+	 * in the master's handle, that will accomplish this.  However,
+	 * before we register this event, we'd better remove any
+	 * events that we've already registered for the master.
+	 */
+	if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
+		/* Unregister old master events. */
+		 __txn_remlock(env,
+		    txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
+
+		/* Now register the new event. */
+		if ((t_ret = __txn_lockevent(env, txn, dbp,
+		    &mdbp->handle_lock, dbp->locker == NULL ?
+		    mdbp->locker : dbp->locker)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	LOCK_INIT(mdbp->handle_lock);
+
+	/*
+	 * If the master was created, we need to sync so that the metadata
+	 * page is correct on disk for recovery, since it isn't read through
+	 * mpool.  If we're opening a subdb in an existing file, we can skip
+	 * the sync.
+	 */
+	if ((t_ret = __db_close(mdbp, txn,
+	    F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __fop_remove_setup --
+ *	Open handle appropriately and lock for removal of a database file.
+ *
+ * PUBLIC: int __fop_remove_setup __P((DB *,
+ * PUBLIC:      DB_TXN *, const char *, u_int32_t));
+ */
+int
+__fop_remove_setup(dbp, txn, name, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+	u_int32_t flags;
+{
+	DB_FH *fhp;
+	DB_LOCK elock;
+	ENV *env;
+	u_int8_t mbuf[DBMETASIZE];
+	int ret;
+
+	COMPQUIET(flags, 0);
+
+	env = dbp->env;
+
+	LOCK_INIT(elock);
+	fhp = NULL;
+	ret = 0;
+
+	/* Create locker if necessary. */
+retry:	if (LOCKING_ON(env)) {
+		if (IS_REAL_TXN(txn))
+			dbp->locker = txn->locker;
+		else if (dbp->locker == DB_LOCK_INVALIDID) {
+			if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+				goto err;
+			if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) &&
+			    (ret = __lock_addfamilylocker(env,
+			    txn->txnid, dbp->locker->id, 1)) != 0)
+				goto err;
+		}
+	}
+
+	/*
+	 * We are about to open a file handle and then possibly close it.
+	 * We cannot close handles if we are doing FCNTL locking.  However,
+	 * there is no way to pass the FCNTL flag into this routine via the
+	 * user API.  The only way we can get in here and be doing FCNTL
+	 * locking is if we are trying to clean up an open that was called
+	 * with FCNTL locking.  In that case, the save_fhp should already be
+	 * set.  So, we use that field to tell us if we need to make sure
+	 * that we shouldn't close the handle.
+	 */
+	fhp = dbp->saved_open_fhp;
+	DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
+
+	/*
+	 * Lock environment to protect file open.  That will enable us to
+	 * read the meta-data page and get the fileid so that we can lock
+	 * the handle.
+	 */
+	GET_ENVLOCK(env, dbp->locker, &elock);
+
+	/* Open database. */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		if ((ret = __env_mpool(dbp, name, flags)) == 0)
+			ret = __os_strdup(env, name, &dbp->dname);
+	} else if (fhp == NULL)
+		ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
+	if (ret != 0)
+		goto err;
+
+	/* Get meta-data */
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		ret = __fop_inmem_read_meta(
+		    dbp, txn, name, flags, DB_CHK_META);
+	else if ((ret = __fop_read_meta(env,
+	    name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
+		ret = __db_meta_setup(env, dbp,
+		    name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Now, get the handle lock.  We first try with NOWAIT, because if
+	 * we have to wait, we're going to have to close the file and reopen
+	 * it, so that if there is someone else removing it, our open doesn't
+	 * prevent that.
+	 */
+	if ((ret = __fop_lock_handle(env,
+	    dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
+		/*
+		 * Close the file, block on the lock, clean up the dbp, and
+		 * then start all over again.
+		 */
+		if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
+			(void)__os_closehandle(env, fhp);
+			fhp = NULL;
+		}
+		if (ret != DB_LOCK_NOTGRANTED ||
+		    (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
+			goto err;
+		else if ((ret = __fop_lock_handle(env,
+		    dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+			goto err;
+
+		if (F_ISSET(dbp, DB_AM_INMEM)) {
+			(void)__lock_put(env, &dbp->handle_lock);
+			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
+		} else {
+			if (txn != NULL)
+				dbp->locker = NULL;
+			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
+		}
+		goto retry;
+	} else if ((ret = __ENV_LPUT(env, elock)) != 0)
+		goto err;
+	else if (F_ISSET(dbp, DB_AM_IN_RENAME))
+		ret = ENOENT;
+
+	if (0) {
+err:		(void)__ENV_LPUT(env, elock);
+	}
+	if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
+		(void)__os_closehandle(env, fhp);
+	/*
+	 * If this is a real file and we are going to proceed with the removal,
+	 * then we need to make sure that we don't leave any pages around in the
+	 * mpool since the file is closed and will be reopened again before
+	 * access.  However, this might be an in-memory file, in which case
+	 * we will handle the discard from the mpool later as it's the "real"
+	 * removal of the database.
+	 */
+	if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
+		F_SET(dbp, DB_AM_DISCARD);
+	return (ret);
+}
+
+/*
+ * __fop_read_meta --
+ *	Read the meta-data page from a file and return it in buf.
+ *
+ * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
+ * PUBLIC:     u_int8_t *, size_t, DB_FH *, int, size_t *));
+ */
+int
+__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
+	ENV *env;
+	const char *name;
+	u_int8_t *buf;
+	size_t size;
+	DB_FH *fhp;
+	int errok;
+	size_t *nbytesp;
+{
+	size_t nr;
+	int ret;
+
+	/*
+	 * Our caller wants to know the number of bytes read, even if we
+	 * return an error.
+	 */
+	if (nbytesp != NULL)
+		*nbytesp = 0;
+
+	nr = 0;
+	ret = __os_read(env, fhp, buf, size, &nr);
+	if (nbytesp != NULL)
+		*nbytesp = nr;
+
+	if (ret != 0) {
+		if (!errok)
+			__db_err(env, ret, "%s", name);
+		goto err;
+	}
+
+	if (nr != size) {
+		if (!errok)
+			__db_errx(env, DB_STR_A("0004",
+			    "fop_read_meta: %s: unexpected file type or format",
+			    "%s"), name);
+		ret = EINVAL;
+	}
+
+err:
+	return (ret);
+}
+
+/*
+ * __fop_dummy --
+ *	This implements the creation and name swapping of dummy files that
+ * we use for remove and rename (remove is simply a rename with a delayed
+ * remove).
+ *
+ * PUBLIC: int __fop_dummy __P((DB *,
+ * PUBLIC:     DB_TXN *, const char *, const char *));
+ */
+int
+__fop_dummy(dbp, txn, old, new)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *old, *new;
+{
+	DB *tmpdbp;
+	DB_TXN *stxn;
+	ENV *env;
+	char *back;
+	int ret, t_ret;
+	u_int8_t mbuf[DBMETASIZE];
+
+	env = dbp->env;
+	back = NULL;
+	stxn = NULL;
+	tmpdbp = NULL;
+
+	DB_ASSERT(env, txn != NULL);
+
+	/*
+	 * Begin sub transaction to encapsulate the rename.  Note that we
+	 * expect the inmem_swap calls to complete the sub-transaction,
+	 * aborting on error and committing on success.
+	 */
+	if (TXN_ON(env) &&
+	    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+		goto err;
+
+	/* We need to create a dummy file as a place holder. */
+	if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
+		goto err;
+	/* Create a dummy dbp handle. */
+	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+		goto err;
+	if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+		(ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	memset(mbuf, 0, sizeof(mbuf));
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	    __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
+	    __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
+
+	if (ret != 0)
+		goto err;
+
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	    __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
+	    __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
+	stxn = NULL;
+	if (ret != 0)
+		goto err;
+
+err:	if (stxn != NULL)
+		(void)__txn_abort(stxn);
+	if (tmpdbp != NULL &&
+	    (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (back != NULL)
+		__os_free(env, back);
+	return (ret);
+}
+
+/*
+ * __fop_dbrename --
+ *	Do the appropriate file locking and file system operations
+ * to effect a dbrename in the absence of transactions (__fop_dummy
+ * and the subsequent calls in __db_rename do the work for the
+ * transactional case).
+ *
+ * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
+ */
+int
+__fop_dbrename(dbp, old, new)
+	DB *dbp;
+	const char *old, *new;
+{
+	DB_LOCK elock;
+	ENV *env;
+	char *real_new, *real_old;
+	int ret, t_ret;
+
+	env = dbp->env;
+	real_new = NULL;
+	real_old = NULL;
+	LOCK_INIT(elock);
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		real_new = (char *)new;
+		real_old = (char *)old;
+	} else {
+		/* Get full names. */
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
+			goto err;
+
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
+			goto err;
+	}
+
+	/*
+	 * It is an error to rename a file over one that already exists,
+	 * as that wouldn't be transaction-safe.  We check explicitly
+	 * for ondisk files, but it's done memp_nameop for in-memory ones.
+	 */
+	GET_ENVLOCK(env, dbp->locker, &elock);
+	ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
+	    __os_exists(env, real_new, NULL);
+
+	if (ret == 0) {
+		ret = EEXIST;
+		__db_errx(env, DB_STR_A("0005",
+		    "rename: file %s exists", "%s"), real_new);
+		goto err;
+	}
+
+	ret = __memp_nameop(env,
+	    dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
+
+err:	if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
+		__os_free(env, real_old);
+	if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
+		__os_free(env, real_new);
+	return (ret);
+}
+
+static int
+__fop_inmem_create(dbp, name, txn, flags)
+	DB *dbp;
+	const char *name;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBT fid_dbt, name_dbt;
+	DB_LSN lsn;
+	ENV *env;
+	int ret;
+	int32_t lfid;
+	u_int32_t dflags, *p32;
+
+	env = dbp->env;
+	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+	MAKE_INMEM(dbp);
+
+	/* Set the pagesize if it isn't yet set. */
+	if (dbp->pgsize == 0)
+		dbp->pgsize = DB_DEF_IOSIZE;
+
+	/*
+	 * Construct a file_id.
+	 *
+	 * If this file has no name, then we only need a fileid for locking.
+	 * If this file has a name, we need the fileid both for locking and
+	 * matching in the memory pool.  So, with unnamed in-memory databases,
+	 * use a lock_id.  For named in-memory files, we need to find a value
+	 * that we can use to uniquely identify a name/fid pair.  We use a
+	 * combination of a unique id (__os_unique_id) and a hash of the
+	 * original name.
+	 */
+	if (name == NULL) {
+		if (LOCKING_ON(env) && (ret =
+		    __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
+			goto err;
+	}  else {
+		p32 = (u_int32_t *)(&dbp->fileid[0]);
+		__os_unique_id(env, p32);
+		p32++;
+		(void)strncpy(
+		    (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
+		dbp->preserve_fid = 1;
+
+		if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+		    txn != NULL &&
+#endif
+		    dbp->log_filename != NULL)
+			memcpy(dbp->log_filename->ufid,
+			    dbp->fileid, DB_FILE_ID_LEN);
+	}
+
+	/* Now, set the fileid. */
+	if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+		goto err;
+
+	if ((ret = __env_mpool(dbp, name, flags)) != 0)
+		goto err;
+
+	if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+	    txn != NULL &&
+#endif
+	    name != NULL) {
+		DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+		memset(&fid_dbt, 0, sizeof(fid_dbt));
+		fid_dbt.data = dbp->fileid;
+		fid_dbt.size = DB_FILE_ID_LEN;
+		lfid = dbp->log_filename == NULL ?
+		    DB_LOGFILEID_INVALID : dbp->log_filename->id;
+		if ((ret = __crdel_inmem_create_log(env, txn,
+		    &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
+			goto err;
+	}
+
+	F_SET(dbp, DB_AM_CREATED);
+
+err:
+	return (ret);
+}
+
+static int
+__fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+	u_int32_t flags;
+	u_int32_t chkflags;
+{
+	DBMETA *metap;
+	DB_THREAD_INFO *ip;
+	db_pgno_t pgno;
+	int ret, t_ret;
+
+	if (txn == NULL)
+		ENV_GET_THREAD_INFO(dbp->env, ip);
+	else
+		ip = txn->thread_info;
+
+	pgno  = PGNO_BASE_MD;
+	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
+		return (ret);
+	if (FLD_ISSET(chkflags, DB_CHK_ONLY)) {
+		if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
+			memcpy(dbp->fileid,
+			    ((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
+	} else 
+		ret = __db_meta_setup(
+		    dbp->env, dbp, name, metap, flags, chkflags);
+
+	if ((t_ret =
+	    __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+static int
+__fop_ondisk_dummy(dbp, txn, name, mbuf)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+	u_int8_t *mbuf;
+{
+	ENV *env;
+	int ret;
+	char *realname;
+	u_int32_t dflags;
+
+	realname = NULL;
+	env = dbp->env;
+	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
+		goto err;
+
+	if ((ret = __fop_create(env,
+	    txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
+		goto err;
+
+	if ((ret =
+	    __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
+		goto err;
+
+	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+	if ((ret = __fop_write(env, txn, name, dbp->dirname,
+	    DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
+		goto err;
+
+	memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+
+err:	if (realname != NULL)
+		__os_free(env, realname);
+
+	return (ret);
+}
+
+static int
+__fop_inmem_dummy(dbp, txn, name, mbuf)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+	u_int8_t *mbuf;
+{
+	DBMETA *metap;
+	DB_THREAD_INFO *ip;
+	db_pgno_t pgno;
+	int ret, t_ret;
+
+	if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
+		return (ret);
+	if (txn == NULL)
+		ENV_GET_THREAD_INFO(dbp->env, ip);
+	else
+		ip = txn->thread_info;
+
+	pgno  = PGNO_BASE_MD;
+	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
+		return (ret);
+	/* Check file existed. */
+	if (metap->magic != 0)
+		ret = EEXIST;
+	else
+		metap->magic = DB_RENAMEMAGIC;
+
+	/* Copy the fileid onto the meta-data page. */
+	memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
+
+	if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
+	    ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret != 0)
+		goto err;
+
+	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+
+err:	return (ret);
+}
+
+static int
+__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
+	DB *dbp, *tmpdbp;
+	DB_TXN *txn;
+	const char *old, *new, *back;
+	DB_LOCKER *locker;
+{
+	DBT fiddbt, namedbt, tmpdbt;
+	DB_FH *fhp;
+	DB_LOCK elock;
+	DB_LSN lsn;
+	DB_TXN *parent;
+	ENV *env;
+	u_int8_t mbuf[DBMETASIZE];
+	u_int32_t child_txnid, dflags;
+	int ret, t_ret;
+	char *realold, *realnew;
+
+	env = dbp->env;
+	DB_ASSERT(env, txn != NULL);
+	DB_ASSERT(env, old != NULL);
+
+	realold = realnew = NULL;
+	LOCK_INIT(elock);
+	fhp = NULL;
+	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
+		goto err;
+
+	/* Now, lock the name space while we initialize this file. */
+retry:	GET_ENVLOCK(env, locker, &elock);
+	if (__os_exists(env, realnew, NULL) == 0) {
+		/*
+		 * It is possible that the only reason this file exists is
+		 * because we've done a previous rename of it and we have
+		 * left a placeholder here.  We need to check for that case
+		 * and allow this rename to succeed if that's the case.
+		 */
+		if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
+			goto err;
+		if ((ret = __fop_read_meta(env,
+		    realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
+		    (ret = __db_meta_setup(env,
+		    tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
+			ret = EEXIST;
+			goto err;
+		}
+		ret = __os_closehandle(env, fhp);
+		fhp = NULL;
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Now, try to acquire the handle lock.  If the handle is locked
+		 * by our current, transaction, then we'll get it and life is
+		 * good.
+		 *
+		 * Alternately, it's not locked at all, we'll get the lock, but
+		 * we will realize it exists and consider this an error.
+		 *
+		 * However, if it's held by another transaction, then there
+		 * could be two different scenarios: 1) the file is in the
+		 * midst of being created or deleted and when that transaction
+		 * is over, we might be able to proceed. 2) the file is open
+		 * and exists and we should report an error. In order to
+		 * distinguish these two cases, we do the following. First, we
+		 * try to acquire a READLOCK.  If the handle is in the midst of
+		 * being created, then we'll block because a writelock is held.
+		 * In that case, we should request a blocking write, and when we
+		 * get the lock, we should then go back and check to see if the
+		 * object exists and start all over again.
+		 *
+		 * If we got the READLOCK, then either no one is holding the
+		 * lock or someone has an open handle and the fact that the file
+		 * exists is problematic.  So, in this case, we request the
+		 * WRITELOCK non-blocking -- if it succeeds, we're golden.  If
+		 * it fails, then the file exists and we return EEXIST.
+		 */
+		if ((ret = __fop_lock_handle(env,
+		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+			/*
+			 * Someone holds a write-lock.  Wait for the write-lock
+			 * and after we get it, release it and start over.
+			 */
+			if ((ret = __fop_lock_handle(env, tmpdbp,
+			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+				goto err;
+			if ((ret =
+			    __lock_put(env, &tmpdbp->handle_lock)) != 0)
+				goto err;
+			if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
+				goto err;
+			goto retry;
+		}
+
+		/* We got the read lock; try to upgrade it. */
+		ret = __fop_lock_handle(env,
+		    tmpdbp, locker, DB_LOCK_WRITE,
+		    NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
+		if (ret != 0) {
+			/*
+			 * We did not get the writelock, so someone
+			 * has the handle open.  This is an error.
+			 */
+			(void)__lock_put(env, &tmpdbp->handle_lock);
+			ret = EEXIST;
+		} else  if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+			/* We got the lock and are renaming it. */
+			ret = 0;
+		else { /* We got the lock, but the file exists. */
+			(void)__lock_put(env, &tmpdbp->handle_lock);
+			ret = EEXIST;
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * While we have the namespace locked, do the renames and then
+	 * swap for the handle lock.
+	 */
+	if ((ret = __fop_rename(env, txn,
+	    old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
+		goto err;
+	if ((ret = __fop_rename(env, txn, back, old,
+	    &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
+		goto err;
+	if ((ret = __fop_lock_handle(env,
+	    tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
+		goto err;
+
+	/*
+	 * We just acquired a transactional lock on the tmp handle.
+	 * We need to null out the tmp handle's lock so that it
+	 * doesn't create problems for us in the close path.
+	 */
+	LOCK_INIT(tmpdbp->handle_lock);
+
+	/* Commit the child. */
+	child_txnid = txn->txnid;
+	parent = txn->parent;
+	ret = __txn_commit(txn, 0);
+	txn = NULL;
+
+	/*
+	 * If the new name is available because it was previously renamed
+	 * remove it from the remove list.
+	 */
+	if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+		__txn_remrem(env, parent, realnew);
+
+	/* Now log the child information in the parent. */
+	memset(&fiddbt, 0, sizeof(fiddbt));
+	fiddbt.data = dbp->fileid;
+	fiddbt.size = DB_FILE_ID_LEN;
+	memset(&tmpdbt, 0, sizeof(fiddbt));
+	tmpdbt.data = tmpdbp->fileid;
+	tmpdbt.size = DB_FILE_ID_LEN;
+	DB_INIT_DBT(namedbt, old, strlen(old) + 1);
+	if ((t_ret = __fop_file_remove_log(env,
+	    parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
+	    (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* This is a delayed delete of the dummy file. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
+		goto err;
+
+	if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
+		goto err;
+
+err:	if (txn != NULL)	/* Ret must already be set, so void abort. */
+		(void)__txn_abort(txn);
+
+	(void)__ENV_LPUT(env, elock);
+
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (realnew != NULL)
+		__os_free(env, realnew);
+	if (realold != NULL)
+		__os_free(env, realold);
+	return (ret);
+}
+
+static int
+__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
+	DB *olddbp, *backdbp;
+	DB_TXN *txn;
+	const char *old, *new, *back;
+	DB_LOCKER *locker;
+{
+	DB *tmpdbp;
+	DBT fid_dbt, n1_dbt, n2_dbt;
+	DB_LOCK elock;
+	DB_LSN lsn;
+	DB_TXN *parent;
+	ENV *env;
+	int ret, t_ret;
+
+	env = olddbp->env;
+	parent = txn->parent;
+retry:	LOCK_INIT(elock);
+	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+		return (ret);
+	MAKE_INMEM(tmpdbp);
+
+	GET_ENVLOCK(env, locker, &elock);
+	if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
+		/*
+		 * It is possible that the only reason this database exists is
+		 * because we've done a previous rename of it and we have
+		 * left a placeholder here.  We need to check for that case
+		 * and allow this rename to succeed if that's the case.
+		 */
+
+		if ((ret = __fop_inmem_read_meta(
+		    tmpdbp, txn, new, 0, DB_CHK_META)) != 0) {
+			ret = EEXIST;
+			goto err;
+		}
+
+		/*
+		 * Now, try to acquire the handle lock.  If it's from our txn,
+		 * then we'll get the lock.  If it's not, then someone else has
+		 * it locked.  See the comments in __fop_ondisk_swap for
+		 * details.
+		 */
+		if ((ret = __fop_lock_handle(env,
+		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+			/*
+			 * Someone holds a writelock.  Try for the WRITELOCK
+			 * and after we get it, retry.
+			 */
+			if ((ret = __fop_lock_handle(env, tmpdbp,
+			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+				goto err;
+
+			/* We have the write lock; release it and start over. */
+			(void)__lock_put(env, &tmpdbp->handle_lock);
+			(void)__db_close(tmpdbp, NULL, DB_NOSYNC);
+			(void)__ENV_LPUT(env, elock);
+			goto retry;
+		} else {
+			(void)__lock_put(env, &tmpdbp->handle_lock);
+			if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+				ret = EEXIST;
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+	/* Log the renames. */
+	if (LOGGING_ON(env)
+#ifndef DEBUG_WOP
+	    && txn != NULL
+#endif
+	) {
+		/* Rename old to new. */
+		DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
+		DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
+		DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
+		if ((ret = __crdel_inmem_rename_log(
+		    env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
+			goto err;
+
+		/* Rename back to old */
+		fid_dbt.data = backdbp->fileid;
+		DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
+		if ((ret = __crdel_inmem_rename_log(
+		    env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
+			goto err;
+	}
+
+	/*
+	 * While we have the namespace locked, do the renames and then
+	 * swap for the handle lock.   If we ran into a file in the midst
+	 * of rename, then we need to delete it first, else nameop is
+	 * going to consider it an error.
+	 */
+	if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
+		if ((ret = __memp_nameop(env,
+		    tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
+			goto err;
+		__txn_remrem(env, parent, new);
+	}
+
+	if ((ret = __memp_nameop(
+	    env, olddbp->fileid, new, old, new, 1)) != 0)
+		goto err;
+	if ((ret = __memp_nameop(
+	    env, backdbp->fileid, old, back, old, 1)) != 0)
+		goto err;
+
+	if ((ret = __fop_lock_handle(env,
+	    tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+		goto err;
+
+	/*
+	 * We just acquired a transactional lock on the tmp handle.
+	 * We need to null out the tmp handle's lock so that it
+	 * doesn't create problems for us in the close path.
+	 */
+	LOCK_INIT(tmpdbp->handle_lock);
+
+	DB_ASSERT(env, txn != NULL);
+
+	/* Commit the child. */
+	ret = __txn_commit(txn, 0);
+	txn = NULL;
+
+	if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
+		goto err;
+
+err:	(void)__ENV_LPUT(env, elock);
+
+	if (txn != NULL)
+		(void)__txn_abort(txn);
+
+	if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/hash/hash.c b/src/hash/hash.c
new file mode 100644
index 00000000..ae5736e7
--- /dev/null
+++ b/src/hash/hash.c
@@ -0,0 +1,2340 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+static int  __ham_bulk __P((DBC *, DBT *, u_int32_t));
+static int  __hamc_close __P((DBC *, db_pgno_t, int *));
+static int  __hamc_del __P((DBC *, u_int32_t));
+static int  __hamc_destroy __P((DBC *));
+static int  __hamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __hamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __hamc_writelock __P((DBC *));
+static int  __ham_dup_return __P((DBC *, DBT *, u_int32_t));
+static int  __ham_expand_table __P((DBC *));
+static int __hamc_update_getorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __hamc_update_setorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ham_get_clist_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * __ham_quick_delete --
+ *	This function is called by __db_del when the appropriate conditions
+ *	are met, and it performs the delete in the optimized way.
+ *
+ * PUBLIC: int __ham_quick_delete __P((DBC *));
+ */
+int
+__ham_quick_delete(dbc)
+	DBC *dbc;
+{
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	int ret, t_ret;
+
+	/*
+	 * When performing a DB->del operation not involving secondary indices
+	 * and not removing an off-page duplicate tree, we can speed things up
+	 * substantially by removing the entire duplicate set, if any is
+	 * present, in one operation, rather than by conjuring up and deleting
+	 * each of the items individually.  (All are stored in one big HKEYDATA
+	 * structure.)  We don't bother to distinguish on-page duplicate sets
+	 * from single, non-dup items;  they're deleted in exactly the same way.
+	 *
+	 * The cursor should be set to the first item in the duplicate set, or
+	 * to the sole key/data pair when the key does not have a duplicate set,
+	 * before the function is called.
+	 *
+	 * We do not need to call CDB_LOCKING_INIT, __db_del calls here with
+	 * a write cursor.
+	 *
+	 * Assert we're initialized, but not to an off-page duplicate.
+	 * Assert we're not using secondary indices.
+	 */
+	DB_ASSERT(dbc->env, IS_INITIALIZED(dbc));
+	DB_ASSERT(dbc->env, dbc->internal->opd == NULL);
+	DB_ASSERT(dbc->env, !F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+	DB_ASSERT(dbc->env, !DB_IS_PRIMARY(dbc->dbp));
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	mpf = dbc->dbp->mpf;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		return (ret);
+
+	if ((ret = __hamc_writelock(dbc)) == 0) {
+		ret = __ham_del_pair(dbc, 0, NULL);
+		/*
+		 * If a page was retrieved during the delete, put it now. We
+		 * can't rely on the callers cursor close to do that, since bulk
+		 * delete operations keep the cursor open across deletes.
+		 */
+		if (hcp->page != NULL) {
+			if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+			    hcp->page, dbc->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			hcp->page = NULL;
+		}
+	}
+
+	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/* ****************** CURSORS ********************************** */
+/*
+ * __hamc_init --
+ *	Initialize the hash-specific portion of a cursor.
+ *
+ * PUBLIC: int __hamc_init __P((DBC *));
+ */
+int
+__hamc_init(dbc)
+	DBC *dbc;
+{
+	ENV *env;
+	HASH_CURSOR *new_curs;
+	int ret;
+
+	env = dbc->env;
+	if ((ret = __os_calloc(env,
+	    1, sizeof(struct cursor_t), &new_curs)) != 0)
+		return (ret);
+	if ((ret = __os_malloc(env,
+	    dbc->dbp->pgsize, &new_curs->split_buf)) != 0) {
+		__os_free(env, new_curs);
+		return (ret);
+	}
+
+	dbc->internal = (DBC_INTERNAL *) new_curs;
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __dbc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	dbc->am_bulk = __ham_bulk;
+	dbc->am_close = __hamc_close;
+	dbc->am_del = __hamc_del;
+	dbc->am_destroy = __hamc_destroy;
+	dbc->am_get = __hamc_get;
+	dbc->am_put = __hamc_put;
+	dbc->am_writelock = __hamc_writelock;
+
+	return (__ham_item_init(dbc));
+}
+
+/*
+ * __hamc_close --
+ *	Close down the cursor from a single use.
+ */
+static int
+__hamc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	HKEYDATA *dp;
+	db_lockmode_t lock_mode;
+	int doroot, gotmeta, ret, t_ret;
+
+	COMPQUIET(rmroot, 0);
+	mpf = dbc->dbp->mpf;
+	doroot = gotmeta = ret = 0;
+	hcp = (HASH_CURSOR *) dbc->internal;
+
+	/* Check for off page dups. */
+	if (dbc->internal->opd != NULL) {
+		if ((ret = __ham_get_meta(dbc)) != 0)
+			goto done;
+		gotmeta = 1;
+		lock_mode = DB_LOCK_READ;
+
+		/* To support dirty reads we must reget the write lock. */
+		if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+		     F_ISSET((BTREE_CURSOR *)
+		     dbc->internal->opd->internal, C_DELETED))
+			lock_mode = DB_LOCK_WRITE;
+
+		if ((ret = __ham_get_cpage(dbc, lock_mode)) != 0)
+			goto out;
+		dp = (HKEYDATA *)H_PAIRDATA(dbc->dbp, hcp->page, hcp->indx);
+
+		/* If it's not a dup we aborted before we changed it. */
+		if (HPAGE_PTYPE(dp) == H_OFFDUP)
+			memcpy(&root_pgno,
+			    HOFFPAGE_PGNO(dp), sizeof(db_pgno_t));
+		else
+			root_pgno = PGNO_INVALID;
+
+		if ((ret =
+		    hcp->opd->am_close(hcp->opd, root_pgno, &doroot)) != 0)
+			goto out;
+		if (doroot != 0) {
+			if ((ret = __memp_dirty(mpf, &hcp->page,
+			    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+				goto out;
+			if ((ret = __ham_del_pair(dbc, 0, NULL)) != 0)
+				goto out;
+		}
+	}
+
+out:	if (ret != 0)
+		F_SET(dbc, DBC_ERROR);
+	if (hcp->page != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+done:	if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __hamc_destroy --
+ *	Cleanup the access method private part of a cursor.
+ */
+static int
+__hamc_destroy(dbc)
+	DBC *dbc;
+{
+	HASH_CURSOR *hcp;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (hcp->split_buf != NULL)
+		__os_free(dbc->env, hcp->split_buf);
+	__os_free(dbc->env, hcp);
+
+	return (0);
+}
+
+/*
+ * __hamc_count --
+ *	Return a count of on-page duplicates.
+ *
+ * PUBLIC: int __hamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__hamc_count(dbc, recnop)
+	DBC *dbc;
+	db_recno_t *recnop;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	db_indx_t len;
+	db_recno_t recno;
+	int ret, t_ret;
+	u_int8_t *p, *pend;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	recno = 0;
+
+	if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0)
+		return (ret);
+	if (hcp->indx >= NUM_ENT(hcp->page)) {
+		*recnop = 0;
+		goto err;
+	}
+
+	switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+	case H_KEYDATA:
+	case H_OFFPAGE:
+		recno = 1;
+		break;
+	case H_DUPLICATE:
+		p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+		pend = p +
+		    LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+		for (; p < pend; recno++) {
+			/* p may be odd, so copy rather than just dereffing */
+			memcpy(&len, p, sizeof(db_indx_t));
+			p += 2 * sizeof(db_indx_t) + len;
+		}
+
+		break;
+	default:
+		ret = __db_pgfmt(dbp->env, hcp->pgno);
+		goto err;
+	}
+
+	*recnop = recno;
+
+err:	if ((t_ret = __memp_fput(mpf,
+	     dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	hcp->page = NULL;
+	return (ret);
+}
+
+/*
+ * __hamc_cmp --
+ *	Compare two hash cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It distinguishes two cases:
+ * * Cursors pointing to different items in the same on-page duplicate set.
+ * * Cursors pointing to the same item, with different DELETED flags.
+ *
+ * PUBLIC: int __hamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__hamc_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	ENV *env;
+	HASH_CURSOR *hcp, *ohcp;
+
+	env = dbc->env;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	ohcp = (HASH_CURSOR *)other_dbc->internal;
+
+	DB_ASSERT (env, hcp->pgno == ohcp->pgno);
+	DB_ASSERT (env, hcp->indx == ohcp->indx);
+
+	/* Only compare the duplicate offsets if this is a duplicate item. */
+	if ((F_ISSET(hcp, H_ISDUP) && hcp->dup_off != ohcp->dup_off) ||
+	    F_ISSET(hcp, H_DELETED) != F_ISSET(ohcp, H_DELETED))
+		*result = 1;
+	else
+		*result = 0;
+	return (0);
+}
+
+static int
+__hamc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT repldbt;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (F_ISSET(hcp, H_DELETED))
+		return (DB_NOTFOUND);
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto out;
+
+	if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
+		goto out;
+
+	/* Off-page duplicates. */
+	if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP)
+		goto out;
+
+	DB_ASSERT(dbp->env, IS_DIRTY(hcp->page));
+
+	if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */
+		if (hcp->dup_off == 0 &&
+		    DUP_SIZE(hcp->dup_len) == LEN_HDATA(dbp, hcp->page,
+		    hcp->hdr->dbmeta.pagesize, hcp->indx))
+			ret = __ham_del_pair(dbc, 0, NULL);
+		else {
+			repldbt.flags = 0;
+			F_SET(&repldbt, DB_DBT_PARTIAL);
+			repldbt.doff = hcp->dup_off;
+			repldbt.dlen = DUP_SIZE(hcp->dup_len);
+			repldbt.size = 0;
+			repldbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+			    hcp->indx));
+			if ((ret =
+			    __ham_replpair(dbc, &repldbt, H_DUPLICATE)) == 0) {
+				hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
+				F_SET(hcp, H_DELETED);
+				/*
+				 * Clear any cached streaming information.
+				 */
+				hcp->stream_start_pgno = PGNO_INVALID;
+				ret = __hamc_update(dbc, DUP_SIZE(hcp->dup_len),
+				    DB_HAM_CURADJ_DEL, 1);
+			}
+		}
+	} else /* Not a duplicate */
+		ret = __ham_del_pair(dbc, 0, NULL);
+
+out:	if (hcp->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    hcp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		hcp->page = NULL;
+	}
+	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __hamc_dup --
+ *	Duplicate a hash cursor, such that the new one holds appropriate
+ *	locks for the position of the original.
+ *
+ * PUBLIC: int __hamc_dup __P((DBC *, DBC *));
+ */
+int
+__hamc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	HASH_CURSOR *orig, *new;
+
+	orig = (HASH_CURSOR *)orig_dbc->internal;
+	new = (HASH_CURSOR *)new_dbc->internal;
+
+	new->bucket = orig->bucket;
+	new->lbucket = orig->lbucket;
+	new->dup_off = orig->dup_off;
+	new->dup_len = orig->dup_len;
+	new->dup_tlen = orig->dup_tlen;
+
+	if (F_ISSET(orig, H_DELETED))
+		F_SET(new, H_DELETED);
+	if (F_ISSET(orig, H_ISDUP))
+		F_SET(new, H_ISDUP);
+
+	return (0);
+}
+
+static int
+__hamc_get(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp;
+	db_lockmode_t lock_type;
+	int ret, t_ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	if (F_ISSET(dbc, DBC_RMW))
+		lock_type = DB_LOCK_WRITE;
+	else
+		lock_type = DB_LOCK_READ;
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		return (ret);
+	hcp->seek_size = 0;
+
+	ret = 0;
+	switch (flags) {
+	case DB_PREV_DUP:
+		F_SET(hcp, H_DUPONLY);
+		goto prev;
+	case DB_PREV_NODUP:
+		F_SET(hcp, H_NEXT_NODUP);
+		/* FALLTHROUGH */
+	case DB_PREV:
+		if (IS_INITIALIZED(dbc)) {
+prev:			ret = __ham_item_prev(dbc, lock_type, pgnop);
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_LAST:
+		ret = __ham_item_last(dbc, lock_type, pgnop);
+		break;
+	case DB_NEXT_DUP:
+	case DB_GET_BOTHC:
+		/* cgetchk has already determined that the cursor is set. */
+		F_SET(hcp, H_DUPONLY);
+		goto next;
+	case DB_NEXT_NODUP:
+		F_SET(hcp, H_NEXT_NODUP);
+		/* FALLTHROUGH */
+	case DB_NEXT:
+		if (IS_INITIALIZED(dbc)) {
+next:			ret = __ham_item_next(dbc, lock_type, pgnop);
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_FIRST:
+		ret = __ham_item_first(dbc, lock_type, pgnop);
+		break;
+	case DB_SET:
+	case DB_SET_RANGE:
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		ret = __ham_lookup(dbc, key, 0, lock_type, pgnop);
+		break;
+	case DB_CURRENT:
+		/* cgetchk has already determined that the cursor is set. */
+		if (F_ISSET(hcp, H_DELETED)) {
+			ret = DB_KEYEMPTY;
+			goto err;
+		}
+
+		ret = __ham_item(dbc, lock_type, pgnop);
+		break;
+	default:
+		ret = __db_unknown_flag(env, "__hamc_get", flags);
+		break;
+	}
+
+	/*
+	 * Must always enter this loop to do error handling and
+	 * check for big key/data pair.
+	 */
+	for (;;) {
+		if (ret != 0 && ret != DB_NOTFOUND)
+			goto err;
+		else if (F_ISSET(hcp, H_OK)) {
+			if (*pgnop == PGNO_INVALID)
+				ret = __ham_dup_return(dbc, data, flags);
+			break;
+		} else if (!F_ISSET(hcp, H_NOMORE)) {
+			__db_errx(env, DB_STR("1130",
+			    "H_NOMORE returned to __hamc_get"));
+			ret = EINVAL;
+			break;
+		}
+
+		/*
+		 * Ran out of entries in a bucket; change buckets.
+		 */
+		switch (flags) {
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_DUP:
+		case DB_PREV_NODUP:
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, hcp->page, dbc->priority);
+			hcp->page = NULL;
+			if (hcp->bucket == 0) {
+				ret = DB_NOTFOUND;
+				hcp->pgno = PGNO_INVALID;
+				goto err;
+			}
+			F_CLR(hcp, H_ISDUP);
+			hcp->bucket--;
+			hcp->indx = NDX_INVALID;
+			hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+			if (ret == 0)
+				ret = __ham_item_prev(dbc, lock_type, pgnop);
+			break;
+		case DB_FIRST:
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, hcp->page, dbc->priority);
+			hcp->page = NULL;
+			hcp->indx = NDX_INVALID;
+			hcp->bucket++;
+			F_CLR(hcp, H_ISDUP);
+			hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+			if (hcp->bucket > hcp->hdr->max_bucket) {
+				ret = DB_NOTFOUND;
+				hcp->pgno = PGNO_INVALID;
+				goto err;
+			}
+			if (ret == 0)
+				ret = __ham_item_next(dbc, lock_type, pgnop);
+			break;
+		case DB_GET_BOTH:
+		case DB_GET_BOTHC:
+		case DB_GET_BOTH_RANGE:
+		case DB_NEXT_DUP:
+		case DB_SET:
+		case DB_SET_RANGE:
+			/* Key not found. */
+			ret = DB_NOTFOUND;
+			goto err;
+		case DB_CURRENT:
+			/*
+			 * This should only happen if you are doing deletes and
+			 * reading with concurrent threads and not doing proper
+			 * locking.  We return the same error code as we would
+			 * if the cursor were deleted.
+			 */
+			ret = DB_KEYEMPTY;
+			goto err;
+		default:
+			DB_ASSERT(env, 0);
+		}
+	}
+
+err:	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	F_CLR(hcp, H_DUPONLY);
+	F_CLR(hcp, H_NEXT_NODUP);
+
+	return (ret);
+}
+
+/*
+ * __ham_bulk -- Return bulk data from a hash table.
+ */
+static int
+__ham_bulk(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *cp;
+	PAGE *pg;
+	db_indx_t dup_len, dup_off, dup_tlen, indx, *inp;
+	db_lockmode_t lock_mode;
+	db_pgno_t pgno;
+	int32_t *endp, *offp, *saveoff;
+	u_int32_t key_off, key_size, pagesize, size, space;
+	u_int8_t *dbuf, *dp, *hk, *np, *tmp;
+	int is_dup, is_key;
+	int need_pg, next_key, no_dup, ret, t_ret;
+
+	ret = 0;
+	key_off = 0;
+	dup_len = dup_off = dup_tlen = 0;
+	size = 0;
+	dbp = dbc->dbp;
+	pagesize = dbp->pgsize;
+	mpf = dbp->mpf;
+	cp = (HASH_CURSOR *)dbc->internal;
+	is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+	next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+	no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+	dbuf = data->data;
+	np = dp = dbuf;
+
+	/* Keep track of space that is left.  There is an termination entry */
+	space = data->ulen;
+	space -= sizeof(*offp);
+
+	/* Build the offset/size table from the end up. */
+	endp = (int32_t *) ((u_int8_t *)dbuf + data->ulen);
+	endp--;
+	offp = endp;
+
+	key_size = 0;
+	lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE: DB_LOCK_READ;
+
+next_pg:
+	need_pg = 1;
+	indx = cp->indx;
+	pg = cp->page;
+	inp = P_INP(dbp, pg);
+
+	do {
+		if (is_key) {
+			hk = H_PAIRKEY(dbp, pg, indx);
+			if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+				memcpy(&key_size,
+				    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+				memcpy(&pgno,
+				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+				size = key_size;
+				if (key_size > space)
+					goto get_key_space;
+				if ((ret = __bam_bulk_overflow(
+				    dbc, key_size, pgno, np)) != 0)
+					return (ret);
+				space -= key_size;
+				key_off = (u_int32_t)(np - dbuf);
+				np += key_size;
+			} else {
+				if (need_pg) {
+					dp = np;
+					size = pagesize - HOFFSET(pg);
+					if (space < size) {
+get_key_space:
+						if (offp == endp) {
+							data->size = (u_int32_t)
+							    DB_ALIGN(size +
+							    pagesize, 1024);
+							return
+							    (DB_BUFFER_SMALL);
+						}
+						goto back_up;
+					}
+					memcpy(dp,
+					   (u_int8_t *)pg + HOFFSET(pg), size);
+					need_pg = 0;
+					space -= size;
+					np += size;
+				}
+				key_size = LEN_HKEY(dbp, pg, pagesize, indx);
+				key_off = ((inp[indx] - HOFFSET(pg)) +
+				    (u_int32_t)(dp - dbuf)) +
+				    SSZA(HKEYDATA, data);
+			}
+		}
+
+		hk = H_PAIRDATA(dbp, pg, indx);
+		switch (HPAGE_PTYPE(hk)) {
+		case H_DUPLICATE:
+		case H_KEYDATA:
+			if (need_pg) {
+				dp = np;
+				size = pagesize - HOFFSET(pg);
+				if (space < size) {
+back_up:
+					if (indx != 0) {
+						indx -= 2;
+						/* XXX
+						 * It's not clear that this is
+						 * the right way to fix this,
+						 * but here goes.
+						 * If we are backing up onto a
+						 * duplicate, then we need to
+						 * position ourselves at the
+						 * end of the duplicate set.
+						 * We probably need to make
+						 * this work for H_OFFDUP too.
+						 * It might be worth making a
+						 * dummy cursor and calling
+						 * __ham_item_prev.
+						 */
+						tmp = H_PAIRDATA(dbp, pg, indx);
+						if (HPAGE_PTYPE(tmp) ==
+						    H_DUPLICATE) {
+							dup_off = dup_tlen =
+							    LEN_HDATA(dbp, pg,
+							    pagesize, indx + 1);
+							memcpy(&dup_len,
+							    HKEYDATA_DATA(tmp),
+							    sizeof(db_indx_t));
+						} else  {
+							is_dup = 0;
+							dup_len = 0;
+							dup_off = 0;
+							dup_tlen = 0;
+							F_CLR(cp, H_ISDUP);
+						}
+						goto get_space;
+					}
+					/* indx == 0 */
+					cp->dup_len = dup_len;
+					cp->dup_off = dup_off;
+					cp->dup_tlen = dup_tlen;
+					if ((ret = __ham_item_prev(dbc,
+					    lock_mode, &pgno)) != 0) {
+						if (ret != DB_NOTFOUND)
+							return (ret);
+						if ((ret = __memp_fput(mpf,
+						    dbc->thread_info, cp->page,
+						    dbc->priority)) != 0)
+							return (ret);
+						cp->page = NULL;
+						if (cp->bucket == 0) {
+							cp->indx = indx =
+							    NDX_INVALID;
+							goto get_space;
+						}
+						if ((ret =
+						    __ham_get_meta(dbc)) != 0)
+							return (ret);
+
+						cp->bucket--;
+						cp->pgno = BUCKET_TO_PAGE(cp,
+						    cp->bucket);
+						cp->indx = NDX_INVALID;
+						if ((ret = __ham_release_meta(
+						    dbc)) != 0)
+							return (ret);
+						/*
+						 * Not an error to get
+						 * DB_NOTFOUND, we're just at
+						 * the beginning of the db.
+						 */
+						if ((ret = __ham_item_prev(dbc,
+						    lock_mode, &pgno)) != 0) {
+							if (ret != DB_NOTFOUND)
+								return (ret);
+							else
+								ret = 0;
+						}
+					}
+					indx = cp->indx;
+get_space:
+					/*
+					 * See if we put any data in the buffer.
+					 */
+					if (offp >= endp ||
+					    F_ISSET(dbc, DBC_TRANSIENT)) {
+						data->size = (u_int32_t)
+						    DB_ALIGN(size +
+						    data->ulen - space, 1024);
+						return (DB_BUFFER_SMALL);
+					}
+					/*
+					 * Don't continue;  we're all out
+					 * of space, even though we're
+					 * returning success.
+					 */
+					next_key = 0;
+					break;
+				}
+				memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+				need_pg = 0;
+				space -= size;
+				np += size;
+			}
+
+			/*
+			 * We're about to crack the offset(s) and length(s)
+			 * out of an H_KEYDATA or H_DUPLICATE item.
+			 * There are three cases:
+			 *   1. We were moved into a duplicate set by
+			 *	the standard hash cursor code.  Respect
+			 *	the dup_off and dup_tlen we were given.
+			 *   2. We stumbled upon a duplicate set while
+			 *	walking the page on our own.  We need to
+			 *	recognize it as a dup and set dup_off and
+			 *	dup_tlen.
+			 *   3. The current item is not a dup.
+			 */
+			if (F_ISSET(cp, H_ISDUP)) {
+				/* Case 1 */
+				is_dup = 1;
+				dup_len = cp->dup_len;
+				dup_off = cp->dup_off;
+				dup_tlen = cp->dup_tlen;
+			} else if (HPAGE_PTYPE(hk) == H_DUPLICATE) {
+				/* Case 2 */
+				is_dup = 1;
+				/*
+				 * If we run out of memory and bail,
+				 * make sure the fact we're in a dup set
+				 * isn't ignored later.
+				 */
+				F_SET(cp, H_ISDUP);
+				dup_off = 0;
+				memcpy(&dup_len,
+				    HKEYDATA_DATA(hk), sizeof(db_indx_t));
+				dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx);
+			} else {
+				/* Case 3 */
+				is_dup = 0;
+				dup_len = 0;
+				dup_off = 0;
+				dup_tlen = 0;
+			}
+
+			do {
+				space -= (is_key ? 4 : 2) * sizeof(*offp);
+				size += (is_key ? 4 : 2) * sizeof(*offp);
+				/*
+				 * Since space is an unsigned, if we happen
+				 * to wrap, then this comparison will turn out
+				 * to be true.  XXX Wouldn't it be better to
+				 * simply check above that space is greater than
+				 * the value we're about to subtract???
+				 */
+				if (space > data->ulen) {
+					if (!is_dup || dup_off == 0)
+						goto back_up;
+					dup_off -= (db_indx_t)
+					    DUP_SIZE((u_int32_t)offp[1]);
+					goto get_space;
+				}
+				if (is_key) {
+					*offp-- = (int32_t)key_off;
+					*offp-- = (int32_t)key_size;
+				}
+				if (is_dup) {
+					*offp-- = (int32_t)(
+					    ((inp[indx + 1] - HOFFSET(pg)) +
+					    dp - dbuf) + SSZA(HKEYDATA, data) +
+					    dup_off + sizeof(db_indx_t));
+					memcpy(&dup_len,
+					    HKEYDATA_DATA(hk) + dup_off,
+					    sizeof(db_indx_t));
+					dup_off += DUP_SIZE(dup_len);
+					*offp-- = dup_len;
+				} else {
+					*offp-- = (int32_t)(
+					    ((inp[indx + 1] - HOFFSET(pg)) +
+					    dp - dbuf) + SSZA(HKEYDATA, data));
+					*offp-- = LEN_HDATA(dbp, pg,
+					    pagesize, indx);
+				}
+			} while (is_dup && dup_off < dup_tlen && no_dup == 0);
+			F_CLR(cp, H_ISDUP);
+			break;
+		case H_OFFDUP:
+			memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+			space -= 2 * sizeof(*offp);
+			if (space > data->ulen)
+				goto back_up;
+
+			if (is_key) {
+				space -= 2 * sizeof(*offp);
+				if (space > data->ulen)
+					goto back_up;
+				*offp-- = (int32_t)key_off;
+				*offp-- = (int32_t)key_size;
+			}
+			saveoff = offp;
+			if ((ret = __bam_bulk_duplicates(dbc,
+			    pgno, dbuf, is_key ? offp + 2 : NULL,
+			    &offp, &np, &space, no_dup)) != 0) {
+				if (ret == DB_BUFFER_SMALL) {
+					size = space;
+					space = 0;
+					if (is_key && saveoff == offp) {
+						offp += 2;
+						goto back_up;
+					}
+					goto get_space;
+				}
+				return (ret);
+			}
+			break;
+		case H_OFFPAGE:
+			space -= (is_key ? 4 : 2) * sizeof(*offp);
+			if (space > data->ulen)
+				goto back_up;
+
+			memcpy(&size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+			memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+			if (size > space)
+				goto back_up;
+
+			if ((ret =
+			    __bam_bulk_overflow(dbc, size, pgno, np)) != 0)
+				return (ret);
+
+			if (is_key) {
+				*offp-- = (int32_t)key_off;
+				*offp-- = (int32_t)key_size;
+			}
+
+			*offp-- = (int32_t)(np - dbuf);
+			*offp-- = (int32_t)size;
+
+			np += size;
+			space -= size;
+			break;
+		default:
+			/* Do nothing. */
+			break;
+		}
+	} while (next_key && (indx += 2) < NUM_ENT(pg));
+
+	cp->indx = indx;
+	cp->dup_len = dup_len;
+	cp->dup_off = dup_off;
+	cp->dup_tlen = dup_tlen;
+
+	/* If we are off the page then try to the next page. */
+	if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+		if ((ret = __ham_item_next(dbc, lock_mode, &pgno)) == 0)
+			goto next_pg;
+		if (ret != DB_NOTFOUND)
+			return (ret);
+		if ((ret = __memp_fput(dbc->dbp->mpf,
+		    dbc->thread_info, cp->page, dbc->priority)) != 0)
+			return (ret);
+		cp->page = NULL;
+		if ((ret = __ham_get_meta(dbc)) != 0)
+			return (ret);
+
+		cp->bucket++;
+		if (cp->bucket > cp->hdr->max_bucket) {
+			/*
+			 * Restore cursor to its previous state.  We're past
+			 * the last item in the last bucket, so the next
+			 * DBC->get(DB_NEXT) will return DB_NOTFOUND.
+			 */
+			cp->bucket--;
+			ret = DB_NOTFOUND;
+		} else {
+			/*
+			 * Start on the next bucket.
+			 *
+			 * Note that if this new bucket happens to be empty,
+			 * but there's another non-empty bucket after it,
+			 * we'll return early.  This is a rare case, and we
+			 * don't guarantee any particular number of keys
+			 * returned on each call, so just let the next call
+			 * to bulk get move forward by yet another bucket.
+			 */
+			cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket);
+			cp->indx = NDX_INVALID;
+			F_CLR(cp, H_ISDUP);
+			ret = __ham_item_next(dbc, lock_mode, &pgno);
+		}
+
+		if ((t_ret = __ham_release_meta(dbc)) != 0)
+			return (t_ret);
+		if (ret == 0)
+			goto next_pg;
+		if (ret != DB_NOTFOUND)
+			return (ret);
+	}
+	*offp = -1;
+	return (0);
+}
+
+static int
+__hamc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBT tmp_val, *myval;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	u_int32_t nbytes;
+	int ret, t_ret;
+
+	/*
+	 * The compiler doesn't realize that we only use this when ret is
+	 * equal to 0 and that if ret is equal to 0, that we must have set
+	 * myval.  So, we initialize it here to shut the compiler up.
+	 */
+	COMPQUIET(myval, NULL);
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST &&
+	    flags != DB_KEYLAST && flags != DB_OVERWRITE_DUP)
+		return (DB_NOTFOUND);
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto err1;
+
+	switch (flags) {
+	case DB_KEYLAST:
+	case DB_KEYFIRST:
+	case DB_NODUPDATA:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
+		    HKEYDATA_PSIZE(key->size)) +
+		    (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
+		    HKEYDATA_PSIZE(data->size));
+		if ((ret = __ham_lookup(dbc,
+		    key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) {
+			if (hcp->seek_found_page != PGNO_INVALID &&
+			    hcp->seek_found_page != hcp->pgno) {
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				    hcp->page, dbc->priority)) != 0)
+					goto err2;
+				hcp->page = NULL;
+				hcp->pgno = hcp->seek_found_page;
+				hcp->indx = NDX_INVALID;
+			}
+
+			if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
+				/*
+				 * A partial put, but the key does not exist
+				 * and we are not beginning the write at 0.
+				 * We must create a data item padded up to doff
+				 * and then write the new bytes represented by
+				 * val.
+				 */
+				if ((ret = __ham_init_dbt(dbp->env, &tmp_val,
+				    data->size + data->doff,
+				    &dbc->my_rdata.data,
+				    &dbc->my_rdata.ulen)) != 0)
+					goto err2;
+
+				memset(tmp_val.data, 0, data->doff);
+				memcpy((u_int8_t *)tmp_val.data +
+				    data->doff, data->data, data->size);
+				myval = &tmp_val;
+			} else
+				myval = (DBT *)data;
+
+			ret = __ham_add_el(dbc, key, myval, H_KEYDATA);
+			goto done;
+		} else if (ret == 0 && flags == DB_NOOVERWRITE &&
+		    !F_ISSET(hcp, H_DELETED)) {
+			if (*pgnop == PGNO_INVALID)
+				ret = DB_KEYEXIST;
+			else
+				ret = __bam_opd_exists(dbc, *pgnop);
+			if (ret != 0)
+				goto done;
+		}
+		break;
+	case DB_BEFORE:
+	case DB_AFTER:
+	case DB_CURRENT:
+		ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop);
+		break;
+	default:
+		ret = __db_unknown_flag(dbp->env, "__hamc_put", flags);
+		break;
+	}
+
+	/*
+	 * Invalidate any insert index found so they are not reused
+	 * in future inserts.
+	 */
+	hcp->seek_found_page = PGNO_INVALID;
+	hcp->seek_found_indx = NDX_INVALID;
+
+	if (*pgnop == PGNO_INVALID && ret == 0) {
+		if ((ret = __memp_dirty(mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto done;
+		if (flags == DB_CURRENT ||
+		    (!(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)) &&
+		    (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+		    flags == DB_NODUPDATA || flags == DB_OVERWRITE_DUP)))
+			ret = __ham_overwrite(dbc, data, flags);
+		else
+			ret = __ham_add_dup(dbc, data, flags, pgnop);
+	}
+
+done:	if (hcp->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    hcp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (t_ret == 0)
+			hcp->page = NULL;
+	}
+
+	if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
+		ret = __ham_expand_table(dbc);
+		F_CLR(hcp, H_EXPAND);
+		/* If we are out of space, ignore the error. */
+		if (ret == ENOSPC && dbc->txn == NULL)
+			ret = 0;
+	} else if (ret == 0 && F_ISSET(hcp, H_CONTRACT)) {
+		if (!F_ISSET(dbp, DB_AM_REVSPLITOFF))
+			ret = __ham_contract_table(dbc, NULL);
+		F_CLR(hcp, H_CONTRACT);
+	}
+
+err2:	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+err1:	return (ret);
+}
+
+/********************************* UTILITIES ************************/
+
+/*
+ * __ham_contract_table -- remove the last bucket.
+ * PUBLIC: int  __ham_contract_table __P((DBC *, DB_COMPACT *));
+ */
+int
+__ham_contract_table(dbc, c_data)
+	DBC *dbc;
+	DB_COMPACT *c_data;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	HMETA *hdr;
+	PAGE *h;
+	db_pgno_t maxpgno, stoppgno;
+	int drop_segment, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	h = NULL;
+	if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+		return (ret);
+	hcp = (HASH_CURSOR *)dbc->internal;
+	hdr = hcp->hdr;
+
+	if ((ret = __ham_merge_pages(dbc,
+	    hdr->max_bucket & hdr->low_mask, hdr->max_bucket, c_data)) != 0)
+		return (ret);
+
+	maxpgno = BUCKET_TO_PAGE(hcp, hdr->max_bucket);
+	drop_segment = hdr->max_bucket == (hdr->low_mask + 1);
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __ham_contract_log(dbp, dbc->txn, &LSN(hdr),
+		    0, PGNO(hdr), &LSN(hdr), hdr->max_bucket, maxpgno)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(hdr));
+
+	hdr->max_bucket--;
+	/*
+	 * If we are dropping a segment then adjust the spares table and masks
+	 * and free the pages in that segment.
+	 */
+	if (drop_segment) {
+		LOCK_CHECK_OFF(dbc->thread_info);
+		hdr->spares[__db_log2(hdr->max_bucket + 1) + 1] = PGNO_INVALID;
+		hdr->high_mask = hdr->low_mask;
+		hdr->low_mask >>= 1;
+		stoppgno = maxpgno + hdr->max_bucket + 1;
+		do {
+			if ((ret = __memp_fget(mpf, &maxpgno,
+			     dbc->thread_info, dbc->txn,
+			     DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+				break;
+			if ((ret = __db_free(dbc, h, 0)) != 0)
+				break;
+			ret = 0;
+		} while (++maxpgno < stoppgno);
+		LOCK_CHECK_ON(dbc->thread_info);
+	}
+
+err: return (ret);
+}
+
+/*
+ * __ham_expand_table --
+ */
+static int
+__ham_expand_table(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DBMETA *mmeta;
+	DB_LOCK metalock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *h;
+	db_pgno_t pgno, mpgno;
+	u_int32_t logn, newalloc, new_bucket, old_bucket;
+	int got_meta, new_double, ret, t_ret;
+
+	LOCK_CHECK_OFF(dbc->thread_info);
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+		return (ret);
+
+	LOCK_INIT(metalock);
+	mmeta = (DBMETA *) hcp->hdr;
+	mpgno = mmeta->pgno;
+	h = NULL;
+	newalloc = 0;
+	got_meta = 0;
+
+	/*
+	 * If the split point is about to increase, make sure that we
+	 * have enough extra pages.  The calculation here is weird.
+	 * We'd like to do this after we've upped max_bucket, but it's
+	 * too late then because we've logged the meta-data split.  What
+	 * we'll do between then and now is increment max bucket and then
+	 * see what the log of one greater than that is; here we have to
+	 * look at the log of max + 2.  VERY NASTY STUFF.
+	 *
+	 * We figure out what we need to do, then we log it, then request
+	 * the pages from mpool.  We don't want to fail after extending
+	 * the file.
+	 *
+	 * If the page we are about to split into has already been allocated,
+	 * then we simply need to get it to get its LSN.  If it hasn't yet
+	 * been allocated, then we know it's LSN (0,0).
+	 */
+
+	new_bucket = hcp->hdr->max_bucket + 1;
+	old_bucket = new_bucket & hcp->hdr->low_mask;
+
+	new_double = hcp->hdr->max_bucket == hcp->hdr->high_mask;
+	logn = __db_log2(new_bucket);
+
+	if (!new_double || hcp->hdr->spares[logn + 1] != PGNO_INVALID) {
+		/* Page exists; get it so we can get its LSN */
+		pgno = BUCKET_TO_PAGE(hcp, new_bucket);
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+			goto err;
+		lsn = h->lsn;
+	} else {
+		/* Get the master meta-data page to do allocation. */
+		if (F_ISSET(dbp, DB_AM_SUBDB)) {
+			mpgno = PGNO_BASE_MD;
+			if ((ret = __db_lget(dbc,
+			   0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(mpf, &mpgno, dbc->thread_info,
+			    dbc->txn, DB_MPOOL_DIRTY, &mmeta)) != 0)
+				goto err;
+			got_meta = 1;
+		}
+		pgno = mmeta->last_pgno + 1;
+		ZERO_LSN(lsn);
+		newalloc = 1;
+	}
+
+	/* Log the meta-data split first. */
+	if (DBC_LOGGING(dbc)) {
+		/*
+		 * We always log the page number of the first page of
+		 * the allocation group.  However, the LSN that we log
+		 * is either the LSN on the first page (if we did not
+		 * do the actual allocation here) or the LSN on the last
+		 * page of the unit (if we did do the allocation here).
+		 */
+		if ((ret = __ham_metagroup_log(dbp, dbc->txn,
+		    &lsn, 0, hcp->hdr->max_bucket, mpgno, &mmeta->lsn,
+		    hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn,
+		    pgno, &lsn, newalloc, mmeta->last_pgno)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(lsn);
+
+	hcp->hdr->dbmeta.lsn = lsn;
+
+	if (new_double && hcp->hdr->spares[logn + 1] == PGNO_INVALID) {
+		/*
+		 * We need to begin a new doubling and we have not allocated
+		 * any pages yet.  Read the last page in and initialize it to
+		 * make the allocation contiguous.  The pgno we calculated
+		 * above is the first page allocated. The entry in spares is
+		 * that page number minus any buckets already allocated (it
+		 * simplifies bucket to page transaction).  After we've set
+		 * that, we calculate the last pgno.
+		 */
+
+		pgno += hcp->hdr->max_bucket;
+
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+			goto err;
+
+		hcp->hdr->spares[logn + 1] =
+		    (pgno - new_bucket) - hcp->hdr->max_bucket;
+		mmeta->last_pgno = pgno;
+		mmeta->lsn = lsn;
+
+		P_INIT(h, dbp->pgsize,
+		    pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+	}
+
+	/* Write out whatever page we ended up modifying. */
+	h->lsn = lsn;
+	if ((ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority)) != 0)
+		goto err;
+	h = NULL;
+
+	/*
+	 * Update the meta-data page of this hash database.
+	 */
+	hcp->hdr->max_bucket = new_bucket;
+	if (new_double) {
+		hcp->hdr->low_mask = hcp->hdr->high_mask;
+		hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
+	}
+
+err:	if (got_meta)
+		if ((t_ret = __memp_fput(mpf,
+		    dbc->thread_info, mmeta, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+			ret = t_ret;
+	if (h != NULL)
+		if ((t_ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Relocate records to the new bucket -- after releasing metapage. */
+	if (ret == 0)
+		ret = __ham_split_page(dbc, old_bucket, new_bucket);
+	LOCK_CHECK_ON(dbc->thread_info);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t));
+ */
+u_int32_t
+__ham_call_hash(dbc, k, len)
+	DBC *dbc;
+	u_int8_t *k;
+	u_int32_t len;
+{
+	DB *dbp;
+	HASH *hashp;
+	HASH_CURSOR *hcp;
+	u_int32_t n, bucket;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	hashp = dbp->h_internal;
+
+	n = (u_int32_t)(hashp->h_hash(dbp, k, len));
+
+	bucket = n & hcp->hdr->high_mask;
+	if (bucket > hcp->hdr->max_bucket)
+		bucket = bucket & hcp->hdr->low_mask;
+	return (bucket);
+}
+
+/*
+ * Check for duplicates, and call __db_ret appropriately.  Release
+ * everything held by the cursor.
+ */
+static int
+__ham_dup_return(dbc, val, flags)
+	DBC *dbc;
+	DBT *val;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT *myval, tmp_val;
+	HASH_CURSOR *hcp;
+	PAGE *pp;
+	db_indx_t ndx;
+	db_pgno_t pgno;
+	u_int32_t off, tlen;
+	u_int8_t *hk, type;
+	int cmp, ret;
+	db_indx_t len;
+
+	/* Check for duplicate and return the first one. */
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	ndx = H_DATAINDEX(hcp->indx);
+	type = HPAGE_TYPE(dbp, hcp->page, ndx);
+	pp = hcp->page;
+	myval = val;
+
+	/*
+	 * There are 4 cases:
+	 * 1. We are not in duplicate, simply return; the upper layer
+	 *    will do the right thing.
+	 * 2. We are looking at keys and stumbled onto a duplicate.
+	 * 3. We are in the middle of a duplicate set. (ISDUP set)
+	 * 4. We need to check for particular data match.
+	 */
+
+	/* We should never get here with off-page dups. */
+	DB_ASSERT(dbp->env, type != H_OFFDUP);
+
+	/* Case 1 */
+	if (type != H_DUPLICATE && flags != DB_GET_BOTH &&
+	    flags != DB_GET_BOTHC && flags != DB_GET_BOTH_RANGE)
+		return (0);
+
+	/*
+	 * Here we check for the case where we just stumbled onto a
+	 * duplicate.  In this case, we do initialization and then
+	 * let the normal duplicate code handle it. (Case 2)
+	 */
+	if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) {
+		F_SET(hcp, H_ISDUP);
+		hcp->dup_tlen = LEN_HDATA(dbp, hcp->page,
+		    hcp->hdr->dbmeta.pagesize, hcp->indx);
+		hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+		if (flags == DB_LAST ||
+		    flags == DB_PREV || flags == DB_PREV_NODUP) {
+			hcp->dup_off = 0;
+			do {
+				memcpy(&len,
+				    HKEYDATA_DATA(hk) + hcp->dup_off,
+				    sizeof(db_indx_t));
+				hcp->dup_off += DUP_SIZE(len);
+			} while (hcp->dup_off < hcp->dup_tlen);
+			hcp->dup_off -= DUP_SIZE(len);
+		} else {
+			memcpy(&len,
+			    HKEYDATA_DATA(hk), sizeof(db_indx_t));
+			hcp->dup_off = 0;
+		}
+		hcp->dup_len = len;
+	}
+
+	/*
+	 * If we are retrieving a specific key/data pair, then we
+	 * may need to adjust the cursor before returning data.
+	 * Case 4
+	 */
+	if (flags == DB_GET_BOTH ||
+	    flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+		if (F_ISSET(hcp, H_ISDUP)) {
+			/*
+			 * If we're doing a join, search forward from the
+			 * current position, not the beginning of the dup set.
+			 */
+			if (flags == DB_GET_BOTHC)
+				F_SET(hcp, H_CONTINUE);
+
+			__ham_dsearch(dbc, val, &off, &cmp, flags);
+
+			/*
+			 * This flag is set nowhere else and is safe to
+			 * clear unconditionally.
+			 */
+			F_CLR(hcp, H_CONTINUE);
+			hcp->dup_off = off;
+		} else {
+			hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+			if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
+				memcpy(&tlen,
+				    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+				memcpy(&pgno,
+				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+				if ((ret = __db_moff(dbc, val, pgno, tlen,
+				    dbp->dup_compare, &cmp)) != 0)
+					return (ret);
+				cmp = -cmp;
+			} else {
+				/*
+				 * We do not zero tmp_val since the comparison
+				 * routines may only look at data and size.
+				 */
+				tmp_val.data = HKEYDATA_DATA(hk);
+				tmp_val.size = LEN_HDATA(dbp, hcp->page,
+				    dbp->pgsize, hcp->indx);
+				cmp = dbp->dup_compare == NULL ?
+				    __bam_defcmp(dbp, &tmp_val, val) :
+				    dbp->dup_compare(dbp, &tmp_val, val);
+			}
+
+			if (cmp > 0 && flags == DB_GET_BOTH_RANGE &&
+			    F_ISSET(dbp, DB_AM_DUPSORT))
+				cmp = 0;
+		}
+
+		if (cmp != 0)
+			return (DB_NOTFOUND);
+	}
+
+	/*
+	 * If we've already got the data for this value, or we're doing a bulk
+	 * get, we don't want to return the data.
+	 */
+	if (F_ISSET(dbc, DBC_MULTIPLE | DBC_MULTIPLE_KEY) ||
+	    F_ISSET(val, DB_DBT_ISSET))
+		return (0);
+
+	/*
+	 * Now, everything is initialized, grab a duplicate if
+	 * necessary.
+	 */
+	if (F_ISSET(hcp, H_ISDUP)) {	/* Case 3 */
+		/*
+		 * Copy the DBT in case we are retrieving into user
+		 * memory and we need the parameters for it.  If the
+		 * user requested a partial, then we need to adjust
+		 * the user's parameters to get the partial of the
+		 * duplicate which is itself a partial.
+		 */
+		memcpy(&tmp_val, val, sizeof(*val));
+
+		if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
+			/*
+			 * Take the user's length unless it would go
+			 * beyond the end of the duplicate.
+			 */
+			if (tmp_val.doff > hcp->dup_len)
+				tmp_val.dlen = 0;
+			else if (tmp_val.dlen + tmp_val.doff > hcp->dup_len)
+				tmp_val.dlen = hcp->dup_len - tmp_val.doff;
+
+		} else {
+			F_SET(&tmp_val, DB_DBT_PARTIAL);
+			tmp_val.dlen = hcp->dup_len;
+			tmp_val.doff = 0;
+		}
+
+		/*
+		 * Set offset to the appropriate place within the
+		 * current duplicate -- need to take into account
+		 * both the dup_off and the current duplicate's
+		 * length.
+		 */
+		tmp_val.doff += hcp->dup_off + sizeof(db_indx_t);
+
+		myval = &tmp_val;
+	}
+
+	/*
+	 * Finally, if we had a duplicate, pp, ndx, and myval should be
+	 * set appropriately.
+	 */
+	if ((ret = __db_ret(dbc, pp, ndx, myval,
+	    &dbc->rdata->data, &dbc->rdata->ulen)) != 0) {
+		if (ret == DB_BUFFER_SMALL)
+			val->size = myval->size;
+		return (ret);
+	}
+
+	/*
+	 * In case we sent a temporary off to db_ret, set the real
+	 * return values.
+	 */
+	val->data = myval->data;
+	val->size = myval->size;
+
+	F_SET(val, DB_DBT_ISSET);
+
+	return (0);
+}
+
+/*
+ * Overwrite a record.
+ *
+ * PUBLIC: int  __ham_overwrite __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__ham_overwrite(dbc, nval, flags)
+	DBC *dbc;
+	DBT *nval;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT *myval, tmp_val, tmp_val2;
+	ENV *env;
+	HASH_CURSOR *hcp;
+	void *newrec;
+	u_int8_t *hk, *p;
+	u_int32_t len, nondup_size;
+	db_indx_t newsize;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (F_ISSET(hcp, H_ISDUP)) {
+		/*
+		 * This is an overwrite of a duplicate. We should never
+		 * be off-page at this point.
+		 */
+		DB_ASSERT(env, hcp->opd == NULL);
+		/* On page dups */
+		if (F_ISSET(nval, DB_DBT_PARTIAL)) {
+			/*
+			 * We're going to have to get the current item, then
+			 * construct the record, do any padding and do a
+			 * replace.
+			 */
+			memset(&tmp_val, 0, sizeof(tmp_val));
+			if ((ret =
+			    __ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0)
+				return (ret);
+
+			/* Figure out new size. */
+			nondup_size = tmp_val.size;
+			newsize = nondup_size;
+
+			/*
+			 * Three cases:
+			 * 1. strictly append (may need to allocate space
+			 *	for pad bytes; really gross).
+			 * 2. overwrite some and append.
+			 * 3. strictly overwrite.
+			 */
+			if (nval->doff > nondup_size)
+				newsize +=
+				    ((nval->doff - nondup_size) + nval->size);
+			else if (nval->doff + nval->dlen > nondup_size)
+				newsize += nval->size -
+				    (nondup_size - nval->doff);
+			else
+				newsize += nval->size - nval->dlen;
+
+			/*
+			 * Make sure that the new size doesn't put us over
+			 * the onpage duplicate size in which case we need
+			 * to convert to off-page duplicates.
+			 */
+			if (ISBIG(hcp,
+			    (hcp->dup_tlen - nondup_size) + newsize)) {
+				if ((ret = __ham_dup_convert(dbc)) != 0)
+					return (ret);
+				return (hcp->opd->am_put(hcp->opd,
+				    NULL, nval, flags, NULL));
+			}
+
+			if ((ret = __os_malloc(dbp->env,
+			    DUP_SIZE(newsize), &newrec)) != 0)
+				return (ret);
+			memset(&tmp_val2, 0, sizeof(tmp_val2));
+			F_SET(&tmp_val2, DB_DBT_PARTIAL);
+
+			/* Construct the record. */
+			p = newrec;
+			/* Initial size. */
+			memcpy(p, &newsize, sizeof(db_indx_t));
+			p += sizeof(db_indx_t);
+
+			/* First part of original record. */
+			len = nval->doff > tmp_val.size
+			    ? tmp_val.size : nval->doff;
+			memcpy(p, tmp_val.data, len);
+			p += len;
+
+			if (nval->doff > tmp_val.size) {
+				/* Padding */
+				memset(p, 0, nval->doff - tmp_val.size);
+				p += nval->doff - tmp_val.size;
+			}
+
+			/* New bytes */
+			memcpy(p, nval->data, nval->size);
+			p += nval->size;
+
+			/* End of original record (if there is any) */
+			if (nval->doff + nval->dlen < tmp_val.size) {
+				len = (tmp_val.size - nval->doff) - nval->dlen;
+				memcpy(p, (u_int8_t *)tmp_val.data +
+				    nval->doff + nval->dlen, len);
+				p += len;
+			}
+
+			/* Final size. */
+			memcpy(p, &newsize, sizeof(db_indx_t));
+
+			/*
+			 * Make sure that the caller isn't corrupting
+			 * the sort order.
+			 */
+			if (dbp->dup_compare != NULL) {
+				tmp_val2.data =
+				    (u_int8_t *)newrec + sizeof(db_indx_t);
+				tmp_val2.size = newsize;
+				if (dbp->dup_compare(
+				    dbp, &tmp_val, &tmp_val2) != 0) {
+					__os_free(env, newrec);
+					return (__db_duperr(dbp, flags));
+				}
+			}
+
+			tmp_val2.data = newrec;
+			tmp_val2.size = DUP_SIZE(newsize);
+			tmp_val2.doff = hcp->dup_off;
+			tmp_val2.dlen = DUP_SIZE(hcp->dup_len);
+
+			ret = __ham_replpair(dbc, &tmp_val2, H_DUPLICATE);
+			__os_free(env, newrec);
+
+			/* Update cursor */
+			if (ret != 0)
+				return (ret);
+
+			if (newsize > nondup_size) {
+				if ((ret = __hamc_update(dbc,
+				    (newsize - nondup_size),
+				    DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+					return (ret);
+				hcp->dup_tlen += (newsize - nondup_size);
+			} else {
+				if ((ret = __hamc_update(dbc,
+				    (nondup_size - newsize),
+				    DB_HAM_CURADJ_DELMOD, 1)) != 0)
+					return (ret);
+				hcp->dup_tlen -= (nondup_size - newsize);
+			}
+			hcp->dup_len = newsize;
+			return (0);
+		} else {
+			/* Check whether we need to convert to off page. */
+			if (ISBIG(hcp,
+			    (hcp->dup_tlen - hcp->dup_len) + nval->size)) {
+				if ((ret = __ham_dup_convert(dbc)) != 0)
+					return (ret);
+				return (hcp->opd->am_put(hcp->opd,
+				    NULL, nval, flags, NULL));
+			}
+
+			/* Make sure we maintain sort order. */
+			if (dbp->dup_compare != NULL) {
+				tmp_val2.data =
+				    HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+				    hcp->indx)) + hcp->dup_off +
+				    sizeof(db_indx_t);
+				tmp_val2.size = hcp->dup_len;
+				if (dbp->dup_compare(
+				    dbp, nval, &tmp_val2) != 0) {
+					__db_errx(env, DB_STR("1131",
+			    "Existing data sorts differently from put data"));
+					return (EINVAL);
+				}
+			}
+			/* Overwriting a complete duplicate. */
+			if ((ret =
+			    __ham_make_dup(dbp->env, nval, &tmp_val,
+			    &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0)
+				return (ret);
+			/* Now fix what we are replacing. */
+			tmp_val.doff = hcp->dup_off;
+			tmp_val.dlen = DUP_SIZE(hcp->dup_len);
+
+			/* Update cursor */
+			if (nval->size > hcp->dup_len) {
+				if ((ret = __hamc_update(dbc,
+				    (nval->size - hcp->dup_len),
+				    DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+					return (ret);
+				hcp->dup_tlen += (nval->size - hcp->dup_len);
+			} else {
+				if ((ret = __hamc_update(dbc,
+				    (hcp->dup_len - nval->size),
+				    DB_HAM_CURADJ_DELMOD, 1)) != 0)
+					return (ret);
+				hcp->dup_tlen -= (hcp->dup_len - nval->size);
+			}
+			hcp->dup_len = (db_indx_t)nval->size;
+		}
+		myval = &tmp_val;
+	} else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
+		/* Put/overwrite */
+		memcpy(&tmp_val, nval, sizeof(*nval));
+		F_SET(&tmp_val, DB_DBT_PARTIAL);
+		tmp_val.doff = 0;
+		hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+		if (HPAGE_PTYPE(hk) == H_OFFPAGE)
+			memcpy(&tmp_val.dlen,
+			    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+		else
+			tmp_val.dlen = LEN_HDATA(dbp, hcp->page,
+			    hcp->hdr->dbmeta.pagesize, hcp->indx);
+		myval = &tmp_val;
+	} else
+		/* Regular partial put */
+		myval = nval;
+
+	return (__ham_replpair(dbc, myval,
+	    F_ISSET(hcp, H_ISDUP) ? H_DUPLICATE : H_KEYDATA));
+}
+
+/*
+ * Given a key and a cursor, sets the cursor to the page/ndx on which
+ * the key resides.  If the key is found, the cursor H_OK flag is set
+ * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set.
+ * If the key is not found, the H_OK flag is not set.  If the sought
+ * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields
+ * are set indicating where an add might take place.  If it is 0,
+ * none of the cursor pointer field are valid.
+ * PUBLIC: int  __ham_lookup __P((DBC *,
+ * PUBLIC:	const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_lookup(dbc, key, sought, mode, pgnop)
+	DBC *dbc;
+	const DBT *key;
+	u_int32_t sought;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_pgno_t next_pgno;
+	int match, ret;
+	u_int8_t *dk;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	/*
+	 * Set up cursor so that we're looking for space to add an item
+	 * as we cycle through the pages looking for the key.
+	 */
+	if ((ret = __ham_item_reset(dbc)) != 0)
+		return (ret);
+	hcp->seek_size = sought;
+
+	hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size);
+	hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+	/* look though all pages in the bucket for the key */
+	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+		return (ret);
+
+	*pgnop = PGNO_INVALID;
+	if (hcp->indx == NDX_INVALID) {
+		hcp->indx = 0;
+		F_CLR(hcp, H_ISDUP);
+	}
+	while (hcp->pgno != PGNO_INVALID) {
+		/* Are we looking for space to insert an item. */
+		if (hcp->seek_size != 0 &&
+		    hcp->seek_found_page == PGNO_INVALID &&
+		    hcp->seek_size < P_FREESPACE(dbp, hcp->page)) {
+			hcp->seek_found_page = hcp->pgno;
+			hcp->seek_found_indx = NDX_INVALID;
+		    }
+
+		if ((ret = __ham_getindex(dbc, hcp->page, key,
+		    H_KEYDATA, &match, &hcp->indx)) != 0)
+			return (ret);
+
+		/*
+		 * If this is the first page in the bucket with space for
+		 * inserting the requested item. Store the insert index to
+		 * save having to look it up again later.
+		 */
+		if (hcp->seek_found_page == hcp->pgno)
+		    hcp->seek_found_indx = hcp->indx;
+
+		if (match == 0) {
+			F_SET(hcp, H_OK);
+			dk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+			if (HPAGE_PTYPE(dk) == H_OFFDUP)
+				memcpy(pgnop, HOFFDUP_PGNO(dk),
+				    sizeof(db_pgno_t));
+			return (0);
+		}
+
+		/* move the cursor to the next page. */
+		if (NEXT_PGNO(hcp->page) == PGNO_INVALID)
+			break;
+		next_pgno = NEXT_PGNO(hcp->page);
+		hcp->indx = 0;
+		if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+			return (ret);
+	}
+	F_SET(hcp, H_NOMORE);
+	return (DB_NOTFOUND);
+}
+
+/*
+ * __ham_init_dbt --
+ *	Initialize a dbt using some possibly already allocated storage
+ *	for items.
+ *
+ * PUBLIC: int __ham_init_dbt __P((ENV *,
+ * PUBLIC:     DBT *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__ham_init_dbt(env, dbt, size, bufp, sizep)
+	ENV *env;
+	DBT *dbt;
+	u_int32_t size;
+	void **bufp;
+	u_int32_t *sizep;
+{
+	int ret;
+
+	memset(dbt, 0, sizeof(*dbt));
+	if (*sizep < size) {
+		if ((ret = __os_realloc(env, size, bufp)) != 0) {
+			*sizep = 0;
+			return (ret);
+		}
+		*sizep = size;
+	}
+	dbt->data = *bufp;
+	dbt->size = size;
+	return (0);
+}
+
+/*
+ * Adjust the cursor after an insert or delete.  The cursor passed is
+ * the one that was operated upon; we just need to check any of the
+ * others.
+ *
+ * len indicates the length of the item added/deleted
+ * add indicates if the item indicated by the cursor has just been
+ * added (add == 1) or deleted (add == 0).
+ * dup indicates if the addition occurred into a duplicate set.
+ *
+ * PUBLIC: int __hamc_update
+ * PUBLIC:    __P((DBC *, u_int32_t, db_ham_curadj, int));
+ */
+ static int
+ __hamc_update_getorder(cp, dbc, orderp, pgno, is_dup, args)
+	DBC *dbc, *cp;
+	u_int32_t *orderp;
+	db_pgno_t pgno;
+	u_int32_t is_dup;
+	void *args;
+{
+	HASH_CURSOR *hcp, *lcp;
+
+	COMPQUIET(args, NULL);
+	COMPQUIET(pgno, 0);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (cp == dbc || cp->dbtype != DB_HASH)
+		return (0);
+	lcp = (HASH_CURSOR *)cp->internal;
+	if (F_ISSET(lcp, H_DELETED) &&
+	    hcp->pgno == lcp->pgno &&
+	    hcp->indx == lcp->indx &&
+	    *orderp < lcp->order &&
+	    (!is_dup || hcp->dup_off == lcp->dup_off) &&
+	    !MVCC_SKIP_CURADJ(cp, lcp->pgno))
+		*orderp = lcp->order;
+	return (0);
+}
+struct __hamc_update_setorder_args {
+	int was_mod, was_add;
+	u_int32_t len, order;
+	DB_TXN *my_txn;
+};
+
+static int
+__hamc_update_setorder(cp, dbc, foundp, pgno, is_dup, vargs)
+	DBC *dbc, *cp;
+	u_int32_t *foundp;
+	db_pgno_t pgno;
+	u_int32_t is_dup;
+	void *vargs;
+{
+	HASH_CURSOR *hcp, *lcp;
+	struct __hamc_update_setorder_args *args;
+
+	COMPQUIET(pgno, 0);
+
+	if (cp == dbc || cp->dbtype != DB_HASH)
+		return (0);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	lcp = (HASH_CURSOR *)cp->internal;
+
+	if (lcp->pgno != hcp->pgno ||
+	    lcp->indx == NDX_INVALID ||
+	    MVCC_SKIP_CURADJ(cp, lcp->pgno))
+		return (0);
+
+	args = vargs;
+	/*
+	 * We're about to move things out from under this
+	 * cursor.  Clear any cached streaming information.
+	 */
+	lcp->stream_start_pgno = PGNO_INVALID;
+
+	if (args->my_txn != NULL && cp->txn != args->my_txn)
+		*foundp = 1;
+
+	if (!is_dup) {
+		if (args->was_add == 1) {
+			/*
+			 * This routine is not called to add
+			 * non-dup records which are always put
+			 * at the end.  It is only called from
+			 * recovery in this case and the
+			 * cursor will be marked deleted.
+			 * We are "undeleting" so unmark all
+			 * cursors with the same order.
+			 */
+			if (lcp->indx == hcp->indx &&
+			    F_ISSET(lcp, H_DELETED)) {
+				if (lcp->order == hcp->order)
+					F_CLR(lcp, H_DELETED);
+				else if (lcp->order >
+				    hcp->order) {
+
+				/*
+				 * If we've moved this cursor's
+				 * index, split its order
+				 * number--i.e., decrement it by
+				 * enough so that the lowest
+				 * cursor moved has order 1.
+				 * cp_arg->order is the split
+				 * point, so decrement by it.
+				 */
+					lcp->order -=
+					    hcp->order;
+					lcp->indx += 2;
+				}
+			} else if (lcp->indx >= hcp->indx)
+				lcp->indx += 2;
+		} else {
+			if (lcp->indx > hcp->indx) {
+				lcp->indx -= 2;
+				if (lcp->indx == hcp->indx &&
+				    F_ISSET(lcp, H_DELETED))
+					lcp->order += args->order;
+			} else if (lcp->indx == hcp->indx &&
+			    !F_ISSET(lcp, H_DELETED)) {
+				F_SET(lcp, H_DELETED);
+				F_CLR(lcp, H_ISDUP);
+				lcp->order = args->order;
+			}
+		}
+	} else if (lcp->indx == hcp->indx) {
+		/*
+		 * Handle duplicates.  This routine is only
+		 * called for on page dups. Off page dups are
+		 * handled by btree/rtree code.
+		 */
+		if (args->was_add == 1) {
+			lcp->dup_tlen += args->len;
+			if (lcp->dup_off == hcp->dup_off &&
+			    F_ISSET(hcp, H_DELETED) &&
+			    F_ISSET(lcp, H_DELETED)) {
+				/* Abort of a delete. */
+				if (lcp->order == hcp->order)
+					F_CLR(lcp, H_DELETED);
+				else if (lcp->order >
+				    hcp->order) {
+					lcp->order -=
+					    (hcp->order -1);
+					lcp->dup_off += args->len;
+				}
+			} else if (lcp->dup_off >
+			    hcp->dup_off || (!args->was_mod &&
+			    lcp->dup_off == hcp->dup_off))
+				lcp->dup_off += args->len;
+		} else {
+			lcp->dup_tlen -= args->len;
+			if (lcp->dup_off > hcp->dup_off) {
+				lcp->dup_off -= args->len;
+				if (lcp->dup_off ==
+				    hcp->dup_off &&
+				    F_ISSET(lcp, H_DELETED))
+					lcp->order += args->order;
+			} else if (!args->was_mod &&
+			    lcp->dup_off == hcp->dup_off &&
+			    !F_ISSET(lcp, H_DELETED)) {
+				F_SET(lcp, H_DELETED);
+				lcp->order = args->order;
+			}
+		}
+	}
+	return (0);
+}
+
+int
+__hamc_update(dbc, len, operation, is_dup)
+	DBC *dbc;
+	u_int32_t len;
+	db_ham_curadj operation;
+	int is_dup;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	HASH_CURSOR *hcp;
+	int ret;
+	u_int32_t found;
+	struct __hamc_update_setorder_args args;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	/*
+	 * Adjustment will only be logged if this is a subtransaction.
+	 * Only subtransactions can abort and effect their parent
+	 * transactions cursors.
+	 */
+
+	args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+	args.len = len;
+
+	switch (operation) {
+	case DB_HAM_CURADJ_DEL:
+		args.was_mod = 0;
+		args.was_add = 0;
+		break;
+	case DB_HAM_CURADJ_ADD:
+		args.was_mod = 0;
+		args.was_add = 1;
+		break;
+	case DB_HAM_CURADJ_DELMOD:
+		args.was_mod = 1;
+		args.was_add = 0;
+		break;
+	case DB_HAM_CURADJ_ADDMOD:
+		args.was_mod = 1;
+		args.was_add = 1;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/*
+	 * Calculate the order of this deleted record.
+	 * This will be one greater than any cursor that is pointing
+	 * at this record and already marked as deleted.
+	 */
+	if (args.was_add == 0) {
+		if ((ret = __db_walk_cursors(dbp, dbc, __hamc_update_getorder,
+		    &args.order, 0, (u_int32_t)is_dup, NULL)) != 0)
+			return (ret);
+		args.order++;
+		hcp->order = args.order;
+	}
+
+	if ((ret = __db_walk_cursors(dbp, dbc,
+	    __hamc_update_setorder, &found, 0, (u_int32_t)is_dup, &args)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(dbc)) {
+		if ((ret = __ham_curadj_log(dbp, args.my_txn, &lsn, 0,
+		    hcp->pgno, hcp->indx, len, hcp->dup_off,
+		    (int)operation, is_dup, args.order)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+struct __ham_get_clist_args {
+	u_int nalloc, nused;
+	DBC **listp;
+};
+
+static int
+__ham_get_clist_func(dbc, my_dbc, countp, pgno, indx, vargs)
+	DBC *dbc, *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *vargs;
+{
+	int ret;
+	struct __ham_get_clist_args *args;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(countp, NULL);
+	args = vargs;
+	/*
+	 * We match if dbc->pgno matches the specified
+	 * pgno, and if either the dbc->indx matches
+	 * or we weren't given an index.
+	 */
+	if (dbc->internal->pgno == pgno &&
+	    (indx == NDX_INVALID ||
+	    dbc->internal->indx == indx) &&
+	    !MVCC_SKIP_CURADJ(dbc, pgno)) {
+		if (args->nused >= args->nalloc) {
+			args->nalloc += 10;
+			if ((ret = __os_realloc(dbc->dbp->env,
+			    args->nalloc * sizeof(HASH_CURSOR *),
+			    &args->listp)) != 0)
+				return (ret);
+		}
+		args->listp[args->nused++] = dbc;
+	}
+	return (0);
+}
+/*
+ * __ham_get_clist --
+ *
+ * Get a list of cursors either on a particular bucket or on a particular
+ * page and index combination.  The former is so that we can update
+ * cursors on a split.  The latter is so we can update cursors when we
+ * move items off page.
+ *
+ * PUBLIC: int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***));
+ */
+int
+__ham_get_clist(dbp, pgno, indx, listp)
+	DB *dbp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	DBC ***listp;
+{
+	ENV *env;
+	int ret;
+	u_int32_t count;
+	struct __ham_get_clist_args args;
+
+	env = dbp->env;
+	args.listp = NULL;
+	args.nalloc = args.nused = 0;
+
+	if ((ret = __db_walk_cursors(dbp, NULL,
+	    __ham_get_clist_func, &count, pgno, indx, &args)) != 0)
+		return (ret);
+	if (args.listp != NULL) {
+		if (args.nused >= args.nalloc) {
+			args.nalloc++;
+			if ((ret = __os_realloc(env,
+			    args.nalloc * sizeof(HASH_CURSOR *),
+			    &args.listp)) != 0)
+				return (ret);
+		}
+		args.listp[args.nused] = NULL;
+	}
+	*listp = args.listp;
+	return (0);
+}
+
+static int
+__hamc_writelock(dbc)
+	DBC *dbc;
+{
+	DB_LOCK tmp_lock;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	/*
+	 * All we need do is acquire the lock and let the off-page
+	 * dup tree do its thing.
+	 */
+	if (!STD_LOCKING(dbc))
+		return (0);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	ret = 0;
+	if ((!LOCK_ISSET(hcp->lock) || hcp->lock_mode != DB_LOCK_WRITE)) {
+		tmp_lock = hcp->lock;
+		if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) == 0 &&
+		    tmp_lock.mode != DB_LOCK_WWRITE)
+			ret = __LPUT(dbc, tmp_lock);
+	}
+	return (ret);
+}
diff --git a/src/hash/hash.src b/src/hash/hash.src
new file mode 100644
index 00000000..e544c6f3
--- /dev/null
+++ b/src/hash/hash.src
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+DBPRIVATE
+PREFIX	__ham
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/hash.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * HASH-insdel: used for hash to insert/delete a pair of entries onto a master
+ * page. The pair might be regular key/data pairs or they might be the
+ * structures that refer to off page items, duplicates or offpage duplicates.
+ *  opcode - PUTPAIR/DELPAIR + big masks
+ *  fileid - identifies the file referenced
+ *  pgno - page within file
+ *  ndx - index on the page of the item being added (item index)
+ *  pagelsn - lsn on the page before the update
+ *  key - the key being inserted
+ *  data - the data being inserted
+ */
+BEGIN insdel		50	21
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	ndx		u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+OP	keytype		u_int32_t	lu
+HDR	key		DBT		s
+OP	datatype	u_int32_t	lu
+HDR	data		DBT		s
+END
+
+BEGIN_COMPAT insdel		42	21
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	ndx		u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+DBT	key		DBT		s
+DBT	data		DBT		s
+END
+
+/*
+ * Used to add and remove overflow pages.
+ * prev_pgno is the previous page that is going to get modified to
+ *	point to this one.  If this is the first page in a chain
+ *	then prev_pgno should be PGNO_INVALID.
+ * new_pgno is the page being allocated.
+ * next_pgno is the page that follows this one.  On allocation,
+ *	this should be PGNO_INVALID.  For deletes, it may exist.
+ * pagelsn is the old lsn on the page.
+ */
+BEGIN newpage		42	22
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	prev_pgno	db_pgno_t	lu
+POINTER	prevlsn		DB_LSN *	lu
+ARG	new_pgno	db_pgno_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	next_pgno	db_pgno_t	lu
+POINTER	nextlsn		DB_LSN *	lu
+END
+
+/*
+ * Splitting requires two types of log messages.  The second logs the
+ * data on the original page.  To redo the split, we have to visit the
+ * new page (pages) and add the items back on the page if they are not
+ * yet there.
+ */
+BEGIN splitdata		42	24
+DB	fileid		int32_t		ld
+ARG	opcode		u_int32_t	lu
+ARG	pgno		db_pgno_t	lu
+PGDBT	pageimage	DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+/*
+ * HASH-replace: is used for hash to handle partial puts that only
+ * affect a single master page.
+ *  fileid - identifies the file referenced
+ *  pgno - page within file
+ *  ndx - index on the page of the item being modified (item index)
+ *  pagelsn - lsn on the page before the update
+ *  off - offset in the old item where the new item is going.
+ *  olditem - DBT that describes the part of the item being replaced.
+ *  newitem - DBT of the new item.
+ *  makedup - this was a replacement that made an item a duplicate.
+ */
+BEGIN replace		50	25
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	ndx		u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	off		int32_t		ld
+OP	oldtype		u_int32_t	lu
+HDR	olditem		DBT		s
+OP	newtype		u_int32_t	lu
+HDR	newitem		DBT		s
+END
+
+BEGIN_COMPAT replace		42	25
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	ndx		u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	off		int32_t		ld
+DBT	olditem		DBT		s
+DBT	newitem		DBT		s
+ARG	makedup		u_int32_t	lu
+END
+
+/*
+ * Used when we empty the first page in a bucket and there are pages after
+ * it.  The page after it gets copied into the bucket page (since bucket
+ * pages have to be in fixed locations).
+ * pgno: the bucket page
+ * pagelsn: the old LSN on the bucket page
+ * next_pgno: the page number of the next page
+ * nnext_pgno: page after next_pgno (may need to change its prev)
+ * nnextlsn: the LSN of nnext_pgno.
+ */
+BEGIN copypage		42	28
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	next_pgno	db_pgno_t	lu
+POINTER	nextlsn		DB_LSN *	lu
+ARG	nnext_pgno	db_pgno_t	lu
+POINTER	nnextlsn	DB_LSN *	lu
+PGDBT	page		DBT		s
+END
+
+/*
+ * This record logs the meta-data aspects of a split operation.  It has enough
+ * information so that we can record both an individual page allocation as well
+ * as a group allocation which we do because in sub databases, the pages in
+ * a hash doubling, must be contiguous.  If we do a group allocation, the
+ * number of pages allocated is bucket + 1, pgno is the page number of the
+ * first newly allocated bucket.
+ *
+ * bucket:	Old maximum bucket number.
+ * mmpgno:	Master meta-data page number (0 if same as mpgno).
+ * mmetalsn:	Lsn of the master meta-data page.
+ * mpgno:	Meta-data page number.
+ * metalsn:	Lsn of the meta-data page.
+ * pgno:	Page allocated to bucket + 1 (first newly allocated page)
+ * pagelsn:	Lsn of either the first page allocated (if newalloc == 0) or
+ *		the last page allocated (if newalloc == 1).
+ * newalloc:	1 indicates that this record did the actual allocation;
+ *		0 indicates that the pages were already allocated from a
+ *		previous (failed) allocation.
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT metagroup		42	29
+DB	fileid		int32_t		ld
+ARG	bucket		u_int32_t	lu
+ARG	 mmpgno		db_pgno_t	lu
+POINTER	mmetalsn	DB_LSN *	lu
+ARG	mpgno		db_pgno_t	lu
+POINTER	metalsn		DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	newalloc	u_int32_t	lu
+END
+
+BEGIN metagroup		43	29
+DB	fileid		int32_t		ld
+ARG	bucket		u_int32_t	lu
+ARG	 mmpgno		db_pgno_t	lu
+POINTER	mmetalsn	DB_LSN *	lu
+ARG	mpgno		db_pgno_t	lu
+POINTER	metalsn		DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+ARG	newalloc	u_int32_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * groupalloc
+ *
+ * This is used in conjunction with MPOOL_NEW_GROUP when we are creating
+ * a new database to make sure that we recreate or reclaim free pages
+ * when we allocate a chunk of contiguous ones during database creation.
+ *
+ * meta_lsn: meta-data lsn
+ * start_pgno:	starting page number
+ * num: number	of allocated pages
+ * unused:	unused, historically the meta-data free list page number
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT groupalloc	42	32
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	start_pgno	db_pgno_t	lu
+ARG	num		u_int32_t	lu
+ARG	free		db_pgno_t	lu
+END
+
+BEGIN groupalloc	43	32
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	start_pgno	db_pgno_t	lu
+ARG	num		u_int32_t	lu
+ARG	unused		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * Changeslot
+ * Change the entry in a spares table slot from the "old" page to the "new"
+ * page.
+ */
+BEGIN changeslot	50 35
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	slot		u_int32_t	lu
+ARG	old		db_pgno_t	lu
+ARG	new		db_pgno_t	lu
+END
+
+/*
+ * Contract
+ * Contract the hash table by removing the last "bucket".  "pgno" is the
+ * page number for that bucket.
+ */
+BEGIN contract		50 37
+DB	fileid		int32_t		ld
+ARG	meta		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	bucket		u_int32_t	lu
+ARG	pgno		db_pgno_t	lu
+END
+
+/*
+ * Records for backing out cursor adjustment.
+ *   curadj - added or deleted a record or a dup
+ *	within a record.
+ *	pgno	- page that was effected
+ *	indx	- indx of recrod effected.
+ *	len	- if a dup its length.
+ *	dup_off	- if a dup its offset
+ *	add	- 1 if add 0 if delete
+ *	is_dup  - 1 if dup 0 otherwise.
+ *	order	- order assigned to this deleted record or dup.
+ *
+ *   chgpg - rmoved a page, move the records to a new page
+ *	mode	- CHGPG page was deleted or records move to new page.
+ *		- SPLIT we split a bucket
+ *		- DUP we convered to off page duplicates.
+ *	old_pgno, new_pgno - old and new page numbers.
+ *	old_index, new_index - old and new index numbers, NDX_INVALID if
+ *		it effects all records on the page.
+ *		For three opcodes new in 3.3 (DB_HAM_DELFIRSTPG, DELMIDPG,
+ *		and DELLASTPG), we overload old_indx and new_indx to avoid
+ *		needing a new log record type:  old_indx stores the only
+ *		indx of interest to these records, and new_indx stores the
+ *		order that's assigned to the lowest deleted record we're
+ *		moving.
+ */
+BEGIN curadj	42	33
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	len		u_int32_t	lu
+ARG	dup_off		u_int32_t	lu
+ARG	add		int		ld
+ARG	is_dup		int		ld
+ARG	order		u_int32_t	lu
+END
+
+BEGIN chgpg	42	34
+DB	fileid		int32_t		ld
+ARG	mode		db_ham_mode	ld
+ARG	old_pgno	db_pgno_t	lu
+ARG	new_pgno	db_pgno_t	lu
+ARG	old_indx	u_int32_t	lu
+ARG	new_indx	u_int32_t	lu
+END
+
diff --git a/src/hash/hash_auto.c b/src/hash/hash_auto.c
new file mode 100644
index 00000000..4adb6cd9
--- /dev/null
+++ b/src/hash/hash_auto.c
@@ -0,0 +1,209 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __ham_insdel_desc[] = {
+	{LOGREC_ARG, SSZ(__ham_insdel_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__ham_insdel_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_insdel_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_insdel_args, ndx), "ndx", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_insdel_args, pagelsn), "pagelsn", ""},
+	{LOGREC_OP, SSZ(__ham_insdel_args, keytype), "keytype", "%lu"},
+	{LOGREC_HDR, SSZ(__ham_insdel_args, key), "key", ""},
+	{LOGREC_OP, SSZ(__ham_insdel_args, datatype), "datatype", "%lu"},
+	{LOGREC_HDR, SSZ(__ham_insdel_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_insdel_42_desc[] = {
+	{LOGREC_ARG, SSZ(__ham_insdel_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__ham_insdel_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_insdel_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_insdel_42_args, ndx), "ndx", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_insdel_42_args, pagelsn), "pagelsn", ""},
+	{LOGREC_DBT, SSZ(__ham_insdel_42_args, key), "key", ""},
+	{LOGREC_DBT, SSZ(__ham_insdel_42_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_newpage_desc[] = {
+	{LOGREC_ARG, SSZ(__ham_newpage_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__ham_newpage_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_newpage_args, prev_pgno), "prev_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_newpage_args, prevlsn), "prevlsn", ""},
+	{LOGREC_ARG, SSZ(__ham_newpage_args, new_pgno), "new_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_newpage_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_newpage_args, next_pgno), "next_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_newpage_args, nextlsn), "nextlsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_splitdata_desc[] = {
+	{LOGREC_DB, SSZ(__ham_splitdata_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_splitdata_args, opcode), "opcode", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_splitdata_args, pgno), "pgno", "%lu"},
+	{LOGREC_PGDBT, SSZ(__ham_splitdata_args, pageimage), "pageimage", ""},
+	{LOGREC_POINTER, SSZ(__ham_splitdata_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_replace_desc[] = {
+	{LOGREC_DB, SSZ(__ham_replace_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_replace_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_replace_args, ndx), "ndx", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_replace_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_replace_args, off), "off", "%ld"},
+	{LOGREC_OP, SSZ(__ham_replace_args, oldtype), "oldtype", "%lu"},
+	{LOGREC_HDR, SSZ(__ham_replace_args, olditem), "olditem", ""},
+	{LOGREC_OP, SSZ(__ham_replace_args, newtype), "newtype", "%lu"},
+	{LOGREC_HDR, SSZ(__ham_replace_args, newitem), "newitem", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_replace_42_desc[] = {
+	{LOGREC_DB, SSZ(__ham_replace_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_replace_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_replace_42_args, ndx), "ndx", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_replace_42_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_replace_42_args, off), "off", "%ld"},
+	{LOGREC_DBT, SSZ(__ham_replace_42_args, olditem), "olditem", ""},
+	{LOGREC_DBT, SSZ(__ham_replace_42_args, newitem), "newitem", ""},
+	{LOGREC_ARG, SSZ(__ham_replace_42_args, makedup), "makedup", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_copypage_desc[] = {
+	{LOGREC_DB, SSZ(__ham_copypage_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_copypage_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_copypage_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_copypage_args, next_pgno), "next_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_copypage_args, nextlsn), "nextlsn", ""},
+	{LOGREC_ARG, SSZ(__ham_copypage_args, nnext_pgno), "nnext_pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_copypage_args, nnextlsn), "nnextlsn", ""},
+	{LOGREC_PGDBT, SSZ(__ham_copypage_args, page), "page", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_metagroup_42_desc[] = {
+	{LOGREC_DB, SSZ(__ham_metagroup_42_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_42_args, bucket), "bucket", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_metagroup_42_args, mmpgno), "mmpgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_42_args, mmetalsn), "mmetalsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_42_args, mpgno), "mpgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_42_args, metalsn), "metalsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_42_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_42_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_42_args, newalloc), "newalloc", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_metagroup_desc[] = {
+	{LOGREC_DB, SSZ(__ham_metagroup_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, bucket), "bucket", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, mmpgno), "mmpgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_args, mmetalsn), "mmetalsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, mpgno), "mpgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_args, metalsn), "metalsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, pgno), "pgno", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_metagroup_args, pagelsn), "pagelsn", ""},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, newalloc), "newalloc", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_metagroup_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_groupalloc_42_desc[] = {
+	{LOGREC_DB, SSZ(__ham_groupalloc_42_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__ham_groupalloc_42_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_42_args, start_pgno), "start_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_42_args, num), "num", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_42_args, free), "free", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_groupalloc_desc[] = {
+	{LOGREC_DB, SSZ(__ham_groupalloc_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__ham_groupalloc_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_args, start_pgno), "start_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_args, num), "num", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_args, unused), "unused", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_groupalloc_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_changeslot_desc[] = {
+	{LOGREC_DB, SSZ(__ham_changeslot_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__ham_changeslot_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__ham_changeslot_args, slot), "slot", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_changeslot_args, old), "old", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_changeslot_args, new), "new", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_contract_desc[] = {
+	{LOGREC_DB, SSZ(__ham_contract_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_contract_args, meta), "meta", "%lu"},
+	{LOGREC_POINTER, SSZ(__ham_contract_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__ham_contract_args, bucket), "bucket", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_contract_args, pgno), "pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_curadj_desc[] = {
+	{LOGREC_DB, SSZ(__ham_curadj_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, len), "len", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, dup_off), "dup_off", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, add), "add", "%ld"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, is_dup), "is_dup", "%ld"},
+	{LOGREC_ARG, SSZ(__ham_curadj_args, order), "order", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_chgpg_desc[] = {
+	{LOGREC_DB, SSZ(__ham_chgpg_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__ham_chgpg_args, mode), "mode", "%ld"},
+	{LOGREC_ARG, SSZ(__ham_chgpg_args, old_pgno), "old_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_chgpg_args, new_pgno), "new_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_chgpg_args, old_indx), "old_indx", "%lu"},
+	{LOGREC_ARG, SSZ(__ham_chgpg_args, new_indx), "new_indx", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __ham_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_insdel_recover, DB___ham_insdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_newpage_recover, DB___ham_newpage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_splitdata_recover, DB___ham_splitdata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_replace_recover, DB___ham_replace)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_copypage_recover, DB___ham_copypage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_metagroup_recover, DB___ham_metagroup)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_groupalloc_recover, DB___ham_groupalloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_changeslot_recover, DB___ham_changeslot)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_contract_recover, DB___ham_contract)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_curadj_recover, DB___ham_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_chgpg_recover, DB___ham_chgpg)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/hash/hash_autop.c b/src/hash/hash_autop.c
new file mode 100644
index 00000000..f1ef0042
--- /dev/null
+++ b/src/hash/hash_autop.c
@@ -0,0 +1,314 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_HASH
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __ham_insdel_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_insdel_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_insdel", __ham_insdel_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_insdel_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_insdel_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_insdel_42", __ham_insdel_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_newpage_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_newpage_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_newpage", __ham_newpage_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_splitdata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_splitdata_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_splitdata", __ham_splitdata_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_replace_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_replace_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_replace", __ham_replace_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_replace_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_replace_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_replace_42", __ham_replace_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_copypage_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_copypage_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_copypage", __ham_copypage_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_metagroup_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_metagroup_42", __ham_metagroup_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_metagroup_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_metagroup", __ham_metagroup_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_groupalloc_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_groupalloc_42", __ham_groupalloc_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_groupalloc_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_groupalloc", __ham_groupalloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_changeslot_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_changeslot_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_changeslot", __ham_changeslot_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_contract_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_contract_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_contract", __ham_contract_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_curadj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_curadj_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_curadj", __ham_curadj_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_chgpg_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_chgpg_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__ham_chgpg", __ham_chgpg_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_insdel_print, DB___ham_insdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_newpage_print, DB___ham_newpage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_splitdata_print, DB___ham_splitdata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_replace_print, DB___ham_replace)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_copypage_print, DB___ham_copypage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_metagroup_print, DB___ham_metagroup)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_groupalloc_print, DB___ham_groupalloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_changeslot_print, DB___ham_changeslot)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_contract_print, DB___ham_contract)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_curadj_print, DB___ham_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_chgpg_print, DB___ham_chgpg)) != 0)
+		return (ret);
+	return (0);
+}
+#endif /* HAVE_HASH */
diff --git a/src/hash/hash_compact.c b/src/hash/hash_compact.c
new file mode 100644
index 00000000..83b5ffb1
--- /dev/null
+++ b/src/hash/hash_compact.c
@@ -0,0 +1,549 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc/mp.h"
+
+static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
+static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));
+
+/*
+ * __ham_compact_int -- internal HASH compaction routine.
+ *
+ * PUBLIC: int __ham_compact_int __P((DBC *,
+ * PUBLIC:      DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
+ */
+int
+__ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
+	DBC *dbc;
+	DBT *start, *stop;
+	u_int32_t factor;
+	DB_COMPACT *c_data;
+	int *donep;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	db_pgno_t origpgno, pgno;
+	int check_trunc, pgs_done, ret, t_ret;
+	u_int32_t empty_buckets, i, stop_bucket;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	pgs_done = 0;
+	empty_buckets = 0;
+	check_trunc = c_data->compact_truncate != PGNO_INVALID;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		return (ret);
+
+	if (stop != NULL && stop->size != 0)
+		stop_bucket = *(u_int32_t *)stop->data;
+	else
+		stop_bucket = hcp->hdr->max_bucket;
+
+	if (start != NULL && start->size != 0)
+		hcp->bucket = *(u_int32_t *)start->data;
+	else
+		hcp->bucket = 0;
+
+	for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
+		/*
+		 * For each bucket first move records toward the head of
+		 * the bucket.
+		 */
+		hcp->indx = NDX_INVALID;
+		F_CLR(hcp, H_ISDUP);
+		hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+		pgno = PGNO_INVALID;
+		ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+
+		/*
+		 * If the bucket is empty, just note it, otherwise process it.
+		 * If there are any records there must be some in the head
+		 * of the bucket.
+		 */
+		if (ret == DB_NOTFOUND ) {
+			empty_buckets++;
+			c_data->compact_pages_examine++;
+			DB_ASSERT(dbp->env,
+			    PREV_PGNO(hcp->page) == PGNO_INVALID &&
+			    NEXT_PGNO(hcp->page) == PGNO_INVALID);
+			goto err;
+		} else if (ret != 0)
+			break;
+		c_data->compact_pages_examine++;
+
+		if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
+			if ((ret =
+			    __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
+				goto err;
+			pgno = PGNO_INVALID;
+			if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
+				goto err;
+		}
+
+		/*
+		 * Loop through the items in this page in the bucket and process
+		 * overflow records and off page duplicate sets.
+		 */
+		while (ret == 0) {
+			/* Handle off page duplicate trees. */
+			if (pgno == PGNO_INVALID)
+				goto no_opd;
+			if (check_trunc &&
+			    pgno > c_data->compact_truncate) {
+				c_data->compact_pages_examine++;
+				/*
+				 * Truncate this page if possible.
+				 * We must update the parent here
+				 * because the page number is
+				 * not aligned.
+				 */
+				if ((ret = __memp_dirty(mpf, &hcp->page,
+				    dbc->thread_info,
+				    dbc->txn, dbc->priority, 0)) != 0)
+					break;
+				origpgno = pgno;
+				if ((ret = __db_truncate_root(dbc, hcp->page,
+				    H_DATAINDEX(hcp->indx), &pgno, 0)) != 0)
+					break;
+				if (pgno != origpgno) {
+					memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
+					    hcp->page, hcp->indx)),
+					    &pgno, sizeof(db_pgno_t));
+					pgs_done++;
+					c_data->compact_pages--;
+				}
+			}
+			/*
+			 * Compact the off page duplicate tree.
+			 */
+			if ((ret = __bam_compact_opd(dbc,
+			    pgno, NULL, factor, c_data, &pgs_done)) != 0)
+				break;
+
+no_opd:			if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
+			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+				/* This is an overflow chain. */
+				if ((ret = __ham_truncate_overflow(dbc,
+				    H_DATAINDEX(hcp->indx),
+				    c_data, &pgs_done)) != 0)
+					break;
+			}
+
+			/* Check for an overflow key. */
+			if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
+			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+				/* This is an overflow chain. */
+				if ((ret = __ham_truncate_overflow(dbc,
+				    H_KEYINDEX(hcp->indx),
+				    c_data, &pgs_done)) != 0)
+					break;
+			}
+
+			pgno = PGNO_INVALID;
+			ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+		}
+
+err:		if (hcp->page != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    hcp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		hcp->page = NULL;
+		hcp->pgno = pgno = PGNO_INVALID;
+		/*
+		 * If we are in an auto-transaction and we updated something
+		 * return to the caller to commit this transaction to
+		 * avoid holding locks. Otherwise process the next bucket.
+		 * We can drop the lock if we did not do anything.
+		 * We always must commit the txn if we are in MVCC
+		 * as we have dirtied the hash buckets.
+		 */
+		if (ret == 0 &&
+		    atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
+		    (pgs_done == 0 || dbc->txn == NULL))
+			ret = __LPUT(dbc, hcp->lock);
+		else if (LF_ISSET(DB_AUTO_COMMIT)) {
+			if (ret == 0)
+				hcp->bucket++;
+			break;
+		}
+	}
+	/*
+	 * If we saw any empty buckets and we are freeing space we
+	 * want to contract the table before dropping the metadata
+	 * page. Wait till we are done with everything else as we
+	 * need to get an exclusive lock on the metadata page.
+	 */
+	if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
+		for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
+			if ((ret = __ham_contract_table(dbc, c_data)) != 0)
+				break;
+	}
+
+	if (ret == 0)
+		ret = __db_retcopy(dbp->env, start, &hcp->bucket,
+		    sizeof(hcp->bucket), &start->data, &start->ulen);
+	(void)__ham_release_meta(dbc);
+	c_data->compact_empty_buckets += empty_buckets;
+	if (hcp->bucket > stop_bucket)
+		*donep = 1;
+	return (ret);
+}
+
+/*
+ * __ham_compact_bucket -- move data to as few pages as possible.
+ *
+ * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
+ */
+int
+__ham_compact_bucket(dbc, c_data, pgs_donep)
+	DBC *dbc;
+	DB_COMPACT *c_data;
+	int *pgs_donep;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *pg;
+	db_pgno_t pgno;
+	int check_trunc, ret, t_ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	pg = hcp->page;
+	check_trunc = c_data->compact_truncate != PGNO_INVALID;
+	ret = 0;
+
+	pgno = hcp->pgno;
+	do {
+		if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+			break;
+		/* Sort any unsorted pages before adding to the page. */
+		if (TYPE(pg) == P_HASH_UNSORTED) {
+			if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
+				break;
+			(*pgs_donep)++;
+		}
+
+		/* If this is not the head try to move it to a lower page. */
+		if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID  &&
+		    PGNO(pg) > c_data->compact_truncate &&
+		    (ret = __db_exchange_page(dbc, &pg,
+		    hcp->page, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+			break;
+		if (pgno != PGNO(pg))
+			(*pgs_donep)++;
+
+		if (NEXT_PGNO(pg) == PGNO_INVALID)
+			break;
+		if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
+			break;
+		pgno = NEXT_PGNO(pg);
+		if (pg != hcp->page && (ret = __memp_fput(mpf,
+		     dbc->thread_info, pg, dbc->priority)) != 0)
+			break;
+		pg = NULL;
+	} while (pgno != PGNO_INVALID);
+
+	if (pg != NULL && pg != hcp->page &&
+	    (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
+	    ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __ham_copy_data -- copy as many records as possible from next page
+ */
+static int
+__ham_copy_data(dbc, pg, c_data, pgs_donep)
+	DBC *dbc;
+	PAGE *pg;
+	DB_COMPACT *c_data;
+	int *pgs_donep;
+{
+	DB *dbp;
+	DBC *newdbc;
+	DBT data, key;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp, *ncp;
+	PAGE *nextpage;
+	db_pgno_t origpgno;
+	int i, nument, records, ret, t_ret;
+	u_int32_t len;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	records = 0;
+
+	if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
+		return (ret);
+	ncp = (HASH_CURSOR *)newdbc->internal;
+	ncp->hdr = hcp->hdr;
+
+	/*
+	 * Copy data to the front of the bucket. Loop until either we
+	 * have not replaced the next page or there is no next page.
+	 * If the next page was not removed then it still has data
+	 * on it.
+	 */
+	origpgno = PGNO_INVALID;
+	while (origpgno != NEXT_PGNO(pg) &&
+	    (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+
+		if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
+			break;
+
+		c_data->compact_pages_examine++;
+		ncp->page = nextpage;
+		ncp->pgno = PGNO(nextpage);
+		ncp->indx = 0;
+		memset(&key, 0, sizeof(key));
+		memset(&data, 0, sizeof(data));
+		nument = NUM_ENT(nextpage);
+		DB_ASSERT(dbp->env, nument != 0);
+		for (i = 0; i < nument; i += 2) {
+			len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
+			    LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
+			    2 * sizeof(db_indx_t);
+			if (P_FREESPACE(dbp, pg) < len)
+				continue;
+
+			if ((ret =
+			    __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
+				break;
+
+			records++;
+			if ((ret = __ham_del_pair(newdbc,
+			    HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
+				break;
+			if (!STD_LOCKING(dbc)) {
+				if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+					return (ret);
+				++hcp->hdr->nelem;
+			}
+		}
+		/*
+		 * If we moved all the records then __ham_del_pair will
+		 * have deleted the nextpage.
+		 */
+		if (records >= nument/2) {
+			c_data->compact_pages_examine++;
+			c_data->compact_pages_free++;
+			COMPACT_TRUNCATE(c_data);
+		}
+		if (ncp->page != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    ncp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		ncp->page = NULL;
+		ncp->pgno = PGNO_INVALID;
+	}
+
+	/*
+	 * If __ham_del_pair freed a page then we needed to dirty the metapage
+	 * and it could change so we need to copy it back to hcp.
+	 */
+	hcp->hdr = ncp->hdr;
+	ncp->hdr = NULL;
+	if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (records != 0)
+		(*pgs_donep)++;
+	return (ret);
+}
+
+/*
+ * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
+ */
+static int
+__ham_truncate_overflow(dbc, indx, c_data, pgs_done)
+	DBC *dbc;
+	u_int32_t indx;
+	DB_COMPACT *c_data;
+	int *pgs_done;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_pgno_t origpgno, pgno;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	memcpy(&pgno,
+	    HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
+	if (pgno > c_data->compact_truncate) {
+		c_data->compact_pages_examine++;
+		origpgno = pgno;
+		if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		if ((ret =
+		     __db_truncate_root(dbc, hcp->page, indx, &pgno, 0)) != 0)
+			return (ret);
+		if (pgno != origpgno) {
+			memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
+			    &pgno, sizeof(db_pgno_t));
+			(*pgs_done)++;
+			c_data->compact_pages--;
+		}
+	}
+	if ((ret = __db_truncate_overflow(dbc, pgno, NULL, c_data)) != 0)
+		return (ret);
+	return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __ham_compact_hash -- compact the hash table.
+ * PUBLIC: int __ham_compact_hash __P((DB *,
+ * PUBLIC:       DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ */
+int
+__ham_compact_hash(dbp, ip, txn, c_data)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_COMPACT *c_data;
+{
+	DBC *dbc;
+	DB_LOCK lock;
+	HASH_CURSOR *hcp;
+	HMETA *meta;
+	PAGE *oldpage;
+	db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
+	int flags, local_txn, ret, t_ret;
+	u_int32_t bucket, i, size;
+
+	local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
+	oldpage = NULL;
+	dbc = NULL;
+	LOCK_INIT(lock);
+
+	if (local_txn &&
+	    (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
+		return (ret);
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err1;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __ham_get_meta(dbc)) != 0 ||
+	    (ret = __ham_dirty_meta(dbc, 0)) != 0)
+		goto err1;
+
+	meta = hcp->hdr;
+
+	LOCK_CHECK_OFF(ip);
+
+	/*
+	 * Find contiguous lower numbered pages for each hash table segment.
+	 */
+	for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
+		if (i == 0) {
+			bucket = 0;
+			size = 1;
+		} else {
+			bucket = 1 << (i - 1);
+			size = bucket;
+		}
+		start_pgno = meta->spares[i] + bucket;
+		if ((ret = __db_find_free(dbc, P_HASH,
+		    size, start_pgno, &free_pgno)) != 0) {
+			if (ret != DB_NOTFOUND)
+				break;
+			ret = 0;
+			continue;
+		}
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __ham_changeslot_log(dbp,
+			    dbc->txn, &LSN(meta),
+			    0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
+				break;
+		} else
+			LSN_NOT_LOGGED(LSN(meta));
+		last_pgno = free_pgno + bucket;
+		/*
+		 * March through the list swapping pages.  If the page is
+		 * empty we just need to free it.  If we are just sliding
+		 * things down don't free the pages that will be reused.
+		 * Note that __db_exchange_page returns the new page so
+		 * we must put it.
+		 */
+		for (pgno = start_pgno;
+		    pgno < start_pgno + size; pgno++, free_pgno++) {
+			if ((ret = __db_lget(dbc,
+			    LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(dbp->mpf, &pgno,
+			     dbc->thread_info, dbc->txn,
+			     DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
+				goto err;
+			if (NUM_ENT(oldpage) != 0) {
+				if (pgno < last_pgno)
+					flags = 0;
+				else
+					flags = DB_EXCH_FREE;
+				if ((ret = __db_exchange_page(dbc,
+				    &oldpage, NULL, free_pgno, flags)) != 0)
+					goto err;
+			} else if (pgno >= last_pgno) {
+				if ((ret = __db_free(dbc, oldpage, 0)) != 0)
+					goto err;
+				COMPACT_TRUNCATE(c_data);
+				oldpage = NULL;
+			}
+			if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
+			    dbc->thread_info, oldpage, dbc->priority)) != 0)
+				goto err;
+			ret = 0;
+			oldpage = NULL;
+			c_data->compact_pages_examine++;
+		}
+		meta->spares[i] = free_pgno - (size + bucket);
+	}
+	if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
+	    PGNO(hcp->hdr) > c_data->compact_truncate)
+		ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, c_data);
+
+err:	if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	LOCK_CHECK_ON(ip);
+err1:	if (dbc != NULL) {
+		if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (local_txn && (t_ret = (ret == 0 ?
+	    __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+#endif
diff --git a/src/hash/hash_conv.c b/src/hash/hash_conv.c
new file mode 100644
index 00000000..fa084f2a
--- /dev/null
+++ b/src/hash/hash_conv.c
@@ -0,0 +1,110 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_pgin --
+ *	Convert host-specific page layout from the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __ham_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__ham_pgin(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	h = pp;
+	pginfo = (DB_PGINFO *)cookie->data;
+
+	/*
+	 * The hash access method does blind reads of pages, causing them
+	 * to be created.  If the type field isn't set it's one of them,
+	 * initialize the rest of the page and return.
+	 */
+	if (h->type != P_HASHMETA && h->pgno == PGNO_INVALID) {
+		P_INIT(pp, (db_indx_t)pginfo->db_pagesize,
+		    pg, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+		return (0);
+	}
+
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	return (h->type == P_HASHMETA ? __ham_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __ham_pgout --
+ *	Convert host-specific page layout to the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __ham_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__ham_pgout(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	return (h->type == P_HASHMETA ?  __ham_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __ham_mswap --
+ *	Swap the bytes on the hash metadata page.
+ *
+ * PUBLIC: int __ham_mswap __P((ENV *, void *));
+ */
+int
+__ham_mswap(env, pg)
+	ENV *env;
+	void *pg;
+{
+	u_int8_t *p;
+	int i;
+
+	COMPQUIET(env, NULL);
+
+	__db_metaswap(pg);
+	p = (u_int8_t *)pg + sizeof(DBMETA);
+
+	SWAP32(p);		/* max_bucket */
+	SWAP32(p);		/* high_mask */
+	SWAP32(p);		/* low_mask */
+	SWAP32(p);		/* ffactor */
+	SWAP32(p);		/* nelem */
+	SWAP32(p);		/* h_charkey */
+	for (i = 0; i < NCACHED; ++i)
+		SWAP32(p);	/* spares */
+	p += 59 * sizeof(u_int32_t); /* unused */
+	SWAP32(p);		/* crypto_magic */
+	return (0);
+}
diff --git a/src/hash/hash_dup.c b/src/hash/hash_dup.c
new file mode 100644
index 00000000..879c33d7
--- /dev/null
+++ b/src/hash/hash_dup.c
@@ -0,0 +1,943 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * PACKAGE:  hashing
+ *
+ * DESCRIPTION:
+ *	Manipulation of duplicates for the hash package.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+static int __hamc_chgpg __P((DBC *,
+    db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+static int __ham_check_move __P((DBC *, u_int32_t));
+static int __ham_dcursor __P((DBC *, db_pgno_t, u_int32_t));
+static int __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t));
+static int __hamc_chgpg_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * Called from hash_access to add a duplicate key. nval is the new
+ * value that we want to add.  The flags correspond to the flag values
+ * to cursor_put indicating where to add the new element.
+ * There are 4 cases.
+ * Case 1: The existing duplicate set already resides on a separate page.
+ *	   We return and let the common code handle this.
+ * Case 2: The element is small enough to just be added to the existing set.
+ * Case 3: The element is large enough to be a big item, so we're going to
+ *	   have to push the set onto a new page.
+ * Case 4: The element is large enough to push the duplicate set onto a
+ *	   separate page.
+ *
+ * PUBLIC: int __ham_add_dup __P((DBC *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ham_add_dup(dbc, nval, flags, pgnop)
+	DBC *dbc;
+	DBT *nval;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBT pval, tmp_val;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp;
+	u_int32_t add_bytes, new_size;
+	int cmp, ret;
+	u_int8_t *hk;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	DB_ASSERT(env, flags != DB_CURRENT);
+
+	add_bytes = nval->size +
+	    (F_ISSET(nval, DB_DBT_PARTIAL) ? nval->doff : 0);
+	add_bytes = DUP_SIZE(add_bytes);
+
+	if ((ret = __ham_check_move(dbc, add_bytes)) != 0)
+		return (ret);
+
+	/*
+	 * Check if resulting duplicate set is going to need to go
+	 * onto a separate duplicate page.  If so, convert the
+	 * duplicate set and add the new one.  After conversion,
+	 * hcp->dndx is the first free ndx or the index of the
+	 * current pointer into the duplicate set.
+	 */
+	hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+	/* Add the len bytes to the current singleton. */
+	if (HPAGE_PTYPE(hk) != H_DUPLICATE)
+		add_bytes += DUP_SIZE(0);
+	new_size =
+	    LEN_HKEYDATA(dbp, hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx)) +
+	    add_bytes;
+
+	/*
+	 * We convert to off-page duplicates if the item is a big item,
+	 * the addition of the new item will make the set large, or
+	 * if there isn't enough room on this page to add the next item.
+	 */
+	if (HPAGE_PTYPE(hk) != H_OFFDUP &&
+	    (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hcp, new_size) ||
+	    add_bytes > P_FREESPACE(dbp, hcp->page))) {
+
+		if ((ret = __ham_dup_convert(dbc)) != 0)
+			return (ret);
+		return (hcp->opd->am_put(hcp->opd,
+		    NULL, nval, flags, NULL));
+	}
+
+	/* There are two separate cases here: on page and off page. */
+	if (HPAGE_PTYPE(hk) != H_OFFDUP) {
+		if (HPAGE_PTYPE(hk) != H_DUPLICATE) {
+			pval.flags = 0;
+			pval.data = HKEYDATA_DATA(hk);
+			pval.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize,
+			    hcp->indx);
+			if ((ret = __ham_make_dup(env,
+			    &pval, &tmp_val, &dbc->my_rdata.data,
+			    &dbc->my_rdata.ulen)) != 0 || (ret =
+			    __ham_replpair(dbc, &tmp_val, H_DUPLICATE)) != 0)
+				return (ret);
+			hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+			HPAGE_PTYPE(hk) = H_DUPLICATE;
+
+			/*
+			 * Update the cursor position since we now are in
+			 * duplicates.
+			 */
+			F_SET(hcp, H_ISDUP);
+			hcp->dup_off = 0;
+			hcp->dup_len = pval.size;
+			hcp->dup_tlen = DUP_SIZE(hcp->dup_len);
+		}
+
+		/* Now make the new entry a duplicate. */
+		if ((ret = __ham_make_dup(env, nval,
+		    &tmp_val, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0)
+			return (ret);
+
+		tmp_val.dlen = 0;
+		switch (flags) {			/* On page. */
+		case DB_KEYFIRST:
+		case DB_KEYLAST:
+		case DB_NODUPDATA:
+		case DB_OVERWRITE_DUP:
+			if (dbp->dup_compare != NULL) {
+				__ham_dsearch(dbc,
+				    nval, &tmp_val.doff, &cmp, flags);
+
+				/*
+				 * Duplicate duplicates are not supported w/
+				 * sorted dups.  We can either overwrite or
+				 * return DB_KEYEXIST.
+				 */
+				if (cmp == 0) {
+					if (flags == DB_OVERWRITE_DUP)
+						return (__ham_overwrite(dbc,
+						    nval, flags));
+					return (__db_duperr(dbp, flags));
+				}
+			} else {
+				hcp->dup_tlen = LEN_HDATA(dbp, hcp->page,
+				    dbp->pgsize, hcp->indx);
+				hcp->dup_len = nval->size;
+				F_SET(hcp, H_ISDUP);
+				if (flags == DB_KEYFIRST)
+					hcp->dup_off = tmp_val.doff = 0;
+				else
+					hcp->dup_off =
+					    tmp_val.doff = hcp->dup_tlen;
+			}
+			break;
+		case DB_BEFORE:
+			tmp_val.doff = hcp->dup_off;
+			break;
+		case DB_AFTER:
+			tmp_val.doff = hcp->dup_off + DUP_SIZE(hcp->dup_len);
+			break;
+		default:
+			return (__db_unknown_path(env, "__ham_add_dup"));
+		}
+
+		/* Add the duplicate. */
+		if ((ret = __memp_dirty(mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0 ||
+		    (ret = __ham_replpair(dbc, &tmp_val, H_DUPLICATE)) != 0)
+			return (ret);
+
+		/* Now, update the cursor if necessary. */
+		switch (flags) {
+		case DB_AFTER:
+			hcp->dup_off += DUP_SIZE(hcp->dup_len);
+			hcp->dup_len = nval->size;
+			hcp->dup_tlen += (db_indx_t)DUP_SIZE(nval->size);
+			break;
+		case DB_BEFORE:
+		case DB_KEYFIRST:
+		case DB_KEYLAST:
+		case DB_NODUPDATA:
+		case DB_OVERWRITE_DUP:
+			hcp->dup_tlen += (db_indx_t)DUP_SIZE(nval->size);
+			hcp->dup_len = nval->size;
+			break;
+		default:
+			return (__db_unknown_path(env, "__ham_add_dup"));
+		}
+		ret = __hamc_update(dbc, tmp_val.size, DB_HAM_CURADJ_ADD, 1);
+		return (ret);
+	}
+
+	/*
+	 * If we get here, then we're on duplicate pages; set pgnop and
+	 * return so the common code can handle it.
+	 */
+	memcpy(pgnop, HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+	    sizeof(db_pgno_t));
+
+	return (ret);
+}
+
+/*
+ * Convert an on-page set of duplicates to an offpage set of duplicates.
+ *
+ * PUBLIC: int __ham_dup_convert __P((DBC *));
+ */
+int
+__ham_dup_convert(dbc)
+	DBC *dbc;
+{
+	BOVERFLOW bo;
+	DB *dbp;
+	DBC **hcs;
+	DBT dbt;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp;
+	HOFFPAGE ho;
+	PAGE *dp;
+	db_indx_t i, len, off;
+	int c, ret, t_ret;
+	u_int8_t *p, *pend;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	/*
+	 * Create a new page for the duplicates.
+	 */
+	if ((ret = __db_new(dbc,
+	    dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, NULL, &dp)) != 0)
+		return (ret);
+	P_INIT(dp, dbp->pgsize,
+	    dp->pgno, PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+	/*
+	 * Get the list of cursors that may need to be updated.
+	 */
+	if ((ret = __ham_get_clist(dbp,
+	    PGNO(hcp->page), (u_int32_t)hcp->indx, &hcs)) != 0)
+		goto err;
+
+	/*
+	 * Now put the duplicates onto the new page.
+	 */
+	dbt.flags = 0;
+	switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+	case H_KEYDATA:
+		/* Simple case, one key on page; move it to dup page. */
+		dbt.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+		dbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+		ret = __db_pitem(dbc,
+		    dp, 0, BKEYDATA_SIZE(dbt.size), NULL, &dbt);
+		goto finish;
+	case H_OFFPAGE:
+		/* Simple case, one key on page; move it to dup page. */
+		memcpy(&ho, P_ENTRY(dbp, hcp->page, H_DATAINDEX(hcp->indx)),
+		    HOFFPAGE_SIZE);
+		UMRW_SET(bo.unused1);
+		B_TSET(bo.type, ho.type);
+		UMRW_SET(bo.unused2);
+		bo.pgno = ho.pgno;
+		bo.tlen = ho.tlen;
+		dbt.size = BOVERFLOW_SIZE;
+		dbt.data = &bo;
+
+		ret = __db_pitem(dbc, dp, 0, dbt.size, &dbt, NULL);
+finish:		if (ret == 0) {
+			/* Update any other cursors. */
+			if (hcs != NULL && DBC_LOGGING(dbc) &&
+			    IS_SUBTRANSACTION(dbc->txn)) {
+				if ((ret = __ham_chgpg_log(dbp, dbc->txn,
+				    &lsn, 0, DB_HAM_DUP, PGNO(hcp->page),
+				    PGNO(dp), hcp->indx, 0)) != 0)
+					break;
+			}
+			for (c = 0; hcs != NULL && hcs[c] != NULL; c++)
+				if ((ret = __ham_dcursor(hcs[c],
+				    PGNO(dp), 0)) != 0)
+					break;
+		}
+		break;
+	case H_DUPLICATE:
+		p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+		pend = p +
+		    LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+
+		/*
+		 * We need to maintain the duplicate cursor position.
+		 * Keep track of where we are in the duplicate set via
+		 * the offset, and when it matches the one in the cursor,
+		 * set the off-page duplicate cursor index to the current
+		 * index.
+		 */
+		for (off = 0, i = 0; p < pend; i++) {
+			memcpy(&len, p, sizeof(db_indx_t));
+			dbt.size = len;
+			p += sizeof(db_indx_t);
+			dbt.data = p;
+			p += len + sizeof(db_indx_t);
+			if ((ret = __db_pitem(dbc, dp,
+			    i, BKEYDATA_SIZE(dbt.size), NULL, &dbt)) != 0)
+				break;
+
+			/* Update any other cursors */
+			if (hcs != NULL && DBC_LOGGING(dbc) &&
+			    IS_SUBTRANSACTION(dbc->txn)) {
+				if ((ret = __ham_chgpg_log(dbp, dbc->txn,
+				    &lsn, 0, DB_HAM_DUP, PGNO(hcp->page),
+				    PGNO(dp), hcp->indx, i)) != 0)
+					break;
+			}
+			for (c = 0; hcs != NULL && hcs[c] != NULL; c++)
+				if (((HASH_CURSOR *)(hcs[c]->internal))->dup_off
+				    == off && (ret = __ham_dcursor(hcs[c],
+				    PGNO(dp), i)) != 0)
+					goto err;
+			off += len + 2 * sizeof(db_indx_t);
+		}
+		break;
+	default:
+		ret = __db_pgfmt(env, hcp->pgno);
+		break;
+	}
+
+	/*
+	 * Now attach this to the source page in place of the old duplicate
+	 * item.
+	 */
+	if (ret == 0)
+		ret = __memp_dirty(mpf,
+		    &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0);
+
+	if (ret == 0)
+		ret = __ham_move_offpage(dbc, hcp->page,
+		    (u_int32_t)H_DATAINDEX(hcp->indx), PGNO(dp));
+
+err:	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret == 0)
+		hcp->dup_tlen = hcp->dup_off = hcp->dup_len = 0;
+
+	if (hcs != NULL)
+		__os_free(env, hcs);
+
+	return (ret);
+}
+
+/*
+ * __ham_make_dup
+ *
+ * Take a regular dbt and make it into a duplicate item with all the partial
+ * information set appropriately. If the incoming dbt is a partial, assume
+ * we are creating a new entry and make sure that we do any initial padding.
+ *
+ * PUBLIC: int __ham_make_dup __P((ENV *,
+ * PUBLIC:     const DBT *, DBT *d, void **, u_int32_t *));
+ */
+int
+__ham_make_dup(env, notdup, duplicate, bufp, sizep)
+	ENV *env;
+	const DBT *notdup;
+	DBT *duplicate;
+	void **bufp;
+	u_int32_t *sizep;
+{
+	db_indx_t tsize, item_size;
+	int ret;
+	u_int8_t *p;
+
+	item_size = (db_indx_t)notdup->size;
+	if (F_ISSET(notdup, DB_DBT_PARTIAL))
+		item_size += notdup->doff;
+
+	tsize = DUP_SIZE(item_size);
+	if ((ret = __ham_init_dbt(env, duplicate, tsize, bufp, sizep)) != 0)
+		return (ret);
+
+	duplicate->dlen = 0;
+	duplicate->flags = notdup->flags;
+	F_SET(duplicate, DB_DBT_PARTIAL);
+
+	p = duplicate->data;
+	memcpy(p, &item_size, sizeof(db_indx_t));
+	p += sizeof(db_indx_t);
+	if (F_ISSET(notdup, DB_DBT_PARTIAL)) {
+		memset(p, 0, notdup->doff);
+		p += notdup->doff;
+	}
+	memcpy(p, notdup->data, notdup->size);
+	p += notdup->size;
+	memcpy(p, &item_size, sizeof(db_indx_t));
+
+	duplicate->doff = 0;
+	duplicate->dlen = notdup->size;
+
+	return (0);
+}
+
+/*
+ * __ham_check_move --
+ *
+ * Check if we can do whatever we need to on this page.  If not,
+ * then we'll have to move the current element to a new page.
+ */
+static int
+__ham_check_move(dbc, add_len)
+	DBC *dbc;
+	u_int32_t add_len;
+{
+	DB *dbp;
+	DBT k, d;
+	DB_LSN new_lsn;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *new_pagep, *next_pagep;
+	db_pgno_t next_pgno;
+	u_int32_t data_type, key_type, new_datalen, old_len;
+	db_indx_t new_indx;
+	u_int8_t *hk;
+	int found, match, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+	found = 0;
+
+	/*
+	 * If the item is already off page duplicates or an offpage item,
+	 * then we know we can do whatever we need to do in-place
+	 */
+	if (HPAGE_PTYPE(hk) == H_OFFDUP || HPAGE_PTYPE(hk) == H_OFFPAGE)
+		return (0);
+
+	old_len =
+	    LEN_HITEM(dbp, hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx));
+	new_datalen = (old_len - HKEYDATA_SIZE(0)) + add_len;
+	if (HPAGE_PTYPE(hk) != H_DUPLICATE)
+		new_datalen += DUP_SIZE(0);
+
+	/*
+	 * We need to add a new page under two conditions:
+	 * 1. The addition makes the total data length cross the BIG
+	 *    threshold and the OFFDUP structure won't fit on this page.
+	 * 2. The addition does not make the total data cross the
+	 *    threshold, but the new data won't fit on the page.
+	 * If neither of these is true, then we can return.
+	 */
+	if (ISBIG(hcp, new_datalen) && (old_len > HOFFDUP_SIZE ||
+	    HOFFDUP_SIZE - old_len <= P_FREESPACE(dbp, hcp->page)))
+		return (0);
+
+	if (!ISBIG(hcp, new_datalen) &&
+	    (new_datalen - old_len) <= P_FREESPACE(dbp, hcp->page))
+		return (0);
+
+	/*
+	 * If we get here, then we need to move the item to a new page.
+	 * Check if there are more pages in the chain.  We now need to
+	 * update new_datalen to include the size of both the key and
+	 * the data that we need to move.
+	 */
+
+	new_datalen = ISBIG(hcp, new_datalen) ?
+	    HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen);
+	new_datalen +=
+	    LEN_HITEM(dbp, hcp->page, dbp->pgsize, H_KEYINDEX(hcp->indx));
+
+	new_pagep = NULL;
+	next_pagep = hcp->page;
+	for (next_pgno = NEXT_PGNO(hcp->page); next_pgno != PGNO_INVALID;
+	    next_pgno = NEXT_PGNO(next_pagep)) {
+		if (next_pagep != hcp->page && (ret = __memp_fput(mpf,
+		    dbc->thread_info, next_pagep, dbc->priority)) != 0)
+			return (ret);
+
+		if ((ret = __memp_fget(mpf,
+		    &next_pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_CREATE, &next_pagep)) != 0)
+			return (ret);
+
+		if (P_FREESPACE(dbp, next_pagep) >= new_datalen) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found != 0) {
+		/* Found a page with space, dirty it and the original. */
+		new_pagep = next_pagep;
+		if ((ret = __memp_dirty(mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err;
+		if ((ret = __memp_dirty(mpf, &new_pagep,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err;
+	} else {
+		if ((ret = __memp_dirty(mpf, &next_pagep,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err;
+
+		/* Add new page at the end of the chain. */
+		new_pagep = next_pagep;
+		if ((ret = __ham_add_ovflpage(dbc, &new_pagep)) != 0)
+			goto err;
+
+		if (next_pagep != hcp->page) {
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, next_pagep, dbc->priority)) != 0)
+				goto err;
+			next_pagep = NULL;
+			/* Dirty the original page to update it. */
+			if ((ret = __memp_dirty(mpf, &hcp->page,
+			    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+				goto err;
+		}
+	}
+
+	/* Copy the item to the new page. */
+	if (DBC_LOGGING(dbc)) {
+		memset(&k, 0, sizeof(DBT));
+		d.flags = 0;
+		if (HPAGE_PTYPE(
+		    H_PAIRKEY(dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+			k.data = H_PAIRKEY(dbp, hcp->page, hcp->indx);
+			k.size = HOFFPAGE_SIZE;
+			key_type = H_OFFPAGE;
+		} else {
+			k.data =
+			    HKEYDATA_DATA(H_PAIRKEY(dbp, hcp->page, hcp->indx));
+			k.size =
+			    LEN_HKEY(dbp, hcp->page, dbp->pgsize, hcp->indx);
+			key_type = H_KEYDATA;
+		}
+
+		/* Resolve the insert index so it can be written to the log. */
+		if ((ret = __ham_getindex(dbc, new_pagep, &k,
+		    key_type, &match, &new_indx)) != 0)
+			return (ret);
+
+		if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+			d.data = hk;
+			d.size = HOFFPAGE_SIZE;
+		} else if (data_type == H_OFFDUP) {
+			d.data = hk;
+			d.size = HOFFDUP_SIZE;
+		} else {
+			d.data = HKEYDATA_DATA(hk);
+			d.size = LEN_HDATA(dbp,
+			     hcp->page, dbp->pgsize, hcp->indx);
+		}
+
+		if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn,
+		    0, PUTPAIR, PGNO(new_pagep), (u_int32_t)new_indx,
+		    &LSN(new_pagep), OP_SET(key_type, new_pagep), &k,
+		    OP_SET(data_type, new_pagep), &d)) != 0) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, new_pagep, dbc->priority);
+			return (ret);
+		}
+	} else {
+		LSN_NOT_LOGGED(new_lsn);
+		/*
+		 * Ensure that an invalid index is passed to __ham_copypair, so
+		 * it knows to resolve the index. Resolving the insert index
+		 * here would require creating a temporary DBT with the key,
+		 * and calling __ham_getindex. Let __ham_copypair do the
+		 * resolution using the final key DBT.
+		 */
+		new_indx = NDX_INVALID;
+	}
+
+	/* Move lsn onto page. */
+	LSN(new_pagep) = new_lsn;	/* Structure assignment. */
+
+	if ((ret = __ham_copypair(dbc, hcp->page,
+	    H_KEYINDEX(hcp->indx), new_pagep, &new_indx, 0)) != 0)
+		goto err;
+
+	/* Update all cursors that used to point to this item. */
+	if ((ret = __hamc_chgpg(dbc, PGNO(hcp->page), H_KEYINDEX(hcp->indx),
+	    PGNO(new_pagep), new_indx)) != 0)
+		goto err;
+
+	/* Now delete the pair from the current page. */
+	if ((ret = __ham_del_pair(dbc, HAM_DEL_NO_RECLAIM, NULL)) != 0)
+		goto err;
+
+	/*
+	 * __ham_del_pair decremented nelem.  This is incorrect;  we
+	 * manually copied the element elsewhere, so the total number
+	 * of elements hasn't changed.  Increment it again.
+	 *
+	 * !!!
+	 * Note that we still have the metadata page pinned, and
+	 * __ham_del_pair dirtied it, so we don't need to set the dirty
+	 * flag again.
+	 */
+	if (!STD_LOCKING(dbc))
+		hcp->hdr->nelem++;
+
+	ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority);
+	hcp->page = new_pagep;
+	hcp->pgno = PGNO(hcp->page);
+	hcp->indx = new_indx;
+	F_SET(hcp, H_EXPAND);
+	F_CLR(hcp, H_DELETED);
+
+	return (ret);
+
+err:	if (new_pagep != NULL)
+		(void)__memp_fput(mpf,
+			dbc->thread_info, new_pagep, dbc->priority);
+	if (next_pagep != NULL &&
+	    next_pagep != hcp->page && next_pagep != new_pagep)
+		(void)__memp_fput(mpf,
+			dbc->thread_info, next_pagep, dbc->priority);
+	return (ret);
+
+}
+
+/*
+ * __ham_move_offpage --
+ *	Replace an onpage set of duplicates with the OFFDUP structure
+ *	that references the duplicate page.
+ *
+ * XXX
+ * This is really just a special case of __onpage_replace; we should
+ * probably combine them.
+ *
+ */
+static int
+__ham_move_offpage(dbc, pagep, ndx, pgno)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t ndx;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT new_dbt;
+	DBT old_dbt;
+	HOFFDUP od;
+	db_indx_t i, *inp;
+	int32_t difflen;
+	u_int8_t *src;
+	int ret;
+
+	dbp = dbc->dbp;
+	od.type = H_OFFDUP;
+	UMRW_SET(od.unused[0]);
+	UMRW_SET(od.unused[1]);
+	UMRW_SET(od.unused[2]);
+	od.pgno = pgno;
+	ret = 0;
+
+	if (DBC_LOGGING(dbc)) {
+		HKEYDATA *hk;
+		new_dbt.data = &od;
+		new_dbt.size = HOFFDUP_SIZE;
+		hk = (HKEYDATA *)P_ENTRY(dbp, pagep, ndx);
+		if (hk->type == H_KEYDATA || hk->type == H_DUPLICATE) {
+			old_dbt.data = hk->data;
+			old_dbt.size = LEN_HITEM(dbp, pagep, dbp->pgsize, ndx) -
+			     SSZA(HKEYDATA, data);
+		} else {
+			old_dbt.data = hk;
+			old_dbt.size = LEN_HITEM(dbp, pagep, dbp->pgsize, ndx);
+		}
+		if ((ret = __ham_replace_log(dbp, dbc->txn, &LSN(pagep), 0,
+		    PGNO(pagep), (u_int32_t)ndx, &LSN(pagep), -1,
+		    OP_SET(hk->type, pagep), &old_dbt,
+		    OP_SET(H_OFFDUP, pagep), &new_dbt)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	/*
+	 * difflen is the difference in the lengths, and so may be negative.
+	 * We know that the difference between two unsigned lengths from a
+	 * database page will fit into an int32_t.
+	 */
+	difflen =
+	    (int32_t)LEN_HITEM(dbp, pagep, dbp->pgsize, ndx) -
+	    (int32_t)HOFFDUP_SIZE;
+	if (difflen != 0) {
+		/* Copy data. */
+		inp = P_INP(dbp, pagep);
+		src = (u_int8_t *)(pagep) + HOFFSET(pagep);
+		memmove(src + difflen, src, inp[ndx] - HOFFSET(pagep));
+		HOFFSET(pagep) += difflen;
+
+		/* Update index table. */
+		for (i = ndx; i < NUM_ENT(pagep); i++)
+			inp[i] += difflen;
+	}
+
+	/* Now copy the offdup entry onto the page. */
+	memcpy(P_ENTRY(dbp, pagep, ndx), &od, HOFFDUP_SIZE);
+	return (ret);
+}
+
+/*
+ * __ham_dsearch:
+ *	Locate a particular duplicate in a duplicate set.  Make sure that
+ *	we exit with the cursor set appropriately.
+ *
+ * PUBLIC: void __ham_dsearch
+ * PUBLIC:     __P((DBC *, DBT *, u_int32_t *, int *, u_int32_t));
+ */
+void
+__ham_dsearch(dbc, dbt, offp, cmpp, flags)
+	DBC *dbc;
+	DBT *dbt;
+	u_int32_t *offp, flags;
+	int *cmpp;
+{
+	DB *dbp;
+	DBT cur;
+	HASH_CURSOR *hcp;
+	db_indx_t i, len;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+	u_int8_t *data;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	func = dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare;
+
+	i = F_ISSET(hcp, H_CONTINUE) ? hcp->dup_off: 0;
+	data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) + i;
+	hcp->dup_tlen = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+	len = hcp->dup_len;
+	while (i < hcp->dup_tlen) {
+		memcpy(&len, data, sizeof(db_indx_t));
+		data += sizeof(db_indx_t);
+		DB_SET_DBT(cur, data, len);
+
+		/*
+		 * If we find an exact match, we're done.  If in a sorted
+		 * duplicate set and the item is larger than our test item,
+		 * we're done.  In the latter case, if permitting partial
+		 * matches, it's not a failure.
+		 */
+		*cmpp = func(dbp, dbt, &cur);
+		if (*cmpp == 0)
+			break;
+		if (*cmpp < 0 && dbp->dup_compare != NULL) {
+			if (flags == DB_GET_BOTH_RANGE)
+				*cmpp = 0;
+			break;
+		}
+
+		i += len + 2 * sizeof(db_indx_t);
+		data += len + sizeof(db_indx_t);
+	}
+
+	*offp = i;
+	hcp->dup_off = i;
+	hcp->dup_len = len;
+	F_SET(hcp, H_ISDUP);
+}
+
+/*
+ * __ham_dcursor --
+ *
+ *	Create an off page duplicate cursor for this cursor.
+ */
+static int
+__ham_dcursor(dbc, pgno, indx)
+	DBC *dbc;
+	db_pgno_t pgno;
+	u_int32_t indx;
+{
+	BTREE_CURSOR *dcp;
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __dbc_newopd(dbc, pgno, hcp->opd, &hcp->opd)) != 0)
+		return (ret);
+
+	dcp = (BTREE_CURSOR *)hcp->opd->internal;
+	dcp->pgno = pgno;
+	dcp->indx = indx;
+
+	if (dbp->dup_compare == NULL) {
+		/*
+		 * Converting to off-page Recno trees is tricky.  The
+		 * record number for the cursor is the index + 1 (to
+		 * convert to 1-based record numbers).
+		 */
+		dcp->recno = indx + 1;
+	}
+
+	/*
+	 * Transfer the deleted flag from the top-level cursor to the
+	 * created one.
+	 */
+	if (F_ISSET(hcp, H_DELETED)) {
+		F_SET(dcp, C_DELETED);
+		F_CLR(hcp, H_DELETED);
+	}
+
+	return (0);
+}
+
+struct __hamc_chgpg_args {
+	db_pgno_t new_pgno;
+	db_indx_t new_index;
+	DB_TXN *my_txn;
+};
+
+static int
+__hamc_chgpg_func(cp, my_dbc, foundp, old_pgno, old_index, vargs)
+	DBC *cp, *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t old_pgno;
+	u_int32_t old_index;
+	void *vargs;
+{
+	HASH_CURSOR *hcp;
+	struct __hamc_chgpg_args *args;
+
+	if (cp == my_dbc || cp->dbtype != DB_HASH)
+		return (0);
+
+	hcp = (HASH_CURSOR *)cp->internal;
+
+	/*
+	 * If a cursor is deleted, it doesn't refer to this
+	 * item--it just happens to have the same indx, but
+	 * it points to a former neighbor.  Don't move it.
+	 */
+	if (F_ISSET(hcp, H_DELETED))
+		return (0);
+
+	args = vargs;
+
+	if (hcp->pgno == old_pgno &&
+	    hcp->indx == old_index &&
+	    !MVCC_SKIP_CURADJ(cp, old_pgno)) {
+		hcp->pgno = args->new_pgno;
+		hcp->indx = args->new_index;
+		if (args->my_txn != NULL && cp->txn != args->my_txn)
+			*foundp = 1;
+	}
+	return (0);
+}
+
+/*
+ * __hamc_chgpg --
+ *	Adjust the cursors after moving an item to a new page.  We only
+ *	move cursors that are pointing at this one item and are not
+ *	deleted;  since we only touch non-deleted cursors, and since
+ *	(by definition) no item existed at the pgno/indx we're moving the
+ *	item to, we're guaranteed that all the cursors we affect here or
+ *	on abort really do refer to this one item.
+ */
+static int
+__hamc_chgpg(dbc, old_pgno, old_index, new_pgno, new_index)
+	DBC *dbc;
+	db_pgno_t old_pgno, new_pgno;
+	u_int32_t old_index, new_index;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	int ret;
+	u_int32_t found;
+	struct __hamc_chgpg_args args;
+
+	dbp = dbc->dbp;
+
+	args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+	args.new_pgno = new_pgno;
+	args.new_index = new_index;
+
+	if ((ret = __db_walk_cursors(dbp, dbc,
+	    __hamc_chgpg_func, &found, old_pgno, old_index, &args)) != 0)
+		return (ret);
+	if (found != 0 && DBC_LOGGING(dbc)) {
+		if ((ret = __ham_chgpg_log(dbp,
+		    args.my_txn, &lsn, 0, DB_HAM_CHGPG,
+		    old_pgno, new_pgno, old_index, new_index)) != 0)
+			return (ret);
+	}
+	return (0);
+}
diff --git a/src/hash/hash_func.c b/src/hash/hash_func.c
new file mode 100644
index 00000000..baf6061c
--- /dev/null
+++ b/src/hash/hash_func.c
@@ -0,0 +1,240 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_func2 --
+ *	Phong Vo's linear congruential hash.
+ *
+ * PUBLIC: u_int32_t __ham_func2 __P((DB *, const void *, u_int32_t));
+ */
+#define	DCHARHASH(h, c)	((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c))
+
+u_int32_t
+__ham_func2(dbp, key, len)
+	DB *dbp;
+	const void *key;
+	u_int32_t len;
+{
+	const u_int8_t *e, *k;
+	u_int32_t h;
+	u_int8_t c;
+
+	if (dbp != NULL)
+		COMPQUIET(dbp, NULL);
+
+	k = key;
+	e = k + len;
+	for (h = 0; k != e;) {
+		c = *k++;
+		if (!c && k > e)
+			break;
+		DCHARHASH(h, c);
+	}
+	return (h);
+}
+
+/*
+ * __ham_func3 --
+ *	Ozan Yigit's original sdbm hash.
+ *
+ * Ugly, but fast.  Break the string up into 8 byte units.  On the first time
+ * through the loop get the "leftover bytes" (strlen % 8).  On every other
+ * iteration, perform 8 HASHC's so we handle all 8 bytes.  Essentially, this
+ * saves us 7 cmp & branch instructions.
+ *
+ * PUBLIC: u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func3(dbp, key, len)
+	DB *dbp;
+	const void *key;
+	u_int32_t len;
+{
+	const u_int8_t *k;
+	u_int32_t n, loop;
+
+	if (dbp != NULL)
+		COMPQUIET(dbp, NULL);
+
+	if (len == 0)
+		return (0);
+
+#define	HASHC	n = *k++ + 65599 * n
+	n = 0;
+	k = key;
+
+	loop = (len + 8 - 1) >> 3;
+	switch (len & (8 - 1)) {
+	case 0:
+		do {
+			HASHC;
+	case 7:
+			HASHC;
+	case 6:
+			HASHC;
+	case 5:
+			HASHC;
+	case 4:
+			HASHC;
+	case 3:
+			HASHC;
+	case 2:
+			HASHC;
+	case 1:
+			HASHC;
+		} while (--loop);
+	}
+	return (n);
+}
+
+/*
+ * __ham_func4 --
+ *	Chris Torek's hash function.  Although this function performs only
+ *	slightly worse than __ham_func5 on strings, it performs horribly on
+ *	numbers.
+ *
+ * PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func4(dbp, key, len)
+	DB *dbp;
+	const void *key;
+	u_int32_t len;
+{
+	const u_int8_t *k;
+	u_int32_t h, loop;
+
+	if (dbp != NULL)
+		COMPQUIET(dbp, NULL);
+
+	if (len == 0)
+		return (0);
+
+#define	HASH4a	h = (h << 5) - h + *k++;
+#define	HASH4b	h = (h << 5) + h + *k++;
+#define	HASH4	HASH4b
+	h = 0;
+	k = key;
+
+	loop = (len + 8 - 1) >> 3;
+	switch (len & (8 - 1)) {
+	case 0:
+		do {
+			HASH4;
+	case 7:
+			HASH4;
+	case 6:
+			HASH4;
+	case 5:
+			HASH4;
+	case 4:
+			HASH4;
+	case 3:
+			HASH4;
+	case 2:
+			HASH4;
+	case 1:
+			HASH4;
+		} while (--loop);
+	}
+	return (h);
+}
+
+/*
+ * Fowler/Noll/Vo hash
+ *
+ * The basis of the hash algorithm was taken from an idea sent by email to the
+ * IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and
+ * Glenn Fowler (gsf@research.att.com).  Landon Curt Noll (chongo@toad.com)
+ * later improved on their algorithm.
+ *
+ * The magic is in the interesting relationship between the special prime
+ * 16777619 (2^24 + 403) and 2^32 and 2^8.
+ *
+ * This hash produces the fewest collisions of any function that we've seen so
+ * far, and works well on both numbers and strings.
+ *
+ * PUBLIC: u_int32_t __ham_func5 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func5(dbp, key, len)
+	DB *dbp;
+	const void *key;
+	u_int32_t len;
+{
+	const u_int8_t *k, *e;
+	u_int32_t h;
+
+	if (dbp != NULL)
+		COMPQUIET(dbp, NULL);
+
+	k = key;
+	e = k + len;
+	for (h = 0; k < e; ++k) {
+		h *= 16777619;
+		h ^= *k;
+	}
+	return (h);
+}
+
+/*
+ * __ham_test --
+ *
+ * PUBLIC: u_int32_t __ham_test __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_test(dbp, key, len)
+	DB *dbp;
+	const void *key;
+	u_int32_t len;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(len, 0);
+	return ((u_int32_t)*(char *)key);
+}
diff --git a/src/hash/hash_meta.c b/src/hash/hash_meta.c
new file mode 100644
index 00000000..d9a35cb4
--- /dev/null
+++ b/src/hash/hash_meta.c
@@ -0,0 +1,170 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * Acquire the meta-data page.
+ *
+ * PUBLIC: int __ham_get_meta __P((DBC *));
+ */
+int
+__ham_get_meta(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH *hashp;
+	HASH_CURSOR *hcp;
+	u_int32_t revision;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hashp = dbp->h_internal;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+again:
+	revision = hashp->revision;
+	if ((ret = __db_lget(dbc, 0,
+	     hashp->meta_pgno, DB_LOCK_READ, 0, &hcp->hlock)) != 0)
+		return (ret);
+
+	if ((ret = __memp_fget(mpf, &hashp->meta_pgno,
+	    dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->hdr)) != 0) {
+		(void)__LPUT(dbc, hcp->hlock);
+		return (ret);
+	}
+
+	if (F_ISSET(dbp, DB_AM_SUBDB) &&
+	    (revision != dbp->mpf->mfp->revision ||
+	    (TYPE(hcp->hdr) != P_HASHMETA &&
+	    !IS_RECOVERING(dbp->env) && !F_ISSET(dbp, DB_AM_RECOVER)))) {
+		ret = __LPUT(dbc, hcp->hlock);
+		t_ret =
+		    __memp_fput(mpf, dbc->thread_info, hcp->hdr, dbc->priority);
+		hcp->hdr = NULL;
+		if (ret != 0)
+			return (ret);
+		if (t_ret != 0)
+			return (t_ret);
+		if ((ret = __db_reopen(dbc)) != 0)
+			return (ret);
+		goto again;
+	}
+
+	return (ret);
+}
+
+/*
+ * Release the meta-data page.
+ *
+ * PUBLIC: int __ham_release_meta __P((DBC *));
+ */
+int
+__ham_release_meta(dbc)
+	DBC *dbc;
+{
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	mpf = dbc->dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (hcp->hdr != NULL) {
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, hcp->hdr, dbc->priority)) != 0)
+			return (ret);
+		hcp->hdr = NULL;
+	}
+
+	ret = __TLPUT(dbc, hcp->hlock);
+	hcp->hlock.mode = DB_LOCK_NG;
+	return (ret);
+}
+
+/*
+ * Mark the meta-data page dirty.
+ *
+ * PUBLIC: int __ham_dirty_meta __P((DBC *, u_int32_t));
+ */
+int
+__ham_dirty_meta(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	HASH *hashp;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	if (F_ISSET(dbc, DBC_OPD))
+		dbc = dbc->internal->pdbc;
+	hashp = dbc->dbp->h_internal;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (hcp->hlock.mode == DB_LOCK_WRITE)
+		return (0);
+
+	mpf = dbc->dbp->mpf;
+
+	if ((ret = __db_lget(dbc, LCK_COUPLE, hashp->meta_pgno,
+	     DB_LOCK_WRITE, DB_LOCK_NOWAIT, &hcp->hlock)) != 0) {
+		if (ret != DB_LOCK_NOTGRANTED && ret != DB_LOCK_DEADLOCK)
+			return (ret);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, hcp->hdr, dbc->priority)) != 0)
+			return (ret);
+		hcp->hdr = NULL;
+		if ((ret = __db_lget(dbc, LCK_COUPLE, hashp->meta_pgno,
+		     DB_LOCK_WRITE, 0, &hcp->hlock)) != 0)
+			return (ret);
+		ret = __memp_fget(mpf, &hashp->meta_pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &hcp->hdr);
+		return (ret);
+	}
+
+	return (__memp_dirty(mpf,
+	    &hcp->hdr, dbc->thread_info, dbc->txn, dbc->priority, flags));
+}
+
+/*
+ * Return the meta data page if it is saved in the cursor.
+ *
+ * PUBLIC: int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **));
+ */
+ int
+ __ham_return_meta(dbc, flags, metap)
+	DBC *dbc;
+	u_int32_t flags;
+	DBMETA **metap;
+{
+	HASH_CURSOR *hcp;
+	int ret;
+
+	*metap = NULL;
+	if (F_ISSET(dbc, DBC_OPD))
+		dbc = dbc->internal->pdbc;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if (hcp->hdr == NULL || PGNO(hcp->hdr) != PGNO_BASE_MD)
+		return (0);
+
+	if (LF_ISSET(DB_MPOOL_DIRTY) &&
+	    (ret = __ham_dirty_meta(dbc, flags)) != 0)
+		return (ret);
+
+	*metap = (DBMETA *)hcp->hdr;
+	return (0);
+}
diff --git a/src/hash/hash_method.c b/src/hash/hash_method.c
new file mode 100644
index 00000000..1da81e70
--- /dev/null
+++ b/src/hash/hash_method.c
@@ -0,0 +1,250 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+static int __ham_set_h_ffactor __P((DB *, u_int32_t));
+static int __ham_get_h_hash
+	       __P((DB *, u_int32_t(**)(DB *, const void *, u_int32_t)));
+static int __ham_set_h_hash
+	       __P((DB *, u_int32_t(*)(DB *, const void *, u_int32_t)));
+static int __ham_set_h_nelem __P((DB *, u_int32_t));
+
+static int __ham_get_h_compare
+	__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+
+/*
+ * __ham_db_create --
+ *	Hash specific initialization of the DB structure.
+ *
+ * PUBLIC: int __ham_db_create __P((DB *));
+ */
+int
+__ham_db_create(dbp)
+	DB *dbp;
+{
+	HASH *hashp;
+	int ret;
+
+	if ((ret = __os_malloc(dbp->env,
+	    sizeof(HASH), &dbp->h_internal)) != 0)
+		return (ret);
+
+	hashp = dbp->h_internal;
+
+	hashp->h_nelem = 0;			/* Defaults. */
+	hashp->h_ffactor = 0;
+	hashp->h_hash = NULL;
+	hashp->h_compare = NULL;
+
+	dbp->get_h_ffactor = __ham_get_h_ffactor;
+	dbp->set_h_ffactor = __ham_set_h_ffactor;
+	dbp->get_h_hash = __ham_get_h_hash;
+	dbp->set_h_hash = __ham_set_h_hash;
+	dbp->get_h_compare = __ham_get_h_compare;
+	dbp->set_h_compare = __ham_set_h_compare;
+	dbp->get_h_nelem = __ham_get_h_nelem;
+	dbp->set_h_nelem = __ham_set_h_nelem;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_db_close __P((DB *));
+ */
+int
+__ham_db_close(dbp)
+	DB *dbp;
+{
+	if (dbp->h_internal == NULL)
+		return (0);
+	__os_free(dbp->env, dbp->h_internal);
+	dbp->h_internal = NULL;
+	return (0);
+}
+
+/*
+ * __ham_get_h_ffactor --
+ *
+ * PUBLIC: int __ham_get_h_ffactor __P((DB *, u_int32_t *));
+ */
+int
+__ham_get_h_ffactor(dbp, h_ffactorp)
+	DB *dbp;
+	u_int32_t *h_ffactorp;
+{
+	HASH *hashp;
+
+	hashp = dbp->h_internal;
+	*h_ffactorp = hashp->h_ffactor;
+	return (0);
+}
+
+/*
+ * __ham_set_h_ffactor --
+ *	Set the fill factor.
+ */
+static int
+__ham_set_h_ffactor(dbp, h_ffactor)
+	DB *dbp;
+	u_int32_t h_ffactor;
+{
+	HASH *hashp;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_ffactor");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	hashp = dbp->h_internal;
+	hashp->h_ffactor = h_ffactor;
+	return (0);
+}
+
+/*
+ * __ham_get_h_hash --
+ *	Get the hash function.
+ */
+static int
+__ham_get_h_hash(dbp, funcp)
+	DB *dbp;
+	u_int32_t (**funcp) __P((DB *, const void *, u_int32_t));
+{
+	HASH *hashp;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	hashp = dbp->h_internal;
+	if (funcp != NULL)
+		*funcp = hashp->h_hash;
+	return (0);
+}
+
+/*
+ * __ham_set_h_hash --
+ *	Set the hash function.
+ */
+static int
+__ham_set_h_hash(dbp, func)
+	DB *dbp;
+	u_int32_t (*func) __P((DB *, const void *, u_int32_t));
+{
+	HASH *hashp;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_hash");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	hashp = dbp->h_internal;
+	hashp->h_hash = func;
+	return (0);
+}
+
+/*
+ * __ham_get_h_compare --
+ *	Get the comparison function.
+ */
+static int
+__ham_get_h_compare(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+	HASH *t;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	t = dbp->h_internal;
+	if (funcp != NULL)
+		*funcp = t->h_compare;
+
+	return (0);
+}
+
+/*
+ * __ham_set_h_compare --
+ *	Set the comparison function.
+ *
+ * PUBLIC: int __ham_set_h_compare
+ * PUBLIC:         __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ */
+int
+__ham_set_h_compare(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+{
+	HASH *t;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_compare");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	t = dbp->h_internal;
+
+	t->h_compare = func;
+
+	return (0);
+}
+
+/*
+ * __db_get_h_nelem --
+ *
+ * PUBLIC: int __ham_get_h_nelem __P((DB *, u_int32_t *));
+ */
+int
+__ham_get_h_nelem(dbp, h_nelemp)
+	DB *dbp;
+	u_int32_t *h_nelemp;
+{
+	HASH *hashp;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	hashp = dbp->h_internal;
+	*h_nelemp = hashp->h_nelem;
+	return (0);
+}
+
+/*
+ * __ham_set_h_nelem --
+ *	Set the table size.
+ */
+static int
+__ham_set_h_nelem(dbp, h_nelem)
+	DB *dbp;
+	u_int32_t h_nelem;
+{
+	HASH *hashp;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_nelem");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	hashp = dbp->h_internal;
+	hashp->h_nelem = h_nelem;
+	return (0);
+}
+
+/*
+ * __ham_copy_config
+ *	Copy the configuration of one DB handle to another.
+ * PUBLIC: void __ham_copy_config __P((DB *, DB*, u_int32_t));
+ */
+void
+__ham_copy_config(src, dst, nparts)
+	DB *src, *dst;
+	u_int32_t nparts;
+{
+	HASH *s, *d;
+
+	s = src->h_internal;
+	d = dst->h_internal;
+
+	d->h_ffactor = s->h_ffactor;
+	d->h_nelem = s->h_nelem / nparts;
+	d->h_hash = s->h_hash;
+	d->h_compare = s->h_compare;
+}
diff --git a/src/hash/hash_open.c b/src/hash/hash_open.c
new file mode 100644
index 00000000..3d0bb220
--- /dev/null
+++ b/src/hash/hash_open.c
@@ -0,0 +1,584 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+
+static db_pgno_t __ham_init_meta __P((DB *, HMETA *, db_pgno_t, DB_LSN *));
+
+/*
+ * __ham_open --
+ *
+ * PUBLIC: int __ham_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char * name, db_pgno_t, u_int32_t));
+ */
+int
+__ham_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBMETA *dbmeta;
+	ENV *env;
+	HASH *hashp;
+	HASH_CURSOR *hcp;
+	int ret, t_ret;
+
+	env = dbp->env;
+	dbc = NULL;
+
+	/*
+	 * Get a cursor.  If DB_CREATE is specified, we may be creating
+	 * pages, and to do that safely in CDB we need a write cursor.
+	 * In STD_LOCKING mode, we'll synchronize using the meta page
+	 * lock instead.
+	 */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+	    (LF_ISSET(DB_CREATE) && CDB_LOCKING(env) ?  DB_WRITECURSOR : 0) |
+	    (F_ISSET(dbp, DB_AM_RECOVER) ? DB_RECOVER : 0))) != 0)
+		return (ret);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	hashp = dbp->h_internal;
+	hashp->meta_pgno = base_pgno;
+	hashp->revision = dbp->mpf->mfp->revision;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto err;
+
+	/* Initialize the hdr structure.  */
+	dbmeta = &hcp->hdr->dbmeta;
+	if (dbmeta->magic == DB_HASHMAGIC) {
+		/* File exists, verify the data in the header. */
+		if (hashp->h_hash == NULL)
+			hashp->h_hash = dbmeta->version < 5
+			? __ham_func4 : __ham_func5;
+		hashp->h_nelem = hcp->hdr->nelem;
+		if (F_ISSET(dbmeta, DB_HASH_DUP))
+			F_SET(dbp, DB_AM_DUP);
+		if (F_ISSET(dbmeta, DB_HASH_DUPSORT))
+			F_SET(dbp, DB_AM_DUPSORT);
+		if (F_ISSET(dbmeta, DB_HASH_SUBDB))
+			F_SET(dbp, DB_AM_SUBDB);
+		if (PGNO(hcp->hdr) == PGNO_BASE_MD &&
+		    !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    (txn == NULL || !F_ISSET(txn, TXN_SNAPSHOT)) && (ret =
+		    __memp_set_last_pgno(dbp->mpf, dbmeta->last_pgno)) != 0)
+			goto err;
+	} else if (!IS_RECOVERING(env) && !F_ISSET(dbp, DB_AM_RECOVER)) {
+		__db_errx(env, DB_STR_A("1124",
+		    "%s: Invalid hash meta page %lu", "%s %lu"),
+		    name, (u_long)base_pgno);
+		ret = EINVAL;
+	}
+
+	/* Release the meta data page */
+	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+err:	if ((t_ret  = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __ham_metachk --
+ *
+ * PUBLIC: int __ham_metachk __P((DB *, const char *, HMETA *));
+ */
+int
+__ham_metachk(dbp, name, hashm)
+	DB *dbp;
+	const char *name;
+	HMETA *hashm;
+{
+	ENV *env;
+	u_int32_t vers;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * At this point, all we know is that the magic number is for a Hash.
+	 * Check the version, the database may be out of date.
+	 */
+	vers = hashm->dbmeta.version;
+	if (F_ISSET(dbp, DB_AM_SWAP))
+		M_32_SWAP(vers);
+	switch (vers) {
+	case 4:
+	case 5:
+	case 6:
+		__db_errx(env, DB_STR_A("1125",
+		    "%s: hash version %lu requires a version upgrade",
+		    "%s %lu"), name, (u_long)vers);
+		return (DB_OLD_VERSION);
+	case 7:
+	case 8:
+	case 9:
+		break;
+	default:
+		__db_errx(env, DB_STR_A("1126",
+		    "%s: unsupported hash version: %lu", "%s %lu"),
+		    name, (u_long)vers);
+		return (EINVAL);
+	}
+
+	/* Swap the page if we need to. */
+	if (F_ISSET(dbp, DB_AM_SWAP) &&
+	    (ret = __ham_mswap(env, (PAGE *)hashm)) != 0)
+		return (ret);
+
+	/* Check the type. */
+	if (dbp->type != DB_HASH && dbp->type != DB_UNKNOWN)
+		return (EINVAL);
+	dbp->type = DB_HASH;
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+	/*
+	 * Check application info against metadata info, and set info, flags,
+	 * and type based on metadata info.
+	 */
+	if ((ret = __db_fchk(env,
+	    "DB->open", hashm->dbmeta.flags,
+	    DB_HASH_DUP | DB_HASH_SUBDB | DB_HASH_DUPSORT)) != 0)
+		return (ret);
+
+	if (F_ISSET(&hashm->dbmeta, DB_HASH_DUP))
+		F_SET(dbp, DB_AM_DUP);
+	else
+		if (F_ISSET(dbp, DB_AM_DUP)) {
+			__db_errx(env, DB_STR_A("1127",
+	    "%s: DB_DUP specified to open method but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&hashm->dbmeta, DB_HASH_SUBDB))
+		F_SET(dbp, DB_AM_SUBDB);
+	else
+		if (F_ISSET(dbp, DB_AM_SUBDB)) {
+			__db_errx(env, DB_STR_A("1128",
+	    "%s: multiple databases specified but not supported in file",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	if (F_ISSET(&hashm->dbmeta, DB_HASH_DUPSORT)) {
+		if (dbp->dup_compare == NULL)
+			dbp->dup_compare = __bam_defcmp;
+	} else
+		if (dbp->dup_compare != NULL) {
+			__db_errx(env, DB_STR_A("1129",
+		"%s: duplicate sort function specified but not set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	/* Set the page size. */
+	dbp->pgsize = hashm->dbmeta.pagesize;
+
+	/* Copy the file's ID. */
+	memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN);
+
+	return (0);
+}
+
+/*
+ * __ham_init_meta --
+ *
+ * Initialize a hash meta-data page.  We assume that the meta-data page is
+ * contiguous with the initial buckets that we create.  If that turns out
+ * to be false, we'll fix it up later.  Return the initial number of buckets
+ * allocated.
+ */
+static db_pgno_t
+__ham_init_meta(dbp, meta, pgno, lsnp)
+	DB *dbp;
+	HMETA *meta;
+	db_pgno_t pgno;
+	DB_LSN *lsnp;
+{
+#ifdef HAVE_PARTITION
+	DB_PARTITION *part;
+#endif
+	ENV *env;
+	HASH *hashp;
+	db_pgno_t nbuckets;
+	u_int i, l2;
+
+	env = dbp->env;
+	hashp = dbp->h_internal;
+
+	if (hashp->h_hash == NULL)
+		hashp->h_hash = DB_HASHVERSION < 5 ? __ham_func4 : __ham_func5;
+
+	if (hashp->h_nelem != 0 && hashp->h_ffactor != 0) {
+		nbuckets = (hashp->h_nelem - 1) / hashp->h_ffactor + 1;
+		l2 = __db_log2(nbuckets > 2 ? nbuckets : 2);
+	} else
+		l2 = 1;
+
+	/* Now make number of buckets a power of two. */
+	nbuckets = (db_pgno_t)(1 << l2);
+
+	memset(meta, 0, sizeof(HMETA));
+	meta->dbmeta.lsn = *lsnp;
+	meta->dbmeta.pgno = pgno;
+	meta->dbmeta.magic = DB_HASHMAGIC;
+	meta->dbmeta.version = DB_HASHVERSION;
+	meta->dbmeta.pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+		DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+		meta->crypto_magic = meta->dbmeta.magic;
+	}
+	meta->dbmeta.type = P_HASHMETA;
+	meta->dbmeta.free = PGNO_INVALID;
+	meta->dbmeta.last_pgno = pgno;
+	meta->max_bucket = nbuckets - 1;
+	meta->high_mask = nbuckets - 1;
+	meta->low_mask = (nbuckets >> 1) - 1;
+	meta->ffactor = hashp->h_ffactor;
+	meta->nelem = hashp->h_nelem;
+	meta->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY));
+	memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+	if (F_ISSET(dbp, DB_AM_DUP))
+		F_SET(&meta->dbmeta, DB_HASH_DUP);
+	if (F_ISSET(dbp, DB_AM_SUBDB))
+		F_SET(&meta->dbmeta, DB_HASH_SUBDB);
+	if (dbp->dup_compare != NULL)
+		F_SET(&meta->dbmeta, DB_HASH_DUPSORT);
+
+#ifdef HAVE_PARTITION
+	if ((part = dbp->p_internal) != NULL) {
+		meta->dbmeta.nparts = part->nparts;
+		if (F_ISSET(part, PART_CALLBACK))
+			FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
+		if (F_ISSET(part, PART_RANGE))
+			FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
+	}
+#endif
+
+	/*
+	 * Create the first and second buckets pages so that we have the
+	 * page numbers for them and we can store that page number in the
+	 * meta-data header (spares[0]).
+	 */
+	meta->spares[0] = pgno + 1;
+
+	/* Fill in the last fields of the meta data page. */
+	for (i = 1; i <= l2; i++)
+		meta->spares[i] = meta->spares[0];
+	for (; i < NCACHED; i++)
+		meta->spares[i] = PGNO_INVALID;
+
+	return (nbuckets);
+}
+
+/*
+ * __ham_new_file --
+ *	Create the necessary pages to begin a new database file.  If name
+ * is NULL, then this is an unnamed file, the mpf has been set in the dbp
+ * and we simply create the pages using mpool.  In this case, we don't log
+ * because we never have to redo an unnamed create and the undo simply
+ * frees resources.
+ *
+ * This code appears more complex than it is because of the two cases (named
+ * and unnamed).  The way to read the code is that for each page being created,
+ * there are three parts: 1) a "get page" chunk (which either uses malloc'd
+ * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
+ * chunk which either does a fop write or an __memp_fput.
+ *
+ * PUBLIC: int __ham_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__ham_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	DBT pdbt;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	HMETA *meta;
+	PAGE *page;
+	int ret;
+	db_pgno_t lpgno;
+	void *buf;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	meta = NULL;
+	page = NULL;
+	buf = NULL;
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		/* Build meta-data page. */
+		lpgno = PGNO_BASE_MD;
+		if ((ret = __memp_fget(mpf, &lpgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+			return (ret);
+		LSN_NOT_LOGGED(lsn);
+		lpgno = __ham_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		meta->dbmeta.last_pgno = lpgno;
+		if ((ret = __db_log_page(dbp,
+		    txn, &lsn, meta->dbmeta.pgno, (PAGE *)meta)) != 0)
+			goto err;
+		ret = __memp_fput(mpf, ip, meta, dbp->priority);
+		meta = NULL;
+		if (ret != 0)
+			goto err;
+
+		/* Allocate the final hash bucket. */
+		if ((ret = __memp_fget(mpf, &lpgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &page)) != 0)
+			goto err;
+		P_INIT(page,
+		    dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+		LSN_NOT_LOGGED(page->lsn);
+		if ((ret =
+		    __db_log_page(dbp, txn, &page->lsn, lpgno, page)) != 0)
+			goto err;
+		ret = __memp_fput(mpf, ip, page, dbp->priority);
+		page = NULL;
+		if (ret != 0)
+			goto err;
+	} else {
+		memset(&pdbt, 0, sizeof(pdbt));
+
+		/* Build meta-data page. */
+		pginfo.db_pagesize = dbp->pgsize;
+		pginfo.type = dbp->type;
+		pginfo.flags =
+		    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+		pdbt.data = &pginfo;
+		pdbt.size = sizeof(pginfo);
+		if ((ret = __os_calloc(dbp->env, 1, dbp->pgsize, &buf)) != 0)
+			return (ret);
+		meta = (HMETA *)buf;
+		LSN_NOT_LOGGED(lsn);
+		lpgno = __ham_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		meta->dbmeta.last_pgno = lpgno;
+		if ((ret =
+		    __db_pgout(env->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+			goto err;
+		if ((ret = __fop_write(env, txn, name, dbp->dirname,
+		    DB_APP_DATA, fhp,
+		    dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		meta = NULL;
+
+		/* Allocate the final hash bucket. */
+#ifdef DIAGNOSTIC
+		memset(buf, 0, dbp->pgsize);
+#endif
+		page = (PAGE *)buf;
+		P_INIT(page,
+		    dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+		LSN_NOT_LOGGED(page->lsn);
+		if ((ret = __db_pgout(env->dbenv, lpgno, buf, &pdbt)) != 0)
+			goto err;
+		if ((ret = __fop_write(env, txn, name, dbp->dirname,
+		    DB_APP_DATA, fhp,
+		    dbp->pgsize, lpgno, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		page = NULL;
+	}
+
+err:	if (buf != NULL)
+		__os_free(env, buf);
+	else {
+		if (meta != NULL)
+			(void)__memp_fput(mpf, ip, meta, dbp->priority);
+		if (page != NULL)
+			(void)__memp_fput(mpf, ip, page, dbp->priority);
+	}
+	return (ret);
+}
+
+/*
+ * __ham_new_subdb --
+ *	Create the necessary pages to begin a new subdatabase.
+ *
+ * PUBLIC: int __ham_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__ham_new_subdb(mdbp, dbp, ip, txn)
+	DB *mdbp, *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	DBC *dbc;
+	DBMETA *mmeta;
+	DB_LOCK lock, metalock, mmlock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HMETA *meta;
+	PAGE *h;
+	int i, ret, t_ret;
+	db_pgno_t lpgno, mpgno;
+
+	env = mdbp->env;
+	mpf = mdbp->mpf;
+	dbc = NULL;
+	meta = NULL;
+	mmeta = NULL;
+	LOCK_INIT(lock);
+	LOCK_INIT(metalock);
+	LOCK_INIT(mmlock);
+
+	if ((ret = __db_cursor(mdbp, ip, txn,
+	    &dbc, CDB_LOCKING(env) ?  DB_WRITECURSOR : 0)) != 0)
+		return (ret);
+
+	/* Get and lock the new meta data page. */
+	if ((ret = __db_lget(dbc,
+	    0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &dbp->meta_pgno, ip, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+		goto err;
+
+	/* Initialize the new meta-data page. */
+	lsn = meta->dbmeta.lsn;
+	lpgno = __ham_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
+
+	/*
+	 * We are about to allocate a set of contiguous buckets (lpgno
+	 * worth).  We need to get the master meta-data page to figure
+	 * out where these pages are and to allocate them.  So, lock and
+	 * get the master meta data page.
+	 */
+	mpgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc, 0, mpgno, DB_LOCK_WRITE, 0, &mmlock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &mpgno, ip, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &mmeta)) != 0)
+		goto err;
+
+	/*
+	 * Now update the hash meta-data page to reflect where the first
+	 * set of buckets are actually located.
+	 */
+	meta->spares[0] = mmeta->last_pgno + 1;
+	for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++)
+		meta->spares[i] = meta->spares[0];
+
+	/* The new meta data page is now complete; log it. */
+	if ((ret = __db_log_page(mdbp,
+	    txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
+		goto err;
+
+	/* Reflect the group allocation. */
+	if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+	    && txn != NULL
+#endif
+	)
+		if ((ret = __ham_groupalloc_log(mdbp, txn,
+		    &LSN(mmeta), 0, &LSN(mmeta), meta->spares[0],
+		    meta->max_bucket + 1, 0, mmeta->last_pgno)) != 0)
+			goto err;
+
+	/* Release the new meta-data page. */
+	if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+		goto err;
+	meta = NULL;
+
+	lpgno += mmeta->last_pgno;
+
+	/* Now allocate the final hash bucket. */
+	if ((ret = __db_lget(dbc, 0, lpgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &lpgno, ip, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+		goto err;
+
+	mmeta->last_pgno = lpgno;
+	P_INIT(h, dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+	LSN(h) = LSN(mmeta);
+	if ((ret = __memp_fput(mpf, ip, h, dbc->priority)) != 0)
+		goto err;
+
+err:	/* Now put the master-metadata page back. */
+	if (mmeta != NULL && (t_ret = __memp_fput(mpf,
+		ip, mmeta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, mmlock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbc != NULL)
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	return (ret);
+}
diff --git a/src/hash/hash_page.c b/src/hash/hash_page.c
new file mode 100644
index 00000000..7576fe61
--- /dev/null
+++ b/src/hash/hash_page.c
@@ -0,0 +1,3182 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * PACKAGE:  hashing
+ *
+ * DESCRIPTION:
+ *	Page manipulation for hashing package.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __hamc_delpg
+    __P((DBC *, db_pgno_t, db_pgno_t, u_int32_t, db_ham_mode, u_int32_t *));
+static int __ham_getindex_sorted
+    __P((DBC *, PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+static int __ham_getindex_unsorted
+    __P((DBC *, PAGE *, const DBT *, int *, db_indx_t *));
+static int __hamc_delpg_getorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __hamc_delpg_setorder
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item(dbc, mode, pgnop)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_pgno_t next_pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (F_ISSET(hcp, H_DELETED)) {
+		__db_errx(dbp->env, DB_STR("1132",
+		    "Attempt to return a deleted item"));
+		return (EINVAL);
+	}
+	F_CLR(hcp, H_OK | H_NOMORE);
+
+	/* Check if we need to get a page for this cursor. */
+	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+		return (ret);
+
+recheck:
+	/* Check if we are looking for space in which to insert an item. */
+	if (hcp->seek_size != 0 && hcp->seek_found_page == PGNO_INVALID &&
+	    hcp->seek_size < P_FREESPACE(dbp, hcp->page)) {
+		hcp->seek_found_page = hcp->pgno;
+		hcp->seek_found_indx = NDX_INVALID;
+	}
+
+	/* Check for off-page duplicates. */
+	if (hcp->indx < NUM_ENT(hcp->page) &&
+	    HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) {
+		memcpy(pgnop,
+		    HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+		    sizeof(db_pgno_t));
+		F_SET(hcp, H_OK);
+		return (0);
+	}
+
+	/* Check if we need to go on to the next page. */
+	if (F_ISSET(hcp, H_ISDUP))
+		/*
+		 * ISDUP is set, and offset is at the beginning of the datum.
+		 * We need to grab the length of the datum, then set the datum
+		 * pointer to be the beginning of the datum.
+		 */
+		memcpy(&hcp->dup_len,
+		    HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) +
+		    hcp->dup_off, sizeof(db_indx_t));
+
+	if (hcp->indx >= (db_indx_t)NUM_ENT(hcp->page)) {
+		/* Fetch next page. */
+		if (NEXT_PGNO(hcp->page) == PGNO_INVALID) {
+			F_SET(hcp, H_NOMORE);
+			return (DB_NOTFOUND);
+		}
+		next_pgno = NEXT_PGNO(hcp->page);
+		hcp->indx = 0;
+		if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+			return (ret);
+		goto recheck;
+	}
+
+	F_SET(hcp, H_OK);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_item_reset __P((DBC *));
+ */
+int
+__ham_item_reset(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	ret = 0;
+	if (hcp->page != NULL) {
+		ret = __memp_fput(mpf,
+		    dbc->thread_info, hcp->page, dbc->priority);
+		hcp->page = NULL;
+	}
+
+	if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_item_init __P((DBC *));
+ */
+int
+__ham_item_init(dbc)
+	DBC *dbc;
+{
+	HASH_CURSOR *hcp;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	/*
+	 * If this cursor still holds any locks, we must release them if
+	 * we are not running with transactions.
+	 */
+	ret = __TLPUT(dbc, hcp->lock);
+
+	/*
+	 * The following fields must *not* be initialized here because they
+	 * may have meaning across inits.
+	 *	hlock, hdr, split_buf, stats
+	 */
+	hcp->bucket = BUCKET_INVALID;
+	hcp->lbucket = BUCKET_INVALID;
+	LOCK_INIT(hcp->lock);
+	hcp->lock_mode = DB_LOCK_NG;
+	hcp->dup_off = 0;
+	hcp->dup_len = 0;
+	hcp->dup_tlen = 0;
+	hcp->seek_size = 0;
+	hcp->seek_found_page = PGNO_INVALID;
+	hcp->seek_found_indx = NDX_INVALID;
+	hcp->flags = 0;
+
+	hcp->pgno = PGNO_INVALID;
+	hcp->indx = NDX_INVALID;
+	hcp->page = NULL;
+
+	return (ret);
+}
+
+/*
+ * Returns the last item in a bucket.
+ *
+ * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_last(dbc, mode, pgnop)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	HASH_CURSOR *hcp;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_item_reset(dbc)) != 0)
+		return (ret);
+
+	hcp->bucket = hcp->hdr->max_bucket;
+	hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+	F_SET(hcp, H_OK);
+	return (__ham_item_prev(dbc, mode, pgnop));
+}
+
+/*
+ * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_first(dbc, mode, pgnop)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	HASH_CURSOR *hcp;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_item_reset(dbc)) != 0)
+		return (ret);
+	F_SET(hcp, H_OK);
+	hcp->bucket = 0;
+	hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+	hcp->dup_off = 0;
+	return (__ham_item_next(dbc, mode, pgnop));
+}
+
+/*
+ * __ham_item_prev --
+ *	Returns a pointer to key/data pair on a page.  In the case of
+ *	bigkeys, just returns the page number and index of the bigkey
+ *	pointer pair.
+ *
+ * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_prev(dbc, mode, pgnop)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_pgno_t next_pgno;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+
+	/*
+	 * There are 5 cases for backing up in a hash file.
+	 * Case 1: In the middle of a page, no duplicates, just dec the index.
+	 * Case 2: In the middle of a duplicate set, back up one.
+	 * Case 3: At the beginning of a duplicate set, get out of set and
+	 *	back up to next key.
+	 * Case 4: At the beginning of a page; go to previous page.
+	 * Case 5: At the beginning of a bucket; go to prev bucket.
+	 */
+	F_CLR(hcp, H_OK | H_NOMORE | H_DELETED);
+
+	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+		return (ret);
+
+	/*
+	 * First handle the duplicates.  Either you'll get the key here
+	 * or you'll exit the duplicate set and drop into the code below
+	 * to handle backing up through keys.
+	 */
+	if (!F_ISSET(hcp, H_NEXT_NODUP) && F_ISSET(hcp, H_ISDUP)) {
+		if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) ==
+		    H_OFFDUP) {
+			memcpy(pgnop,
+			    HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+			    sizeof(db_pgno_t));
+			F_SET(hcp, H_OK);
+			return (0);
+		}
+
+		/* Duplicates are on-page. */
+		if (hcp->dup_off != 0) {
+			memcpy(&hcp->dup_len, HKEYDATA_DATA(
+			    H_PAIRDATA(dbp, hcp->page, hcp->indx))
+			    + hcp->dup_off - sizeof(db_indx_t),
+			    sizeof(db_indx_t));
+			hcp->dup_off -=
+			    DUP_SIZE(hcp->dup_len);
+			return (__ham_item(dbc, mode, pgnop));
+		}
+	}
+
+	/*
+	 * If we get here, we are not in a duplicate set, and just need
+	 * to back up the cursor.  There are still three cases:
+	 * midpage, beginning of page, beginning of bucket.
+	 */
+
+	if (F_ISSET(hcp, H_DUPONLY)) {
+		F_CLR(hcp, H_OK);
+		F_SET(hcp, H_NOMORE);
+		return (0);
+	} else
+		/*
+		 * We are no longer in a dup set;  flag this so the dup code
+		 * will reinitialize should we stumble upon another one.
+		 */
+		F_CLR(hcp, H_ISDUP);
+
+	if (hcp->indx == 0) {		/* Beginning of page. */
+		hcp->pgno = PREV_PGNO(hcp->page);
+		if (hcp->pgno == PGNO_INVALID) {
+			/* Beginning of bucket. */
+			F_SET(hcp, H_NOMORE);
+			return (DB_NOTFOUND);
+		} else if ((ret =
+		    __ham_next_cpage(dbc, hcp->pgno)) != 0)
+			return (ret);
+		else
+			hcp->indx = NUM_ENT(hcp->page);
+	}
+
+	/*
+	 * Either we've got the cursor set up to be decremented, or we
+	 * have to find the end of a bucket.
+	 */
+	if (hcp->indx == NDX_INVALID) {
+		DB_ASSERT(dbp->env, hcp->page != NULL);
+
+		hcp->indx = NUM_ENT(hcp->page);
+		for (next_pgno = NEXT_PGNO(hcp->page);
+		    next_pgno != PGNO_INVALID;
+		    next_pgno = NEXT_PGNO(hcp->page)) {
+			if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+				return (ret);
+			hcp->indx = NUM_ENT(hcp->page);
+		}
+
+		if (hcp->indx == 0) {
+			/* Bucket was empty. */
+			F_SET(hcp, H_NOMORE);
+			return (DB_NOTFOUND);
+		}
+	}
+
+	hcp->indx -= 2;
+
+	return (__ham_item(dbc, mode, pgnop));
+}
+
+/*
+ * Sets the cursor to the next key/data pair on a page.
+ *
+ * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_next(dbc, mode, pgnop)
+	DBC *dbc;
+	db_lockmode_t mode;
+	db_pgno_t *pgnop;
+{
+	HASH_CURSOR *hcp;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+		return (ret);
+
+	/*
+	 * Deleted on-page duplicates are a weird case. If we delete the last
+	 * one, then our cursor is at the very end of a duplicate set and
+	 * we actually need to go on to the next key.
+	 */
+	if (F_ISSET(hcp, H_DELETED)) {
+		if (hcp->indx != NDX_INVALID &&
+		    F_ISSET(hcp, H_ISDUP) &&
+		    HPAGE_TYPE(dbc->dbp, hcp->page, H_DATAINDEX(hcp->indx))
+			== H_DUPLICATE && hcp->dup_tlen == hcp->dup_off) {
+			if (F_ISSET(hcp, H_DUPONLY)) {
+				F_CLR(hcp, H_OK);
+				F_SET(hcp, H_NOMORE);
+				return (0);
+			} else {
+				F_CLR(hcp, H_ISDUP);
+				hcp->indx += 2;
+			}
+		} else if (!F_ISSET(hcp, H_ISDUP) && F_ISSET(hcp, H_DUPONLY)) {
+			F_CLR(hcp, H_OK);
+			F_SET(hcp, H_NOMORE);
+			return (0);
+		} else if (F_ISSET(hcp, H_ISDUP) &&
+		    F_ISSET(hcp, H_NEXT_NODUP)) {
+			F_CLR(hcp, H_ISDUP);
+			hcp->indx += 2;
+		}
+		F_CLR(hcp, H_DELETED);
+	} else if (hcp->indx == NDX_INVALID) {
+		hcp->indx = 0;
+		F_CLR(hcp, H_ISDUP);
+	} else if (F_ISSET(hcp, H_NEXT_NODUP)) {
+		hcp->indx += 2;
+		F_CLR(hcp, H_ISDUP);
+	} else if (F_ISSET(hcp, H_ISDUP) && hcp->dup_tlen != 0) {
+		if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >=
+		    hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) {
+			F_CLR(hcp, H_OK);
+			F_SET(hcp, H_NOMORE);
+			return (0);
+		}
+		hcp->dup_off += DUP_SIZE(hcp->dup_len);
+		if (hcp->dup_off >= hcp->dup_tlen) {
+			F_CLR(hcp, H_ISDUP);
+			hcp->indx += 2;
+		}
+	} else if (F_ISSET(hcp, H_DUPONLY)) {
+		F_CLR(hcp, H_OK);
+		F_SET(hcp, H_NOMORE);
+		return (0);
+	} else {
+		hcp->indx += 2;
+		F_CLR(hcp, H_ISDUP);
+	}
+
+	ret = __ham_item(dbc, mode, pgnop);
+	return (ret);
+}
+
+/*
+ * __ham_insertpair --
+ *
+ * Used for adding a pair of elements to a sorted page. We are guaranteed that
+ * the pair will fit on this page.
+ *
+ * indexp will return the point at which we inserted the pair.
+ *
+ * We're overloading the meaning of the H_OFFPAGE type here, which is a little
+ * bit sleazy. When we recover deletes, we have the entire entry instead of
+ * having only the DBT, so we'll pass type H_OFFPAGE to mean "copy the whole
+ * entry" as opposed to constructing an H_KEYDATA around it. In the recovery
+ * case it is assumed that a valid index is passed in, since a lookup using
+ * the overloaded H_OFFPAGE key will be incorrect.
+ *
+ * PUBLIC: int __ham_insertpair __P((DBC *, PAGE *p,
+ * PUBLIC:     db_indx_t *indxp, const DBT *,
+ * PUBLIC:     const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type)
+	DBC *dbc;
+	PAGE *p;
+	db_indx_t *indxp;
+	const DBT *key_dbt, *data_dbt;
+	u_int32_t key_type, data_type;
+{
+	DB *dbp;
+	u_int16_t n, indx;
+	db_indx_t *inp;
+	u_int32_t ksize, dsize, increase, distance;
+	u_int8_t *offset;
+	int i;
+
+	dbp = dbc->dbp;
+	n = NUM_ENT(p);
+	inp = P_INP(dbp, p);
+	ksize = (key_type == H_OFFPAGE) ?
+	    key_dbt->size : HKEYDATA_SIZE(key_dbt->size);
+	dsize = (data_type == H_OFFPAGE || data_type == H_OFFDUP) ?
+	    data_dbt->size : HKEYDATA_SIZE(data_dbt->size);
+	increase = ksize + dsize;
+
+	DB_ASSERT(dbp->env, indxp != NULL && *indxp != NDX_INVALID);
+	DB_ASSERT(dbp->env,
+	    P_FREESPACE(dbp, p) >= dsize + ksize + 2 * sizeof(db_indx_t));
+	indx = *indxp;
+
+	/* Special case if the page is empty or inserting at end of page.*/
+	if (n == 0 || indx == n) {
+		inp[indx] = HOFFSET(p) - ksize;
+		inp[indx+1] = HOFFSET(p) - increase;
+	} else {
+		/*
+		 * Shuffle the data elements.
+		 *
+		 * For example, inserting an element that sorts between items
+		 * 2 and 3 on a page:
+		 * The copy starts from the beginning of the second item.
+		 *
+		 * ---------------------------
+		 * |pgheader..
+		 * |__________________________
+		 * ||1|2|3|4|...
+		 * |--------------------------
+		 * |
+		 * |__________________________
+		 * |              ...|4|3|2|1|
+		 * |--------------------------
+		 * ---------------------------
+		 *
+		 * Becomes:
+		 *
+		 * ---------------------------
+		 * |pgheader..
+		 * |__________________________
+		 * ||1|2|2a|3|4|...
+		 * |--------------------------
+		 * |
+		 * |__________________________
+		 * |           ...|4|3|2a|2|1|
+		 * |--------------------------
+		 * ---------------------------
+		 *
+		 * Index's 3,4 etc move down the page.
+		 * The data for 3,4,etc moves up the page by sizeof(2a)
+		 * The index pointers in 3,4 etc are updated to point at the
+		 * relocated data.
+		 * It is necessary to move the data (not just adjust the index)
+		 * since the hash format uses consecutive data items to
+		 * dynamically calculate the item size.
+		 * An item in this example is a key/data pair.
+		 */
+		offset = (u_int8_t *)p + HOFFSET(p);
+		if (indx == 0)
+			distance = dbp->pgsize - HOFFSET(p);
+		else
+			distance = (u_int32_t)
+			    (P_ENTRY(dbp, p, indx - 1) - offset);
+		memmove(offset - increase, offset, distance);
+
+		/* Shuffle the index array */
+		memmove(&inp[indx + 2], &inp[indx],
+		    (n - indx) * sizeof(db_indx_t));
+
+		/* update the index array */
+		for (i = indx + 2; i < n + 2; i++)
+			inp[i] -= increase;
+
+		/* set the new index elements. */
+		inp[indx] = (HOFFSET(p) - increase) + distance + dsize;
+		inp[indx + 1] = (HOFFSET(p) - increase) + distance;
+	}
+
+	HOFFSET(p) -= increase;
+	/* insert the new elements */
+	if (key_type == H_OFFPAGE)
+		memcpy(P_ENTRY(dbp, p, indx), key_dbt->data, key_dbt->size);
+	else
+		PUT_HKEYDATA(P_ENTRY(dbp, p, indx), key_dbt->data,
+		    key_dbt->size, key_type);
+	if (data_type == H_OFFPAGE || data_type == H_OFFDUP)
+		memcpy(P_ENTRY(dbp, p, indx+1), data_dbt->data,
+		    data_dbt->size);
+	else
+		PUT_HKEYDATA(P_ENTRY(dbp, p, indx+1), data_dbt->data,
+		    data_dbt->size, data_type);
+	NUM_ENT(p) += 2;
+
+	/*
+	 * If debugging a sorted hash page problem, this is a good place to
+	 * insert a call to __ham_verify_sorted_page.
+	 * It used to be called when diagnostic mode was enabled, but that
+	 * causes problems in recovery if a custom comparator was used.
+	 */
+	return (0);
+}
+
+/*
+ * __hame_getindex --
+ *
+ * The key_type parameter overloads the entry type to allow for comparison of
+ * a key DBT that contains off-page data. A key that is not of type H_OFFPAGE
+ * might contain data larger than the page size, since this routine can be
+ * called with user-provided DBTs.
+ *
+ * PUBLIC: int __ham_getindex __P((DBC *,
+ * PUBLIC:     PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+ */
+int
+__ham_getindex(dbc, p, key, key_type, match, indx)
+	DBC *dbc;
+	PAGE *p;
+	const DBT *key;
+	u_int32_t key_type;
+	int *match;
+	db_indx_t *indx;
+{
+	/* Since all entries are key/data pairs. */
+	DB_ASSERT(dbc->env, NUM_ENT(p)%2 == 0 );
+
+	/* Support pre 4.6 unsorted hash pages. */
+	if (p->type == P_HASH_UNSORTED)
+		return (__ham_getindex_unsorted(dbc, p, key, match, indx));
+	else
+		return (__ham_getindex_sorted(dbc,
+		    p, key, key_type, match, indx));
+}
+
+#undef	min
+#define	min(a, b) (((a) < (b)) ? (a) : (b))
+
+/*
+ * Perform a linear search of an unsorted (pre 4.6 format) hash page.
+ *
+ * This routine is never used to generate an index for insertion, because any
+ * unsorted page is sorted before we insert.
+ *
+ * Returns 0 if an exact match is found, with indx set to requested elem.
+ * Returns 1 if the item did not exist, indx is set to the last element on the
+ * page.
+ */
+static int
+__ham_getindex_unsorted(dbc, p, key, match, indx)
+	DBC *dbc;
+	PAGE *p;
+	const DBT *key;
+	int *match;
+	db_indx_t *indx;
+{
+	DB *dbp;
+	DBT pg_dbt;
+	HASH *t;
+	db_pgno_t pgno;
+	int i, n, res, ret;
+	u_int32_t tlen;
+	u_int8_t *hk;
+
+	dbp = dbc->dbp;
+	n = NUM_ENT(p);
+	t = dbp->h_internal;
+	res = 1;
+
+	/* Do a linear search over the page looking for an exact match */
+	for (i = 0; i < n; i+=2) {
+		hk = H_PAIRKEY(dbp, p, i);
+		switch (HPAGE_PTYPE(hk)) {
+		case H_OFFPAGE:
+			/* extract item length from possibly unaligned DBT */
+			memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+			if (tlen == key->size) {
+				memcpy(&pgno,
+				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+				if ((ret = __db_moff(dbc, key, pgno, tlen,
+				    t->h_compare, &res)) != 0)
+					return (ret);
+			}
+			break;
+		case H_KEYDATA:
+			if (t->h_compare != NULL) {
+				DB_INIT_DBT(pg_dbt,
+				    HKEYDATA_DATA(hk), key->size);
+				if (t->h_compare(
+				    dbp, key, &pg_dbt) != 0)
+					break;
+			} else if (key->size ==
+			    LEN_HKEY(dbp, p, dbp->pgsize, i))
+				res = memcmp(key->data, HKEYDATA_DATA(hk),
+				    key->size);
+			break;
+		case H_DUPLICATE:
+		case H_OFFDUP:
+			/*
+			 * These are errors because keys are never duplicated.
+			 */
+			 /* FALLTHROUGH */
+		default:
+			return (__db_pgfmt(dbp->env, PGNO(p)));
+		}
+		if (res == 0)
+			break;
+	}
+	*indx = i;
+	*match = (res == 0 ? 0 : 1);
+	return (0);
+}
+
+/*
+ * Perform a binary search of a sorted hash page for a key.
+ * Return 0 if an exact match is found, with indx set to requested elem.
+ * Return 1 if the item did not exist, indx will be set to the first element
+ * greater than the requested item.
+ */
+static int
+__ham_getindex_sorted(dbc, p, key, key_type, match, indxp)
+	DBC *dbc;
+	PAGE *p;
+	const DBT *key;
+	u_int32_t key_type;
+	int *match;
+	db_indx_t *indxp;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	HASH *t;
+	HOFFPAGE *offp;
+	db_indx_t indx;
+	db_pgno_t off_pgno, koff_pgno;
+	u_int32_t base, itemlen, lim, off_len;
+	u_int8_t *entry;
+	int res, ret;
+	void *data;
+
+	dbp = dbc->dbp;
+	DB_ASSERT(dbp->env, p->type == P_HASH );
+
+	t = dbp->h_internal;
+	/* Initialize so the return params are correct for empty pages. */
+	res = indx = 0;
+
+	/* Do a binary search for the element. */
+	DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(p), 2) {
+		DB_BINARY_SEARCH_INCR(indx, base, lim, 2);
+		data = HKEYDATA_DATA(H_PAIRKEY(dbp, p, indx));
+		/*
+		 * There are 4 cases here:
+		 *  1) Off page key, off page match
+		 *  2) Off page key, on page match
+		 *  3) On page key, off page match
+		 *  4) On page key, on page match
+		 */
+		entry = P_ENTRY(dbp, p, indx);
+		if (*entry == H_OFFPAGE) {
+			offp = (HOFFPAGE*)P_ENTRY(dbp, p, indx);
+			(void)__ua_memcpy(&itemlen, HOFFPAGE_TLEN(offp),
+			    sizeof(u_int32_t));
+			if (key_type == H_OFFPAGE) {
+				/*
+				 * Case 1.
+				 *
+				 * If both key and cmp DBTs refer to different
+				 * offpage items, it is necessary to compare
+				 * the content of the entries, in order to be
+				 * able to maintain a valid lexicographic sort
+				 * order.
+				 */
+				(void)__ua_memcpy(&koff_pgno,
+				    HOFFPAGE_PGNO(key->data),
+				    sizeof(db_pgno_t));
+				(void)__ua_memcpy(&off_pgno,
+				    HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+				if (koff_pgno == off_pgno)
+					res = 0;
+				else {
+					memset(&tmp_dbt, 0, sizeof(tmp_dbt));
+					tmp_dbt.size = HOFFPAGE_SIZE;
+					tmp_dbt.data = offp;
+					if ((ret = __db_coff(dbc, key, &tmp_dbt,
+					    t->h_compare, &res)) != 0)
+						return (ret);
+				}
+			} else {
+				/* Case 2 */
+				(void)__ua_memcpy(&off_pgno,
+				    HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+				if ((ret = __db_moff(dbc, key, off_pgno,
+				    itemlen, t->h_compare, &res)) != 0)
+					return (ret);
+			}
+		} else {
+			itemlen = LEN_HKEYDATA(dbp, p, dbp->pgsize, indx);
+			if (key_type == H_OFFPAGE) {
+				/* Case 3 */
+				tmp_dbt.data = data;
+				tmp_dbt.size = itemlen;
+				offp = (HOFFPAGE *)key->data;
+				(void)__ua_memcpy(&off_pgno,
+				    HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+				(void)__ua_memcpy(&off_len, HOFFPAGE_TLEN(offp),
+				    sizeof(u_int32_t));
+				if ((ret = __db_moff(dbc, &tmp_dbt, off_pgno,
+				    off_len, t->h_compare, &res)) != 0)
+					return (ret);
+				/*
+				 * Since we switched the key/match parameters
+				 * in the __db_moff call, the result needs to
+				 * be inverted.
+				 */
+				res = -res;
+			} else if (t->h_compare != NULL) {
+				/* Case 4, with a user comparison func */
+				DB_INIT_DBT(tmp_dbt, data, itemlen);
+				res = t->h_compare(dbp, key, &tmp_dbt);
+			} else {
+				/* Case 4, without a user comparison func */
+				if ((res = memcmp(key->data, data,
+				    min(key->size, itemlen))) == 0)
+					res = itemlen > key->size ? 1 :
+					    (itemlen < key->size ? -1 : 0);
+			}
+		}
+		if (res == 0) {
+			/* Found a match */
+			*indxp = indx;
+			*match = 0;
+			return (0);
+		} else if (res > 0)
+			DB_BINARY_SEARCH_SHIFT_BASE(indx, base, lim, 2);
+	}
+	/*
+	 * If no match was found, and the comparison indicates that the
+	 * closest match was lexicographically less than the input key adjust
+	 * the insertion index to be after the index of the closest match.
+	 */
+	if (res > 0)
+	    indx += 2;
+	*indxp = indx;
+	*match = 1;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_verify_sorted_page __P((DBC *, PAGE *));
+ *
+ * The__ham_verify_sorted_page function is used to determine the correctness
+ * of sorted hash pages. The checks are used by verification, they are
+ * implemented in the hash code because they are also useful debugging aids.
+ */
+int
+__ham_verify_sorted_page (dbc, p)
+	DBC *dbc;
+	PAGE *p;
+{
+	DB *dbp;
+	DBT prev_dbt, curr_dbt;
+	ENV *env;
+	HASH *t;
+	db_pgno_t tpgno;
+	u_int32_t curr_len, prev_len, tlen;
+	u_int16_t *indxp;
+	db_indx_t i, n;
+	int res, ret;
+	char *prev, *curr;
+
+	/* Validate that next, prev pointers are OK */
+	n = NUM_ENT(p);
+	dbp = dbc->dbp;
+	DB_ASSERT(dbp->env, n%2 == 0 );
+
+	env = dbp->env;
+	t = dbp->h_internal;
+
+	/* Disable verification if a custom comparator is supplied */
+	if (t->h_compare != NULL)
+	    return (0);
+
+	/* Iterate through page, ensuring order */
+	prev = (char *)HKEYDATA_DATA(H_PAIRKEY(dbp, p, 0));
+	prev_len = LEN_HKEYDATA(dbp, p, dbp->pgsize, 0);
+	for (i = 2; i < n; i+=2) {
+		curr = (char *)HKEYDATA_DATA(H_PAIRKEY(dbp, p, i));
+		curr_len = LEN_HKEYDATA(dbp, p, dbp->pgsize, i);
+
+		if (HPAGE_TYPE(dbp, p, i-2) == H_OFFPAGE &&
+		    HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) {
+			memset(&prev_dbt, 0, sizeof(prev_dbt));
+			memset(&curr_dbt, 0, sizeof(curr_dbt));
+			prev_dbt.size = curr_dbt.size = HOFFPAGE_SIZE;
+			prev_dbt.data = H_PAIRKEY(dbp, p, i-2);
+			curr_dbt.data = H_PAIRKEY(dbp, p, i);
+			if ((ret = __db_coff(dbc,
+			    &prev_dbt, &curr_dbt, t->h_compare, &res)) != 0)
+				return (ret);
+		} else if (HPAGE_TYPE(dbp, p, i-2) == H_OFFPAGE) {
+			memset(&curr_dbt, 0, sizeof(curr_dbt));
+			curr_dbt.size = curr_len;
+			curr_dbt.data = H_PAIRKEY(dbp, p, i);
+			memcpy(&tlen, HOFFPAGE_TLEN(H_PAIRKEY(dbp, p, i-2)),
+			    sizeof(u_int32_t));
+			memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i-2)),
+			    sizeof(db_pgno_t));
+			if ((ret = __db_moff(dbc,
+			    &curr_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+				return (ret);
+		} else if (HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) {
+			memset(&prev_dbt, 0, sizeof(prev_dbt));
+			prev_dbt.size = prev_len;
+			prev_dbt.data = H_PAIRKEY(dbp, p, i);
+			memcpy(&tlen, HOFFPAGE_TLEN(H_PAIRKEY(dbp, p, i)),
+			    sizeof(u_int32_t));
+			memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i)),
+			    sizeof(db_pgno_t));
+			if ((ret = __db_moff(dbc,
+			    &prev_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+				return (ret);
+		} else
+			res = memcmp(prev, curr, min(curr_len, prev_len));
+
+		if (res == 0 && curr_len > prev_len)
+			res = 1;
+		else if (res == 0 && curr_len < prev_len)
+			res = -1;
+
+		if (res >= 0) {
+			__db_msg(env, "key1: %s, key2: %s, len: %lu\n",
+			    (char *)prev, (char *)curr,
+			    (u_long)min(curr_len, prev_len));
+			__db_msg(env, "curroffset %lu\n", (u_long)i);
+			__db_msg(env, "indexes: ");
+			for (i = 0; i < n; i++) {
+				indxp = P_INP(dbp, p) + i;
+				__db_msg(env, "%04X, ", *indxp);
+			}
+			__db_msg(env, "\n");
+#ifdef HAVE_STATISTICS
+			if ((ret = __db_prpage(dbp, p, DB_PR_PAGE)) != 0)
+				return (ret);
+#endif
+			DB_ASSERT(dbp->env, res < 0);
+		}
+
+		prev = curr;
+		prev_len = curr_len;
+	}
+	return (0);
+}
+
+/*
+ * A wrapper for the __ham_sort_page function. Implements logging and cursor
+ * adjustments associated with sorting a page outside of recovery/upgrade.
+ * PUBLIC: int __ham_sort_page_cursor __P((DBC *, PAGE *));
+ */
+int
+__ham_sort_page_cursor(dbc, page)
+	DBC *dbc;
+	PAGE *page;
+{
+	DB *dbp;
+	DBT page_dbt;
+	DB_LSN new_lsn;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	dbp = dbc->dbp;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (DBC_LOGGING(dbc)) {
+		page_dbt.size = dbp->pgsize;
+		page_dbt.data = page;
+		if ((ret = __ham_splitdata_log(dbp, dbc->txn,
+		    &new_lsn, 0, SORTPAGE, PGNO(page),
+		    &page_dbt, &LSN(page))) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+	/* Move lsn onto page. */
+	LSN(page) = new_lsn;	/* Structure assignment. */
+
+	/*
+	 * Invalidate the saved index, it needs to be retrieved
+	 * again once the page is sorted.
+	 */
+	hcp->seek_found_indx = NDX_INVALID;
+	hcp->seek_found_page = PGNO_INVALID;
+
+	return (__ham_sort_page(dbc, &hcp->split_buf, page));
+}
+
+/*
+ * PUBLIC: int __ham_sort_page __P((DBC *,  PAGE **, PAGE *));
+ *
+ * Convert a page from P_HASH_UNSORTED into the sorted format P_HASH.
+ *
+ * All locking and logging is carried out be the caller. A user buffer can
+ * optionally be passed in to save allocating a page size buffer for sorting.
+ * This is allows callers to re-use the buffer pre-allocated for page splits
+ * in the hash cursor. The buffer is optional since no cursor exists when in
+ * the recovery or upgrade code paths.
+ */
+int
+__ham_sort_page(dbc, tmp_buf, page)
+	DBC *dbc;
+	PAGE **tmp_buf;
+	PAGE *page;
+{
+	DB *dbp;
+	PAGE *temp_pagep;
+	db_indx_t i;
+	int ret;
+
+	dbp = dbc->dbp;
+	DB_ASSERT(dbp->env, page->type == P_HASH_UNSORTED);
+
+	ret = 0;
+	if (tmp_buf != NULL)
+		temp_pagep = *tmp_buf;
+	else if ((ret = __os_malloc(dbp->env, dbp->pgsize, &temp_pagep)) != 0)
+	    return (ret);
+
+	memcpy(temp_pagep, page, dbp->pgsize);
+
+	/* Re-initialize the page. */
+	P_INIT(page, dbp->pgsize,
+	    page->pgno, page->prev_pgno, page->next_pgno, 0, P_HASH);
+
+	for (i = 0; i < NUM_ENT(temp_pagep); i += 2)
+		if ((ret =
+		    __ham_copypair(dbc, temp_pagep, i, page, NULL, 0)) != 0)
+			break;
+
+	if (tmp_buf == NULL)
+		__os_free(dbp->env, temp_pagep);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_del_pair __P((DBC *, int, PAGE *));
+ */
+int
+__ham_del_pair(dbc, flags, ppg)
+	DBC *dbc;
+	int flags;
+	PAGE *ppg;
+{
+	DB *dbp;
+	DBT data_dbt, key_dbt;
+	DB_LSN new_lsn, *n_lsn, tmp_lsn;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *n_pagep, *nn_pagep, *p, *p_pagep;
+	db_ham_mode op;
+	db_indx_t ndx;
+	db_pgno_t chg_pgno, pgno, tmp_pgno;
+	u_int32_t data_type, key_type, order;
+	int ret, t_ret;
+	u_int8_t *hk;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	n_pagep = p_pagep = nn_pagep = NULL;
+	ndx = hcp->indx;
+
+	if (hcp->page == NULL &&
+	    (ret = __memp_fget(mpf, &hcp->pgno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &hcp->page)) != 0)
+		return (ret);
+	p = hcp->page;
+
+	/*
+	 * We optimize for the normal case which is when neither the key nor
+	 * the data are large.  In this case, we write a single log record
+	 * and do the delete.  If either is large, we'll call __big_delete
+	 * to remove the big item and then update the page to remove the
+	 * entry referring to the big item.
+	 */
+	if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) &&
+	    HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx)) == H_OFFPAGE) {
+		memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_KEYINDEX(ndx))),
+		    sizeof(db_pgno_t));
+		ret = __db_doff(dbc, pgno);
+	} else
+		ret = 0;
+
+	if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && ret == 0)
+		switch (HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx))) {
+		case H_OFFPAGE:
+			memcpy(&pgno,
+			    HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_DATAINDEX(ndx))),
+			    sizeof(db_pgno_t));
+			ret = __db_doff(dbc, pgno);
+			break;
+		case H_OFFDUP:
+		case H_DUPLICATE:
+			/*
+			 * If we delete a pair that is/was a duplicate, then
+			 * we had better clear the flag so that we update the
+			 * cursor appropriately.
+			 */
+			F_CLR(hcp, H_ISDUP);
+			break;
+		default:
+			/* No-op */
+			break;
+		}
+
+	if (ret)
+		return (ret);
+
+	/* Now log the delete off this page. */
+	if (DBC_LOGGING(dbc)) {
+		hk = H_PAIRKEY(dbp, hcp->page, ndx);
+		if ((key_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+			key_dbt.data = hk;
+			key_dbt.size = HOFFPAGE_SIZE;
+		} else {
+			key_dbt.data = HKEYDATA_DATA(hk);
+			key_dbt.size =
+			    LEN_HKEY(dbp, hcp->page, dbp->pgsize, ndx);
+		}
+		hk = H_PAIRDATA(dbp, hcp->page, ndx);
+		if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+			data_dbt.data = hk;
+			data_dbt.size = HOFFPAGE_SIZE;
+		} else if (data_type == H_OFFDUP) {
+			data_dbt.data = hk;
+			data_dbt.size = HOFFDUP_SIZE;
+		} else {
+			data_dbt.data = HKEYDATA_DATA(hk);
+			data_dbt.size =
+			    LEN_HDATA(dbp, hcp->page, dbp->pgsize, ndx);
+		}
+
+		if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn, 0,
+		    DELPAIR, PGNO(p), (u_int32_t)ndx, &LSN(p),
+		    OP_SET(key_type, p), &key_dbt,
+		    OP_SET(data_type, p), &data_dbt)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+
+	/* Move lsn onto page. */
+	LSN(p) = new_lsn;
+	/* Do the delete. */
+	__ham_dpair(dbp, p, ndx);
+
+	/*
+	 * Mark item deleted so that we don't try to return it, and
+	 * so that we update the cursor correctly on the next call
+	 * to next.
+	 */
+	F_SET(hcp, H_DELETED);
+	F_CLR(hcp, H_OK);
+
+	/* Clear any cache streaming information. */
+	hcp->stream_start_pgno = PGNO_INVALID;
+
+	/*
+	 * If we are locking, we will not maintain this, because it is
+	 * a hot spot.
+	 *
+	 * XXX
+	 * Perhaps we can retain incremental numbers and apply them later.
+	 */
+	if (!STD_LOCKING(dbc)) {
+		if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+			return (ret);
+		--hcp->hdr->nelem;
+	}
+
+	/* The HAM_DEL_NO_CURSOR flag implies HAM_DEL_NO_RECLAIM. */
+	if (LF_ISSET(HAM_DEL_NO_CURSOR))
+		return (0);
+	/*
+	 * Update cursors that are on the page where the delete happened.
+	 */
+	if ((ret = __hamc_update(dbc, 0, DB_HAM_CURADJ_DEL, 0)) != 0)
+		return (ret);
+
+	/*
+	 * If we need to reclaim the page, then check if the page is empty.
+	 * There are two cases.  If it's empty and it's not the first page
+	 * in the bucket (i.e., the bucket page) then we can simply remove
+	 * it. If it is the first chain in the bucket, then we need to copy
+	 * the second page into it and remove the second page.
+	 * If its the only page in the bucket we leave it alone.
+	 */
+	if (LF_ISSET(HAM_DEL_NO_RECLAIM) ||
+	    NUM_ENT(p) != 0 ||
+	    (PREV_PGNO(p) == PGNO_INVALID && NEXT_PGNO(p) == PGNO_INVALID)) {
+		if (NUM_ENT(p) == 0)
+			F_SET(hcp, H_CONTRACT);
+		return (0);
+	}
+
+	if (PREV_PGNO(p) == PGNO_INVALID) {
+		/*
+		 * First page in chain is empty and we know that there
+		 * are more pages in the chain.
+		 */
+		if ((ret = __memp_fget(mpf,
+		    &NEXT_PGNO(p), dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &n_pagep)) != 0)
+			return (ret);
+
+		if (NEXT_PGNO(n_pagep) != PGNO_INVALID &&
+		    (ret = __memp_fget(mpf, &NEXT_PGNO(n_pagep),
+		    dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &nn_pagep)) != 0)
+			goto err;
+
+		if (DBC_LOGGING(dbc)) {
+			key_dbt.data = n_pagep;
+			key_dbt.size = dbp->pgsize;
+			if ((ret = __ham_copypage_log(dbp,
+			    dbc->txn, &new_lsn, 0, PGNO(p),
+			    &LSN(p), PGNO(n_pagep), &LSN(n_pagep),
+			    NEXT_PGNO(n_pagep),
+			    nn_pagep == NULL ? NULL : &LSN(nn_pagep),
+			    &key_dbt)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(new_lsn);
+
+		/* Move lsn onto page. */
+		LSN(p) = new_lsn;	/* Structure assignment. */
+		LSN(n_pagep) = new_lsn;
+		if (NEXT_PGNO(n_pagep) != PGNO_INVALID)
+			LSN(nn_pagep) = new_lsn;
+
+		if (nn_pagep != NULL) {
+			PREV_PGNO(nn_pagep) = PGNO(p);
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, nn_pagep, dbc->priority);
+			nn_pagep = NULL;
+			if (ret != 0)
+				goto err;
+		}
+
+		tmp_pgno = PGNO(p);
+		tmp_lsn = LSN(p);
+		memcpy(p, n_pagep, dbp->pgsize);
+		PGNO(p) = tmp_pgno;
+		LSN(p) = tmp_lsn;
+		PREV_PGNO(p) = PGNO_INVALID;
+
+		/*
+		 * Update cursors to reflect the fact that records
+		 * on the second page have moved to the first page.
+		 */
+		if ((ret = __hamc_delpg(dbc, PGNO(n_pagep),
+		    PGNO(p), 0, DB_HAM_DELFIRSTPG, &order)) != 0)
+			goto err;
+
+		/*
+		 * Update the cursor to reflect its new position.
+		 */
+		hcp->indx = 0;
+		hcp->pgno = PGNO(p);
+		hcp->order += order;
+
+		if ((ret = __db_free(dbc, n_pagep, 0)) != 0) {
+			n_pagep = NULL;
+			goto err;
+		}
+	} else {
+		if ((p_pagep = ppg) == NULL && (ret = __memp_fget(mpf,
+		    &PREV_PGNO(p), dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &p_pagep)) != 0)
+			goto err;
+
+		if (NEXT_PGNO(p) != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &NEXT_PGNO(p),
+			dbc->thread_info, dbc->txn,
+			    DB_MPOOL_DIRTY, &n_pagep)) != 0)
+				goto err;
+			n_lsn = &LSN(n_pagep);
+		} else {
+			n_pagep = NULL;
+			n_lsn = NULL;
+		}
+
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __ham_newpage_log(dbp, dbc->txn,
+			    &new_lsn, 0, DELOVFL, PREV_PGNO(p), &LSN(p_pagep),
+			    PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(new_lsn);
+
+		/* Move lsn onto page. */
+		LSN(p_pagep) = new_lsn;	/* Structure assignment. */
+		if (n_pagep)
+			LSN(n_pagep) = new_lsn;
+		LSN(p) = new_lsn;
+
+		NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
+		if (n_pagep != NULL)
+			PREV_PGNO(n_pagep) = PGNO(p_pagep);
+
+		if (NEXT_PGNO(p) == PGNO_INVALID) {
+			/*
+			 * There is no next page; put the cursor on the
+			 * previous page as if we'd deleted the last item
+			 * on that page, with index after the last valid
+			 * entry.
+			 *
+			 * The deleted flag was set up above.
+			 */
+			hcp->pgno = PGNO(p_pagep);
+			hcp->indx = NUM_ENT(p_pagep);
+			op = DB_HAM_DELLASTPG;
+		} else {
+			/*
+			 * There is a next page, so put the cursor at
+			 * the beginning of it.
+			 */
+			hcp->pgno = NEXT_PGNO(p);
+			hcp->indx = 0;
+			op = DB_HAM_DELMIDPG;
+		}
+
+		/*
+		 * Since we are about to delete the cursor page and we have
+		 * just moved the cursor, we need to make sure that the
+		 * old page pointer isn't left hanging around in the cursor.
+		 */
+		hcp->page = NULL;
+		chg_pgno = PGNO(p);
+		ret = __db_free(dbc, p, 0);
+		if (ppg == NULL && (t_ret = __memp_fput(mpf, dbc->thread_info,
+		       p_pagep, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (n_pagep != NULL && (t_ret = __memp_fput(mpf,
+		    dbc->thread_info, n_pagep, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+		if ((ret = __hamc_delpg(dbc,
+		    chg_pgno, hcp->pgno, hcp->indx, op, &order)) != 0)
+			return (ret);
+		hcp->order += order;
+	}
+	return (ret);
+
+err:	/* Clean up any pages. */
+	if (n_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, n_pagep, dbc->priority);
+	if (nn_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, nn_pagep, dbc->priority);
+	if (ppg == NULL && p_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, p_pagep, dbc->priority);
+	return (ret);
+}
+
+/*
+ * __ham_replpair --
+ *	Given the key data indicated by the cursor, replace part/all of it
+ *	according to the fields in the dbt.
+ *
+ * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__ham_replpair(dbc, dbt, newtype)
+	DBC *dbc;
+	DBT *dbt;
+	u_int32_t newtype;
+{
+	DB *dbp;
+	DBC **carray, *dbc_n;
+	DBT old_dbt, tdata, tmp, *new_dbt;
+	DB_LSN	new_lsn;
+	ENV *env;
+	HASH_CURSOR *hcp, *cp;
+	db_indx_t orig_indx;
+	db_pgno_t off_pgno, orig_pgno;
+	u_int32_t change;
+	u_int32_t dup_flag, len, memsize, newlen, oldtype, type;
+	char tmp_ch;
+	int beyond_eor, is_big, is_plus, ret, i, found, t_ret;
+	u_int8_t *beg, *dest, *end, *hk, *src;
+	void *memp;
+
+	/*
+	 * Most items that were already offpage (ISBIG) were handled before
+	 * we get in here.  So, we need only handle cases where the old
+	 * key is on a regular page.  That leaves us 6 cases:
+	 * 1. Original data onpage; new data is smaller
+	 * 2. Original data onpage; new data is the same size
+	 * 3. Original data onpage; new data is bigger, but not ISBIG,
+	 *    fits on page
+	 * 4. Original data onpage; new data is bigger, but not ISBIG,
+	 *    does not fit on page
+	 * 5. Original data onpage; New data is an off-page item.
+	 * 6. Original data was offpage; new item is smaller.
+	 * 7. Original data was offpage; new item is supplied as a partial.
+	 *
+	 * Cases 1-3 are essentially the same (and should be the common case).
+	 * We handle 4-6 as delete and add. 7 is generally a delete and add,
+	 * unless it is an append, when we extend the offpage item, and
+	 * update the HOFFPAGE item on the current page to have the new size
+	 * via a delete/add.
+	 */
+	dbp = dbc->dbp;
+	env = dbp->env;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	carray = NULL;
+	dbc_n = memp = NULL;
+	found = 0;
+	new_dbt = NULL;
+	off_pgno = PGNO_INVALID;
+	type = 0;
+
+	/*
+	 * We need to compute the number of bytes that we are adding or
+	 * removing from the entry.  Normally, we can simply subtract
+	 * the number of bytes we are replacing (dbt->dlen) from the
+	 * number of bytes we are inserting (dbt->size).  However, if
+	 * we are doing a partial put off the end of a record, then this
+	 * formula doesn't work, because we are essentially adding
+	 * new bytes.
+	 */
+	if (dbt->size > dbt->dlen) {
+		change = dbt->size - dbt->dlen;
+		is_plus = 1;
+	} else {
+		change = dbt->dlen - dbt->size;
+		is_plus = 0;
+	}
+
+	hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+	oldtype = HPAGE_PTYPE(hk);
+	is_big = oldtype == H_OFFPAGE;
+
+	if (is_big) {
+		memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+		memcpy(&off_pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+	} else
+		len = LEN_HKEYDATA(dbp, hcp->page,
+		    dbp->pgsize, H_DATAINDEX(hcp->indx));
+
+	beyond_eor = dbt->doff + dbt->dlen > len;
+	if (beyond_eor) {
+		/*
+		 * The change is beyond the end of record.  If change
+		 * is a positive number, we can simply add the extension
+		 * to it.  However, if change is negative, then we need
+		 * to figure out if the extension is larger than the
+		 * negative change.
+		 */
+		if (is_plus)
+			change += dbt->doff + dbt->dlen - len;
+		else if (dbt->doff + dbt->dlen - len > change) {
+			/* Extension bigger than change */
+			is_plus = 1;
+			change = (dbt->doff + dbt->dlen - len) - change;
+		} else /* Extension is smaller than change. */
+			change -= (dbt->doff + dbt->dlen - len);
+	}
+
+	newlen = (is_plus ? len + change : len - change);
+	if (is_big || beyond_eor || ISBIG(hcp, newlen) ||
+	    (is_plus && change > P_FREESPACE(dbp, hcp->page))) {
+		/*
+		 * If we are in cases 4 or 5 then is_plus will be true.
+		 * If we don't have a transaction then we cannot roll back,
+		 * make sure there is enough room for the new page.
+		 */
+		if (is_plus && dbc->txn == NULL &&
+		    dbp->mpf->mfp->maxpgno != 0 &&
+		    dbp->mpf->mfp->maxpgno == dbp->mpf->mfp->last_pgno)
+			return (__db_space_err(dbp));
+		/*
+		 * Cases 4-6 -- two subcases.
+		 * A. This is not really a partial operation, but an overwrite.
+		 *    Simple del and add works.
+		 * B. This is a partial and we need to construct the data that
+		 *    we are really inserting (yuck).
+		 * In both cases, we need to grab the key off the page (in
+		 * some cases we could do this outside of this routine; for
+		 * cleanliness we do it here.  If you happen to be on a big
+		 * key, this could be a performance hit).
+		 */
+		memset(&tmp, 0, sizeof(tmp));
+		if ((ret = __db_ret(dbc, hcp->page, H_KEYINDEX(hcp->indx),
+		    &tmp, &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+			return (ret);
+
+		/* Preserve duplicate info. */
+		dup_flag = F_ISSET(hcp, H_ISDUP);
+		/* Streaming insert. */
+		if (is_big && !dup_flag && !DB_IS_PRIMARY(dbp) &&
+		    F_ISSET(dbt, DB_DBT_PARTIAL) && dbt->doff == len) {
+			/*
+			 * If the cursor has not already cached the last page
+			 * in the offpage chain, we need to walk the chain to
+			 * be sure that the page has been read.
+			 */
+			if (hcp->stream_start_pgno != off_pgno ||
+			    hcp->stream_off > dbt->doff || dbt->doff >
+			    hcp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+				memset(&tdata, 0, sizeof(DBT));
+				tdata.doff = dbt->doff - 1;
+				/*
+				 * Set the length to 1, to force __db_goff
+				 * to do the traversal.
+				 */
+				tdata.dlen = tdata.ulen = 1;
+				tdata.data = &tmp_ch;
+				tdata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+				/*
+				 * Read to the last page.  It will be cached
+				 * in the cursor.
+				 */
+				if ((ret = __db_goff(dbc, &tdata, len,
+				    off_pgno, NULL, NULL)) != 0)
+					return (ret);
+			}
+			/*
+			 * Since this is an append, dlen is irrelevant (there
+			 * are no bytes to overwrite). We need the caller's
+			 * DBT size to end up with the total size of the item.
+			 * From now on, use dlen as the length of the user's
+			 * data that we are going to append.
+			 * Don't futz with the caller's DBT any more than we
+			 * have to in order to send back the size.
+			 */
+			tdata = *dbt;
+			tdata.dlen = dbt->size;
+			tdata.size = newlen;
+			new_dbt = &tdata;
+			F_SET(new_dbt, DB_DBT_STREAMING);
+			type = H_KEYDATA;
+		}
+
+		/*
+		 * In cases 4-6, a delete and insert works, but we need to
+		 * track and update any cursors pointing to the item being
+		 * moved.
+		 */
+		orig_pgno = PGNO(hcp->page);
+		orig_indx = hcp->indx;
+		if ((ret = __ham_get_clist(dbp,
+		    orig_pgno, orig_indx, &carray)) != 0)
+			goto err;
+
+		if (dbt->doff == 0 && dbt->dlen == len) {
+			type = (dup_flag ? H_DUPLICATE : H_KEYDATA);
+			new_dbt = dbt;
+		} else if (!F_ISSET(dbt, DB_DBT_STREAMING)) {	/* Case B */
+			type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
+			    HPAGE_PTYPE(hk) : H_KEYDATA;
+			memset(&tdata, 0, sizeof(tdata));
+			memsize = 0;
+			if ((ret = __db_ret(dbc, hcp->page,
+			    H_DATAINDEX(hcp->indx), &tdata,
+			    &memp, &memsize)) != 0)
+				goto err;
+
+			/* Now shift old data around to make room for new. */
+			if (is_plus) {
+				if ((ret = __os_realloc(env,
+				    tdata.size + change, &tdata.data)) != 0)
+					return (ret);
+				memp = tdata.data;
+				memsize = tdata.size + change;
+				memset((u_int8_t *)tdata.data + tdata.size,
+				    0, change);
+			}
+			end = (u_int8_t *)tdata.data + tdata.size;
+
+			src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
+			if (src < end && tdata.size > dbt->doff + dbt->dlen) {
+				len = tdata.size - (dbt->doff + dbt->dlen);
+				if (is_plus)
+					dest = src + change;
+				else
+					dest = src - change;
+				memmove(dest, src, len);
+			}
+			memcpy((u_int8_t *)tdata.data + dbt->doff,
+			    dbt->data, dbt->size);
+			if (is_plus)
+				tdata.size += change;
+			else
+				tdata.size -= change;
+			new_dbt = &tdata;
+		}
+		if ((ret = __ham_del_pair(dbc, HAM_DEL_NO_CURSOR |
+		    (F_ISSET(dbt, DB_DBT_STREAMING) ? HAM_DEL_IGNORE_OFFPAGE :
+		    0), NULL)) != 0)
+			goto err;
+		/*
+		 * Save the state of the cursor after the delete, so that we
+		 * can adjust any cursors impacted by the delete. Don't just
+		 * update the cursors now, to avoid ambiguity in reversing the
+		 * adjustments during abort.
+		 */
+		if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+			goto err;
+		if ((ret = __ham_add_el(dbc, &tmp, new_dbt, type)) != 0)
+			goto err;
+		F_SET(hcp, dup_flag);
+
+		/*
+		 * If the delete/insert pair caused the item to be moved
+		 * to another location (which is possible for duplicate sets
+		 * that are moved onto another page in the bucket), then update
+		 * any impacted cursors.
+		 */
+		if (((HASH_CURSOR*)dbc_n->internal)->pgno != hcp->pgno ||
+		    ((HASH_CURSOR*)dbc_n->internal)->indx != hcp->indx) {
+			/*
+			 * Set any cursors pointing to items in the moved
+			 * duplicate set to the destination location and reset
+			 * the deleted flag. This can't be done earlier, since
+			 * the insert location is not computed until the actual
+			 * __ham_add_el call is made.
+			 */
+			if (carray != NULL) {
+				for (i = 0; carray[i] != NULL; i++) {
+					cp = (HASH_CURSOR*)carray[i]->internal;
+					cp->pgno = hcp->pgno;
+					cp->indx = hcp->indx;
+					F_CLR(cp, H_DELETED);
+					found = 1;
+				}
+				/*
+				 * Only log the update once, since the recovery
+				 * code iterates through all open cursors and
+				 * applies the change to all matching cursors.
+				 */
+				if (found && DBC_LOGGING(dbc) &&
+				    IS_SUBTRANSACTION(dbc->txn)) {
+					if ((ret =
+					    __ham_chgpg_log(dbp,
+					    dbc->txn, &new_lsn, 0,
+					    DB_HAM_CHGPG, orig_pgno, hcp->pgno,
+					    orig_indx, hcp->indx)) != 0)
+						goto err;
+				}
+			}
+			/*
+			 * Update any cursors impacted by the delete. Do this
+			 * after chgpg log so that recovery does not re-bump
+			 * cursors pointing to the deleted item.
+			 */
+			ret = __hamc_update(dbc_n, 0, DB_HAM_CURADJ_DEL, 0);
+		}
+
+err:		if (dbc_n != NULL && (t_ret = __dbc_close(dbc_n)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		if (carray != NULL)
+			__os_free(env, carray);
+		if (memp != NULL)
+			__os_free(env, memp);
+		return (ret);
+	}
+
+	/*
+	 * Set up pointer into existing data. Do it before the log
+	 * message so we can use it inside of the log setup.
+	 */
+	beg = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+	beg += dbt->doff;
+
+	/*
+	 * If we are going to have to move bytes at all, figure out
+	 * all the parameters here.  Then log the call before moving
+	 * anything around.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		old_dbt.data = beg;
+		old_dbt.size = dbt->dlen;
+		if ((ret = __ham_replace_log(dbp, dbc->txn, &new_lsn,
+		    0, PGNO(hcp->page),
+		    (u_int32_t)H_DATAINDEX(hcp->indx), &LSN(hcp->page),
+		    (int32_t)dbt->doff, OP_SET(oldtype, hcp->page),
+		    &old_dbt, OP_SET(newtype, hcp->page), dbt)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+
+	LSN(hcp->page) = new_lsn;	/* Structure assignment. */
+
+	__ham_onpage_replace(dbp, hcp->page, (u_int32_t)H_DATAINDEX(hcp->indx),
+	    (int32_t)dbt->doff, change, is_plus, dbt);
+
+	return (0);
+}
+
+/*
+ * Replace data on a page with new data, possibly growing or shrinking what's
+ * there.  This is called on two different occasions. On one (from replpair)
+ * we are interested in changing only the data.  On the other (from recovery)
+ * we are replacing the entire data (header and all) with a new element.  In
+ * the latter case, the off argument is negative.
+ * pagep: the page that we're changing
+ * ndx: page index of the element that is growing/shrinking.
+ * off: Offset at which we are beginning the replacement.
+ * change: the number of bytes (+ or -) that the element is growing/shrinking.
+ * dbt: the new data that gets written at beg.
+ *
+ * PUBLIC: void __ham_onpage_replace __P((DB *, PAGE *, u_int32_t,
+ * PUBLIC:     int32_t, u_int32_t,  int, DBT *));
+ */
+void
+__ham_onpage_replace(dbp, pagep, ndx, off, change, is_plus, dbt)
+	DB *dbp;
+	PAGE *pagep;
+	u_int32_t ndx;
+	int32_t off;
+	u_int32_t change;
+	int is_plus;
+	DBT *dbt;
+{
+	db_indx_t i, *inp;
+	int32_t len;
+	size_t pgsize;
+	u_int8_t *src, *dest;
+	int zero_me;
+
+	pgsize = dbp->pgsize;
+	inp = P_INP(dbp, pagep);
+	if (change != 0) {
+		zero_me = 0;
+		src = (u_int8_t *)(pagep) + HOFFSET(pagep);
+		if (off < 0)
+			len = inp[ndx] - HOFFSET(pagep);
+		else if ((u_int32_t)off >=
+		    LEN_HKEYDATA(dbp, pagep, pgsize, ndx)) {
+			len = (int32_t)(HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx))
+			    + LEN_HKEYDATA(dbp, pagep, pgsize, ndx) - src);
+			zero_me = 1;
+		} else
+			len = (int32_t)(
+			    (HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx)) + off) -
+			    src);
+		if (is_plus)
+			dest = src - change;
+		else
+			dest = src + change;
+		memmove(dest, src, (size_t)len);
+		if (zero_me)
+			memset(dest + len, 0, change);
+
+		/* Now update the indices. */
+		for (i = ndx; i < NUM_ENT(pagep); i++) {
+			if (is_plus)
+				inp[i] -= change;
+			else
+				inp[i] += change;
+		}
+		if (is_plus)
+			HOFFSET(pagep) -= change;
+		else
+			HOFFSET(pagep) += change;
+	}
+	if (off >= 0)
+		memcpy(HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx)) + off,
+		    dbt->data, dbt->size);
+	else
+		memcpy(P_ENTRY(dbp, pagep, ndx), dbt->data, dbt->size);
+}
+
+/*
+ * __ham_merge_page --
+ *	Merge pages from one bucket to another.
+ * PUBLIC: int __ham_merge_pages __P((DBC *,
+ * PUBLIC:      u_int32_t, u_int32_t, DB_COMPACT *));
+ */
+int
+__ham_merge_pages(dbc, tobucket, frombucket, c_data)
+	DBC *dbc;
+	u_int32_t tobucket, frombucket;
+	DB_COMPACT *c_data;
+{
+	DB *dbp;
+	DBC **carray;
+	DB_LOCK tlock, firstlock;
+	DB_LSN from_lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp, *cp;
+	PAGE *to_pagep, *first_pagep,
+	    *from_pagep, *last_pagep, *next_pagep, *prev_pagep;
+	db_pgno_t to_pgno, first_pgno, from_pgno;
+	u_int32_t len;
+	db_indx_t dest_indx, n, num_ent;
+	int check_trunc, found, i, ret;
+
+	dbp = dbc->dbp;
+	carray = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	hcp->pgno = PGNO_INVALID;
+	to_pagep = first_pagep = NULL;
+	from_pagep = last_pagep = next_pagep = prev_pagep = NULL;
+	from_pgno = PGNO_INVALID;
+	LOCK_INIT(tlock);
+	LOCK_INIT(firstlock);
+
+	check_trunc =
+	    c_data == NULL ? 0 : c_data->compact_truncate != PGNO_INVALID;
+	to_pgno = BUCKET_TO_PAGE(hcp, tobucket);
+	if ((ret = __db_lget(dbc,
+	    0, to_pgno, DB_LOCK_WRITE, 0, &tlock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &to_pgno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &to_pagep)) != 0)
+		goto err;
+
+	/* Sort any unsorted pages before adding to the page. */
+	if (to_pagep->type == P_HASH_UNSORTED)
+		if ((ret = __ham_sort_page_cursor(dbc, to_pagep)) != 0)
+			return (ret);
+
+	/* Fetch the first page of the bucket we are getting rid of. */
+	from_pgno = BUCKET_TO_PAGE(hcp, frombucket);
+	if ((ret = __db_lget(dbc,
+	    0, from_pgno, DB_LOCK_WRITE, 0, &firstlock)) != 0)
+		goto err;
+next_page:
+	/*
+	 * from_pagep is the starting point in the bucket at which records
+	 * are moved to the new bucket.
+	 */
+	if (from_pagep == NULL &&
+	    (ret = __memp_fget(mpf, &from_pgno, dbc->thread_info,
+	    dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &from_pagep)) != 0)
+		goto err;
+	if ((ret = __ham_get_clist(dbp, from_pgno, NDX_INVALID, &carray)) != 0)
+		goto err;
+
+	hcp->indx = 0;
+	hcp->pgno = from_pgno;
+	hcp->page = from_pagep;
+	num_ent = NUM_ENT(from_pagep);
+	for (n = 0; n < num_ent; n += 2) {
+		/*
+		 * Figure out how many bytes we need on the from
+		 * page to store the key/data pair.
+		 */
+		len = LEN_HITEM(dbp, from_pagep,
+		    dbp->pgsize, H_DATAINDEX(hcp->indx)) +
+		    LEN_HITEM(dbp, from_pagep,
+		    dbp->pgsize, H_KEYINDEX(hcp->indx)) +
+		    2 * sizeof(db_indx_t);
+
+		/*
+		 * Find a page that will fit this data.  We don't go back
+		 * to a page, so we may leave some space if there is a big
+		 * variation in record size.
+		 */
+		while (P_FREESPACE(dbp, to_pagep) < len) {
+			to_pgno = NEXT_PGNO(to_pagep);
+			if (to_pgno == PGNO_INVALID) {
+				next_pagep = to_pagep;
+				if ((ret =
+				     __ham_add_ovflpage(dbc, &next_pagep)) != 0)
+					goto err;
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				    to_pagep, dbc->priority)) != 0)
+					goto err;
+				to_pagep = next_pagep;
+				next_pagep = NULL;
+				if (c_data != NULL &&
+				    c_data->compact_pages_free > 0)
+					c_data->compact_pages_free--;
+				to_pgno = PGNO(to_pagep);
+			} else {
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				    to_pagep, dbc->priority)) != 0)
+					goto err;
+				to_pagep = NULL;
+				if ((ret = __memp_fget(mpf,
+				    &to_pgno, dbc->thread_info, dbc->txn,
+				    DB_MPOOL_CREATE | DB_MPOOL_DIRTY,
+				    &to_pagep)) != 0)
+					goto err;
+
+				/*
+				 * Sort any unsorted pages before adding
+				 * to the page.
+				 */
+				if (to_pagep->type == P_HASH_UNSORTED)
+					if ((ret = __ham_sort_page_cursor(dbc,
+					    to_pagep)) != 0)
+						goto err;
+			}
+		}
+		dest_indx = NDX_INVALID;
+		if ((ret = __ham_copypair(dbc,
+		    from_pagep, hcp->indx, to_pagep, &dest_indx, 1)) != 0)
+			goto err;
+
+		/* Update any cursors pointing at the moved item. */
+		if (carray != NULL) {
+			found = 0;
+			for (i = 0; carray[i] != NULL; i++) {
+				cp =
+				    (HASH_CURSOR *)carray[i]->internal;
+				if (cp->pgno == from_pgno &&
+				    cp->indx == n) {
+					cp->pgno = PGNO(to_pagep);
+					cp->indx = dest_indx;
+					cp->bucket = tobucket;
+					found = 1;
+				}
+			}
+			/*
+			 * Only log the update once, since the recovery
+			 * code iterates through all open cursors and
+			 * applies the change to all matching cursors.
+			 */
+			if (found && DBC_LOGGING(dbc) &&
+			    IS_SUBTRANSACTION(dbc->txn)) {
+				if ((ret =
+				    __ham_chgpg_log(dbp,
+				    dbc->txn, &from_lsn, 0,
+				    DB_HAM_SPLIT, from_pgno,
+				    PGNO(to_pagep), n, dest_indx)) != 0)
+					goto err;
+			}
+		}
+		/*
+		 * If this is the head of the bucket, delete the record.
+		 * Otherwise we will just free the page after the loop.
+		 */
+		if (PREV_PGNO(from_pagep) == PGNO_INVALID) {
+			if ((ret = __ham_del_pair(dbc,
+			    HAM_DEL_IGNORE_OFFPAGE | HAM_DEL_NO_CURSOR,
+			    from_pagep)) != 0)
+				goto err;
+			if (!STD_LOCKING(dbc)) {
+				if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+					return (ret);
+				++hcp->hdr->nelem;
+			}
+		} else
+			hcp->indx += 2;
+	}
+	/*
+	 * If there are more pages in the bucket then we need to process them.
+	 * First we may remove a page that is empty.  If there is a next
+	 * page then save the previous one for relinking.
+	 */
+	from_pgno = NEXT_PGNO(from_pagep);
+	if (PREV_PGNO(from_pagep) != PGNO_INVALID) {
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __db_relink_log(dbp, dbc->txn,
+			    &LSN(prev_pagep), 0, PGNO(from_pagep),
+			    PGNO_INVALID, PGNO(prev_pagep),
+			    &LSN(prev_pagep), PGNO_INVALID, NULL)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(prev_pagep));
+
+		NEXT_PGNO(prev_pagep) = PGNO_INVALID;
+
+		if ((ret = __db_free(dbc, from_pagep, 0)) != 0) {
+			from_pagep = NULL;
+			goto err;
+		}
+		if (c_data != NULL)
+			c_data->compact_pages_free++;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, prev_pagep, dbc->priority)) != 0)
+			goto err;
+		prev_pagep = NULL;
+	} else if (from_pgno != PGNO_INVALID)
+		prev_pagep = from_pagep;
+	else if ((ret = __memp_fput(mpf,
+	    dbc->thread_info, from_pagep, dbc->priority)) != 0)
+		goto err;
+
+	from_pagep = NULL;
+	hcp->page = NULL;
+	if (carray != NULL)
+		__os_free(env, carray);
+	carray = NULL;
+
+	/*
+	 * The head of the bucket has been copied.  Try to figure out
+	 * if we should just relink the following pages or try to merge
+	 * them into existing pages.  This is quick and dirty: if it
+	 * looks like the data will fit on the current "to" page then
+	 * merge it, otherwise just do the linking.
+	 * If this was called from DB->compact it will be better to copy
+	 * the data to lower numbered pages.
+	 */
+	if (check_trunc && from_pgno > c_data->compact_truncate)
+		goto next_page;
+
+	/*
+	 * first_pgno will be the first page of a list that gets
+	 * relinked to the new bucket. last_pagep will point at the
+	 * last page of the linked list.
+	 */
+	first_pgno = from_pgno;
+	last_pagep = NULL;
+	while (from_pgno != PGNO_INVALID) {
+		if ((ret = __memp_fget(mpf,
+		    &from_pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &from_pagep)) != 0)
+			goto err;
+		if (P_FREESPACE(dbp, to_pagep) >
+		    (dbp->pgsize - HOFFSET(from_pagep)) +
+		    (NUM_ENT(from_pagep) * sizeof(db_indx_t)))
+			break;
+		if (check_trunc && from_pgno > c_data->compact_truncate)
+			break;
+		from_pgno = NEXT_PGNO(from_pagep);
+		if (last_pagep != NULL && last_pagep != first_pagep &&
+		    (ret = __memp_fput(mpf,
+		    dbc->thread_info, last_pagep, dbc->priority)) != 0)
+			goto err;
+		last_pagep = from_pagep;
+		if (first_pagep == NULL)
+			first_pagep = from_pagep;
+		from_pagep = NULL;
+	}
+
+	/* Link the chain of "full" pages into the "to" bucket. */
+	if (first_pgno != PGNO_INVALID && first_pgno != from_pgno) {
+		DB_ASSERT(dbp->env, first_pagep != NULL);
+		next_pagep = NULL;
+		if (NEXT_PGNO(to_pagep) != PGNO_INVALID && (ret =
+		    __memp_fget(mpf, &NEXT_PGNO(to_pagep), dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &next_pagep)) != 0)
+			goto err;
+
+		if (last_pagep == NULL)
+			last_pagep = first_pagep;
+		DB_ASSERT(dbp->env, last_pagep != NULL);
+		/*
+		 * At the point we have:
+		 *	to_pagep -- the page that we are linking to.
+		 *	first_pagep -- the page that is first in the list.
+		 *	last_pagep -- the page that is the last in the list.
+		 *	prev_pagep -- the page that points at first_pagep.
+		 *	next_pagep -- the next page after the list.
+		 */
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __db_relink_log(dbp, dbc->txn,
+			    &LSN(to_pagep), 0, NEXT_PGNO(to_pagep),
+			    first_pgno, to_pgno, &LSN(to_pagep),
+			    PGNO_INVALID, NULL)) != 0)
+				goto err;
+			if ((ret = __db_relink_log(dbp, dbc->txn,
+			    &LSN(first_pagep), 0, PREV_PGNO(first_pagep),
+			    to_pgno, PGNO_INVALID, NULL, first_pgno,
+			    &LSN(first_pagep))) != 0)
+				goto err;
+			if (next_pagep != NULL) {
+				if ((ret = __db_relink_log(dbp, dbc->txn,
+				    &LSN(next_pagep), 0, PREV_PGNO(next_pagep),
+				    PGNO(last_pagep), PGNO_INVALID, NULL,
+				    PGNO(next_pagep), &LSN(next_pagep))) != 0)
+					goto err;
+				if ((ret = __db_relink_log(dbp, dbc->txn,
+				    &LSN(last_pagep), 0, NEXT_PGNO(last_pagep),
+				    PGNO(next_pagep), PGNO(last_pagep),
+				    &LSN(last_pagep), PGNO_INVALID, NULL)) != 0)
+					goto err;
+			} else if (NEXT_PGNO(last_pagep) != PGNO_INVALID &&
+			    (ret = __db_relink_log(dbp, dbc->txn,
+			    &LSN(last_pagep), 0, NEXT_PGNO(last_pagep),
+			    PGNO_INVALID, PGNO(last_pagep),
+			    &LSN(last_pagep), PGNO_INVALID, NULL)) != 0)
+					goto err;
+			if (prev_pagep != NULL &&
+			    (ret = __db_relink_log(dbp, dbc->txn,
+			    &LSN(prev_pagep), 0, NEXT_PGNO(prev_pagep),
+			    NEXT_PGNO(last_pagep), PGNO(prev_pagep),
+			    &LSN(prev_pagep), PGNO_INVALID, NULL)) != 0)
+				goto err;
+		} else {
+			LSN_NOT_LOGGED(LSN(to_pagep));
+			LSN_NOT_LOGGED(LSN(first_pagep));
+			LSN_NOT_LOGGED(LSN(last_pagep));
+			if (next_pagep != NULL)
+				LSN_NOT_LOGGED(LSN(to_pagep));
+		}
+		if (prev_pagep != NULL)
+			NEXT_PGNO(prev_pagep) = NEXT_PGNO(last_pagep);
+		NEXT_PGNO(last_pagep) = NEXT_PGNO(to_pagep);
+		NEXT_PGNO(to_pagep) = first_pgno;
+		PREV_PGNO(first_pagep) = to_pgno;
+		if (next_pagep != NULL) {
+			PREV_PGNO(next_pagep) = PGNO(last_pagep);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, next_pagep, dbc->priority)) != 0)
+				goto err;
+			next_pagep = NULL;
+		}
+		if (last_pagep != first_pagep && (ret = __memp_fput(mpf,
+		    dbc->thread_info, last_pagep, dbc->priority)) != 0)
+			goto err;
+		last_pagep = NULL;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, first_pagep, dbc->priority)) != 0)
+			goto err;
+		first_pagep = NULL;
+	} else if (last_pagep != NULL && (ret = __memp_fput(mpf,
+	    dbc->thread_info, last_pagep, dbc->priority)) != 0)
+		goto err;
+
+	if (from_pagep == NULL) {
+		from_pagep = first_pagep;
+		first_pagep = NULL;
+	}
+	if (from_pgno != PGNO_INVALID)
+		goto next_page;
+
+	if (prev_pagep != NULL && (ret = __memp_fput(mpf,
+	    dbc->thread_info, prev_pagep, dbc->priority)) != 0)
+		goto err;
+	ret = __memp_fput(mpf, dbc->thread_info, to_pagep, dbc->priority);
+	return (ret);
+
+err:	if (last_pagep != NULL && last_pagep != first_pagep)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, last_pagep, dbc->priority);
+	if (first_pagep != NULL && first_pagep != from_pagep)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, first_pagep, dbc->priority);
+	if (next_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, next_pagep, dbc->priority);
+	if (from_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, from_pagep, dbc->priority);
+	if (to_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, to_pagep, dbc->priority);
+	if (prev_pagep != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, prev_pagep, dbc->priority);
+	hcp->page = NULL;
+	(void)__TLPUT(dbc, tlock);
+	(void)__TLPUT(dbc, firstlock);
+	if (carray != NULL)
+		__os_free(env, carray);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
+ */
+int
+__ham_split_page(dbc, obucket, nbucket)
+	DBC *dbc;
+	u_int32_t obucket, nbucket;
+{
+	DB *dbp;
+	DBC **carray, *tmp_dbc;
+	DBT key, page_dbt;
+	DB_LOCK block;
+	DB_LSN new_lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp, *cp;
+	PAGE **pp, *old_pagep, *temp_pagep, *new_pagep, *next_pagep;
+	db_indx_t n, dest_indx;
+	db_pgno_t bucket_pgno, npgno, next_pgno;
+	u_int32_t big_len, len;
+	int found, i, ret, t_ret;
+	void *big_buf;
+
+	dbp = dbc->dbp;
+	carray = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	temp_pagep = old_pagep = new_pagep = NULL;
+	npgno = PGNO_INVALID;
+	LOCK_INIT(block);
+
+	bucket_pgno = BUCKET_TO_PAGE(hcp, obucket);
+	if ((ret = __db_lget(dbc,
+	    0, bucket_pgno, DB_LOCK_WRITE, 0, &block)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &bucket_pgno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &old_pagep)) != 0)
+		goto err;
+
+	/* Sort any unsorted pages before doing a hash split. */
+	if (old_pagep->type == P_HASH_UNSORTED)
+		if ((ret = __ham_sort_page_cursor(dbc, old_pagep)) != 0)
+			return (ret);
+
+	/* Properly initialize the new bucket page. */
+	npgno = BUCKET_TO_PAGE(hcp, nbucket);
+	if ((ret = __memp_fget(mpf, &npgno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &new_pagep)) != 0)
+		goto err;
+	P_INIT(new_pagep,
+	    dbp->pgsize, npgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+
+	temp_pagep = hcp->split_buf;
+	memcpy(temp_pagep, old_pagep, dbp->pgsize);
+
+	if (DBC_LOGGING(dbc)) {
+		page_dbt.size = dbp->pgsize;
+		page_dbt.data = old_pagep;
+		if ((ret = __ham_splitdata_log(dbp,
+		    dbc->txn, &new_lsn, 0, SPLITOLD,
+		    PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+
+	LSN(old_pagep) = new_lsn;	/* Structure assignment. */
+
+	P_INIT(old_pagep, dbp->pgsize, PGNO(old_pagep), PGNO_INVALID,
+	    PGNO_INVALID, 0, P_HASH);
+
+	big_len = 0;
+	big_buf = NULL;
+	memset(&key, 0, sizeof(key));
+	while (temp_pagep != NULL) {
+		if ((ret = __ham_get_clist(dbp,
+		    PGNO(temp_pagep), NDX_INVALID, &carray)) != 0)
+			goto err;
+
+		for (n = 0; n < (db_indx_t)NUM_ENT(temp_pagep); n += 2) {
+			if ((ret = __db_ret(dbc, temp_pagep, H_KEYINDEX(n),
+			    &key, &big_buf, &big_len)) != 0)
+				goto err;
+
+			if (__ham_call_hash(dbc, key.data, key.size) == obucket)
+				pp = &old_pagep;
+			else
+				pp = &new_pagep;
+
+			/*
+			 * Figure out how many bytes we need on the new
+			 * page to store the key/data pair.
+			 */
+			len = LEN_HITEM(dbp, temp_pagep, dbp->pgsize,
+			    H_DATAINDEX(n)) +
+			    LEN_HITEM(dbp, temp_pagep, dbp->pgsize,
+			    H_KEYINDEX(n)) +
+			    2 * sizeof(db_indx_t);
+
+			if (P_FREESPACE(dbp, *pp) < len) {
+				if (DBC_LOGGING(dbc)) {
+					page_dbt.size = dbp->pgsize;
+					page_dbt.data = *pp;
+					if ((ret = __ham_splitdata_log(dbp,
+					    dbc->txn, &new_lsn, 0,
+					    SPLITNEW, PGNO(*pp), &page_dbt,
+					    &LSN(*pp))) != 0)
+						goto err;
+				} else
+					LSN_NOT_LOGGED(new_lsn);
+				LSN(*pp) = new_lsn;
+				next_pagep = *pp;
+				if ((ret =
+				    __ham_add_ovflpage(dbc, &next_pagep)) != 0)
+					goto err;
+				if ((ret = __memp_fput(mpf,
+				    dbc->thread_info, *pp, dbc->priority)) != 0)
+					goto err;
+				*pp = next_pagep;
+			}
+
+			dest_indx = NDX_INVALID;
+			if ((ret = __ham_copypair(dbc, temp_pagep,
+			    H_KEYINDEX(n), *pp, &dest_indx, 0)) != 0)
+			    goto err;
+
+			/*
+			 * Update any cursors that were pointing to items
+			 * shuffled because of this insert.
+			 * Use __hamc_update, since the cursor adjustments are
+			 * the same as those required for an insert. The
+			 * overhead of creating a cursor is worthwhile to save
+			 * replicating the adjustment functionality.
+			 * Adjusting shuffled cursors needs to be done prior to
+			 * adjusting any cursors that were pointing to the
+			 * moved item.
+			 * All pages in a bucket are sorted, but the items are
+			 * not sorted across pages within a bucket. This means
+			 * that splitting the first page in a bucket into two
+			 * new buckets won't require any cursor shuffling,
+			 * since all inserts will be appends. Splitting of the
+			 * second etc page from the initial bucket could
+			 * cause an item to be inserted at any location on a
+			 * page (since items already inserted from page 1 of
+			 * the initial bucket may overlap), so only adjust
+			 * cursors for the second etc pages within a bucket.
+			 */
+			if (PGNO(temp_pagep) != bucket_pgno) {
+				if ((ret = __db_cursor_int(dbp,
+				    dbc->thread_info, dbc->txn, dbp->type,
+				    PGNO_INVALID, 0, DB_LOCK_INVALIDID,
+				    &tmp_dbc)) != 0)
+					goto err;
+				hcp = (HASH_CURSOR*)tmp_dbc->internal;
+				hcp->pgno = PGNO(*pp);
+				hcp->indx = dest_indx;
+				hcp->dup_off = 0;
+				hcp->order = 0;
+				if ((ret = __hamc_update(
+				    tmp_dbc, len, DB_HAM_CURADJ_ADD, 0)) != 0)
+					goto err;
+				if ((ret = __dbc_close(tmp_dbc)) != 0)
+					goto err;
+			}
+			/* Update any cursors pointing at the moved item. */
+			if (carray != NULL) {
+				found = 0;
+				for (i = 0; carray[i] != NULL; i++) {
+					cp =
+					    (HASH_CURSOR *)carray[i]->internal;
+					if (cp->pgno == PGNO(temp_pagep) &&
+					    cp->indx == n) {
+						cp->pgno = PGNO(*pp);
+						cp->indx = dest_indx;
+						if (cp->pgno == PGNO(old_pagep))
+							cp->bucket = obucket;
+						else
+							cp->bucket = nbucket;
+						found = 1;
+					}
+				}
+				/*
+				 * Only log the update once, since the recovery
+				 * code iterates through all open cursors and
+				 * applies the change to all matching cursors.
+				 */
+				if (found && DBC_LOGGING(dbc) &&
+				    IS_SUBTRANSACTION(dbc->txn)) {
+					if ((ret =
+					    __ham_chgpg_log(dbp,
+					    dbc->txn, &new_lsn, 0,
+					    DB_HAM_SPLIT, PGNO(temp_pagep),
+					    PGNO(*pp), n, dest_indx)) != 0)
+						goto err;
+				}
+			}
+		}
+		next_pgno = NEXT_PGNO(temp_pagep);
+
+		/* Clear temp_page; if it's a link overflow page, free it. */
+		if (PGNO(temp_pagep) != bucket_pgno && (ret =
+		    __db_free(dbc, temp_pagep, 0)) != 0) {
+			temp_pagep = NULL;
+			goto err;
+		}
+
+		if (next_pgno == PGNO_INVALID)
+			temp_pagep = NULL;
+		else if ((ret = __memp_fget(mpf,
+		    &next_pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &temp_pagep)) != 0)
+			goto err;
+
+		if (temp_pagep != NULL) {
+			if (DBC_LOGGING(dbc)) {
+				page_dbt.size = dbp->pgsize;
+				page_dbt.data = temp_pagep;
+				if ((ret = __ham_splitdata_log(dbp,
+				    dbc->txn, &new_lsn, 0,
+				    SPLITOLD, PGNO(temp_pagep),
+				    &page_dbt, &LSN(temp_pagep))) != 0)
+					goto err;
+			} else
+				LSN_NOT_LOGGED(new_lsn);
+			LSN(temp_pagep) = new_lsn;
+		}
+
+		if (carray != NULL)	/* We never knew its size. */
+			__os_free(env, carray);
+		carray = NULL;
+	}
+	if (big_buf != NULL)
+		__os_free(env, big_buf);
+
+	/*
+	 * If the original bucket spanned multiple pages, then we've got
+	 * a pointer to a page that used to be on the bucket chain.  It
+	 * should be deleted.
+	 */
+	if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
+	    (ret = __db_free(dbc, temp_pagep, 0)) != 0) {
+		temp_pagep = NULL;
+		goto err;
+	}
+
+	/*
+	 * Write new buckets out.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		page_dbt.size = dbp->pgsize;
+		page_dbt.data = old_pagep;
+		if ((ret = __ham_splitdata_log(dbp, dbc->txn,
+		    &new_lsn, 0, SPLITNEW, PGNO(old_pagep), &page_dbt,
+		    &LSN(old_pagep))) != 0)
+			goto err;
+		LSN(old_pagep) = new_lsn;
+
+		page_dbt.data = new_pagep;
+		if ((ret = __ham_splitdata_log(dbp, dbc->txn, &new_lsn, 0,
+		    SPLITNEW, PGNO(new_pagep), &page_dbt,
+		    &LSN(new_pagep))) != 0)
+			goto err;
+		LSN(new_pagep) = new_lsn;
+	} else {
+		LSN_NOT_LOGGED(LSN(old_pagep));
+		LSN_NOT_LOGGED(LSN(new_pagep));
+	}
+
+	ret = __memp_fput(mpf, dbc->thread_info, old_pagep, dbc->priority);
+	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, new_pagep, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (0) {
+err:		if (old_pagep != NULL)
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, old_pagep, dbc->priority);
+		if (new_pagep != NULL) {
+			P_INIT(new_pagep, dbp->pgsize,
+			     npgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, new_pagep, dbc->priority);
+		}
+		if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, temp_pagep, dbc->priority);
+	}
+	if ((t_ret = __TLPUT(dbc, block)) != 0 && ret == 0)
+		ret = t_ret;
+	if (carray != NULL)		/* We never knew its size. */
+		__os_free(env, carray);
+	return (ret);
+}
+
+/*
+ * Add the given pair to the page.  The page in question may already be
+ * held (i.e. it was already gotten).  If it is, then the page is passed
+ * in via the pagep parameter.  On return, pagep will contain the page
+ * to which we just added something.  This allows us to link overflow
+ * pages and return the new page having correctly put the last page.
+ *
+ * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, u_int32_t));
+ */
+int
+__ham_add_el(dbc, key, val, type)
+	DBC *dbc;
+	const DBT *key, *val;
+	u_int32_t type;
+{
+	const DBT *pkey, *pdata;
+	DB *dbp;
+	DBT key_dbt, data_dbt;
+	DB_LSN new_lsn;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	HOFFPAGE doff, koff;
+	PAGE *new_pagep;
+	db_pgno_t next_pgno, pgno;
+	u_int32_t data_size, data_type, key_size, key_type;
+	u_int32_t pages, pagespace, pairsize;
+	int do_expand, is_keybig, is_databig, match, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	do_expand = 0;
+
+	pgno = hcp->seek_found_page != PGNO_INVALID ?
+	    hcp->seek_found_page : hcp->pgno;
+	if (hcp->page == NULL && (ret = __memp_fget(mpf, &pgno,
+	    dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->page)) != 0)
+		return (ret);
+
+	key_size = HKEYDATA_PSIZE(key->size);
+	data_size = HKEYDATA_PSIZE(val->size);
+	is_keybig = ISBIG(hcp, key->size);
+	is_databig = ISBIG(hcp, val->size);
+	if (is_keybig)
+		key_size = HOFFPAGE_PSIZE;
+	if (is_databig)
+		data_size = HOFFPAGE_PSIZE;
+
+	pairsize = key_size + data_size;
+
+	/* Advance to first page in chain with room for item. */
+	while (H_NUMPAIRS(hcp->page) && NEXT_PGNO(hcp->page) != PGNO_INVALID) {
+		/*
+		 * This may not be the end of the chain, but the pair may fit
+		 * anyway.  Check if it's a bigpair that fits or a regular
+		 * pair that fits.
+		 */
+		if (P_FREESPACE(dbp, hcp->page) >= pairsize)
+			break;
+		next_pgno = NEXT_PGNO(hcp->page);
+		if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * Check if we need to allocate a new page.
+	 */
+	if (P_FREESPACE(dbp, hcp->page) < pairsize) {
+		do_expand = 1;
+		if ((ret = __memp_dirty(mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		new_pagep = hcp->page;
+		if ((ret = __ham_add_ovflpage(dbc, &new_pagep)) != 0)
+			return (ret);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, hcp->page, dbc->priority)) != 0) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, new_pagep, dbc->priority);
+			return (ret);
+		}
+		hcp->page = new_pagep;
+		hcp->pgno = PGNO(hcp->page);
+	}
+
+	/*
+	 * If we don't have a transaction then make sure we will not
+	 * run out of file space before updating the key or data.
+	 */
+	if (dbc->txn == NULL &&
+	    dbp->mpf->mfp->maxpgno != 0 && (is_keybig || is_databig)) {
+		pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+		pages = 0;
+		if (is_databig)
+			pages = ((data_size - 1) / pagespace) + 1;
+		if (is_keybig) {
+			pages += ((key->size - 1) / pagespace) + 1;
+			if (pages >
+			    (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno))
+				return (__db_space_err(dbp));
+		}
+	}
+
+	if ((ret = __memp_dirty(mpf,
+	    &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Update cursor.
+	 */
+	hcp->indx = hcp->seek_found_indx;
+	F_CLR(hcp, H_DELETED);
+	if (is_keybig) {
+		koff.type = H_OFFPAGE;
+		UMRW_SET(koff.unused[0]);
+		UMRW_SET(koff.unused[1]);
+		UMRW_SET(koff.unused[2]);
+		if ((ret = __db_poff(dbc, key, &koff.pgno)) != 0)
+			return (ret);
+		koff.tlen = key->size;
+		key_dbt.data = &koff;
+		key_dbt.size = sizeof(koff);
+		pkey = &key_dbt;
+		key_type = H_OFFPAGE;
+	} else {
+		pkey = key;
+		key_type = H_KEYDATA;
+	}
+
+	if (is_databig) {
+		doff.type = H_OFFPAGE;
+		UMRW_SET(doff.unused[0]);
+		UMRW_SET(doff.unused[1]);
+		UMRW_SET(doff.unused[2]);
+		if ((ret = __db_poff(dbc, val, &doff.pgno)) != 0)
+			return (ret);
+		doff.tlen = val->size;
+		data_dbt.data = &doff;
+		data_dbt.size = sizeof(doff);
+		pdata = &data_dbt;
+		data_type = H_OFFPAGE;
+	} else {
+		pdata = val;
+		data_type = type;
+	}
+
+	/* Sort any unsorted pages before doing the insert. */
+	if (((PAGE *)hcp->page)->type == P_HASH_UNSORTED)
+		if ((ret = __ham_sort_page_cursor(dbc, hcp->page)) != 0)
+			return (ret);
+
+	/*
+	 * If inserting on the page found initially, then use the saved index.
+	 * If inserting on a different page resolve the index now so it can be
+	 * logged.
+	 * The page might be different, if P_FREESPACE constraint failed (due
+	 * to a partial put that increases the data size).
+	 */
+	if (PGNO(hcp->page) != hcp->seek_found_page) {
+	    if ((ret = __ham_getindex(dbc, hcp->page, pkey,
+		    key_type, &match, &hcp->seek_found_indx)) != 0)
+			return (ret);
+	    hcp->seek_found_page = PGNO(hcp->page);
+
+	    DB_ASSERT(dbp->env, hcp->seek_found_indx <= NUM_ENT(hcp->page));
+	}
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn, 0,
+		    PUTPAIR, PGNO(hcp->page), (u_int32_t)hcp->seek_found_indx,
+		    &LSN(hcp->page), OP_SET(key_type, hcp->page), pkey,
+		    OP_SET(data_type, hcp->page), pdata)) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+
+	/* Move lsn onto page. */
+	LSN(hcp->page) = new_lsn;	/* Structure assignment. */
+
+	if ((ret = __ham_insertpair(dbc, hcp->page,
+	    &hcp->seek_found_indx, pkey, pdata, key_type, data_type)) != 0)
+		return (ret);
+
+	/*
+	 * Adjust any cursors that were pointing at items whose indices were
+	 * shuffled due to the insert.
+	 */
+	if ((ret = __hamc_update(dbc, pairsize, DB_HAM_CURADJ_ADD, 0)) != 0)
+		return (ret);
+
+	/*
+	 * For splits, we are going to update item_info's page number
+	 * field, so that we can easily return to the same page the
+	 * next time we come in here.  For other operations, this doesn't
+	 * matter, since this is the last thing that happens before we return
+	 * to the user program.
+	 */
+	hcp->pgno = PGNO(hcp->page);
+	/*
+	 * When moving an item from one page in a bucket to another, due to an
+	 * expanding on page duplicate set, or a partial put that increases the
+	 * size of an item. The destination index needs to be saved so that the
+	 * __ham_replpair code can update any cursors impacted by the move. For
+	 * other operations, this does not matter, since this is the last thing
+	 * that happens before we return to the user program.
+	 */
+	hcp->indx = hcp->seek_found_indx;
+
+	/*
+	 * XXX
+	 * Maybe keep incremental numbers here.
+	 */
+	if (!STD_LOCKING(dbc)) {
+		if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+			return (ret);
+		hcp->hdr->nelem++;
+	}
+
+	if (do_expand || (hcp->hdr->ffactor != 0 &&
+	    (u_int32_t)H_NUMPAIRS(hcp->page) > hcp->hdr->ffactor))
+		F_SET(hcp, H_EXPAND);
+	return (0);
+}
+
+/*
+ * Special insert pair call -- copies a key/data pair from one page to
+ * another.  Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
+ * H_DUPLICATE, H_OFFDUP).  Since we log splits at a high level, we
+ * do not need to log them here.
+ *
+ * dest_indx is an optional parameter, it serves several purposes:
+ * * ignored if NULL
+ * * Used as an insert index if non-null and not NDX_INVALID
+ * * Populated with the insert index if non-null and NDX_INVALID
+ *
+ * PUBLIC: int __ham_copypair __P((DBC *, PAGE *, u_int32_t,
+ * PUBLIC:     PAGE *, db_indx_t *, int));
+ */
+int
+__ham_copypair(dbc, src_page, src_ndx, dest_page, dest_indx, log)
+	DBC *dbc;
+	PAGE *src_page;
+	u_int32_t src_ndx;
+	PAGE *dest_page;
+	db_indx_t *dest_indx;
+	int log;
+{
+	DB *dbp;
+	DBT tkey, tdata;
+	db_indx_t kindx, dindx, dest;
+	u_int32_t ktype, dtype;
+	int match, ret;
+
+	dbp = dbc->dbp;
+	ret = 0;
+	memset(&tkey, 0, sizeof(tkey));
+	memset(&tdata, 0, sizeof(tdata));
+
+	ktype = HPAGE_TYPE(dbp, src_page, H_KEYINDEX(src_ndx));
+	dtype = HPAGE_TYPE(dbp, src_page, H_DATAINDEX(src_ndx));
+	kindx = H_KEYINDEX(src_ndx);
+	dindx = H_DATAINDEX(src_ndx);
+	if (ktype == H_OFFPAGE) {
+		tkey.data = P_ENTRY(dbp, src_page, kindx);
+		tkey.size = LEN_HITEM(dbp, src_page, dbp->pgsize, kindx);
+	} else {
+		tkey.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, kindx));
+		tkey.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, kindx);
+	}
+	if (dtype == H_OFFPAGE || dtype == H_OFFDUP) {
+		tdata.data = P_ENTRY(dbp, src_page, dindx);
+		tdata.size = LEN_HITEM(dbp, src_page, dbp->pgsize, dindx);
+	} else {
+		tdata.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, dindx));
+		tdata.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, dindx);
+	}
+	if (dest_indx != NULL)
+		dest = *dest_indx;
+	else
+		dest = NDX_INVALID;
+	if (dest == NDX_INVALID) {
+		if ((ret = __ham_getindex(dbc,
+		    dest_page, &tkey, ktype, &match, &dest)) != 0)
+			return (ret);
+		/* It is an error to insert a duplicate key */
+		DB_ASSERT(dbp->env, match != 0);
+	}
+
+	if (log == 1) {
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __ham_insdel_log(dbp, dbc->txn,
+			    &LSN(dest_page), 0, PUTPAIR,
+			    PGNO(dest_page), (u_int32_t)dest, &LSN(dest_page),
+			    OP_SET(ktype, dest_page), &tkey,
+			    OP_SET(dtype, dest_page), &tdata)) != 0)
+				return (ret);
+		} else
+			LSN_NOT_LOGGED(LSN(dest_page));
+	}
+
+	if ((ret = __ham_insertpair(dbc, dest_page, &dest,
+	    &tkey, &tdata, ktype, dtype)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbp->env, dtype != H_DUPLICATE ||
+	    HPAGE_TYPE(dbp, dest_page, H_DATAINDEX(dest)) == dtype);
+
+	if (dest_indx != NULL)
+		*dest_indx = dest;
+
+	return (ret);
+}
+
+/*
+ * __ham_add_ovflpage --
+ *
+ * Returns:
+ *	0 on success: pp points to new page; !0 on error, pp not valid.
+ *
+ * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE **));
+ */
+int
+__ham_add_ovflpage(dbc, pp)
+	DBC *dbc;
+	PAGE **pp;
+{
+	DB *dbp;
+	DB_LSN new_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *new_pagep, *pagep;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	pagep = *pp;
+	*pp = NULL;
+
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+	if ((ret = __db_new(dbc, P_HASH, NULL, &new_pagep)) != 0)
+		return (ret);
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __ham_newpage_log(dbp, dbc->txn, &new_lsn, 0,
+		    PUTOVFL, PGNO(pagep), &LSN(pagep), PGNO(new_pagep),
+		    &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, new_pagep, dbc->priority);
+			return (ret);
+		}
+	} else
+		LSN_NOT_LOGGED(new_lsn);
+
+	/* Move lsn onto page. */
+	LSN(pagep) = LSN(new_pagep) = new_lsn;
+	NEXT_PGNO(pagep) = PGNO(new_pagep);
+
+	PREV_PGNO(new_pagep) = PGNO(pagep);
+
+	*pp = new_pagep;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t));
+ */
+int
+__ham_get_cpage(dbc, mode)
+	DBC *dbc;
+	db_lockmode_t mode;
+{
+	DB *dbp;
+	DB_LOCK tmp_lock;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	ret = 0;
+
+	/*
+	 * There are four cases with respect to buckets and locks.
+	 * 1. If there is no lock held, then if we are locking, we should
+	 *    get the lock.
+	 * 2. If there is a lock held, it's for the current bucket, and it's
+	 *    for the right mode, we don't need to do anything.
+	 * 3. If there is a lock held for the current bucket but it's not
+	 *    strong enough, we need to upgrade.
+	 * 4. If there is a lock, but it's for a different bucket, then we need
+	 *    to release the existing lock and get a new lock.
+	 */
+	LOCK_INIT(tmp_lock);
+	if (STD_LOCKING(dbc)) {
+		if (hcp->lbucket != hcp->bucket) {	/* Case 4 */
+			if ((ret = __TLPUT(dbc, hcp->lock)) != 0)
+				return (ret);
+			LOCK_INIT(hcp->lock);
+			hcp->stream_start_pgno = PGNO_INVALID;
+		}
+
+		/*
+		 * See if we have the right lock.  If we are doing
+		 * dirty reads we assume the write lock has been downgraded.
+		 */
+		if ((LOCK_ISSET(hcp->lock) &&
+		    ((hcp->lock_mode == DB_LOCK_READ ||
+		    F_ISSET(dbp, DB_AM_READ_UNCOMMITTED)) &&
+		    mode == DB_LOCK_WRITE))) {
+			/* Case 3. */
+			tmp_lock = hcp->lock;
+			LOCK_INIT(hcp->lock);
+		}
+
+		/* Acquire the lock. */
+		if (!LOCK_ISSET(hcp->lock))
+			/* Cases 1, 3, and 4. */
+			if ((ret = __ham_lock_bucket(dbc, mode)) != 0)
+				return (ret);
+
+		if (ret == 0) {
+			hcp->lock_mode = mode;
+			hcp->lbucket = hcp->bucket;
+			/* Case 3: release the original lock. */
+			if ((ret = __ENV_LPUT(dbp->env, tmp_lock)) != 0)
+				return (ret);
+		} else if (LOCK_ISSET(tmp_lock))
+			hcp->lock = tmp_lock;
+	}
+
+	if (ret == 0 && hcp->page == NULL) {
+		if (hcp->pgno == PGNO_INVALID)
+			hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+		if ((ret = __memp_fget(mpf,
+		    &hcp->pgno, dbc->thread_info, dbc->txn,
+		    (mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0) |
+		    DB_MPOOL_CREATE, &hcp->page)) != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+/*
+ * Get a new page at the cursor, putting the last page if necessary.
+ * If the flag is set to H_ISDUP, then we are talking about the
+ * duplicate page, not the main page.
+ *
+ * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t));
+ */
+int
+__ham_next_cpage(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *p;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if (hcp->page != NULL && (ret = __memp_fput(mpf,
+	    dbc->thread_info, hcp->page, dbc->priority)) != 0)
+		return (ret);
+	hcp->stream_start_pgno = PGNO_INVALID;
+	hcp->page = NULL;
+
+	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_CREATE, &p)) != 0)
+		return (ret);
+
+	hcp->page = p;
+	hcp->pgno = pgno;
+	hcp->indx = 0;
+
+	return (0);
+}
+
+/*
+ * __ham_lock_bucket --
+ *	Get the lock on a particular bucket.
+ *
+ * PUBLIC: int __ham_lock_bucket __P((DBC *, db_lockmode_t));
+ */
+int
+__ham_lock_bucket(dbc, mode)
+	DBC *dbc;
+	db_lockmode_t mode;
+{
+	HASH_CURSOR *hcp;
+	db_pgno_t pgno;
+	int gotmeta, ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	gotmeta = hcp->hdr == NULL ? 1 : 0;
+	if (gotmeta)
+		if ((ret = __ham_get_meta(dbc)) != 0)
+			return (ret);
+	pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+	if (gotmeta)
+		if ((ret = __ham_release_meta(dbc)) != 0)
+			return (ret);
+
+	ret = __db_lget(dbc, 0, pgno, mode, 0, &hcp->lock);
+
+	hcp->lock_mode = mode;
+	return (ret);
+}
+
+/*
+ * __ham_dpair --
+ *	Delete a pair on a page, paying no attention to what the pair
+ *	represents.  The caller is responsible for freeing up duplicates
+ *	or offpage entries that might be referenced by this pair.
+ *
+ *	Recovery assumes that this may be called without the metadata
+ *	page pinned.
+ *
+ * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
+ */
+void
+__ham_dpair(dbp, p, indx)
+	DB *dbp;
+	PAGE *p;
+	u_int32_t indx;
+{
+	db_indx_t delta, n, *inp;
+	u_int8_t *dest, *src;
+
+	inp = P_INP(dbp, p);
+	/*
+	 * Compute "delta", the amount we have to shift all of the
+	 * offsets.  To find the delta, we just need to calculate
+	 * the size of the pair of elements we are removing.
+	 */
+	delta = H_PAIRSIZE(dbp, p, dbp->pgsize, indx);
+
+	/*
+	 * The hard case: we want to remove something other than
+	 * the last item on the page.  We need to shift data and
+	 * offsets down.
+	 */
+	if ((db_indx_t)indx != NUM_ENT(p) - 2) {
+		/*
+		 * Move the data: src is the first occupied byte on
+		 * the page. (Length is delta.)
+		 */
+		src = (u_int8_t *)p + HOFFSET(p);
+
+		/*
+		 * Destination is delta bytes beyond src.  This might
+		 * be an overlapping copy, so we have to use memmove.
+		 */
+		dest = src + delta;
+		memmove(dest, src, inp[H_DATAINDEX(indx)] - HOFFSET(p));
+	}
+
+	/* Adjust page metadata. */
+	HOFFSET(p) = HOFFSET(p) + delta;
+	NUM_ENT(p) = NUM_ENT(p) - 2;
+
+	/* Adjust the offsets. */
+	for (n = (db_indx_t)indx; n < (db_indx_t)(NUM_ENT(p)); n++)
+		inp[n] = inp[n + 2] + delta;
+
+}
+
+static int
+__hamc_delpg_getorder(cp, my_dbc, orderp, new_pgno, indx, args)
+	DBC *cp, *my_dbc;
+	u_int32_t *orderp;
+	db_pgno_t new_pgno;
+	u_int32_t indx;
+	void *args;
+{
+	HASH_CURSOR *hcp;
+
+	COMPQUIET(args, NULL);
+
+	if (cp == my_dbc || cp->dbtype != DB_HASH)
+		return (0);
+	hcp = (HASH_CURSOR *)cp->internal;
+	if (hcp->pgno == new_pgno &&
+	    !MVCC_SKIP_CURADJ(cp, new_pgno)) {
+		if (hcp->indx == indx &&
+		    F_ISSET(hcp, H_DELETED) &&
+		    hcp->order > *orderp)
+			*orderp = hcp->order;
+	}
+	return (0);
+}
+
+struct __hamc_delpg_setorder_args {
+	db_pgno_t new_pgno;
+	u_int32_t order;
+	db_ham_mode op;
+	DB_TXN *my_txn;
+};
+
+static int
+__hamc_delpg_setorder(cp, my_dbc, foundp, old_pgno, indx, vargs)
+	DBC *cp, *my_dbc;
+	u_int32_t *foundp;
+	db_pgno_t old_pgno;
+	u_int32_t indx;
+	void *vargs;
+{
+	HASH_CURSOR *hcp;
+	struct __hamc_delpg_setorder_args *args;
+
+	if (cp == my_dbc || cp->dbtype != DB_HASH)
+		return (0);
+
+	hcp = (HASH_CURSOR *)cp->internal;
+	args = vargs;
+
+	if (hcp->pgno == old_pgno &&
+	    !MVCC_SKIP_CURADJ(cp, old_pgno)) {
+		switch (args->op) {
+		case DB_HAM_DELFIRSTPG:
+			/*
+			 * We're moving all items,
+			 * regardless of index.
+			 */
+			hcp->pgno = args->new_pgno;
+
+			/*
+			 * But we have to be careful of
+			 * the order values.
+			 */
+			if (hcp->indx == indx)
+				hcp->order += args->order;
+			break;
+		case DB_HAM_DELMIDPG:
+			hcp->pgno = args->new_pgno;
+			DB_ASSERT(cp->dbp->env, hcp->indx == 0 &&
+			    F_ISSET(hcp, H_DELETED));
+			hcp->order += args->order;
+			break;
+		case DB_HAM_DELLASTPG:
+			hcp->pgno = args->new_pgno;
+			DB_ASSERT(cp->dbp->env, hcp->indx == 0 &&
+			    F_ISSET(hcp, H_DELETED));
+			hcp->indx = indx;
+			hcp->order += args->order;
+			break;
+		default:
+			return (__db_unknown_path(
+			    cp->dbp->env, "__hamc_delpg"));
+		}
+		if (args->my_txn != NULL && cp->txn != args->my_txn)
+			*foundp = 1;
+	}
+	return (0);
+}
+
+/*
+ * __hamc_delpg --
+ *
+ * Adjust the cursors after we've emptied a page in a bucket, taking
+ * care that when we move cursors pointing to deleted items, their
+ * orders don't collide with the orders of cursors on the page we move
+ * them to (since after this function is called, cursors with the same
+ * index on the two pages will be otherwise indistinguishable--they'll
+ * all have pgno new_pgno).  There are three cases:
+ *
+ *	1) The emptied page is the first page in the bucket.  In this
+ *	case, we've copied all the items from the second page into the
+ *	first page, so the first page is new_pgno and the second page is
+ *	old_pgno.  new_pgno is empty, but can have deleted cursors
+ *	pointing at indx 0, so we need to be careful of the orders
+ *	there.  This is DB_HAM_DELFIRSTPG.
+ *
+ *	2) The page is somewhere in the middle of a bucket.  Our caller
+ *	can just delete such a page, so it's old_pgno.  old_pgno is
+ *	empty, but may have deleted cursors pointing at indx 0, so we
+ *	need to be careful of indx 0 when we move those cursors to
+ *	new_pgno.  This is DB_HAM_DELMIDPG.
+ *
+ *	3) The page is the last in a bucket.  Again the empty page is
+ *	old_pgno, and again it should only have cursors that are deleted
+ *	and at indx == 0.  This time, though, there's no next page to
+ *	move them to, so we set them to indx == num_ent on the previous
+ *	page--and indx == num_ent is the index whose cursors we need to
+ *	be careful of.  This is DB_HAM_DELLASTPG.
+ */
+static int
+__hamc_delpg(dbc, old_pgno, new_pgno, num_ent, op, orderp)
+	DBC *dbc;
+	db_pgno_t old_pgno, new_pgno;
+	u_int32_t num_ent;
+	db_ham_mode op;
+	u_int32_t *orderp;
+{
+	DB *dbp;
+	DB_LSN lsn;
+	db_indx_t indx;
+	int ret;
+	u_int32_t found;
+	struct __hamc_delpg_setorder_args args;
+
+	/* Which is the worrisome index? */
+	indx = (op == DB_HAM_DELLASTPG) ? num_ent : 0;
+
+	dbp = dbc->dbp;
+
+	/*
+	 * Find the highest order of any cursor our movement
+	 * may collide with.
+	 */
+	if ((ret = __db_walk_cursors(dbp, dbc,
+	    __hamc_delpg_getorder, &args.order, new_pgno, indx, NULL)) != 0)
+		return (ret);
+	args.order++;
+
+	args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+	args.op = op;
+	args.new_pgno = new_pgno;
+	if ((ret = __db_walk_cursors(dbp, dbc,
+	    __hamc_delpg_setorder, &found, old_pgno, indx, &args)) != 0)
+		return (ret);
+
+	if (found != 0 && DBC_LOGGING(dbc)) {
+		if ((ret = __ham_chgpg_log(dbp, args.my_txn, &lsn, 0, op,
+		    old_pgno, new_pgno, indx, args.order)) != 0)
+			return (ret);
+	}
+	*orderp = args.order;
+	return (0);
+}
diff --git a/src/hash/hash_rec.c b/src/hash/hash_rec.c
new file mode 100644
index 00000000..58965569
--- /dev/null
+++ b/src/hash/hash_rec.c
@@ -0,0 +1,1896 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	Margo Seltzer.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+
+static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *));
+static int __ham_alloc_pages_42
+    __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *));
+static int __ham_chgpg_recover_func
+    __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * __ham_insdel_recover --
+ *
+ * PUBLIC: int __ham_insdel_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_insdel_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_insdel_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_indx_t dindx;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_insdel_print);
+	REC_INTRO(__ham_insdel_read, ip, 1);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+	    0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			if (ret == DB_PAGE_NOTFOUND)
+				goto done;
+			else {
+				ret = __db_pgerr(file_dbp, argp->pgno, ret);
+				goto out;
+			}
+		}
+		/* If the page is not here then it was later truncated. */
+		if (!IS_ZERO_LSN(argp->pagelsn))
+			goto done;
+		/*
+		 * This page was created by a group allocation and
+		 * the file may not have been extend yet.
+		 * Create the page if necessary.
+		 */
+		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+	/*
+	 * Two possible things going on:
+	 * redo a delete/undo a put: delete the item from the page.
+	 * redo a put/undo a delete: add the item to the page.
+	 * If we are undoing a delete, then the information logged is the
+	 * entire entry off the page, not just the data of a dbt.  In
+	 * this case, we want to copy it back onto the page verbatim.
+	 * We do this by calling __insertpair with the type H_OFFPAGE instead
+	 * of H_KEYDATA.
+	 */
+	if ((argp->opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
+	    (argp->opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
+		/*
+		 * Need to redo a PUT or undo a delete.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		dindx = (db_indx_t)argp->ndx;
+		if ((ret = __ham_insertpair(dbc, pagep, &dindx, &argp->key,
+		    &argp->data, OP_MODE_GET(argp->keytype),
+		    OP_MODE_GET(argp->datatype))) != 0)
+			goto out;
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+	} else if ((argp->opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
+	    (argp->opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
+		/* Need to undo a put or redo a delete. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		__ham_dpair(file_dbp, pagep, argp->ndx);
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	/* Return the previous LSN. */
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_insdel_42_recover --
+ *
+ * PUBLIC: int __ham_insdel_42_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_insdel_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_insdel_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_indx_t dindx;
+	u_int32_t dtype, ktype, opcode;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_insdel_print);
+	REC_INTRO(__ham_insdel_42_read, ip, 1);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+	    0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			if (ret == DB_PAGE_NOTFOUND)
+				goto done;
+			else {
+				ret = __db_pgerr(file_dbp, argp->pgno, ret);
+				goto out;
+			}
+		}
+		/* If the page is not here then it was later truncated. */
+		if (!IS_ZERO_LSN(argp->pagelsn))
+			goto done;
+		/*
+		 * This page was created by a group allocation and
+		 * the file may not have been extend yet.
+		 * Create the page if necessary.
+		 */
+		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+	/*
+	 * Two possible things going on:
+	 * redo a delete/undo a put: delete the item from the page.
+	 * redo a put/undo a delete: add the item to the page.
+	 * If we are undoing a delete, then the information logged is the
+	 * entire entry off the page, not just the data of a dbt.  In
+	 * this case, we want to copy it back onto the page verbatim.
+	 * We do this by calling __insertpair with the type H_OFFPAGE instead
+	 * of H_KEYDATA.
+	 */
+	opcode = OPCODE_OF(argp->opcode);
+	if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
+	    (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
+		/*
+		 * Need to redo a PUT or undo a delete.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ?
+		    H_OFFPAGE : H_KEYDATA;
+		if (PAIR_ISDATADUP(argp->opcode))
+			dtype = H_DUPLICATE;
+		else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode))
+			dtype = H_OFFPAGE;
+		else
+			dtype = H_KEYDATA;
+		dindx = (db_indx_t)argp->ndx;
+		if ((ret = __ham_insertpair(dbc, pagep, &dindx,
+		    &argp->key, &argp->data, ktype, dtype)) != 0)
+			goto out;
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+	} else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
+	    (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
+		/* Need to undo a put or redo a delete. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		__ham_dpair(file_dbp, pagep, argp->ndx);
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	/* Return the previous LSN. */
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_newpage_recover --
+ *	This log message is used when we add/remove overflow pages.  This
+ *	message takes care of the pointer chains, not the data on the pages.
+ *
+ * PUBLIC: int __ham_newpage_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_newpage_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_newpage_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int change, cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_newpage_print);
+	REC_INTRO(__ham_newpage_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage);
+	change = 0;
+
+	/*
+	 * There are potentially three pages we need to check: the one
+	 * that we created/deleted, the one before it and the one after
+	 * it.
+	 */
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+		/* Redo a create new page or undo a delete new page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, argp->new_pgno,
+		    argp->prev_pgno, argp->next_pgno, 0, P_HASH);
+		change = 1;
+	} else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+		/*
+		 * Redo a delete or undo a create new page.  All we
+		 * really need to do is change the LSN.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		change = 1;
+	}
+
+	if (change)
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	/* Now do the prev page. */
+ppage:	if (argp->prev_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+		change = 0;
+
+		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+			/* Redo a create new page or undo a delete new page. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			pagep->next_pgno = argp->new_pgno;
+			change = 1;
+		} else if ((cmp_p == 0 &&
+		    DB_REDO(op) && argp->opcode == DELOVFL) ||
+		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+			/* Redo a delete or undo a create new page. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			pagep->next_pgno = argp->next_pgno;
+			change = 1;
+		}
+
+		if (change)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, file_dbp->priority)) != 0)
+			goto out;
+		pagep = NULL;
+	}
+
+	/* Now time to do the next page */
+npage:	if (argp->next_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+		change = 0;
+
+		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+			/* Redo a create new page or undo a delete new page. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			pagep->prev_pgno = argp->new_pgno;
+			change = 1;
+		} else if ((cmp_p == 0 &&
+		    DB_REDO(op) && argp->opcode == DELOVFL) ||
+		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+			/* Redo a delete or undo a create new page. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			pagep->prev_pgno = argp->prev_pgno;
+			change = 1;
+		}
+
+		if (change)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, file_dbp->priority)) != 0)
+			goto out;
+		pagep = NULL;
+	}
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_replace_recover --
+ *	This log message refers to partial puts that are local to a single
+ *	page.  You can think of them as special cases of the more general
+ *	insdel log message.
+ *
+ * PUBLIC: int __ham_replace_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_replace_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_replace_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DBT dbt;
+	PAGE *pagep;
+	u_int32_t change;
+	int cmp_n, cmp_p, is_plus, modified, off, ret;
+	u_int8_t *hk;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_replace_print);
+	REC_INTRO(__ham_replace_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	memset(&dbt, 0, sizeof(dbt));
+	modified = 0;
+
+	/*
+	 * Before we know the direction of the transformation we will
+	 * determine the size differential; then once we know if we are
+	 * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
+	 */
+	if (argp->newitem.size > argp->olditem.size) {
+		change = argp->newitem.size - argp->olditem.size;
+		is_plus = 1;
+	} else {
+		change = argp->olditem.size - argp->newitem.size;
+		is_plus = 0;
+	}
+	/*
+	 * When chaining from a "regular" record to an off page record
+	 * the old record does not contain a header while the new record
+	 * does and is at an offset of -1 relative to the data part of
+	 * the record. We add this to the amount of the change (which is
+	 * an absolute value).  If we are undoing then the offset is not
+	 * used in the placement of the data.
+	 */
+	off = argp->off;
+	if (off < 0 &&
+	     (OP_MODE_GET(argp->oldtype) == H_DUPLICATE ||
+	     OP_MODE_GET(argp->oldtype) == H_KEYDATA)) {
+		change -= (u_int32_t)off;
+		if (DB_UNDO(op))
+			off = 0;
+	}
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Reapply the change as specified. */
+		dbt.data = argp->newitem.data;
+		dbt.size = argp->newitem.size;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = *lsnp;
+		/*
+		 * The is_plus flag is set properly to reflect
+		 * newitem.size - olditem.size.
+		 */
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the already applied change. */
+		dbt.data = argp->olditem.data;
+		dbt.size = argp->olditem.size;
+		/*
+		 * Invert is_plus to reflect sign of
+		 * olditem.size - newitem.size.
+		 */
+		is_plus = !is_plus;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->pagelsn;
+		modified = 1;
+	}
+
+	if (modified) {
+		__ham_onpage_replace(file_dbp, pagep,
+		    argp->ndx, off, change, is_plus, &dbt);
+		if (argp->oldtype != argp->newtype) {
+			hk = P_ENTRY(file_dbp, pagep, argp->ndx);
+			if (DB_REDO(op))
+				HPAGE_PTYPE(hk) = OP_MODE_GET(argp->newtype);
+			else
+				HPAGE_PTYPE(hk) = OP_MODE_GET(argp->oldtype);
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_replace_42_recover --
+ *	This log message refers to partial puts that are local to a single
+ *	page.  You can think of them as special cases of the more general
+ *	insdel log message.
+ *
+ * PUBLIC: int __ham_replace_42_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_replace_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_replace_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DBT dbt;
+	PAGE *pagep;
+	u_int32_t change;
+	int cmp_n, cmp_p, is_plus, modified, ret;
+	u_int8_t *hk;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_replace_print);
+	REC_INTRO(__ham_replace_42_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	memset(&dbt, 0, sizeof(dbt));
+	modified = 0;
+
+	/*
+	 * Before we know the direction of the transformation we will
+	 * determine the size differential; then once we know if we are
+	 * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
+	 */
+	if (argp->newitem.size > argp->olditem.size) {
+		change = argp->newitem.size - argp->olditem.size;
+		is_plus = 1;
+	} else {
+		change = argp->olditem.size - argp->newitem.size;
+		is_plus = 0;
+	}
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Reapply the change as specified. */
+		dbt.data = argp->newitem.data;
+		dbt.size = argp->newitem.size;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = *lsnp;
+		/*
+		 * The is_plus flag is set properly to reflect
+		 * newitem.size - olditem.size.
+		 */
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the already applied change. */
+		dbt.data = argp->olditem.data;
+		dbt.size = argp->olditem.size;
+		/*
+		 * Invert is_plus to reflect sign of
+		 * olditem.size - newitem.size.
+		 */
+		is_plus = !is_plus;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->pagelsn;
+		modified = 1;
+	}
+
+	if (modified) {
+		__ham_onpage_replace(file_dbp, pagep,
+		    argp->ndx, argp->off, change, is_plus, &dbt);
+		if (argp->makedup) {
+			hk = P_ENTRY(file_dbp, pagep, argp->ndx);
+			if (DB_REDO(op))
+				HPAGE_PTYPE(hk) = H_DUPLICATE;
+			else
+				HPAGE_PTYPE(hk) = H_KEYDATA;
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_splitdata_recover --
+ *
+ * PUBLIC: int __ham_splitdata_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_splitdata_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_splitdata_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_splitdata_print);
+	REC_INTRO(__ham_splitdata_read, ip, 1);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			if (ret == DB_PAGE_NOTFOUND)
+				goto done;
+			else {
+				ret = __db_pgerr(file_dbp, argp->pgno, ret);
+				goto out;
+			}
+		}
+		/* If the page is not here then it was later truncated. */
+		if (!IS_ZERO_LSN(argp->pagelsn))
+			goto done;
+		/*
+		 * This page was created by a group allocation and
+		 * the file may not have been extend yet.
+		 * Create the page if necessary.
+		 */
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	/*
+	 * There are three types of log messages here. Two are related
+	 * to an actual page split operation, one for the old page
+	 * and one for the new pages created.  The original image in the
+	 * SPLITOLD record is used for undo.  The image in the SPLITNEW
+	 * is used for redo.  We should never have a case where there is
+	 * a redo operation and the SPLITOLD record is on disk, but not
+	 * the SPLITNEW record.  Therefore, we only have work to do when
+	 * redo NEW messages and undo OLD messages, but we have to update
+	 * LSNs in both cases.
+	 *
+	 * The third message is generated when a page is sorted (SORTPAGE). In
+	 * an undo the original image in the SORTPAGE is used. In a redo we
+	 * recreate the sort operation by calling __ham_sort_page.
+	 */
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (argp->opcode == SPLITNEW)
+			/* Need to redo the split described. */
+			memcpy(pagep, argp->pageimage.data,
+			    argp->pageimage.size);
+		else if (argp->opcode == SORTPAGE) {
+			if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0)
+				goto out;
+		}
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) {
+			/* Put back the old image. */
+			memcpy(pagep, argp->pageimage.data,
+			    argp->pageimage.size);
+		} else
+			P_INIT(pagep, file_dbp->pgsize, argp->pgno,
+			    PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+		LSN(pagep) = argp->pagelsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_copypage_recover --
+ *	Recovery function for copypage.
+ *
+ * PUBLIC: int __ham_copypage_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_copypage_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_copypage_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__ham_copypage_print);
+	REC_INTRO(__ham_copypage_read, ip, 0);
+
+	/* This is the bucket page. */
+	REC_FGET(mpf, ip, argp->pgno, &pagep, donext);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->page.data, argp->page.size);
+		PGNO(pagep) = argp->pgno;
+		PREV_PGNO(pagep) = PGNO_INVALID;
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID,
+		    argp->next_pgno, 0, P_HASH);
+		LSN(pagep) = argp->pagelsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+donext:	/* Now fix up the "next" page. */
+	REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn);
+
+	/* For REDO just update the LSN. For UNDO copy page back. */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->page.data, argp->page.size);
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	/* Now fix up the next's next page. */
+do_nn:	if (argp->nnext_pgno == PGNO_INVALID)
+		goto done;
+
+	REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		PREV_PGNO(pagep) = argp->pgno;
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		PREV_PGNO(pagep) = argp->next_pgno;
+		LSN(pagep) = argp->nnextlsn;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __ham_metagroup_recover --
+ *	Recovery function for metagroup.
+ *
+ * PUBLIC: int __ham_metagroup_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_metagroup_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_metagroup_args *argp;
+	DB_THREAD_INFO *ip;
+	HASH_CURSOR *hcp;
+	DB *file_dbp;
+	DBMETA *mmeta;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	mmeta = NULL;
+	did_alloc = 0;
+	REC_PRINT(__ham_metagroup_print);
+	REC_INTRO(__ham_metagroup_read, ip, 1);
+
+	/*
+	 * This logs the virtual create of pages pgno to pgno + bucket.
+	 * The log record contains:
+	 * bucket: old maximum bucket
+	 * pgno: page number of the new bucket.
+	 * We round up on log calculations, so we can figure out if we are
+	 * about to double the hash table if argp->bucket+1 is a power of 2.
+	 * If it is, then we are allocating an entire doubling of pages,
+	 * otherwise, we are simply allocated one new page.
+	 */
+	groupgrow =
+	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
+	pgno = argp->pgno;
+	if (argp->newalloc)
+		pgno += argp->bucket;
+
+	pagep = NULL;
+	ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep);
+
+	/* If we are undoing, then we don't want to create the page. */
+	if (ret != 0 && DB_REDO(op))
+		ret = __memp_fget(mpf,
+		    &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep);
+	else if (ret == DB_PAGE_NOTFOUND)
+		goto do_meta;
+	if (ret != 0) {
+		if (ret != ENOSPC)
+			goto out;
+		pgno = 0;
+		goto do_meta;
+	}
+
+	/*
+	 * When we get here then either we did not grow the file
+	 * (groupgrow == 0) or we did grow the file and the allocation
+	 * of those new pages succeeded.
+	 */
+	did_alloc = groupgrow;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* If this record allocated the pages give them back. */
+		if (argp->newalloc) {
+			if (pagep != NULL && (ret = __memp_fput(mpf,
+			    ip, pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			pagep = NULL;
+			if ((ret = __memp_ftruncate(mpf, NULL, ip,
+			    argp->pgno, 0)) != 0)
+				goto out;
+		} else {
+			/*
+			 * Otherwise just roll the page back to its
+			 * previous state.
+			 */
+			REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+			pagep->lsn = argp->pagelsn;
+		}
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+
+	/*
+	 * If a earlier aborted allocation used one of our pages it may
+	 * be in the wrong state, read all the pages in the group and init
+	 * them to be empty.
+	 */
+	if (DB_REDO(op) && argp->newalloc) {
+		for (pgno = argp->pgno;
+		    pgno < argp->pgno + argp->bucket; pgno++) {
+			if ((ret = __memp_fget(mpf,
+			    &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
+
+			if (IS_ZERO_LSN(LSN(pagep))) {
+				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+				P_INIT(pagep, file_dbp->pgsize,
+				    PGNO_INVALID, PGNO_INVALID, PGNO_INVALID,
+				    0, P_HASH);
+			}
+			if ((ret =
+			    __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+				goto out;
+		}
+	}
+
+do_meta:
+	/* Now we have to update the meta-data page. */
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto out;
+	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
+	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
+	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
+	CHECK_ABORT(env, op, cmp_n, &hcp->hdr->dbmeta.lsn, lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the actual updating of bucket counts. */
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		++hcp->hdr->max_bucket;
+		if (groupgrow) {
+			hcp->hdr->low_mask = hcp->hdr->high_mask;
+			hcp->hdr->high_mask =
+			    (argp->bucket + 1) | hcp->hdr->low_mask;
+		}
+		hcp->hdr->dbmeta.lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the actual updating of bucket counts. */
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		hcp->hdr->max_bucket = argp->bucket;
+		if (groupgrow) {
+			hcp->hdr->high_mask = argp->bucket;
+			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
+		}
+		hcp->hdr->dbmeta.lsn = argp->metalsn;
+	}
+
+	/*
+	 * Now we need to fix up the spares array.  Each entry in the
+	 * spares array indicates the beginning page number for the
+	 * indicated doubling.
+	 */
+	if (cmp_p == 0 && did_alloc && !DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
+		    (argp->pgno - argp->bucket) - 1;
+	}
+	if (cmp_n == 0 && groupgrow && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		hcp->hdr->spares[
+		    __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID;
+	}
+
+	/*
+	 * Finally, we need to potentially fix up the last_pgno field
+	 * in the master meta-data page (which may or may not be the
+	 * same as the hash header page).
+	 */
+	if (argp->mmpgno != argp->mpgno) {
+		if ((ret = __memp_fget(mpf,
+		    &argp->mmpgno, ip,  NULL, DB_MPOOL_EDIT, &mmeta)) != 0) {
+			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
+				ret = 0;
+			goto out;
+		}
+		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
+		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
+		if (cmp_p == 0 && DB_REDO(op)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+			mmeta->lsn = *lsnp;
+		} else if (cmp_n == 0 && DB_UNDO(op)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+			mmeta->lsn = argp->mmetalsn;
+		}
+	} else {
+		mmeta = (DBMETA *)hcp->hdr;
+		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+	}
+
+	if (cmp_n == 0 && DB_UNDO(op))
+		mmeta->last_pgno = argp->last_pgno;
+	else if (cmp_p == 0 && DB_REDO(op) && mmeta->last_pgno < pgno)
+		mmeta->last_pgno = pgno;
+
+	if (argp->mmpgno != argp->mpgno &&
+	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
+		goto out;
+	mmeta = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (mmeta != NULL)
+		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+	if (dbc != NULL)
+		(void)__ham_release_meta(dbc);
+
+	REC_CLOSE;
+}
+
+/*
+ * __ham_contract_recover --
+ *	Recovery function for contracting a hash table
+ *
+ * PUBLIC: int __ham_contract_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_contract_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_contract_args *argp;
+	DB_THREAD_INFO *ip;
+	DB_MPOOLFILE *mpf;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
+	HMETA *meta;
+	int cmp_n, cmp_p, ret, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__ham_contract_print);
+	REC_INTRO(__ham_contract_read, ip, 1);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto done;
+	meta = hcp->hdr;
+	cmp_n = LOG_COMPARE(lsnp, &meta->dbmeta.lsn);
+	cmp_p = LOG_COMPARE(&meta->dbmeta.lsn, &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &meta->dbmeta.lsn, &argp->meta_lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		meta = hcp->hdr;
+		meta->max_bucket = argp->bucket - 1;
+		if (argp->bucket == meta->low_mask + 1) {
+			meta->spares[
+			    __db_log2(argp->bucket) + 1] = PGNO_INVALID;
+			meta->high_mask = meta->low_mask;
+			meta->low_mask >>= 1;
+		}
+		meta->dbmeta.lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		meta = hcp->hdr;
+		meta->max_bucket = argp->bucket;
+		if (argp->bucket == meta->high_mask + 1) {
+			meta->spares[__db_log2(argp->bucket) + 1] =
+			    argp->pgno - argp->bucket;
+			meta->low_mask = meta->high_mask;
+			meta->high_mask = meta->max_bucket | meta->low_mask;
+		}
+		meta->dbmeta.lsn = argp->meta_lsn;
+	}
+	*lsnp = argp->prev_lsn;
+
+out:	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+done:	REC_CLOSE;
+}
+
+/*
+ * __ham_groupalloc_recover --
+ *	Recover the batch creation of a set of pages for a new database.
+ *
+ * PUBLIC: int __ham_groupalloc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_groupalloc_args *argp;
+	DB_THREAD_INFO *ip;
+	DBMETA *mmeta;
+	DB_MPOOLFILE *mpf;
+	DB *file_dbp;
+	DBC *dbc;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	mmeta = NULL;
+	REC_PRINT(__ham_groupalloc_print);
+	REC_INTRO(__ham_groupalloc_read, ip, 1);
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta));
+	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(mmeta), lsnp);
+
+	/*
+	 * Basically, we used mpool to allocate a chunk of pages.
+	 * We need to either add those to a free list (in the undo
+	 * case) or initialize them (in the redo case).
+	 *
+	 * If we are redoing and this is a hash subdatabase, it's possible
+	 * that the pages were never allocated, so we'd better check for
+	 * that and handle it here.
+	 */
+	pgno = argp->start_pgno + argp->num - 1;
+	if (DB_REDO(op)) {
+		if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0)
+			goto out;
+		if (cmp_p == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+			LSN(mmeta) = *lsnp;
+		}
+	} else if (DB_UNDO(op)) {
+		/*
+		 * Fetch the last page and determine if it is in
+		 * the post allocation state.
+		 */
+		pagep = NULL;
+		if ((ret = __memp_fget(mpf, &pgno,
+		     ip,  NULL, DB_MPOOL_EDIT, &pagep)) == 0) {
+			if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) {
+				if ((ret = __memp_fput(mpf, ip,
+				    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+					goto out;
+				pagep = NULL;
+			}
+		} else if (ret != DB_PAGE_NOTFOUND)
+			goto out;
+		/*
+		 * If the last page was allocated then truncate back
+		 * to the first page.
+		 */
+		if (pagep != NULL) {
+			if ((ret = __memp_fput(mpf, ip,
+			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			if ((ret = __memp_ftruncate(mpf, NULL,
+			     ip, argp->start_pgno, 0)) != 0)
+				goto out;
+		}
+
+		/*
+		 * If we are rolling back the metapage, then make
+		 * sure it reflects the the correct last_pgno.
+		 */
+		if (cmp_n == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+			mmeta->last_pgno = argp->last_pgno;
+		}
+		pgno = 0;
+		if (cmp_n == 0) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+			LSN(mmeta) = argp->meta_lsn;
+		}
+	}
+
+	/*
+	 * Set the last page number to the current value.
+	 */
+	if (pgno > mmeta->last_pgno) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+		mmeta->last_pgno = pgno;
+	}
+
+done:	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (mmeta != NULL)
+		(void)__memp_fput(mpf, ip, mmeta, file_dbp->priority);
+
+	REC_CLOSE;
+}
+
+/*
+ * __ham_alloc_pages --
+ *
+ * Called during redo of a file create.  We create new pages in the file
+ * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
+ * __crdel_metasub message.  If we manage to crash without the newly written
+ * pages getting to disk (I'm not sure this can happen anywhere except our
+ * test suite?!), then we need to go through a recreate the final pages.
+ * Hash normally has holes in its files and handles them appropriately.
+ */
+static int
+__ham_alloc_pages(dbc, argp, lsnp)
+	DBC *dbc;
+	__ham_groupalloc_args *argp;
+	DB_LSN *lsnp;
+{
+	DB *file_dbp;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int ret;
+
+	file_dbp = dbc->dbp;
+	mpf = file_dbp->mpf;
+	ip = dbc->thread_info;
+
+	/* Read the last page of the allocation. */
+	pgno = argp->start_pgno + argp->num - 1;
+
+	/* If the page exists, and it has been initialized, then we're done. */
+	if ((ret =
+	    __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) {
+		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
+			goto reinit_page;
+		return (__memp_fput(mpf, ip, pagep, dbc->priority));
+	}
+
+	/* Had to create the page. */
+	if ((ret = __memp_fget(mpf, &pgno,
+	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+		return (__db_pgerr(dbc->dbp, pgno, ret));
+
+reinit_page:
+	/* Initialize the newly allocated page. */
+	REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+	P_INIT(pagep, dbc->dbp->pgsize,
+	    pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+	pagep->lsn = *lsnp;
+
+out:	return (__memp_fput(mpf, ip, pagep, dbc->priority));
+}
+
+/*
+ * __ham_changeslot_recover --
+ *	Recovery function for changeslot.
+ * When we compact a hash database we may change one of the spares slots
+ * to point at a new block of pages.
+ *
+ * PUBLIC: int __ham_changeslot_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_changeslot_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_changeslot_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	HASH_CURSOR *hcp;
+	HMETA *meta;
+	u_int32_t bucket;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+
+	REC_PRINT(__ham_changeslot_print);
+	REC_INTRO(__ham_changeslot_read, ip, 1);
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto out;
+	meta = hcp->hdr;
+	cmp_n = log_compare(lsnp, &LSN(meta));
+	cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+
+	bucket = argp->slot == 0 ? 0 : 1 << (argp->slot - 1);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		meta = hcp->hdr;
+		meta->spares[argp->slot] = argp->new - bucket;
+		LSN(meta) = *lsnp;
+	} else if (cmp_n == 0 && !DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		meta = hcp->hdr;
+		meta->spares[argp->slot] = argp->old - bucket;
+		LSN(meta) = argp->meta_lsn;
+	}
+	*lsnp = argp->prev_lsn;
+	ret = __ham_release_meta(dbc);
+
+done:
+out:	REC_CLOSE;
+}
+
+/*
+ * __ham_curadj_recover --
+ *	Undo cursor adjustments if a subtransaction fails.
+ *
+ * PUBLIC: int __ham_curadj_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_curadj_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_curadj_args *argp;
+	db_ham_curadj mode, hamc_mode;
+	DB_THREAD_INFO *ip;
+	DB_MPOOLFILE *mpf;
+	DB *file_dbp;
+	DBC *dbc;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__ham_curadj_print);
+	REC_INTRO(__ham_curadj_read, ip, 1);
+
+	if (op != DB_TXN_ABORT)
+		goto done;
+
+	mode = (db_ham_curadj)argp->add;
+
+	/*
+	 * Reverse the logged operation, so that the consequences are reversed
+	 * by the __hamc_update code.
+	 */
+	switch (mode) {
+	case DB_HAM_CURADJ_DEL:
+		hamc_mode = DB_HAM_CURADJ_ADD;
+		break;
+	case DB_HAM_CURADJ_ADD:
+		hamc_mode = DB_HAM_CURADJ_DEL;
+		break;
+	case DB_HAM_CURADJ_ADDMOD:
+		hamc_mode = DB_HAM_CURADJ_DELMOD;
+		break;
+	case DB_HAM_CURADJ_DELMOD:
+		hamc_mode = DB_HAM_CURADJ_ADDMOD;
+		break;
+	default:
+		__db_errx(env, DB_STR("1122",
+		    "Invalid flag in __ham_curadj_recover"));
+		ret = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Undo the adjustment by reinitializing the the cursor to look like
+	 * the one that was used to do the adjustment, then we invert the
+	 * add so that undo the adjustment.
+	 */
+	hcp = (HASH_CURSOR *)dbc->internal;
+	hcp->pgno = argp->pgno;
+	hcp->indx = argp->indx;
+	hcp->dup_off = argp->dup_off;
+	hcp->order = argp->order;
+	if (mode == DB_HAM_CURADJ_DEL)
+		F_SET(hcp, H_DELETED);
+	(void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup);
+
+done:	*lsnp = argp->prev_lsn;
+out:	REC_CLOSE;
+}
+
+static int
+__ham_chgpg_recover_func(cp, my_dbc, countp, pgno, indx, vargs)
+	DBC *cp, *my_dbc;
+	u_int32_t *countp;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	void *vargs;
+{
+	BTREE_CURSOR *opdcp;
+	HASH_CURSOR *lcp;
+	u_int32_t order;
+	int ret;
+	__ham_chgpg_args *argp;
+
+	COMPQUIET(my_dbc, NULL);
+	COMPQUIET(countp, NULL);
+	COMPQUIET(pgno, 0);
+	lcp = (HASH_CURSOR *)cp->internal;
+	argp = vargs;
+
+	/* Overloaded field for DB_HAM_DEL*PG */
+	order = argp->new_indx;
+
+	switch (argp->mode) {
+	case DB_HAM_DELFIRSTPG:
+		if (lcp->pgno != argp->new_pgno ||
+		    MVCC_SKIP_CURADJ(cp, lcp->pgno))
+			break;
+		if (lcp->indx != indx ||
+		    !F_ISSET(lcp, H_DELETED) ||
+		    lcp->order >= order) {
+			lcp->pgno = argp->old_pgno;
+			if (lcp->indx == indx)
+				lcp->order -= order;
+		}
+		break;
+	case DB_HAM_DELMIDPG:
+	case DB_HAM_DELLASTPG:
+		if (lcp->pgno == argp->new_pgno &&
+		    lcp->indx == indx &&
+		    F_ISSET(lcp, H_DELETED) &&
+		    lcp->order >= order &&
+		    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
+			lcp->pgno = argp->old_pgno;
+			lcp->order -= order;
+			lcp->indx = 0;
+		}
+		break;
+	case DB_HAM_CHGPG:
+		/*
+		 * If we're doing a CHGPG, we're undoing
+		 * the move of a non-deleted item to a
+		 * new page.  Any cursors with the deleted
+		 * flag set do not belong to this item;
+		 * don't touch them.
+		 */
+		if (F_ISSET(lcp, H_DELETED))
+			break;
+		/* FALLTHROUGH */
+	case DB_HAM_SPLIT:
+		if (lcp->pgno == argp->new_pgno &&
+		    lcp->indx == argp->new_indx &&
+		    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
+			lcp->indx = argp->old_indx;
+			lcp->pgno = argp->old_pgno;
+		}
+		break;
+	case DB_HAM_DUP:
+		if (lcp->opd == NULL)
+			break;
+		opdcp = (BTREE_CURSOR *)lcp->opd->internal;
+		if (opdcp->pgno != argp->new_pgno ||
+		    opdcp->indx != argp->new_indx ||
+		    MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno))
+			break;
+
+		if (F_ISSET(opdcp, C_DELETED))
+			F_SET(lcp, H_DELETED);
+		/*
+		 * We can't close a cursor while we have the
+		 * dbp mutex locked, since c_close reacquires
+		 * it.  It should be safe to drop the mutex
+		 * here, though, since newly opened cursors
+		 * are put only at the end of the tailq and
+		 * the cursor we're adjusting can't be closed
+		 * under us.
+		 */
+		MUTEX_UNLOCK(cp->dbp->env, cp->dbp->mutex);
+		ret = __dbc_close(lcp->opd);
+		MUTEX_LOCK(cp->dbp->env, cp->dbp->mutex);
+		if (ret != 0)
+			return (ret);
+		lcp->opd = NULL;
+		break;
+	}
+	return (0);
+}
+/*
+ * __ham_chgpg_recover --
+ *	Undo cursor adjustments if a subtransaction fails.
+ *
+ * PUBLIC: int __ham_chgpg_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_chgpg_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_chgpg_args *argp;
+	DB_THREAD_INFO *ip;
+	DB_MPOOLFILE *mpf;
+	DB *file_dbp;
+	DBC *dbc;
+	int ret;
+	u_int32_t count;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__ham_chgpg_print);
+	REC_INTRO(__ham_chgpg_read, ip, 0);
+
+	if (op != DB_TXN_ABORT)
+		goto done;
+
+	ret = __db_walk_cursors(file_dbp, dbc,
+	    __ham_chgpg_recover_func, &count, 0, argp->old_indx, argp);
+
+done:	*lsnp = argp->prev_lsn;
+out:	REC_CLOSE;
+}
+
+/*
+ * __ham_metagroup_recover --
+ *	Recovery function for metagroup.
+ *
+ * PUBLIC: int __ham_metagroup_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_metagroup_42_args *argp;
+	DB_THREAD_INFO *ip;
+	HASH_CURSOR *hcp;
+	DB *file_dbp;
+	DBMETA *mmeta;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	u_int32_t flags;
+	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	mmeta = NULL;
+	did_alloc = 0;
+	REC_PRINT(__ham_metagroup_42_print);
+	REC_INTRO(__ham_metagroup_42_read, ip, 1);
+
+	/*
+	 * This logs the virtual create of pages pgno to pgno + bucket
+	 * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not
+	 * transaction protected, we can never undo it.  Even in an abort,
+	 * we have to allocate these pages to the hash table if they
+	 * were actually created.  In particular, during disaster
+	 * recovery the metapage may be before this point if we
+	 * are rolling backward.  If the file has not been extended
+	 * then the metapage could not have been updated.
+	 * The log record contains:
+	 * bucket: old maximum bucket
+	 * pgno: page number of the new bucket.
+	 * We round up on log calculations, so we can figure out if we are
+	 * about to double the hash table if argp->bucket+1 is a power of 2.
+	 * If it is, then we are allocating an entire doubling of pages,
+	 * otherwise, we are simply allocated one new page.
+	 */
+	groupgrow =
+	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
+	pgno = argp->pgno;
+	if (argp->newalloc)
+		pgno += argp->bucket;
+
+	flags = 0;
+	pagep = NULL;
+	LF_SET(DB_MPOOL_CREATE);
+	ret = __memp_fget(mpf, &pgno, ip,  NULL, flags, &pagep);
+
+	if (ret != 0) {
+		if (ret != ENOSPC)
+			goto out;
+		pgno = 0;
+		goto do_meta;
+	}
+
+	/*
+	 * When we get here then either we did not grow the file
+	 * (groupgrow == 0) or we did grow the file and the allocation
+	 * of those new pages succeeded.
+	 */
+	did_alloc = groupgrow;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/*
+		 * Otherwise just roll the page back to its
+		 * previous state.
+		 */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		pagep->lsn = argp->pagelsn;
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+
+do_meta:
+	/* Now we have to update the meta-data page. */
+	hcp = (HASH_CURSOR *)dbc->internal;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto out;
+	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
+	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
+	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the actual updating of bucket counts. */
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		++hcp->hdr->max_bucket;
+		if (groupgrow) {
+			hcp->hdr->low_mask = hcp->hdr->high_mask;
+			hcp->hdr->high_mask =
+			    (argp->bucket + 1) | hcp->hdr->low_mask;
+		}
+		hcp->hdr->dbmeta.lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Undo the actual updating of bucket counts. */
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		hcp->hdr->max_bucket = argp->bucket;
+		if (groupgrow) {
+			hcp->hdr->high_mask = argp->bucket;
+			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
+		}
+		hcp->hdr->dbmeta.lsn = argp->metalsn;
+	}
+
+	/*
+	 * Now we need to fix up the spares array.  Each entry in the
+	 * spares array indicates the beginning page number for the
+	 * indicated doubling.  We need to fill this in whenever the
+	 * spares array is invalid, if we never reclaim pages then
+	 * we have to allocate the pages to the spares array in both
+	 * the redo and undo cases.
+	 */
+	if (did_alloc &&
+	    hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
+		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
+		    (argp->pgno - argp->bucket) - 1;
+	}
+
+	/*
+	 * Finally, we need to potentially fix up the last_pgno field
+	 * in the master meta-data page (which may or may not be the
+	 * same as the hash header page).
+	 */
+	if (argp->mmpgno != argp->mpgno) {
+		if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL,
+		    DB_MPOOL_EDIT, &mmeta)) != 0) {
+			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
+				ret = 0;
+			goto out;
+		}
+		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
+		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
+		if (cmp_p == 0 && DB_REDO(op)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+			mmeta->lsn = *lsnp;
+		} else if (cmp_n == 0 && DB_UNDO(op)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+			mmeta->lsn = argp->mmetalsn;
+		}
+	} else {
+		mmeta = (DBMETA *)hcp->hdr;
+		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+	}
+
+	if (mmeta->last_pgno < pgno)
+		mmeta->last_pgno = pgno;
+
+	if (argp->mmpgno != argp->mpgno &&
+	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
+		goto out;
+	mmeta = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (mmeta != NULL)
+		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+	if (dbc != NULL)
+		(void)__ham_release_meta(dbc);
+
+	REC_CLOSE;
+}
+
+/*
+ * __ham_groupalloc_42_recover --
+ *	Recover the batch creation of a set of pages for a new database.
+ *
+ * PUBLIC: int __ham_groupalloc_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__ham_groupalloc_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DBMETA *mmeta;
+	DB_MPOOLFILE *mpf;
+	DB *file_dbp;
+	DBC *dbc;
+	db_pgno_t pgno;
+	int cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	mmeta = NULL;
+	REC_PRINT(__ham_groupalloc_42_print);
+	REC_INTRO(__ham_groupalloc_42_read, ip, 1);
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+
+	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
+
+	/*
+	 * Basically, we used mpool to allocate a chunk of pages.
+	 * We need to either add those to a free list (in the undo
+	 * case) or initialize them (in the redo case).
+	 *
+	 * If we are redoing and this is a hash subdatabase, it's possible
+	 * that the pages were never allocated, so we'd better check for
+	 * that and handle it here.
+	 */
+	pgno = argp->start_pgno + argp->num - 1;
+	if (DB_REDO(op)) {
+		if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0)
+			goto out;
+		if (cmp_p == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+			LSN(mmeta) = *lsnp;
+		}
+	} else if (DB_UNDO(op)) {
+		/*
+		 * We cannot roll back 4.2 style allocations.
+		 */
+		__db_errx(env, DB_STR("1123",
+"Cannot replicate prepared transactions from master running release 4.2."));
+		ret = __env_panic(env, EINVAL);
+		goto out;
+	}
+
+	/*
+	 * In both REDO and UNDO, we have grown the file and need to make
+	 * sure that last_pgno is correct.  If we HAVE_FTRUNCATE pgno
+	 * will only be valid on REDO.
+	 */
+	if (pgno > mmeta->last_pgno) {
+		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+		mmeta->last_pgno = pgno;
+	}
+
+done:	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (mmeta != NULL)
+		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+
+	REC_CLOSE;
+}
+
+/*
+ * __ham_alloc_pages_42 --
+ *
+ * Called during redo of a file create.  We create new pages in the file
+ * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
+ * __crdel_metasub message.  If we manage to crash without the newly written
+ * pages getting to disk (I'm not sure this can happen anywhere except our
+ * test suite?!), then we need to go through a recreate the final pages.
+ * Hash normally has holes in its files and handles them appropriately.
+ */
+static int
+__ham_alloc_pages_42(dbc, argp, lsnp)
+	DBC *dbc;
+	__ham_groupalloc_42_args *argp;
+	DB_LSN *lsnp;
+{
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int ret;
+
+	mpf = dbc->dbp->mpf;
+	ip = dbc->thread_info;
+
+	/* Read the last page of the allocation. */
+	pgno = argp->start_pgno + argp->num - 1;
+
+	/* If the page exists, and it has been initialized, then we're done. */
+	if ((ret = __memp_fget(mpf,
+	    &pgno, ip, NULL, 0, &pagep)) == 0) {
+		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
+			goto reinit_page;
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, dbc->priority)) != 0)
+			return (ret);
+		return (0);
+	}
+
+	/* Had to create the page. */
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0)
+		return (__db_pgerr(dbc->dbp, pgno, ret));
+
+reinit_page:
+	/* Initialize the newly allocated page. */
+	P_INIT(pagep,
+	    dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+	pagep->lsn = *lsnp;
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/src/hash/hash_reclaim.c b/src/hash/hash_reclaim.c
new file mode 100644
index 00000000..ce3f6d9e
--- /dev/null
+++ b/src/hash/hash_reclaim.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_reclaim --
+ *	Reclaim the pages from a subdatabase and return them to the
+ * parent free list.  For now, we link each freed page on the list
+ * separately.  If people really store hash databases in subdatabases
+ * and do a lot of creates and deletes, this is going to be a problem,
+ * because hash needs chunks of contiguous storage.  We may eventually
+ * need to go to a model where we maintain the free list with chunks of
+ * contiguous pages as well.
+ *
+ * PUBLIC: int __ham_reclaim __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *txn, u_int32_t));
+ */
+int
+__ham_reclaim(dbp, ip, txn, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	HASH_CURSOR *hcp;
+	int ret;
+
+	/* Open up a cursor that we'll use for traversing. */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		return (ret);
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto err;
+
+	/* Write lock the metapage for deallocations. */
+	if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+		goto err;
+
+	/* Avoid locking every page, we have the handle locked exclusive. */
+	F_SET(dbc, DBC_DONTLOCK);
+
+	if ((ret = __ham_traverse(dbc, DB_LOCK_WRITE,
+	    __db_reclaim_callback, &flags, 1)) != 0)
+		goto err;
+	if ((ret = __dbc_close(dbc)) != 0)
+		goto err;
+	if ((ret = __ham_release_meta(dbc)) != 0)
+		goto err;
+	return (0);
+
+err:	if (hcp->hdr != NULL)
+		(void)__ham_release_meta(dbc);
+	(void)__dbc_close(dbc);
+	return (ret);
+}
+
+/*
+ * __ham_truncate --
+ *	Reclaim the pages from a subdatabase and return them to the
+ * parent free list.
+ *
+ * PUBLIC: int __ham_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__ham_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	u_int32_t count;
+	int ret, t_ret;
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		return (ret);
+
+	count = 0;
+
+	ret = __ham_traverse(dbc,
+	    DB_LOCK_WRITE, __db_truncate_callback, &count, 1);
+
+	if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (countp != NULL)
+		*countp = count;
+	return (ret);
+}
diff --git a/src/hash/hash_stat.c b/src/hash/hash_stat.c
new file mode 100644
index 00000000..683ce5a6
--- /dev/null
+++ b/src/hash/hash_stat.c
@@ -0,0 +1,518 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+
+#ifdef HAVE_STATISTICS
+static int __ham_stat_callback __P((DBC *, PAGE *, void *, int *));
+
+/*
+ * __ham_stat --
+ *	Gather/print the hash statistics
+ *
+ * PUBLIC: int __ham_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__ham_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_HASH_STAT *sp;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH_CURSOR *hcp;
+	PAGE *h;
+	db_pgno_t pgno;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	mpf = dbp->mpf;
+	sp = NULL;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		goto err;
+
+	/* Allocate and clear the structure. */
+	if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+		goto err;
+	memset(sp, 0, sizeof(*sp));
+	/* Copy the fields that we have. */
+	sp->hash_nkeys = hcp->hdr->dbmeta.key_count;
+	sp->hash_ndata = hcp->hdr->dbmeta.record_count;
+	/*
+	 * Don't take the page number from the meta-data page -- that value is
+	 * only maintained in the primary database, we may have been called on
+	 * a subdatabase.
+	 */
+	if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
+		goto err;
+	sp->hash_pagecnt = pgno + 1;
+	sp->hash_pagesize = dbp->pgsize;
+	sp->hash_buckets = hcp->hdr->max_bucket + 1;
+	sp->hash_magic = hcp->hdr->dbmeta.magic;
+	sp->hash_version = hcp->hdr->dbmeta.version;
+	sp->hash_metaflags = hcp->hdr->dbmeta.flags;
+	sp->hash_ffactor = hcp->hdr->ffactor;
+
+	if (flags == DB_FAST_STAT)
+		goto done;
+
+	/* Walk the free list, counting pages. */
+	for (sp->hash_free = 0, pgno = hcp->hdr->dbmeta.free;
+	    pgno != PGNO_INVALID;) {
+		++sp->hash_free;
+
+		if ((ret = __memp_fget(mpf,
+		     &pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0)
+			goto err;
+
+		pgno = h->next_pgno;
+		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	}
+
+	/* Now traverse the rest of the table. */
+	sp->hash_nkeys = 0;
+	sp->hash_ndata = 0;
+	if ((ret = __ham_traverse(dbc,
+	    DB_LOCK_READ, __ham_stat_callback, sp, 0)) != 0)
+		goto err;
+
+	if (!F_ISSET(dbp, DB_AM_RDONLY)) {
+		/*
+		 * A transaction is not required for DB->stat, so this update
+		 * can't safely make a copy of the meta page.  We have to
+		 * update in place.
+		 */
+		if ((ret = __ham_dirty_meta(dbc,
+		    (dbc->txn == NULL) ? DB_MPOOL_EDIT : 0)) != 0)
+			goto err;
+		hcp->hdr->dbmeta.key_count = sp->hash_nkeys;
+		hcp->hdr->dbmeta.record_count = sp->hash_ndata;
+	}
+
+done:	if ((ret = __ham_release_meta(dbc)) != 0)
+		goto err;
+
+	*(DB_HASH_STAT **)spp = sp;
+	return (0);
+
+err:	if (sp != NULL)
+		__os_ufree(env, sp);
+
+	if (hcp->hdr != NULL)
+		(void)__ham_release_meta(dbc);
+
+	return (ret);
+}
+
+/*
+ * __ham_stat_print --
+ *	Display hash statistics.
+ *
+ * PUBLIC: int __ham_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__ham_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_HASH_DUP,		"duplicates" },
+		{ DB_HASH_SUBDB,	"multiple-databases" },
+		{ DB_HASH_DUPSORT,	"sorted duplicates" },
+		{ 0,			NULL }
+	};
+	DB *dbp;
+	ENV *env;
+	DB_HASH_STAT *sp;
+	int lorder, ret;
+	const char *s;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __ham_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Default Hash database information:");
+	}
+	__db_msg(env, "%lx\tHash magic number", (u_long)sp->hash_magic);
+	__db_msg(env,
+	    "%lu\tHash version number", (u_long)sp->hash_version);
+	(void)__db_get_lorder(dbp, &lorder);
+	switch (lorder) {
+	case 1234:
+		s = "Little-endian";
+		break;
+	case 4321:
+		s = "Big-endian";
+		break;
+	default:
+		s = "Unrecognized byte order";
+		break;
+	}
+	__db_msg(env, "%s\tByte order", s);
+	__db_prflags(env, NULL, sp->hash_metaflags, fn, NULL, "\tFlags");
+	__db_dl(env,
+	    "Number of pages in the database", (u_long)sp->hash_pagecnt);
+	__db_dl(env,
+	    "Underlying database page size", (u_long)sp->hash_pagesize);
+	__db_dl(env, "Specified fill factor", (u_long)sp->hash_ffactor);
+	__db_dl(env,
+	    "Number of keys in the database", (u_long)sp->hash_nkeys);
+	__db_dl(env,
+	    "Number of data items in the database", (u_long)sp->hash_ndata);
+
+	__db_dl(env, "Number of hash buckets", (u_long)sp->hash_buckets);
+	__db_dl_pct(env, "Number of bytes free on bucket pages",
+	    (u_long)sp->hash_bfree, DB_PCT_PG(
+	    sp->hash_bfree, sp->hash_buckets, sp->hash_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of overflow pages", (u_long)sp->hash_bigpages);
+	__db_dl_pct(env, "Number of bytes free in overflow pages",
+	    (u_long)sp->hash_big_bfree, DB_PCT_PG(
+	    sp->hash_big_bfree, sp->hash_bigpages, sp->hash_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of bucket overflow pages", (u_long)sp->hash_overflows);
+	__db_dl_pct(env,
+	    "Number of bytes free in bucket overflow pages",
+	    (u_long)sp->hash_ovfl_free, DB_PCT_PG(
+	    sp->hash_ovfl_free, sp->hash_overflows, sp->hash_pagesize), "ff");
+
+	__db_dl(env, "Number of duplicate pages", (u_long)sp->hash_dup);
+	__db_dl_pct(env, "Number of bytes free in duplicate pages",
+	    (u_long)sp->hash_dup_free, DB_PCT_PG(
+	    sp->hash_dup_free, sp->hash_dup, sp->hash_pagesize), "ff");
+
+	__db_dl(env,
+	    "Number of pages on the free list", (u_long)sp->hash_free);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+static int
+__ham_stat_callback(dbc, pagep, cookie, putp)
+	DBC *dbc;
+	PAGE *pagep;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	DB_BTREE_STAT bstat;
+	DB_HASH_STAT *sp;
+	db_indx_t indx, len, off, tlen, top;
+	u_int8_t *hk;
+	int ret;
+
+	*putp = 0;
+	sp = cookie;
+	dbp = dbc->dbp;
+
+	switch (pagep->type) {
+	case P_INVALID:
+		/*
+		 * Hash pages may be wholly zeroed;  this is not a bug.
+		 * Obviously such pages have no data, so we can just proceed.
+		 */
+		break;
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		/*
+		 * We count the buckets and the overflow pages
+		 * separately and tally their bytes separately
+		 * as well.  We need to figure out if this page
+		 * is a bucket.
+		 */
+		if (PREV_PGNO(pagep) == PGNO_INVALID)
+			sp->hash_bfree += P_FREESPACE(dbp, pagep);
+		else {
+			sp->hash_overflows++;
+			sp->hash_ovfl_free += P_FREESPACE(dbp, pagep);
+		}
+		top = NUM_ENT(pagep);
+		/* Correct for on-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			switch (*H_PAIRDATA(dbp, pagep, indx)) {
+			case H_OFFDUP:
+				break;
+			case H_OFFPAGE:
+			case H_KEYDATA:
+				sp->hash_ndata++;
+				break;
+			case H_DUPLICATE:
+				tlen = LEN_HDATA(dbp, pagep, 0, indx);
+				hk = H_PAIRDATA(dbp, pagep, indx);
+				for (off = 0; off < tlen;
+				    off += len + 2 * sizeof(db_indx_t)) {
+					sp->hash_ndata++;
+					memcpy(&len,
+					    HKEYDATA_DATA(hk)
+					    + off, sizeof(db_indx_t));
+				}
+				break;
+			default:
+				return (__db_pgfmt(dbp->env, PGNO(pagep)));
+			}
+		}
+		sp->hash_nkeys += H_NUMPAIRS(pagep);
+		break;
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LRECNO:
+	case P_LDUP:
+		/*
+		 * These are all btree pages; get a correct
+		 * cookie and call them.  Then add appropriate
+		 * fields into our stat structure.
+		 */
+		memset(&bstat, 0, sizeof(bstat));
+		if ((ret = __bam_stat_callback(dbc, pagep, &bstat, putp)) != 0)
+			return (ret);
+		sp->hash_dup++;
+		sp->hash_dup_free += bstat.bt_leaf_pgfree +
+		    bstat.bt_dup_pgfree + bstat.bt_int_pgfree;
+		sp->hash_ndata += bstat.bt_ndata;
+		break;
+	case P_OVERFLOW:
+		sp->hash_bigpages++;
+		sp->hash_big_bfree += P_OVFLSPACE(dbp, dbp->pgsize, pagep);
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(pagep)));
+	}
+
+	return (0);
+}
+
+/*
+ * __ham_print_cursor --
+ *	Display the current cursor.
+ *
+ * PUBLIC: void __ham_print_cursor __P((DBC *));
+ */
+void
+__ham_print_cursor(dbc)
+	DBC *dbc;
+{
+	static const FN fn[] = {
+		{ H_CONTINUE,	"H_CONTINUE" },
+		{ H_DELETED,	"H_DELETED" },
+		{ H_DUPONLY,	"H_DUPONLY" },
+		{ H_EXPAND,	"H_EXPAND" },
+		{ H_ISDUP,	"H_ISDUP" },
+		{ H_NEXT_NODUP,	"H_NEXT_NODUP" },
+		{ H_NOMORE,	"H_NOMORE" },
+		{ H_OK,		"H_OK" },
+		{ 0,		NULL }
+	};
+	ENV *env;
+	HASH_CURSOR *cp;
+
+	env = dbc->env;
+	cp = (HASH_CURSOR *)dbc->internal;
+
+	STAT_ULONG("Bucket traversing", cp->bucket);
+	STAT_ULONG("Bucket locked", cp->lbucket);
+	STAT_ULONG("Duplicate set offset", cp->dup_off);
+	STAT_ULONG("Current duplicate length", cp->dup_len);
+	STAT_ULONG("Total duplicate set length", cp->dup_tlen);
+	STAT_ULONG("Bytes needed for add", cp->seek_size);
+	STAT_ULONG("Page on which we can insert", cp->seek_found_page);
+	STAT_ULONG("Order", cp->order);
+	__db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__ham_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __ham_traverse
+ *	 Traverse an entire hash table.  We use the callback so that we
+ * can use this both for stat collection and for deallocation.
+ *
+ * PUBLIC: int __ham_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC:     int (*)(DBC *, PAGE *, void *, int *), void *, int));
+ */
+int
+__ham_traverse(dbc, mode, callback, cookie, look_past_max)
+	DBC *dbc;
+	db_lockmode_t mode;
+	int (*callback) __P((DBC *, PAGE *, void *, int *));
+	void *cookie;
+	int look_past_max;
+{
+	DB *dbp;
+	DBC *opd;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	HKEYDATA *hk;
+	db_pgno_t pgno, opgno;
+	int did_put, i, ret, t_ret;
+	u_int32_t bucket, spares_entry;
+
+	dbp = dbc->dbp;
+	opd = NULL;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	ret = 0;
+
+	/*
+	 * In a perfect world, we could simply read each page in the file
+	 * and look at its page type to tally the information necessary.
+	 * Unfortunately, the bucket locking that hash tables do to make
+	 * locking easy, makes this a pain in the butt.  We have to traverse
+	 * duplicate, overflow and big pages from the bucket so that we
+	 * don't access anything that isn't properly locked.
+	 *
+	 */
+	for (bucket = 0;; bucket++) {
+		/*
+		 * We put the loop exit condition check here, because
+		 * it made for a really vile extended ?: that made SCO's
+		 * compiler drop core.
+		 *
+		 * If look_past_max is not set, we can stop at max_bucket;
+		 * if it is set, we need to include pages that are part of
+		 * the current doubling but beyond the highest bucket we've
+		 * split into, as well as pages from a "future" doubling
+		 * that may have been created within an aborted
+		 * transaction.  To do this, keep looping (and incrementing
+		 * bucket) until the corresponding spares array entries
+		 * cease to be defined.
+		 */
+		if (look_past_max) {
+			spares_entry = __db_log2(bucket + 1);
+			if (spares_entry >= NCACHED ||
+			    hcp->hdr->spares[spares_entry] == 0)
+				break;
+		} else {
+			if (bucket > hcp->hdr->max_bucket)
+				break;
+		}
+
+		hcp->bucket = bucket;
+		hcp->pgno = pgno = BUCKET_TO_PAGE(hcp, bucket);
+		for (ret = __ham_get_cpage(dbc, mode); ret == 0;
+		    ret = __ham_next_cpage(dbc, pgno)) {
+
+			/*
+			 * If we are cleaning up pages past the max_bucket,
+			 * then they may be on the free list and have their
+			 * next pointers set, but they should be ignored.  In
+			 * fact, we really ought to just skip anybody who is
+			 * not a valid page.
+			 */
+			if (TYPE(hcp->page) == P_INVALID)
+				break;
+			pgno = NEXT_PGNO(hcp->page);
+
+			/*
+			 * Go through each item on the page checking for
+			 * duplicates (in which case we have to count the
+			 * duplicate pages) or big key/data items (in which
+			 * case we have to count those pages).
+			 */
+			for (i = 0; i < NUM_ENT(hcp->page); i++) {
+				hk = (HKEYDATA *)P_ENTRY(dbp, hcp->page, i);
+				switch (HPAGE_PTYPE(hk)) {
+				case H_OFFDUP:
+					memcpy(&opgno, HOFFDUP_PGNO(hk),
+					    sizeof(db_pgno_t));
+					if ((ret = __dbc_newopd(dbc,
+					    opgno, NULL, &opd)) != 0)
+						return (ret);
+					if ((ret = __bam_traverse(opd,
+					    DB_LOCK_READ, opgno,
+					    callback, cookie))
+					    != 0)
+						goto err;
+					if ((ret = __dbc_close(opd)) != 0)
+						return (ret);
+					opd = NULL;
+					break;
+				case H_OFFPAGE:
+					/*
+					 * We are about to get a big page
+					 * which will use the same spot that
+					 * the current page uses, so we need
+					 * to restore the current page before
+					 * looking at it again.
+					 */
+					memcpy(&opgno, HOFFPAGE_PGNO(hk),
+					    sizeof(db_pgno_t));
+					if ((ret = __db_traverse_big(dbc,
+					    opgno, callback, cookie)) != 0)
+						goto err;
+					break;
+				case H_KEYDATA:
+				case H_DUPLICATE:
+					break;
+				default:
+					ret = __db_unknown_path(
+					    dbp->env, "__ham_traverse");
+					goto err;
+				}
+			}
+
+			/* Call the callback on main pages. */
+			if ((ret = callback(dbc,
+			    hcp->page, cookie, &did_put)) != 0)
+				goto err;
+
+			if (did_put)
+				hcp->page = NULL;
+			if (pgno == PGNO_INVALID)
+				break;
+		}
+		if (ret != 0)
+			goto err;
+
+		if (hcp->page != NULL) {
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, hcp->page, dbc->priority)) != 0)
+				return (ret);
+			hcp->page = NULL;
+		}
+
+	}
+err:	if (opd != NULL &&
+	    (t_ret = __dbc_close(opd)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
diff --git a/src/hash/hash_stub.c b/src/hash/hash_stub.c
new file mode 100644
index 00000000..57337ea9
--- /dev/null
+++ b/src/hash/hash_stub.c
@@ -0,0 +1,470 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_HASH
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * If the library wasn't compiled with the Hash access method, various
+ * routines aren't available.  Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_nohasham --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_hash_am __P((ENV *));
+ */
+int
+__db_no_hash_am(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("1133",
+    "library build did not include support for the Hash access method"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__ham_30_hashmeta(dbp, real_name, obuf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *obuf;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(obuf, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_30_sizefix(dbp, fhp, realname, metabuf)
+	DB *dbp;
+	DB_FH *fhp;
+	char *realname;
+	u_int8_t *metabuf;
+{
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(realname, NULL);
+	COMPQUIET(metabuf, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_31_hash(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(dirtyp, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_31_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(dirtyp, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(dirtyp, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(dirtyp, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__hamc_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	COMPQUIET(other_dbc, NULL);
+	COMPQUIET(result, NULL);
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__hamc_count(dbc, recnop)
+	DBC *dbc;
+	db_recno_t *recnop;
+{
+	COMPQUIET(recnop, NULL);
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__hamc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	COMPQUIET(new_dbc, NULL);
+	return (__db_no_hash_am(orig_dbc->env));
+}
+
+int
+__hamc_init(dbc)
+	DBC *dbc;
+{
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_db_close(dbp)
+	DB *dbp;
+{
+	COMPQUIET(dbp, NULL);
+	return (0);
+}
+
+int
+__ham_db_create(dbp)
+	DB *dbp;
+{
+	COMPQUIET(dbp, NULL);
+	return (0);
+}
+
+int
+__ham_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__ham_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HMETA *hmeta;
+	u_int32_t flags;
+	DB *pgset;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(hmeta, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(pgset, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_metachk(dbp, name, hashm)
+	DB *dbp;
+	const char *name;
+	HMETA *hashm;
+{
+	COMPQUIET(name, NULL);
+	COMPQUIET(hashm, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op, (db_recops)0);
+	COMPQUIET(info, NULL);
+	return (__db_no_hash_am(env));
+}
+
+int
+__ham_mswap(env, pg)
+	ENV *env;
+	void *pg;
+{
+	COMPQUIET(pg, NULL);
+	return (__db_no_hash_am(env));
+}
+
+int
+__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op, (db_recops)0);
+	COMPQUIET(info, NULL);
+	return (__db_no_hash_am(env));
+}
+
+int
+__ham_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(name, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_new_subdb(mdbp, dbp, ip, txn)
+	DB *mdbp, *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(ip, NULL);
+	return (__db_no_hash_am(mdbp->env));
+}
+
+int
+__ham_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(base_pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_pgin(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	COMPQUIET(pg, 0);
+	COMPQUIET(pp, NULL);
+	COMPQUIET(cookie, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_pgout(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	COMPQUIET(pg, 0);
+	COMPQUIET(pp, NULL);
+	COMPQUIET(cookie, NULL);
+	return (__db_no_hash_am(dbp->env));
+}
+
+void
+__ham_print_cursor(dbc)
+	DBC *dbc;
+{
+	(void)__db_no_hash_am(dbc->env);
+}
+
+int
+__ham_quick_delete(dbc)
+	DBC *dbc;
+{
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_reclaim(dbp, ip, txn, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	COMPQUIET(txn, NULL);
+	COMPQUIET(ip, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(h, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	COMPQUIET(countp, NULL);
+	return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_vrfy(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_vrfy_hashing(dbc, nentries, m, thisbucket, pgno, flags, hfunc)
+	DBC *dbc;
+	u_int32_t nentries;
+	HMETA *m;
+	u_int32_t thisbucket;
+	db_pgno_t pgno;
+	u_int32_t flags;
+	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+{
+	COMPQUIET(nentries, 0);
+	COMPQUIET(m, NULL);
+	COMPQUIET(thisbucket, 0);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	COMPQUIET(hfunc, NULL);
+	return (__db_no_hash_am(dbc->dbp->env));
+}
+
+int
+__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HMETA *m;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(m, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta_pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(meta_pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_hash_am(dbp->env));
+}
+#endif /* !HAVE_HASH */
diff --git a/src/hash/hash_upgrade.c b/src/hash/hash_upgrade.c
new file mode 100644
index 00000000..f66a7a58
--- /dev/null
+++ b/src/hash/hash_upgrade.c
@@ -0,0 +1,323 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/db_upgrade.h"
+
+/*
+ * __ham_30_hashmeta --
+ *	Upgrade the database from version 4/5 to version 6.
+ *
+ * PUBLIC: int __ham_30_hashmeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__ham_30_hashmeta(dbp, real_name, obuf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *obuf;
+{
+	ENV *env;
+	HASHHDR *oldmeta;
+	HMETA30 newmeta;
+	u_int32_t *o_spares, *n_spares;
+	u_int32_t fillf, i, maxb, max_entry, nelem;
+	int ret;
+
+	env = dbp->env;
+	memset(&newmeta, 0, sizeof(newmeta));
+
+	oldmeta = (HASHHDR *)obuf;
+
+	/*
+	 * The first 32 bytes are similar.  The only change is the version
+	 * and that we removed the ovfl_point and have the page type now.
+	 */
+
+	newmeta.dbmeta.lsn = oldmeta->lsn;
+	newmeta.dbmeta.pgno = oldmeta->pgno;
+	newmeta.dbmeta.magic = oldmeta->magic;
+	newmeta.dbmeta.version = 6;
+	newmeta.dbmeta.pagesize = oldmeta->pagesize;
+	newmeta.dbmeta.type = P_HASHMETA;
+
+	/* Move flags */
+	newmeta.dbmeta.flags = oldmeta->flags;
+
+	/* Copy the free list, which has changed its name but works the same. */
+	newmeta.dbmeta.free = oldmeta->last_freed;
+
+	/* Copy: max_bucket, high_mask, low-mask, ffactor, nelem, h_charkey */
+	newmeta.max_bucket = oldmeta->max_bucket;
+	newmeta.high_mask = oldmeta->high_mask;
+	newmeta.low_mask = oldmeta->low_mask;
+	newmeta.ffactor = oldmeta->ffactor;
+	newmeta.nelem = oldmeta->nelem;
+	newmeta.h_charkey = oldmeta->h_charkey;
+
+	/*
+	 * There was a bug in 2.X versions where the nelem could go negative.
+	 * In general, this is considered "bad."  If it does go negative
+	 * (that is, very large and positive), we'll die trying to dump and
+	 * load this database.  So, let's see if we can fix it here.
+	 */
+	nelem = newmeta.nelem;
+	fillf = newmeta.ffactor;
+	maxb = newmeta.max_bucket;
+
+	if ((fillf != 0 && fillf * maxb < 2 * nelem) ||
+	    (fillf == 0 && nelem > 0x8000000))
+		newmeta.nelem = 0;
+
+	/*
+	 * We now have to convert the spares array.  The old spares array
+	 * contained the total number of extra pages allocated prior to
+	 * the bucket that begins the next doubling.  The new spares array
+	 * contains the page number of the first bucket in the next doubling
+	 * MINUS the bucket number of that bucket.
+	 */
+	o_spares = oldmeta->spares;
+	n_spares = newmeta.spares;
+	max_entry = __db_log2(maxb + 1);   /* highest spares entry in use */
+	n_spares[0] = 1;
+	for (i = 1; i < NCACHED && i <= max_entry; i++)
+		n_spares[i] = 1 + o_spares[i - 1];
+
+					/* Replace the unique ID. */
+	if ((ret = __os_fileid(env, real_name, 1, newmeta.dbmeta.uid)) != 0)
+		return (ret);
+
+	/* Overwrite the original. */
+	memcpy(oldmeta, &newmeta, sizeof(newmeta));
+
+	return (0);
+}
+
+/*
+ * __ham_30_sizefix --
+ *	Make sure that all hash pages belonging to the current
+ *	hash doubling are within the bounds of the file.
+ *
+ * PUBLIC: int __ham_30_sizefix __P((DB *, DB_FH *, char *, u_int8_t *));
+ */
+int
+__ham_30_sizefix(dbp, fhp, realname, metabuf)
+	DB *dbp;
+	DB_FH *fhp;
+	char *realname;
+	u_int8_t *metabuf;
+{
+	u_int8_t buf[DB_MAX_PGSIZE];
+	ENV *env;
+	HMETA30 *meta;
+	db_pgno_t last_actual, last_desired;
+	int ret;
+	size_t nw;
+	u_int32_t pagesize;
+
+	env = dbp->env;
+	memset(buf, 0, DB_MAX_PGSIZE);
+
+	meta = (HMETA30 *)metabuf;
+	pagesize = meta->dbmeta.pagesize;
+
+	/*
+	 * Get the last page number.  To do this, we'll need dbp->pgsize
+	 * to be set right, so slam it into place.
+	 */
+	dbp->pgsize = pagesize;
+	if ((ret = __db_lastpgno(dbp, realname, fhp, &last_actual)) != 0)
+		return (ret);
+
+	/*
+	 * The last bucket in the doubling is equal to high_mask;  calculate
+	 * the page number that implies.
+	 */
+	last_desired = BS_TO_PAGE(meta->high_mask, meta->spares);
+
+	/*
+	 * If last_desired > last_actual, we need to grow the file.  Write
+	 * a zeroed page where last_desired would go.
+	 */
+	if (last_desired > last_actual) {
+		if ((ret = __os_seek(
+		    env, fhp, last_desired, pagesize, 0)) != 0)
+			return (ret);
+		if ((ret = __os_write(env, fhp, buf, pagesize, &nw)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __ham_31_hashmeta --
+ *	Upgrade the database from version 6 to version 7.
+ *
+ * PUBLIC: int __ham_31_hashmeta
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_31_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	HMETA30 *oldmeta;
+	HMETA31 *newmeta;
+
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(fhp, NULL);
+
+	newmeta = (HMETA31 *)h;
+	oldmeta = (HMETA30 *)h;
+
+	/*
+	 * Copy the fields down the page.
+	 * The fields may overlap so start at the bottom and use memmove().
+	 */
+	memmove(newmeta->spares, oldmeta->spares, sizeof(oldmeta->spares));
+	newmeta->h_charkey = oldmeta->h_charkey;
+	newmeta->nelem = oldmeta->nelem;
+	newmeta->ffactor = oldmeta->ffactor;
+	newmeta->low_mask = oldmeta->low_mask;
+	newmeta->high_mask = oldmeta->high_mask;
+	newmeta->max_bucket = oldmeta->max_bucket;
+	memmove(newmeta->dbmeta.uid,
+	    oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+	newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+	newmeta->dbmeta.record_count = 0;
+	newmeta->dbmeta.key_count = 0;
+	ZERO_LSN(newmeta->dbmeta.unused3);
+
+	/* Update the version. */
+	newmeta->dbmeta.version = 7;
+
+	/* Upgrade the flags. */
+	if (LF_ISSET(DB_DUPSORT))
+		F_SET(&newmeta->dbmeta, DB_HASH_DUPSORT);
+
+	*dirtyp = 1;
+	return (0);
+}
+
+/*
+ * __ham_31_hash --
+ *	Upgrade the database hash leaf pages.
+ *
+ * PUBLIC: int __ham_31_hash
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_31_hash(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	HKEYDATA *hk;
+	db_pgno_t pgno, tpgno;
+	db_indx_t indx;
+	int ret;
+
+	COMPQUIET(flags, 0);
+
+	ret = 0;
+	for (indx = 0; indx < NUM_ENT(h); indx += 2) {
+		hk = (HKEYDATA *)H_PAIRDATA(dbp, h, indx);
+		if (HPAGE_PTYPE(hk) == H_OFFDUP) {
+			memcpy(&pgno, HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+			tpgno = pgno;
+			if ((ret = __db_31_offdup(dbp, real_name, fhp,
+			    LF_ISSET(DB_DUPSORT) ? 1 : 0, &tpgno)) != 0)
+				break;
+			if (pgno != tpgno) {
+				*dirtyp = 1;
+				memcpy(HOFFDUP_PGNO(hk),
+				    &tpgno, sizeof(db_pgno_t));
+			}
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __ham_46_hashmeta --
+ *	Upgrade the database from version 8 to version 9.
+ *
+ * PUBLIC: int __ham_46_hashmeta
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	HMETA33 *newmeta;
+
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+
+	newmeta = (HMETA33 *)h;
+	/* Update the version. */
+	newmeta->dbmeta.version = 9;
+	*dirtyp = 1;
+
+	return (0);
+}
+
+/*
+ * __ham_46_hash --
+ *	Upgrade the database hash leaf pages.
+ *	From version 8 databases to version 9.
+ *	Involves sorting leaf pages, no format change.
+ *
+ * PUBLIC: int __ham_46_hash
+ * PUBLIC:      __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	DB_FH *fhp;
+	PAGE *h;
+	int *dirtyp;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(fhp, NULL);
+
+	if ((ret = __db_cursor(dbp, NULL, NULL, &dbc, 0)) != 0)
+		return (ret);
+	*dirtyp = 1;
+	ret = __ham_sort_page(dbc, NULL, h);
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/src/hash/hash_verify.c b/src/hash/hash_verify.c
new file mode 100644
index 00000000..662e7ac8
--- /dev/null
+++ b/src/hash/hash_verify.c
@@ -0,0 +1,1157 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __ham_dups_unsorted __P((DB *, u_int8_t *, u_int32_t));
+static int __ham_vrfy_bucket __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
+    u_int32_t));
+static int __ham_vrfy_item __P((DB *,
+    VRFY_DBINFO *, db_pgno_t, PAGE *, u_int32_t, u_int32_t));
+
+/*
+ * __ham_vrfy_meta --
+ *	Verify the hash-specific part of a metadata page.
+ *
+ *	Note that unlike btree, we don't save things off, because we
+ *	will need most everything again to verify each page and the
+ *	amount of state here is significant.
+ *
+ * PUBLIC: int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *,
+ * PUBLIC:     db_pgno_t, u_int32_t));
+ */
+int
+__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HMETA *m;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	HASH *hashp;
+	VRFY_PAGEINFO *pip;
+	int i, ret, t_ret, isbad;
+	u_int32_t pwr, mbucket;
+	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	hashp = dbp->h_internal;
+
+	if (hashp != NULL && hashp->h_hash != NULL)
+		hfunc = hashp->h_hash;
+	else
+		hfunc = __ham_func5;
+
+	/*
+	 * If we came through __db_vrfy_pagezero, we have already checked the
+	 * common fields.  However, we used the on-disk metadata page, it may
+	 * have been stale.  We now have the page from mpool, so check that.
+	 */
+	if ((ret = __db_vrfy_meta(dbp, vdp, &m->dbmeta, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/* h_charkey */
+	if (!LF_ISSET(DB_NOORDERCHK))
+		if (m->h_charkey != hfunc(dbp, CHARKEY, sizeof(CHARKEY))) {
+			EPRINT((env, DB_STR_A("1096",
+"Page %lu: database has custom hash function; reverify with DB_NOORDERCHK set",
+			    "%lu"), (u_long)pgno));
+			/*
+			 * Return immediately;  this is probably a sign of user
+			 * error rather than database corruption, so we want to
+			 * avoid extraneous errors.
+			 */
+			isbad = 1;
+			goto err;
+		}
+
+	/* max_bucket must be less than the last pgno. */
+	if (m->max_bucket > vdp->last_pgno) {
+		EPRINT((env, DB_STR_A("1097",
+		    "Page %lu: Impossible max_bucket %lu on meta page",
+		    "%lu %lu"), (u_long)pgno, (u_long)m->max_bucket));
+		/*
+		 * Most other fields depend somehow on max_bucket, so
+		 * we just return--there will be lots of extraneous
+		 * errors.
+		 */
+		isbad = 1;
+		goto err;
+	}
+
+	/*
+	 * max_bucket, high_mask and low_mask: high_mask must be one
+	 * less than the next power of two above max_bucket, and
+	 * low_mask must be one less than the power of two below it.
+	 */
+	pwr = (m->max_bucket == 0) ? 1 : 1 << __db_log2(m->max_bucket + 1);
+	if (m->high_mask != pwr - 1) {
+		EPRINT((env, DB_STR_A("1098",
+		    "Page %lu: incorrect high_mask %lu, should be %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)m->high_mask,
+		    (u_long)pwr - 1));
+		isbad = 1;
+	}
+	pwr >>= 1;
+	if (m->low_mask != pwr - 1) {
+		EPRINT((env, DB_STR_A("1099",
+		    "Page %lu: incorrect low_mask %lu, should be %lu",
+		    "%lu %lu %lu"), (u_long)pgno, (u_long)m->low_mask,
+		    (u_long)pwr - 1));
+		isbad = 1;
+	}
+
+	/* ffactor: no check possible. */
+	pip->h_ffactor = m->ffactor;
+
+	/*
+	 * nelem: just make sure it's not astronomical for now. This is the
+	 * same check that hash_upgrade does, since there was a bug in 2.X
+	 * which could make nelem go "negative".
+	 */
+	if (m->nelem > 0x80000000) {
+		EPRINT((env, DB_STR_A("1100",
+		    "Page %lu: suspiciously high nelem of %lu", "%lu %lu"),
+		    (u_long)pgno, (u_long)m->nelem));
+		isbad = 1;
+		pip->h_nelem = 0;
+	} else
+		pip->h_nelem = m->nelem;
+
+	/* flags */
+	if (F_ISSET(&m->dbmeta, DB_HASH_DUP))
+		F_SET(pip, VRFY_HAS_DUPS);
+	if (F_ISSET(&m->dbmeta, DB_HASH_DUPSORT))
+		F_SET(pip, VRFY_HAS_DUPSORT);
+	/* XXX: Why is the DB_HASH_SUBDB flag necessary? */
+
+	/* spares array */
+	for (i = 0; i < NCACHED && m->spares[i] != 0; i++) {
+		/*
+		 * We set mbucket to the maximum bucket that would use a given
+		 * spares entry;  we want to ensure that it's always less
+		 * than last_pgno.
+		 */
+		mbucket = (1 << i) - 1;
+		if (BS_TO_PAGE(mbucket, m->spares) > vdp->last_pgno) {
+			EPRINT((env, DB_STR_A("1101",
+			    "Page %lu: spares array entry %d is invalid",
+			    "%lu %d"), (u_long)pgno, i));
+			isbad = 1;
+		}
+	}
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (LF_ISSET(DB_SALVAGE) &&
+	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy --
+ *	Verify hash page.
+ *
+ * PUBLIC: int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__ham_vrfy(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	u_int32_t ent, himark, inpend;
+	db_indx_t *inp;
+	int isbad, ret, t_ret;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	if (TYPE(h) != P_HASH && TYPE(h) != P_HASH_UNSORTED) {
+		ret = __db_unknown_path(env, "__ham_vrfy");
+		goto err;
+	}
+
+	/* Verify and save off fields common to all PAGEs. */
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * Verify inp[].  Each offset from 0 to NUM_ENT(h) must be lower
+	 * than the previous one, higher than the current end of the inp array,
+	 * and lower than the page size.
+	 *
+	 * In any case, we return immediately if things are bad, as it would
+	 * be unsafe to proceed.
+	 */
+	inp = P_INP(dbp, h);
+	for (ent = 0, himark = dbp->pgsize,
+	    inpend = (u_int32_t)((u_int8_t *)inp - (u_int8_t *)h);
+	    ent < NUM_ENT(h); ent++)
+		if (inp[ent] >= himark) {
+			EPRINT((env, DB_STR_A("1102",
+			    "Page %lu: item %lu is out of order or nonsensical",
+			    "%lu %lu"), (u_long)pgno, (u_long)ent));
+			isbad = 1;
+			goto err;
+		} else if (inpend >= himark) {
+			EPRINT((env, DB_STR_A("1103",
+			    "Page %lu: entries array collided with data",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+			goto err;
+
+		} else {
+			himark = inp[ent];
+			inpend += sizeof(db_indx_t);
+			if ((ret = __ham_vrfy_item(
+			    dbp, vdp, pgno, h, ent, flags)) != 0)
+				goto err;
+		}
+
+	if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL, DB_HASH,
+	    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+		return (ret);
+	if (!LF_ISSET(DB_NOORDERCHK) && TYPE(h) == P_HASH &&
+	    (ret = __ham_verify_sorted_page(dbc, h)) != 0)
+		isbad = 1;
+
+err:	if ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy_item --
+ *	Given a hash page and an offset, sanity-check the item itself,
+ *	and save off any overflow items or off-page dup children as necessary.
+ */
+static int
+__ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	u_int32_t i, flags;
+{
+	HOFFDUP hod;
+	HOFFPAGE hop;
+	VRFY_CHILDINFO child;
+	VRFY_PAGEINFO *pip;
+	db_indx_t offset, len, dlen, elen;
+	int ret, t_ret;
+	u_int8_t *databuf;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	switch (HPAGE_TYPE(dbp, h, i)) {
+	case H_KEYDATA:
+		/* Nothing to do here--everything but the type field is data */
+		break;
+	case H_DUPLICATE:
+		/* Are we a datum or a key?  Better be the former. */
+		if (i % 2 == 0) {
+			EPRINT((dbp->env, DB_STR_A("1104",
+			    "Page %lu: hash key stored as duplicate item %lu",
+			    "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+		}
+		/*
+		 * Dups are encoded as a series within a single HKEYDATA,
+		 * in which each dup is surrounded by a copy of its length
+		 * on either side (so that the series can be walked in either
+		 * direction.  We loop through this series and make sure
+		 * each dup is reasonable.
+		 *
+		 * Note that at this point, we've verified item i-1, so
+		 * it's safe to use LEN_HKEYDATA (which looks at inp[i-1]).
+		 */
+		len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+		databuf = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+		for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
+			memcpy(&dlen, databuf + offset, sizeof(db_indx_t));
+
+			/* Make sure the length is plausible. */
+			if (offset + DUP_SIZE(dlen) > len) {
+				EPRINT((dbp->env, DB_STR_A("1105",
+			    "Page %lu: duplicate item %lu has bad length",
+				    "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+				ret = DB_VERIFY_BAD;
+				goto err;
+			}
+
+			/*
+			 * Make sure the second copy of the length is the
+			 * same as the first.
+			 */
+			memcpy(&elen,
+			    databuf + offset + dlen + sizeof(db_indx_t),
+			    sizeof(db_indx_t));
+			if (elen != dlen) {
+				EPRINT((dbp->env, DB_STR_A("1106",
+		"Page %lu: duplicate item %lu has two different lengths",
+				    "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+				ret = DB_VERIFY_BAD;
+				goto err;
+			}
+		}
+		F_SET(pip, VRFY_HAS_DUPS);
+		if (!LF_ISSET(DB_NOORDERCHK) &&
+		    __ham_dups_unsorted(dbp, databuf, len))
+			F_SET(pip, VRFY_DUPS_UNSORTED);
+		break;
+	case H_OFFPAGE:
+		/* Offpage item.  Make sure pgno is sane, save off. */
+		memcpy(&hop, P_ENTRY(dbp, h, i), HOFFPAGE_SIZE);
+		if (!IS_VALID_PGNO(hop.pgno) || hop.pgno == pip->pgno ||
+		    hop.pgno == PGNO_INVALID) {
+			EPRINT((dbp->env, DB_STR_A("1107",
+			    "Page %lu: offpage item %lu has bad pgno %lu",
+			    "%lu %lu %lu"), (u_long)pip->pgno, (u_long)i,
+			    (u_long)hop.pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		memset(&child, 0, sizeof(VRFY_CHILDINFO));
+		child.pgno = hop.pgno;
+		child.type = V_OVERFLOW;
+		child.tlen = hop.tlen; /* This will get checked later. */
+		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
+			goto err;
+		break;
+	case H_OFFDUP:
+		/* Offpage duplicate item.  Same drill. */
+		memcpy(&hod, P_ENTRY(dbp, h, i), HOFFDUP_SIZE);
+		if (!IS_VALID_PGNO(hod.pgno) || hod.pgno == pip->pgno ||
+		    hod.pgno == PGNO_INVALID) {
+			EPRINT((dbp->env, DB_STR_A("1108",
+			    "Page %lu: offpage item %lu has bad page number",
+			    "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		memset(&child, 0, sizeof(VRFY_CHILDINFO));
+		child.pgno = hod.pgno;
+		child.type = V_DUPLICATE;
+		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
+			goto err;
+		F_SET(pip, VRFY_HAS_DUPS);
+		break;
+	default:
+		EPRINT((dbp->env, DB_STR_A("1109",
+		    "Page %lu: item %lu has bad type", "%lu %lu"),
+		    (u_long)pip->pgno, (u_long)i));
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+err:	if ((t_ret =
+	    __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __ham_vrfy_structure --
+ *	Verify the structure of a hash database.
+ *
+ * PUBLIC: int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta_pgno;
+	u_int32_t flags;
+{
+	DB *pgset;
+	DB_MPOOLFILE *mpf;
+	HMETA *m;
+	PAGE *h;
+	VRFY_PAGEINFO *pip;
+	int isbad, p, ret, t_ret;
+	db_pgno_t pgno;
+	u_int32_t bucket, spares_entry;
+
+	mpf = dbp->mpf;
+	pgset = vdp->pgset;
+	h = NULL;
+	ret = isbad = 0;
+
+	if ((ret = __db_vrfy_pgset_get(pgset,
+	    vdp->thread_info, vdp->txn, meta_pgno, &p)) != 0)
+		return (ret);
+	if (p != 0) {
+		EPRINT((dbp->env, DB_STR_A("1110",
+		    "Page %lu: Hash meta page referenced twice", "%lu"),
+		    (u_long)meta_pgno));
+		return (DB_VERIFY_BAD);
+	}
+	if ((ret = __db_vrfy_pgset_inc(pgset,
+	    vdp->thread_info, vdp->txn, meta_pgno)) != 0)
+		return (ret);
+
+	/* Get the meta page;  we'll need it frequently. */
+	if ((ret = __memp_fget(mpf,
+	    &meta_pgno, vdp->thread_info, NULL, 0, &m)) != 0)
+		return (ret);
+
+	/* Loop through bucket by bucket. */
+	for (bucket = 0; bucket <= m->max_bucket; bucket++)
+		if ((ret =
+		    __ham_vrfy_bucket(dbp, vdp, m, bucket, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		    }
+
+	/*
+	 * There may be unused hash pages corresponding to buckets
+	 * that have been allocated but not yet used.  These may be
+	 * part of the current doubling above max_bucket, or they may
+	 * correspond to buckets that were used in a transaction
+	 * that then aborted.
+	 *
+	 * Loop through them, as far as the spares array defines them,
+	 * and make sure they're all empty.
+	 *
+	 * Note that this should be safe, since we've already verified
+	 * that the spares array is sane.
+	 */
+	for (bucket = m->max_bucket + 1; spares_entry = __db_log2(bucket + 1),
+	    spares_entry < NCACHED && m->spares[spares_entry] != 0; bucket++) {
+		pgno = BS_TO_PAGE(bucket, m->spares);
+		if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+			goto err;
+
+		/* It's okay if these pages are totally zeroed;  unmark it. */
+		F_CLR(pip, VRFY_IS_ALLZEROES);
+
+		/* It's also OK if this page is simply invalid. */
+		if (pip->type == P_INVALID) {
+			if ((ret = __db_vrfy_putpageinfo(dbp->env,
+			    vdp, pip)) != 0)
+				goto err;
+			continue;
+		}
+
+		if (pip->type != P_HASH && pip->type != P_HASH_UNSORTED) {
+			EPRINT((dbp->env, DB_STR_A("1111",
+			    "Page %lu: hash bucket %lu maps to non-hash page",
+			    "%lu %lu"), (u_long)pgno, (u_long)bucket));
+			isbad = 1;
+		} else if (pip->entries != 0) {
+			EPRINT((dbp->env, DB_STR_A("1112",
+		    "Page %lu: non-empty page in unused hash bucket %lu",
+			    "%lu %lu"), (u_long)pgno, (u_long)bucket));
+			isbad = 1;
+		} else {
+			if ((ret = __db_vrfy_pgset_get(pgset,
+			    vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+				goto err;
+			if (p != 0) {
+				EPRINT((dbp->env, DB_STR_A("1113",
+				    "Page %lu: above max_bucket referenced",
+				    "%lu"), (u_long)pgno));
+				isbad = 1;
+			} else {
+				if ((ret = __db_vrfy_pgset_inc(pgset,
+				    vdp->thread_info, vdp->txn, pgno)) != 0)
+					goto err;
+				if ((ret = __db_vrfy_putpageinfo(dbp->env,
+				    vdp, pip)) != 0)
+					goto err;
+				continue;
+			}
+		}
+
+		/* If we got here, it's an error. */
+		(void)__db_vrfy_putpageinfo(dbp->env, vdp, pip);
+		goto err;
+	}
+
+err:	if ((t_ret = __memp_fput(mpf, vdp->thread_info, m, dbp->priority)) != 0)
+		return (t_ret);
+	if (h != NULL &&
+	    (t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+		return (t_ret);
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD: ret);
+}
+
+/*
+ * __ham_vrfy_bucket --
+ *	Verify a given bucket.
+ */
+static int
+__ham_vrfy_bucket(dbp, vdp, m, bucket, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HMETA *m;
+	u_int32_t bucket, flags;
+{
+	ENV *env;
+	HASH *hashp;
+	VRFY_CHILDINFO *child;
+	VRFY_PAGEINFO *mip, *pip;
+	int ret, t_ret, isbad, p;
+	db_pgno_t pgno, next_pgno;
+	DBC *cc;
+	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+
+	env = dbp->env;
+	isbad = 0;
+	pip = NULL;
+	cc = NULL;
+
+	hashp = dbp->h_internal;
+	if (hashp != NULL && hashp->h_hash != NULL)
+		hfunc = hashp->h_hash;
+	else
+		hfunc = __ham_func5;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO(m), &mip)) != 0)
+		return (ret);
+
+	/* Calculate the first pgno for this bucket. */
+	pgno = BS_TO_PAGE(bucket, m->spares);
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		goto err;
+
+	/*
+	 * Hash pages that nothing has ever hashed to may never have actually
+	 * come into existence, it is possible, and legal, for the first page in
+	 * a bucket to not exist.  This flag would have been set in
+	 * __db_vrfy_walkpages.
+	 */
+	if (F_ISSET(pip, VRFY_NONEXISTENT))
+		goto err;
+
+	/* Make sure we got a plausible page number. */
+	if (pgno > vdp->last_pgno ||
+	    (pip->type != P_HASH && pip->type != P_HASH_UNSORTED)) {
+		EPRINT((env, DB_STR_A("1114",
+		    "Page %lu: impossible first page in bucket %lu", "%lu %lu"),
+		    (u_long)pgno, (u_long)bucket));
+		/* Unsafe to continue. */
+		isbad = 1;
+		goto err;
+	}
+
+	if (pip->prev_pgno != PGNO_INVALID) {
+		EPRINT((env, DB_STR_A("1115",
+		    "Page %lu: first page in hash bucket %lu has a prev_pgno",
+		    "%lu %lu"), (u_long)pgno, (u_long)bucket));
+		isbad = 1;
+	}
+
+	/*
+	 * Set flags for dups and sorted dups.
+	 */
+	flags |= F_ISSET(mip, VRFY_HAS_DUPS) ? DB_ST_DUPOK : 0;
+	flags |= F_ISSET(mip, VRFY_HAS_DUPSORT) ? DB_ST_DUPSORT : 0;
+
+	/* Loop until we find a fatal bug, or until we run out of pages. */
+	for (;;) {
+		/* Provide feedback on our progress to the application. */
+		if (!LF_ISSET(DB_SALVAGE))
+			__db_vrfy_struct_feedback(dbp, vdp);
+
+		if ((ret = __db_vrfy_pgset_get(vdp->pgset,
+		    vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+			goto err;
+		if (p != 0) {
+			EPRINT((env, DB_STR_A("1116",
+			    "Page %lu: hash page referenced twice", "%lu"),
+			    (u_long)pgno));
+			isbad = 1;
+			/* Unsafe to continue. */
+			goto err;
+		} else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+		    vdp->thread_info, vdp->txn, pgno)) != 0)
+			goto err;
+
+		/*
+		 * Hash pages that nothing has ever hashed to may never
+		 * have actually come into existence, and may appear to be
+		 * entirely zeroed.  This is acceptable, and since there's
+		 * no real way for us to know whether this has actually
+		 * occurred, we clear the "wholly zeroed" flag on every
+		 * hash page.  A wholly zeroed page, by nature, will appear
+		 * to have no flags set and zero entries, so should
+		 * otherwise verify correctly.
+		 */
+		F_CLR(pip, VRFY_IS_ALLZEROES);
+
+		/* If we have dups, our meta page had better know about it. */
+		if (F_ISSET(pip, VRFY_HAS_DUPS) &&
+		    !F_ISSET(mip, VRFY_HAS_DUPS)) {
+			EPRINT((env, DB_STR_A("1117",
+		    "Page %lu: duplicates present in non-duplicate database",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+		}
+
+		/*
+		 * If the database has sorted dups, this page had better
+		 * not have unsorted ones.
+		 */
+		if (F_ISSET(mip, VRFY_HAS_DUPSORT) &&
+		    F_ISSET(pip, VRFY_DUPS_UNSORTED)) {
+			EPRINT((env, DB_STR_A("1118",
+			    "Page %lu: unsorted dups in sorted-dup database",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+		}
+
+		/* Walk overflow chains and offpage dup trees. */
+		if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+			goto err;
+		for (ret = __db_vrfy_ccset(cc, pip->pgno, &child); ret == 0;
+		    ret = __db_vrfy_ccnext(cc, &child))
+			if (child->type == V_OVERFLOW) {
+				if ((ret = __db_vrfy_ovfl_structure(dbp, vdp,
+				    child->pgno, child->tlen,
+				    flags | DB_ST_OVFL_LEAF)) != 0) {
+					if (ret == DB_VERIFY_BAD)
+						isbad = 1;
+					else
+						goto err;
+				}
+			} else if (child->type == V_DUPLICATE) {
+				if ((ret = __db_vrfy_duptype(dbp,
+				    vdp, child->pgno, flags)) != 0) {
+					isbad = 1;
+					continue;
+				}
+				if ((ret = __bam_vrfy_subtree(dbp, vdp,
+				    child->pgno, NULL, NULL,
+		    flags | DB_ST_RECNUM | DB_ST_DUPSET | DB_ST_TOPLEVEL,
+				    NULL, NULL, NULL)) != 0) {
+					if (ret == DB_VERIFY_BAD)
+						isbad = 1;
+					else
+						goto err;
+				}
+			}
+		/* Close the cursor on vdp, open one on dbp */
+		if ((ret = __db_vrfy_ccclose(cc)) != 0)
+			goto err;
+		if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL,
+		    DB_HASH, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &cc)) != 0)
+			goto err;
+		/* If it's safe to check that things hash properly, do so. */
+		if (isbad == 0 && !LF_ISSET(DB_NOORDERCHK) &&
+		    (ret = __ham_vrfy_hashing(cc, pip->entries,
+		    m, bucket, pgno, flags, hfunc)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+
+		next_pgno = pip->next_pgno;
+		ret = __db_vrfy_putpageinfo(env, vdp, pip);
+
+		pip = NULL;
+		if (ret != 0)
+			goto err;
+
+		if (next_pgno == PGNO_INVALID)
+			break;		/* End of the bucket. */
+
+		/* We already checked this, but just in case... */
+		if (!IS_VALID_PGNO(next_pgno)) {
+			EPRINT((env, DB_STR_A("1119",
+			    "Page %lu: hash page has bad next_pgno", "%lu"),
+			    (u_long)pgno));
+			isbad = 1;
+			goto err;
+		}
+
+		if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+			goto err;
+
+		if (pip->prev_pgno != pgno) {
+			EPRINT((env, DB_STR_A("1120",
+			    "Page %lu: hash page has bad prev_pgno", "%lu"),
+			    (u_long)next_pgno));
+			isbad = 1;
+		}
+		pgno = next_pgno;
+	}
+
+err:	if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+		ret = t_ret;
+	if (mip != NULL && ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
+		ret = t_ret;
+	if (pip != NULL && ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy_hashing --
+ *	Verify that all items on a given hash page hash correctly.
+ *
+ * PUBLIC: int __ham_vrfy_hashing __P((DBC *,
+ * PUBLIC:     u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t,
+ * PUBLIC:     u_int32_t (*) __P((DB *, const void *, u_int32_t))));
+ */
+int
+__ham_vrfy_hashing(dbc, nentries, m, thisbucket, pgno, flags, hfunc)
+	DBC *dbc;
+	u_int32_t nentries;
+	HMETA *m;
+	u_int32_t thisbucket;
+	db_pgno_t pgno;
+	u_int32_t flags;
+	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+{
+	DB *dbp;
+	DBT dbt;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	db_indx_t i;
+	int ret, t_ret, isbad;
+	u_int32_t hval, bucket;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	ret = isbad = 0;
+
+	memset(&dbt, 0, sizeof(DBT));
+	F_SET(&dbt, DB_DBT_REALLOC);
+	ENV_GET_THREAD_INFO(dbp->env, ip);
+
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
+		return (ret);
+
+	for (i = 0; i < nentries; i += 2) {
+		/*
+		 * We've already verified the page integrity and that of any
+		 * overflow chains linked off it;  it is therefore safe to use
+		 * __db_ret.  It's also not all that much slower, since we have
+		 * to copy every hash item to deal with alignment anyway;  we
+		 * can tweak this a bit if this proves to be a bottleneck,
+		 * but for now, take the easy route.
+		 */
+		if ((ret = __db_ret(dbc, h, i, &dbt, NULL, NULL)) != 0)
+			goto err;
+		hval = hfunc(dbp, dbt.data, dbt.size);
+
+		bucket = hval & m->high_mask;
+		if (bucket > m->max_bucket)
+			bucket = bucket & m->low_mask;
+
+		if (bucket != thisbucket) {
+			EPRINT((dbp->env, DB_STR_A("1121",
+			    "Page %lu: item %lu hashes incorrectly", "%lu %lu"),
+			    (u_long)pgno, (u_long)i));
+			isbad = 1;
+		}
+	}
+
+err:	if (dbt.data != NULL)
+		__os_ufree(dbp->env, dbt.data);
+	if ((t_ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
+		return (t_ret);
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_salvage --
+ *	Safely dump out anything that looks like a key on an alleged
+ *	hash page.
+ *
+ * PUBLIC: int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
+ * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBT dbt, key_dbt, unkdbt;
+	db_pgno_t dpgno;
+	int ret, err_ret, t_ret;
+	u_int32_t himark, i, ovfl_bufsz;
+	u_int8_t *hk, *p;
+	void *buf, *key_buf;
+	db_indx_t dlen, len, tlen;
+
+	memset(&dbt, 0, sizeof(DBT));
+	dbt.flags = DB_DBT_REALLOC;
+
+	DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+	err_ret = 0;
+
+	/*
+	 * Allocate a buffer for overflow items.  Start at one page;
+	 * __db_safe_goff will realloc as needed.
+	 */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0)
+		return (ret);
+    ovfl_bufsz = dbp->pgsize;
+
+	himark = dbp->pgsize;
+	for (i = 0;; i++) {
+		/* If we're not aggressive, break when we hit NUM_ENT(h). */
+		if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
+			break;
+
+		/*
+		 * Verify the current item. If we're beyond NUM_ENT errors are
+		 * expected and ignored.
+		 */
+		ret = __db_vrfy_inpitem(dbp,
+		    h, pgno, i, 0, flags, &himark, NULL);
+		/* If this returned a fatality, it's time to break. */
+		if (ret == DB_VERIFY_FATAL) {
+			if (i >= NUM_ENT(h))
+				ret = 0;
+			break;
+		} else if (ret != 0 && i >= NUM_ENT(h)) {
+			/* Not a reportable error, but don't salvage item. */
+			ret = 0;
+		} else if (ret == 0) {
+			/* Set len to total entry length. */
+			len = LEN_HITEM(dbp, h, dbp->pgsize, i);
+			hk = P_ENTRY(dbp, h, i);
+			if (len == 0 || len > dbp->pgsize ||
+			    (u_int32_t)(hk + len - (u_int8_t *)h) >
+			    dbp->pgsize) {
+				/* Item is unsafely large; skip it. */
+				err_ret = DB_VERIFY_BAD;
+				continue;
+			}
+			switch (HPAGE_PTYPE(hk)) {
+			case H_KEYDATA:
+				/* Update len to size of item. */
+				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+keydata:			memcpy(buf, HKEYDATA_DATA(hk), len);
+				dbt.size = len;
+				dbt.data = buf;
+				if ((ret = __db_vrfy_prdbt(&dbt,
+				    0, " ", handle, callback, 0, 0, vdp)) != 0)
+					err_ret = ret;
+				break;
+			case H_OFFPAGE:
+				if (len < HOFFPAGE_SIZE) {
+					err_ret = DB_VERIFY_BAD;
+					continue;
+				}
+				memcpy(&dpgno,
+				    HOFFPAGE_PGNO(hk), sizeof(dpgno));
+				if ((ret = __db_safe_goff(dbp,
+				    vdp, dpgno, &dbt, &buf,
+				    &ovfl_bufsz, flags)) != 0) {
+					err_ret = ret;
+					(void)__db_vrfy_prdbt(&unkdbt, 0, " ",
+					    handle, callback, 0, 0, vdp);
+					/* fallthrough to end of case */
+				} else if ((ret = __db_vrfy_prdbt(&dbt,
+				    0, " ", handle, callback, 0, 0, vdp)) != 0)
+					err_ret = ret;
+				break;
+			case H_OFFDUP:
+				if (len < HOFFDUP_SIZE) {
+					err_ret = DB_VERIFY_BAD;
+					continue;
+				}
+				memcpy(&dpgno,
+				    HOFFDUP_PGNO(hk), sizeof(dpgno));
+				/* UNKNOWN iff pgno is bad or we're a key. */
+				if (!IS_VALID_PGNO(dpgno) || (i % 2 == 0)) {
+					if ((ret =
+					    __db_vrfy_prdbt(&unkdbt, 0, " ",
+					    handle, callback, 0, 0, vdp)) != 0)
+						err_ret = ret;
+				} else if ((ret = __db_salvage_duptree(dbp,
+				    vdp, dpgno, &dbt, handle, callback,
+				    flags | DB_SA_SKIPFIRSTKEY)) != 0)
+					err_ret = ret;
+				break;
+			case H_DUPLICATE:
+				/*
+				 * This is an on-page duplicate item, iterate
+				 * over the duplicate set, printing out
+				 * key/data pairs.
+				 */
+				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+				/*
+				 * If this item is at an even index it must be
+				 * a key item and it should never be of type
+				 * H_DUPLICATE. If we are in aggressive mode,
+				 * print the item out as a normal key, and let
+				 * the user resolve the discrepancy.
+				 */
+				if (i % 2 == 0) {
+					err_ret = ret;
+					if (LF_ISSET(DB_AGGRESSIVE))
+						goto keydata;
+					break;
+				}
+
+				/*
+				 * Check to ensure that the item size is
+				 * greater than the smallest possible on page
+				 * duplicate.
+				 */
+				if (len <
+				    HKEYDATA_SIZE(2 * sizeof(db_indx_t))) {
+					err_ret = DB_VERIFY_BAD;
+					continue;
+				}
+
+				/*
+				 * Copy out the key from the dbt, it is still
+				 * present from the previous pass.
+				 */
+				memset(&key_dbt, 0, sizeof(key_dbt));
+				if ((ret = __os_malloc(
+				    dbp->env, dbt.size, &key_buf)) != 0)
+					return (ret);
+				memcpy(key_buf, buf, dbt.size);
+				key_dbt.data = key_buf;
+				key_dbt.size = dbt.size;
+				key_dbt.flags = DB_DBT_USERMEM;
+
+				/* Loop until we hit the total length. */
+				for (tlen = 0; tlen + sizeof(db_indx_t) < len;
+				    tlen += dlen + 2 * sizeof(db_indx_t)) {
+					/*
+					 * Print the key for every duplicate
+					 * item. Except the first dup, since
+					 * the key was already output once by
+					 * the previous iteration.
+					 */
+					if (tlen != 0) {
+						if ((ret = __db_vrfy_prdbt(
+						    &key_dbt, 0, " ", handle,
+						    callback, 0, 0, vdp)) != 0)
+							err_ret = ret;
+					}
+					p = HKEYDATA_DATA(hk) + tlen;
+					memcpy(&dlen, p, sizeof(db_indx_t));
+					p += sizeof(db_indx_t);
+					/*
+					 * If dlen is too long, print all the
+					 * rest of the dup set in a chunk.
+					 */
+					if (dlen + tlen + sizeof(db_indx_t) >
+					    len) {
+						dlen = len -
+						    (tlen + sizeof(db_indx_t));
+						err_ret = DB_VERIFY_BAD;
+					}
+					memcpy(buf, p, dlen);
+					dbt.size = dlen;
+					dbt.data = buf;
+					if ((ret = __db_vrfy_prdbt(&dbt, 0, " ",
+					    handle, callback, 0, 0, vdp)) != 0)
+						err_ret = ret;
+				}
+				__os_free(dbp->env, key_buf);
+				break;
+			default:
+				if (!LF_ISSET(DB_AGGRESSIVE))
+					break;
+				err_ret = DB_VERIFY_BAD;
+				break;
+			}
+		}
+	}
+
+	__os_free(dbp->env, buf);
+	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+		return (t_ret);
+	return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __ham_meta2pgset --
+ *	Return the set of hash pages corresponding to the given
+ *	known-good meta page.
+ *
+ * PUBLIC: int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
+ * PUBLIC:     DB *));
+ */
+int
+__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HMETA *hmeta;
+	u_int32_t flags;
+	DB *pgset;
+{
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t bucket, totpgs;
+	int ret, val;
+
+	/*
+	 * We don't really need flags, but leave them for consistency with
+	 * __bam_meta2pgset.
+	 */
+	COMPQUIET(flags, 0);
+	ip = vdp->thread_info;
+
+	DB_ASSERT(dbp->env, pgset != NULL);
+
+	mpf = dbp->mpf;
+	totpgs = 0;
+
+	/*
+	 * Loop through all the buckets, pushing onto pgset the corresponding
+	 * page(s) for each one.
+	 */
+	for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+		pgno = BS_TO_PAGE(bucket, hmeta->spares);
+
+		/*
+		 * We know the initial pgno is safe because the spares array has
+		 * been verified.
+		 *
+		 * Safely walk the list of pages in this bucket.
+		 */
+		for (;;) {
+			if ((ret =
+			    __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
+				return (ret);
+			if (TYPE(h) == P_HASH || TYPE(h) == P_HASH_UNSORTED) {
+
+				/*
+				 * Make sure we don't go past the end of
+				 * pgset.
+				 */
+				if (++totpgs > vdp->last_pgno) {
+					(void)__memp_fput(mpf,
+					    ip, h, dbp->priority);
+					return (DB_VERIFY_BAD);
+				}
+				if ((ret = __db_vrfy_pgset_inc(pgset,
+				    vdp->thread_info, vdp->txn, pgno)) != 0) {
+					(void)__memp_fput(mpf,
+					    ip, h, dbp->priority);
+					return (ret);
+				}
+
+				pgno = NEXT_PGNO(h);
+			} else
+				pgno = PGNO_INVALID;
+
+			if ((ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
+				return (ret);
+
+			/* If the new pgno is wonky, go onto the next bucket. */
+			if (!IS_VALID_PGNO(pgno) ||
+			    pgno == PGNO_INVALID)
+				break;
+
+			/*
+			 * If we've touched this page before, we have a cycle;
+			 * go on to the next bucket.
+			 */
+			if ((ret = __db_vrfy_pgset_get(pgset,
+			    vdp->thread_info, vdp->txn, pgno, &val)) != 0)
+				return (ret);
+			if (val != 0)
+				break;
+		}
+	}
+	return (0);
+}
+
+/*
+ * __ham_dups_unsorted --
+ *	Takes a known-safe hash duplicate set and its total length.
+ *	Returns 1 if there are out-of-order duplicates in this set,
+ *	0 if there are not.
+ */
+static int
+__ham_dups_unsorted(dbp, buf, len)
+	DB *dbp;
+	u_int8_t *buf;
+	u_int32_t len;
+{
+	DBT a, b;
+	db_indx_t offset, dlen;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+
+	memset(&a, 0, sizeof(DBT));
+	memset(&b, 0, sizeof(DBT));
+
+	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+	/*
+	 * Loop through the dup set until we hit the end or we find
+	 * a pair of dups that's out of order.  b is always the current
+	 * dup, a the one before it.
+	 */
+	for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
+		memcpy(&dlen, buf + offset, sizeof(db_indx_t));
+		b.data = buf + offset + sizeof(db_indx_t);
+		b.size = dlen;
+
+		if (a.data != NULL && func(dbp, &a, &b) > 0)
+			return (1);
+
+		a.data = b.data;
+		a.size = b.size;
+	}
+
+	return (0);
+}
diff --git a/src/heap/heap.c b/src/heap/heap.c
new file mode 100644
index 00000000..ab404658
--- /dev/null
+++ b/src/heap/heap.c
@@ -0,0 +1,2812 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int  __heap_bulk __P((DBC *, DBT *, u_int32_t));
+static int  __heap_getpage __P((DBC *, u_int32_t, u_int8_t *));
+static int  __heapc_close __P((DBC *, db_pgno_t, int *));
+static int  __heapc_del __P((DBC *, u_int32_t));
+static int  __heapc_destroy __P((DBC *));
+static int  __heapc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __heapc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int  __heapc_reloc __P((DBC *, DBT *, DBT *));
+static int  __heapc_reloc_partial __P((DBC *, DBT *, DBT *));
+static int  __heapc_split __P((DBC *, DBT *, DBT *, int));
+
+/*
+ * Acquire a new page/lock.  If we are already holding a page and a lock
+ * we discard those and get the new ones.  In this case we can use
+ * LCK_COUPLE to save trips to lock manager.  If we are not holding a page or
+ * locks, we just get a new lock and page. Lock release done with a
+ * transactional lock put.
+ */
+#undef  ACQUIRE
+#define	ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, mflags, ret) do { \
+	DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf;				\
+	if ((pagep) != NULL) {						\
+		ret = __memp_fput(__mpf,				\
+		    (dbc)->thread_info, pagep, dbc->priority);		\
+		pagep = NULL;						\
+	}								\
+	if ((ret) == 0 && STD_LOCKING(dbc))				\
+		ret = __db_lget(dbc,					\
+		    LOCK_ISSET(lock) ? LCK_COUPLE : 0,			\
+		    lpgno, mode, flags, &(lock));			\
+	if ((ret) == 0)							\
+		ret = __memp_fget(__mpf, &(fpgno),			\
+		    (dbc)->thread_info, (dbc)->txn, mflags, &(pagep));	\
+} while (0)
+
+/* Acquire a new page/lock for a heap cursor */
+#undef  ACQUIRE_CUR
+#define	ACQUIRE_CUR(dbc, mode, p, flags, mflags, ret) do {		\
+	HEAP_CURSOR *__cp = (HEAP_CURSOR *)(dbc)->internal;		\
+	if (p != __cp->pgno)						\
+		__cp->pgno = PGNO_INVALID;				\
+	ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, mflags, ret); \
+	if ((ret) == 0) {						\
+		__cp->pgno = p;						\
+		__cp->lock_mode = (mode);				\
+	}								\
+} while (0)
+
+/* Discard the current page/lock for a cursor, indicate txn lock release */
+#undef  DISCARD
+#define	DISCARD(dbc, pagep, lock, tlock, ret) do {			\
+	DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf;				\
+	int __t_ret;							\
+	__t_ret = 0;							\
+	if ((pagep) != NULL) {						\
+		__t_ret = __memp_fput(__mpf,				\
+		    (dbc)->thread_info, pagep, dbc->priority);		\
+		pagep = NULL;						\
+	}								\
+	if (__t_ret != 0 && (ret) == 0)					\
+		ret = __t_ret;						\
+	if (tlock == 1)							\
+		__t_ret = __TLPUT((dbc), lock);				\
+	else								\
+		__t_ret = __LPUT((dbc), lock);				\
+	if (__t_ret != 0 && (ret) == 0)					\
+		ret = __t_ret;						\
+} while (0)
+
+/*
+ * __heapc_init --
+ *	Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __heapc_init __P((DBC *));
+ */
+int
+__heapc_init(dbc)
+	DBC *dbc;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+	if (dbc->internal == NULL)
+		if ((ret = __os_calloc(
+		    env, 1, sizeof(HEAP_CURSOR), &dbc->internal)) != 0)
+			return (ret);
+
+	/* Initialize methods. */
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __dbc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	dbc->am_bulk = __heap_bulk;
+	dbc->am_close = __heapc_close;
+	dbc->am_del = __heapc_del;
+	dbc->am_destroy = __heapc_destroy;
+	dbc->am_get = __heapc_get;
+	dbc->am_put = __heapc_put;
+	dbc->am_writelock = NULL;
+
+	return (0);
+}
+
+static int
+__heap_bulk(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_HEAP_RID prev_rid, rid;
+	DBT sdata;
+	HEAP_CURSOR *cp;
+	HEAPHDR *hdr;
+	HEAPSPLITHDR *shdr;
+	PAGE *pg;
+	db_lockmode_t lock_type;
+	int is_key, ret;
+	int32_t *offp;
+	u_int32_t data_size, key_size, needed, space;
+	u_int8_t *dbuf, *np;
+
+	ret = 0;
+	dbp = dbc->dbp;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	hdr = NULL;
+	shdr = NULL;
+
+	/* Check for additional bits for locking */
+	if (F_ISSET(dbc, DBC_RMW))
+		lock_type = DB_LOCK_WRITE;
+	else
+		lock_type = DB_LOCK_READ;
+
+	/*
+	 * np is the next place to copy things into the buffer.
+	 * dbuf always stays at the beginning of the buffer.
+	 */
+	dbuf = data->data;
+	np = dbuf;
+
+	/* Keep track of space that is left.  There is a termination entry */
+	space = data->ulen;
+	space -= sizeof(*offp);
+
+	/* Build the offset/size table from the end up. */
+	offp = (int32_t *)((u_int8_t *)dbuf + data->ulen);
+	offp--;
+
+	/*
+	 * key_size and data_size hold the 32-bit aligned size of the key and
+	 * data values written to the buffer.
+	 */
+	key_size = DB_ALIGN(DB_HEAP_RID_SZ, sizeof(u_int32_t));
+	data_size = 0;
+
+	/* is_key indicates whether keys are returned. */
+	is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+
+next_pg:
+	rid.indx = cp->indx;
+	rid.pgno = cp->pgno;
+	pg = cp->page;
+
+	/*
+	 * Write records to the buffer, in the format needed by the DB_MULTIPLE
+	 * macros.  For a description of the data layout, see db.h.
+	 */
+	do {
+		if (HEAP_OFFSETTBL(dbp, pg)[rid.indx] == 0)
+			continue;
+		hdr = (HEAPHDR *)P_ENTRY(dbp, pg, rid.indx);
+		/*
+		 * If this is a split record and not the first piece of the
+		 * record, skip it.
+		 */
+		if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+		    !F_ISSET(hdr, HEAP_RECFIRST))
+			continue;
+
+		/*
+		 * Calculate how much space is needed to add this record.  If
+		 * there's not enough, we're done.  If we haven't written any
+		 * data to the buffer, or if we are doing a DBP->get, return
+		 * DB_BUFFER_SMALL.
+		 */
+		needed = 0;
+		if (is_key)
+			needed = 2 * sizeof(*offp) + key_size;
+		if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+			shdr = (HEAPSPLITHDR *)hdr;
+			data_size = DB_ALIGN(shdr->tsize, sizeof(u_int32_t));
+		} else
+			data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t));
+		needed += 2 * sizeof(*offp) + data_size;
+
+		if (needed > space) {
+			if (np == dbuf || F_ISSET(dbc, DBC_FROM_DB_GET)) {
+				data->size = (u_int32_t)DB_ALIGN(
+				    needed + data->ulen - space, 1024);
+				return (DB_BUFFER_SMALL);
+			}
+			break;
+		}
+
+		if (is_key) {
+			memcpy(np, &rid, key_size);
+			*offp-- = (int32_t)(np - dbuf);
+			*offp-- = (int32_t)DB_HEAP_RID_SZ;
+			np += key_size;
+		}
+
+		if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+			/*
+			 * Use __heapc_gsplit to write a split record to the
+			 * return buffer.  gsplit will return any fetched pages
+			 * to the cache, but will leave the cursor's current
+			 * page alone.
+			 */
+			memset(&sdata, 0, sizeof(DBT));
+			sdata.data = np;
+			sdata.size = sdata.ulen = shdr->tsize;
+			sdata.flags = DB_DBT_USERMEM;
+			/* gsplit expects the cursor to be positioned. */
+			cp->pgno = rid.pgno;
+			cp->indx = rid.indx;
+			if ((ret = __heapc_gsplit(
+			    dbc, &sdata, NULL, NULL)) != 0)
+				return (ret);
+		} else {
+			memcpy(np,
+			    (u_int8_t *)hdr + sizeof(HEAPHDR), hdr->size);
+		}
+		*offp-- = (int32_t)(np - dbuf);
+		if (F_ISSET(hdr, HEAP_RECSPLIT))
+			*offp-- = (int32_t)shdr->tsize;
+		else
+			*offp-- = (int32_t)hdr->size;
+		np += data_size;
+		space -= needed;
+		prev_rid = rid;
+
+		/*
+		 * The data and "metadata" ends of the buffer should never
+		 * overlap.
+		 */
+		DB_ASSERT(dbp->env, (void *)np <= (void *)offp);
+	} while (++rid.indx < NUM_ENT(pg));
+
+	/* If we are off the page then try the next page. */
+	if (rid.indx >= NUM_ENT(pg)) {
+		rid.pgno++;
+		ACQUIRE_CUR(dbc, lock_type, rid.pgno, 0, 0, ret);
+		if (ret == 0) {
+			cp->indx = 0;
+			goto next_pg;
+		} else if (ret != DB_PAGE_NOTFOUND)
+			return (ret);
+	}
+
+	DB_ASSERT(dbp->env, (ret == 0 || ret == DB_PAGE_NOTFOUND));
+	cp->indx = prev_rid.indx;
+	cp->pgno = prev_rid.pgno;
+
+	*offp = -1;
+
+	return (0);
+}
+
+static int
+__heapc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	DB_MPOOLFILE *mpf;
+	HEAP_CURSOR *cp;
+	int ret;
+
+	COMPQUIET(root_pgno, 0);
+	COMPQUIET(rmroot, 0);
+
+	cp = (HEAP_CURSOR *)dbc->internal;
+	mpf = dbc->dbp->mpf;
+	ret = 0;
+
+	/* Release the page/lock held by the cursor. */
+	DISCARD(dbc, cp->page, cp->lock, 1, ret);
+	if (ret == 0 && !LOCK_ISSET(cp->lock))
+		cp->lock_mode = DB_LOCK_NG;
+
+	return (ret);
+}
+
+static int
+__heapc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_HEAP_RID next_rid, orig_rid;
+	DB_MPOOLFILE *mpf;
+	DBT hdr_dbt, log_dbt;
+	HEAP *h;
+	HEAPHDR *hdr;
+	HEAPPG *rpage;
+	HEAP_CURSOR *cp;
+	db_pgno_t region_pgno;
+	int oldspacebits, ret, spacebits, t_ret;
+	u_int16_t data_size, size;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	h = dbp->heap_internal;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	rpage = NULL;
+	COMPQUIET(flags, 0);
+
+	/*
+	 * We need to be able to reset the cursor after deleting a record split
+	 * across multiple pages.
+	 */
+	orig_rid.pgno = cp->pgno;
+	orig_rid.indx = cp->indx;
+
+	/*
+	 * This code is always called with a page lock but no page.
+	 */
+	DB_ASSERT(dbp->env, cp->page == NULL);
+
+	/* We have a read lock, but need a write lock. */
+start:	if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+	    LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+		return (ret);
+
+	if ((ret = __memp_fget(mpf, &cp->pgno,
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+		return (ret);
+
+	HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), oldspacebits);
+
+	hdr = (HEAPHDR *)P_ENTRY(dbp, cp->page, cp->indx);
+	data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t));
+	size = data_size + HEAP_HDRSIZE(hdr);
+	if (size < sizeof(HEAPSPLITHDR))
+		size = sizeof(HEAPSPLITHDR);
+	if (F_ISSET(hdr, HEAP_RECSPLIT) && !F_ISSET(hdr, HEAP_RECLAST)) {
+		next_rid.pgno = F_ISSET(hdr, HEAP_RECLAST) ?
+			PGNO_INVALID : ((HEAPSPLITHDR *)hdr)->nextpg;
+		next_rid.indx = F_ISSET(hdr, HEAP_RECLAST) ?
+			PGNO_INVALID : ((HEAPSPLITHDR *)hdr)->nextindx;
+	} else {
+		next_rid.pgno = PGNO_INVALID;
+		next_rid.indx = 0;
+	}
+
+	/* Log the deletion. */
+	if (DBC_LOGGING(dbc)) {
+		hdr_dbt.data = hdr;
+		hdr_dbt.size = HEAP_HDRSIZE(hdr);
+		log_dbt.data = (u_int8_t *)hdr + hdr_dbt.size;
+		log_dbt.size = data_size;
+		if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+		    0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
+		    size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+
+	if ((ret = __heap_ditem(dbc, cp->page, cp->indx, size)) != 0)
+		goto err;
+
+	/*
+	 * If the deleted item lived in a region prior to our current, back up
+	 * the current region, giving us a chance to reuse the newly available
+	 * space on the next insert.
+	 */
+	region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+	if (region_pgno < h->curregion)
+		h->curregion = region_pgno;
+
+	HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), spacebits);
+
+	if (spacebits != oldspacebits) {
+		/*
+		 * Get the region page.  We never lock the region page, the data
+		 * page lock locks the corresponding bits in the bitmap and
+		 * latching serializes access.
+		 */
+		if ((ret = __memp_fget(mpf, &region_pgno,
+		    dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+			goto err;
+		HEAP_SETSPACE(dbp, rpage,
+		    cp->pgno - region_pgno - 1, spacebits);
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (rpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	rpage = NULL;
+
+	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	cp->page = NULL;
+
+	if (ret == 0 && next_rid.pgno != PGNO_INVALID) {
+		cp->pgno = next_rid.pgno;
+		cp->indx = next_rid.indx;
+		goto start;
+	}
+
+	cp->pgno = orig_rid.pgno;
+	cp->indx = orig_rid.indx;
+
+	return (ret);
+}
+
+/*
+ * __heap_ditem --
+ *   Remove an item from a page.
+ *
+ * PUBLIC: int __heap_ditem
+ * PUBLIC:   __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__heap_ditem(dbc, pagep, indx, nbytes)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx, nbytes;
+{
+	DB *dbp;
+	db_indx_t first, i, max, off, *offtbl, span;
+	u_int8_t *src, *dest;
+
+	dbp = dbc->dbp;
+
+	DB_ASSERT(dbp->env, TYPE(pagep) == P_HEAP);
+	DB_ASSERT(dbp->env, nbytes == DB_ALIGN(nbytes, sizeof(u_int32_t)));
+	DB_ASSERT(dbp->env, nbytes >= sizeof(HEAPSPLITHDR));
+
+	offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, pagep);
+	off = offtbl[indx];
+	/*
+	 * Find the lowest offset on the page, and adjust offsets that are about
+	 * to be moved.  If the deleted item is the lowest offset on the page,
+
+	 * everything will work, that is not a special case.
+	 */
+	max = HEAP_HIGHINDX(pagep);
+	first = HOFFSET(pagep);
+	for (i = 0; i <= max; i++) {
+		if (offtbl[i] < off && offtbl[i] != 0)
+			offtbl[i] += nbytes;
+	}
+	offtbl[indx] = 0;
+
+	/*
+	 * Coalesce free space at the beginning of the page.  Shift all the data
+	 * preceding the deleted entry down, overwriting the deleted entry.
+	 */
+	src = (u_int8_t *)(pagep) + first;
+	dest = src + nbytes;
+	span = off - first;
+	memmove(dest, src, span);
+#ifdef DIAGNOSTIC
+	memset(src, CLEAR_BYTE, nbytes);
+#endif
+
+	/* Update the page's metadata. */
+	NUM_ENT(pagep)--;
+	HOFFSET(pagep) += nbytes;
+	if (indx < HEAP_FREEINDX(pagep))
+		HEAP_FREEINDX(pagep) = indx;
+	while (HEAP_HIGHINDX(pagep) > 0 && offtbl[HEAP_HIGHINDX(pagep)] == 0)
+		HEAP_HIGHINDX(pagep)--;
+	if (NUM_ENT(pagep) == 0)
+		HEAP_FREEINDX(pagep) = 0;
+	else if (HEAP_FREEINDX(pagep) > HEAP_HIGHINDX(pagep) + 1)
+		HEAP_FREEINDX(pagep) = HEAP_HIGHINDX(pagep) + 1;
+
+	return (0);
+}
+
+static int
+__heapc_destroy(dbc)
+	DBC *dbc;
+{
+	HEAP_CURSOR *cp;
+
+	cp = (HEAP_CURSOR *)dbc->internal;
+	__os_free(dbc->env, cp);
+	dbc->internal = NULL;
+
+	return (0);
+}
+
+/*
+ * __heapc_get --
+ *	Get using a cursor (heap).
+ */
+static int
+__heapc_get(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DB_HEAP_RID rid;
+	DB_MPOOLFILE *mpf;
+	DB_LOCK meta_lock;
+	DBT tmp_val;
+	HEAP *h;
+	HEAPHDR *hdr;
+	HEAPMETA *meta;
+	HEAPPG *dpage;
+	HEAP_CURSOR *cp;
+	db_lockmode_t lock_type;
+	db_pgno_t pgno;
+	int cmp, f_indx, found, getpage, indx, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	h = dbp->heap_internal;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	LOCK_INIT(meta_lock);
+	COMPQUIET(pgnop, NULL);
+
+	if (F_ISSET(key, DB_DBT_USERMEM) && key->ulen < DB_HEAP_RID_SZ) {
+		key->size = DB_HEAP_RID_SZ;
+		return (DB_BUFFER_SMALL);
+	}
+
+	/* Check for additional bits for locking */
+	if (F_ISSET(dbc, DBC_RMW))
+		lock_type = DB_LOCK_WRITE;
+	else
+		lock_type = DB_LOCK_READ;
+
+	ret = 0;
+	found = getpage = FALSE;
+	meta = NULL;
+	dpage = NULL;
+	switch (flags) {
+	case DB_CURRENT:
+
+		/*
+		 * Acquire the current page with read lock unless user
+		 * has asked for a write lock.  Ensure page and record
+		 * exist still.
+		 */
+		ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+		if (ret != 0) {
+			if (ret == DB_PAGE_NOTFOUND)
+				ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		if (HEAP_OFFSETTBL(dbp, cp->page)[cp->indx] == 0) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		dpage = (HEAPPG *)cp->page;
+		hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, cp->indx);
+		if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+		    !F_ISSET(hdr, HEAP_RECFIRST)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		break;
+	case DB_FIRST:
+		/*
+		 * The region pages do not distinguish between an empty
+		 * page and page with a something on it.  So, we will
+		 * grab the first possible data page and look for the
+		 * lowest index with data.  If page is empty we go on to
+		 * the next page and look. If no page, then no records.
+		 */
+first:		pgno = FIRST_HEAP_DPAGE;
+		while (!found) {
+			/* Put old lock/page and get the new lock/page */
+			ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+			if (ret != 0 ) {
+				if (ret == DB_PAGE_NOTFOUND)
+					ret = DB_NOTFOUND;
+				goto err;
+			}
+			dpage = (HEAPPG *)cp->page;
+			/*
+			 * The page needs to be a data page with entries on
+			 * it.  If page is good, loop through the offset table
+			 * finding first non-split record or first piece of a
+			 * split record, then set up cursor.
+			 */
+			if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
+				for (indx = 0;
+				     indx <= HEAP_HIGHINDX(dpage); indx++) {
+					if (HEAP_OFFSETTBL(
+					    dbp, dpage)[indx] == 0)
+						continue;
+					hdr = (HEAPHDR *)P_ENTRY(
+					    dbp, dpage, indx);
+					if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+					    F_ISSET(hdr, HEAP_RECFIRST)) {
+						found = TRUE;
+						cp->pgno = pgno;
+						cp->indx = indx;
+						break;
+					}
+				}
+				if (!found)
+					pgno++;
+			} else
+				pgno++;
+		}
+		break;
+	case DB_LAST:
+		/*
+		 * Grab the metadata page to find the last page, and start
+		 * there looking backwards for the record with the highest
+		 * index and return that one.
+		 */
+last:		pgno = PGNO_BASE_MD;
+		ACQUIRE(dbc, DB_LOCK_READ,
+		    pgno, meta_lock, pgno, meta, 0, 0, ret);
+		if (ret != 0)
+			goto err;
+
+		pgno = meta->dbmeta.last_pgno;
+
+		/*
+		 * It is possible to have another page added while we are
+		 * searching backwards for last record. No need to block
+		 * this case from occurring by keeping meta page lock.
+		 */
+		DISCARD(dbc, meta, meta_lock, 1, ret);
+		if (ret != 0)
+			goto err;
+
+		while (!found) {
+			/* Don't look earlier than the first data page. */
+			if (pgno < FIRST_HEAP_DPAGE) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+
+			/* Put old lock/page and get the new lock/page. */
+			ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+			if (ret != 0)
+				goto err;
+			dpage = (HEAPPG *)cp->page;
+			/*
+			 * The page needs to be a data page with entries on
+			 * it.  If page is good, search backwards until the a
+			 * non-split record or the first piece of a split record
+			 * is found.
+			 */
+			if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
+				for (indx = HEAP_HIGHINDX(dpage);
+				     indx >= 0; indx--) {
+					if (HEAP_OFFSETTBL(
+					    dbp, dpage)[indx] == 0)
+						continue;
+					hdr = (HEAPHDR *)P_ENTRY(
+					    dbp, dpage, indx);
+					if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+					    F_ISSET(hdr, HEAP_RECFIRST)) {
+						found = TRUE;
+						cp->pgno = pgno;
+						cp->indx = indx;
+						break;
+					}
+				}
+				if (!found)
+					pgno--;
+			} else
+				pgno--;
+		}
+		break;
+	case DB_NEXT_NODUP:
+	case DB_NEXT:
+		/* If cursor not initialize, behave as DB_FIRST */
+		if (dbc->internal->pgno == PGNO_INVALID)
+			goto first;
+
+		/*
+		 * Acquire the current page with the lock we have already,
+		 * unless user has asked for a write lock.
+		 */
+		ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+		if (ret != 0)
+			goto err;
+		dpage = (HEAPPG *)cp->page;
+
+		/* At end of current page, must get next page */
+		if (cp->indx >= HEAP_HIGHINDX(dpage))
+			getpage = TRUE;
+
+		while (!found) {
+			if (getpage) {
+				pgno = cp->pgno + 1;
+
+				/* Put current page/lock and get next one */
+				ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+				if (ret != 0) {
+					/* Beyond last page? */
+					if (ret == DB_PAGE_NOTFOUND)
+						ret = DB_NOTFOUND;
+					goto err;
+				}
+				dpage = (HEAPPG *)cp->page;
+
+				/*
+				 * If page is a spam page or its a data
+				 * page without entries, try again.
+				 */
+				if (TYPE(dpage) != P_HEAP ||
+				    (TYPE(dpage) == P_HEAP &&
+				    NUM_ENT(dpage) == 0))
+					continue;
+
+				/* When searching, indx gets bumped to 0 */
+				cp->indx = -1;
+				getpage = FALSE;
+			}
+
+			/*
+			 * Bump index and loop through the offset table finding
+			 * first nonzero entry.  If the offset is for a split
+			 * record, make sure it's the first piece of the split
+			 * record. HEAP_HIGHINDX always points to highest filled
+			 * entry on page.
+			 */
+			cp->indx++;
+			for (indx=cp->indx;
+			     indx <= HEAP_HIGHINDX(dpage); indx++) {
+				if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
+					continue;
+				hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+				if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+				    F_ISSET(hdr, HEAP_RECFIRST)) {
+					found = TRUE;
+					cp->indx = indx;
+					break;
+				}
+			}
+
+			/* Nothing of interest on page, so try next */
+			if (!found)
+				getpage = TRUE;
+		}
+		break;
+	case DB_PREV_NODUP:
+	case DB_PREV:
+		/* If cursor not initialize, behave as DB_LAST */
+		if (dbc->internal->pgno == PGNO_INVALID)
+			goto last;
+
+		/*
+		 * Acquire the current page with the lock we have already,
+		 * unless user has asked for a write lock.
+		 */
+		ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+		if (ret != 0)
+			goto err;
+		dpage = (HEAPPG *)cp->page;
+
+		/*
+		 * Loop through indexes and find first used slot.  Check if
+		 * already at the first slot.
+		 */
+		for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) &&
+		    (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++) ;
+
+		/* At the beginning of current page, must get new page */
+		if (cp->indx == 0 || cp->indx <= f_indx) {
+			if (cp->pgno == FIRST_HEAP_DPAGE) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+			getpage = TRUE;
+		}
+
+		while (!found) {
+			if (getpage) {
+				pgno = cp->pgno - 1;
+				/* Do not go past first page */
+				if (pgno < FIRST_HEAP_DPAGE) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+				/* Put current page/lock and get prev page. */
+				ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+				if (ret != 0)
+					goto err;
+
+				dpage = (HEAPPG *)cp->page;
+
+				/*
+				 * If page is a spam page or its a data
+				 * page without entries, try again.
+				 */
+				if (TYPE(dpage) != P_HEAP ||
+				    (TYPE(dpage) == P_HEAP &&
+				    NUM_ENT(dpage) == 0))
+					continue;
+
+				/* When search, this gets bumped to high indx */
+				cp->indx = HEAP_HIGHINDX(dpage) + 1;
+				getpage = FALSE;
+			}
+
+			/*
+			 * Decrement index and loop through the offset table
+			 * finding previous nonzero entry.
+			 */
+			cp->indx--;
+			for (indx=cp->indx;
+			     indx >= 0; indx--) {
+				if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
+					continue;
+				hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+				if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+				    F_ISSET(hdr, HEAP_RECFIRST)) {
+					found = TRUE;
+					cp->indx = indx;
+					break;
+				}
+			}
+
+			/* Nothing of interest on page, so try previous */
+			if (!found)
+				getpage = TRUE;
+		}
+		break;
+	case DB_GET_BOTH_RANGE:
+	case DB_GET_BOTH:
+	case DB_SET_RANGE:
+	case DB_SET:
+		pgno = ((DB_HEAP_RID *)key->data)->pgno;
+		indx = ((DB_HEAP_RID *)key->data)->indx;
+
+		/* First make sure we're trying to get a data page. */
+		if (pgno == PGNO_BASE_MD ||
+		    pgno == HEAP_REGION_PGNO(dbp, pgno)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		/* Lock the data page and get it. */
+		ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+
+		if (ret != 0) {
+			if (ret == DB_PAGE_NOTFOUND)
+				ret = DB_NOTFOUND;
+			goto err;
+		}
+		dpage = (HEAPPG *)cp->page;
+
+		/* validate requested index, throw error if not in range */
+		if ((indx >  HEAP_HIGHINDX(dpage)) ||
+		    (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)) {
+			DISCARD(dbc, cp->page, cp->lock, 0, ret);
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+		if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+		    !F_ISSET(hdr, HEAP_RECFIRST)) {
+			DISCARD(dbc, cp->page, cp->lock, 0, ret);
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		cp->pgno = pgno;
+		cp->indx = indx;
+
+		if (flags == DB_GET_BOTH || flags == DB_GET_BOTH_RANGE)	{
+			memset(&tmp_val, 0, sizeof(DBT));
+			/* does the data match ? */
+			if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+				tmp_val.flags = DB_DBT_MALLOC;
+				if ((ret = __heapc_gsplit(
+				    dbc, &tmp_val, NULL, 0)) != 0)
+					goto err;
+			} else {
+				tmp_val.data =
+				    (void *)((u_int8_t *)hdr + sizeof(HEAPHDR));
+				tmp_val.size = hdr->size;
+			}
+			cmp = __bam_defcmp(dbp, &tmp_val, data);
+			if (F_ISSET(&tmp_val, DB_DBT_MALLOC))
+				__os_ufree(dbp->env, tmp_val.data);
+			if (cmp != 0) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+		}
+
+		break;
+	case DB_NEXT_DUP:
+	case DB_PREV_DUP:
+		ret = DB_NOTFOUND;
+		goto err;
+	default:
+		/* DB_GET_RECNO, DB_JOIN_ITEM, DB_SET_RECNO are invalid */
+		ret = __db_unknown_flag(dbp->env, "__heap_get", flags);
+		goto err;
+
+	}
+
+err:	if (ret == 0 ) {
+		if (key != NULL) {
+			rid.pgno = cp->pgno;
+			rid.indx = cp->indx;
+			ret = __db_retcopy(dbp->env, key, &rid,
+			    DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+			F_SET(key, DB_DBT_ISSET);
+		}
+
+	} else {
+		if (meta != NULL)
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, meta, dbc->priority);
+		if (LOCK_ISSET(meta_lock))
+			(void)__LPUT(dbc, meta_lock);
+		if (LOCK_ISSET(cp->lock))
+			 (void)__LPUT(dbc, cp->lock);
+	}
+	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	return (ret);
+}
+
+#undef	IS_FIRST
+#define	IS_FIRST (last_rid.pgno == PGNO_INVALID)
+/*
+ * __heapc_reloc_partial --
+ *	 Move data from a too-full page to a new page.  The old data page must
+ *	 be write locked before calling this method.
+ */
+static int
+__heapc_reloc_partial(dbc, key, data)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+{
+	DB *dbp;
+	DBT hdr_dbt, log_dbt, t_data, t_key;
+	DB_HEAP_RID last_rid, next_rid;
+	HEAPHDR *old_hdr;
+	HEAPSPLITHDR new_hdr;
+	HEAP_CURSOR *cp;
+	int add_bytes, ret;
+	u_int32_t buflen, data_size, dlen, doff, left, old_size;
+	u_int32_t remaining, size;
+	u_int8_t *buf, *olddata;
+
+	dbp = dbc->dbp;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+	memset(&hdr_dbt, 0, sizeof(DBT));
+	memset(&log_dbt, 0, sizeof(DBT));
+	buf = NULL;
+	COMPQUIET(key, NULL);
+
+	/* We only work on partial puts. */
+	DB_ASSERT(dbp->env, F_ISSET(data, DB_DBT_PARTIAL));
+
+	/*
+	 * Start by calculating the data_size, total size of the new record, and
+	 * dlen, the number of bytes we will actually overwrite.  Keep a local
+	 * copy of doff, we'll adjust it as we see pieces of the record so that
+	 * it's always relative to the current piece of data.
+	 */
+	if (F_ISSET(old_hdr, HEAP_RECSPLIT))
+		old_size = ((HEAPSPLITHDR *)old_hdr)->tsize;
+	else
+		old_size = old_hdr->size;
+	doff = data->doff;
+	if (old_size < doff) {
+		/* Post-pending */
+		dlen = data->dlen;
+		data_size = doff + data->size;
+	} else {
+		if (old_size - doff < data->dlen)
+			dlen = old_size - doff;
+		else
+			dlen = data->dlen;
+		data_size = old_size - dlen + data->size;
+	}
+
+	/*
+	 * We don't need a buffer large enough to hold the data_size
+	 * bytes, just one large enough to hold the bytes that will be
+	 * written to an individual page.  We'll realloc to the necessary size
+	 * as needed.
+	 */
+	buflen = 0;
+	buf = NULL;
+
+	/*
+	 * We are updating an existing record, which will grow into a split
+	 * record.  The strategy is to overwrite the existing record (or each
+	 * piece of the record if the record is already split.)  If the new
+	 * record is shorter than the old, delete any extra pieces.  If the new
+	 * record is longer than the old, use heapc_split() to write the extra
+	 * data.
+	 *
+	 * We start each loop with old_hdr pointed at the header for the old
+	 * record and the necessary page write locked in cp->page.
+	 */
+	last_rid.pgno = PGNO_INVALID;
+	last_rid.indx = 0;
+	add_bytes = 1;
+	left = data_size;
+	memset(&t_data, 0, sizeof(DBT));
+	remaining = 0;
+	for (;;) {
+		/* Figure out if we have a next piece. */
+		if (F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+			next_rid.pgno = ((HEAPSPLITHDR *)old_hdr)->nextpg;
+			next_rid.indx = ((HEAPSPLITHDR *)old_hdr)->nextindx;
+		} else {
+			next_rid.pgno = PGNO_INVALID;
+			next_rid.indx = 0;
+		}
+
+		/*
+		 * Before we delete the old data, use it to construct the new
+		 * data. First figure out the size of the new piece, including
+		 * any remaining data from the last piece.
+		 */
+		if (doff >= old_hdr->size)
+			if (F_ISSET(old_hdr, HEAP_RECLAST) ||
+			    !F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+				/* Post-pending. */
+				data_size = doff + data->size;
+			} else {
+				/* The new piece is just the old piece. */
+				data_size = old_hdr->size;
+			}
+		else if (doff + dlen > old_hdr->size)
+			/*
+			 * Some of the to-be-overwritten bytes are on the next
+			 * piece, but we'll append all the new bytes to this
+			 * piece if we haven't already written them.
+			 */
+			data_size = doff + (add_bytes ? data->size : 0);
+		else
+			data_size = old_hdr->size -
+				dlen + (add_bytes ? data->size : 0);
+		data_size += remaining;
+
+		if (data_size > buflen) {
+			if (__os_realloc(dbp->env, data_size, &buf) != 0)
+				return (ENOMEM);
+			buflen = data_size;
+		}
+		t_data.data = buf;
+
+		/*
+		 * Adjust past any remaining bytes, they've already been moved
+		 * to the beginning of the buffer.
+		 */
+		buf += remaining;
+		remaining = 0;
+
+		olddata = (u_int8_t *)old_hdr + HEAP_HDRSIZE(old_hdr);
+		if (doff >= old_hdr->size) {
+			memcpy(buf, olddata, old_hdr->size);
+			doff -= old_hdr->size;
+			if (F_ISSET(old_hdr, HEAP_RECLAST) ||
+			    !F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+				/* Post-pending. */
+				buf += old_hdr->size;
+				memset(buf, '\0', doff);
+				buf += doff;
+				memcpy(buf, data->data, data->size);
+			}
+		} else {
+			/* Preserve the first doff bytes. */
+			memcpy(buf, olddata, doff);
+			buf += doff;
+			olddata += doff;
+			/* Copy in the new bytes, if needed. */
+			if (add_bytes) {
+				memcpy(buf, data->data, data->size);
+				buf += data->size;
+				add_bytes = 0;
+			}
+			/* Skip dlen bytes. */
+			if (doff + dlen < old_hdr->size) {
+				olddata += dlen;
+				memcpy(buf,
+				    olddata, old_hdr->size - doff - dlen);
+				dlen = 0;
+			} else
+				/*
+				 * The data to be removed spills over onto the
+				 * following page(s).  Adjust dlen to account
+				 * for the bytes removed from this page.
+				 */
+				dlen = doff + dlen - old_hdr->size;
+			doff = 0;
+		}
+		buf = t_data.data;
+
+		/* Delete the old data, after logging it. */
+		old_size = DB_ALIGN(
+		    old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+		if (old_size < sizeof(HEAPSPLITHDR))
+			old_size = sizeof(HEAPSPLITHDR);
+		if (DBC_LOGGING(dbc)) {
+			hdr_dbt.data = old_hdr;
+			hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+			log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+			log_dbt.size = DB_ALIGN(
+			    old_hdr->size, sizeof(u_int32_t));
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx, old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+		if ((ret = __heap_ditem(
+		    dbc, cp->page, cp->indx, old_size)) != 0)
+			goto err;
+
+		if (left == 0)
+			/*
+			 * We've finished writing the new record, we're just
+			 * cleaning up the old record now.
+			 */
+			goto next_pg;
+
+		if (data_size == 0 && !IS_FIRST) {
+			/*
+			 * This piece is being completely removed.  We need to
+			 * adjust the header of the previous piece now.
+			 */
+			ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+			    last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+			if (ret != 0)
+				goto err;
+
+			cp->indx = last_rid.indx;
+			old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+			if (DBC_LOGGING(dbc)) {
+				old_size = DB_ALIGN(old_hdr->size +
+				    HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+				hdr_dbt.data = old_hdr;
+				hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+				log_dbt.data =
+				    (u_int8_t *)old_hdr + hdr_dbt.size;
+				log_dbt.size = DB_ALIGN(
+				    old_hdr->size, sizeof(u_int32_t));
+				if ((ret = __heap_addrem_log(dbp, dbc->txn,
+				    &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+				    (u_int32_t)cp->indx, old_size,
+				    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+					goto err;
+			} else
+				LSN_NOT_LOGGED(LSN(cp->page));
+
+			((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+			((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+			if (DBC_LOGGING(dbc)) {
+				if ((ret = __heap_addrem_log(dbp, dbc->txn,
+				    &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+				    (u_int32_t)cp->indx, old_size,
+				    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+					goto err;
+			} else
+				LSN_NOT_LOGGED(LSN(cp->page));
+
+			DISCARD(dbc, cp->page, cp->lock, 1, ret);
+
+			goto next_pg;
+		}
+
+		/* Set up the header for the new record. */
+		memset(&new_hdr, 0, sizeof(HEAPSPLITHDR));
+		new_hdr.std_hdr.flags = HEAP_RECSPLIT;
+		/*
+		 * If next_rid.pgno == PGNO_INVALID and there's still more data,
+		 * we'll come back and correct the header once we know where the
+		 * next piece lives.
+		 */
+		new_hdr.nextpg = next_rid.pgno;
+		new_hdr.nextindx = next_rid.indx;
+		/*
+		 * Figure out how much we can fit on the page, rounding down to
+		 * a multiple of 4.  If we will have to expand the offset table,
+		 * account for that. It needs to be enough to at least fit the
+		 * split header.
+		 */
+		size = HEAP_FREESPACE(dbp, cp->page);
+		if (NUM_ENT(cp->page) == 0 ||
+		    cp->indx > HEAP_HIGHINDX(cp->page))
+			size -= sizeof(db_indx_t);
+		/* Round down to a multiple of 4. */
+		size = DB_ALIGN(
+		    size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+		DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+
+		/*
+		 * We try to fill the page, but cannot write more than
+		 * t_data.size bytes, that's all we have in-memory.
+		 */
+		new_hdr.std_hdr.size = (u_int16_t)
+		    (size - sizeof(HEAPSPLITHDR));
+		if (new_hdr.std_hdr.size > data_size)
+			new_hdr.std_hdr.size = data_size;
+		if (new_hdr.std_hdr.size >= left) {
+			new_hdr.std_hdr.size = left;
+			new_hdr.std_hdr.flags |= HEAP_RECLAST;
+			new_hdr.nextpg = PGNO_INVALID;
+			new_hdr.nextindx = 0;
+		}
+		if (IS_FIRST) {
+			new_hdr.std_hdr.flags |= HEAP_RECFIRST;
+			new_hdr.tsize = left;
+		}
+
+		/* Now write the new data to the page. */
+		t_data.size = new_hdr.std_hdr.size;
+		hdr_dbt.data = &new_hdr;
+		hdr_dbt.size = sizeof(HEAPSPLITHDR);
+		/* Log the write. */
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __heap_addrem_log(dbp,
+			    dbc->txn, &LSN(cp->page), 0,
+			    DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+			    size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+		if ((ret = __heap_pitem(dbc,
+		    (PAGE *)cp->page, cp->indx, size, &hdr_dbt, &t_data)) != 0)
+			goto err;
+
+		left -= new_hdr.std_hdr.size;
+		/*
+		 * If any data couldn't fit on this page, it has to go onto the
+		 * next.  Copy it to the front of the buffer and it will be
+		 * preserved in the next loop.
+		 */
+		if (new_hdr.std_hdr.size < data_size) {
+			remaining = data_size - new_hdr.std_hdr.size;
+			memmove(buf, buf + new_hdr.std_hdr.size, remaining);
+		}
+
+		/*
+		 * Remember this piece's RID, we may need to update the header
+		 * if the next data piece is removed, or if this is the final
+		 * piece and we add data to the end of the record.
+		 */
+next_pg:	last_rid.pgno = cp->pgno;
+		last_rid.indx = cp->indx;
+		/* Get the next page, if any. */
+		if (next_rid.pgno != PGNO_INVALID) {
+			ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+			    next_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+			if (ret != 0)
+				goto err;
+			cp->indx = next_rid.indx;
+			old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+			DB_ASSERT(dbp->env,
+			    HEAP_HIGHINDX(cp->page) <= cp->indx);
+			DB_ASSERT(dbp->env, F_ISSET(old_hdr, HEAP_RECSPLIT));
+		} else {
+			/* Discard the page and drop the lock, txn-ally. */
+			DISCARD(dbc, cp->page, cp->lock, 1, ret);
+			if (ret != 0)
+				goto err;
+			break;
+		}
+	}
+
+	/*
+	 * If there is more work to do, let heapc_split do it.  After
+	 * heapc_split returns we need to update nextpg and nextindx in the
+	 * header of the last piece we wrote above.
+	 *
+	 * For logging purposes, we "delete" the old record and then "add" the
+	 * record.  This makes redo/undo work as-is, but we won't actually
+	 * delete and re-add the record.
+	 */
+	if (left > 0) {
+		memset(&t_key, 0, sizeof(DBT));
+		t_key.size = t_key.ulen = sizeof(DB_HEAP_RID);
+		t_key.data = &next_rid;
+		t_key.flags = DB_DBT_USERMEM;
+		t_data.size = left;
+		if ((ret = __heapc_split(dbc, &t_key, &t_data, 0)) != 0)
+			goto err;
+
+		ACQUIRE_CUR(dbc,
+		    DB_LOCK_WRITE, last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+		if (ret != 0)
+			goto err;
+
+		cp->indx = last_rid.indx;
+		old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+		if (DBC_LOGGING(dbc)) {
+			old_size = DB_ALIGN(old_hdr->size +
+			    HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+			hdr_dbt.data = old_hdr;
+			hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+			log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+			log_dbt.size = DB_ALIGN(
+			    old_hdr->size, sizeof(u_int32_t));
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx, old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+
+		((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+		((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx, old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+
+		DISCARD(dbc, cp->page, cp->lock, 1, ret);
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (buf != NULL)
+		__os_free(dbp->env, buf);
+	return (ret);
+}
+
+/*
+ * __heapc_reloc --
+ *	 Move data from a too-full page to a new page.  The old data page must
+ *	 be write locked before calling this method.
+ */
+static int
+__heapc_reloc(dbc, key, data)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+{
+	DB *dbp;
+	DBT hdr_dbt, log_dbt, t_data, t_key;
+	DB_HEAP_RID last_rid, next_rid;
+	HEAPHDR *old_hdr;
+	HEAPSPLITHDR new_hdr;
+	HEAP_CURSOR *cp;
+	int is_first, ret;
+	u_int32_t left, old_size, size;
+
+	dbp = dbc->dbp;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+	memset(&hdr_dbt, 0, sizeof(DBT));
+	memset(&log_dbt, 0, sizeof(DBT));
+	COMPQUIET(key, NULL);
+
+	/*
+	 * We are updating an existing record, which will grow into a split
+	 * record.  The strategy is to overwrite the existing record (or each
+	 * piece of the record if the record is already split.)  If the new
+	 * record is shorter than the old, delete any extra pieces.  If the new
+	 * record is longer than the old, use heapc_split() to write the extra
+	 * data.
+	 *
+	 * We start each loop with t_data.data positioned to the next byte to be
+	 * written, old_hdr pointed at the header for the old record and the
+	 * necessary page write locked in cp->page.
+	 */
+	is_first = 1;
+	left = data->size;
+	memset(&t_data, 0, sizeof(DBT));
+	t_data.data = data->data;
+	for (;;) {
+		/* Figure out if we have a next piece. */
+		if (F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+			next_rid.pgno = ((HEAPSPLITHDR *)old_hdr)->nextpg;
+			next_rid.indx = ((HEAPSPLITHDR *)old_hdr)->nextindx;
+		} else {
+			next_rid.pgno = PGNO_INVALID;
+			next_rid.indx = 0;
+		}
+
+		/* Delete the old data, after logging it. */
+		old_size = DB_ALIGN(
+		    old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+		if (old_size < sizeof(HEAPSPLITHDR))
+			old_size = sizeof(HEAPSPLITHDR);
+		if (DBC_LOGGING(dbc)) {
+			hdr_dbt.data = old_hdr;
+			hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+			log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+			log_dbt.size = DB_ALIGN(
+			    old_hdr->size, sizeof(u_int32_t));
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx, old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+		if ((ret = __heap_ditem(
+		    dbc, cp->page, cp->indx, old_size)) != 0)
+			goto err;
+
+		if (left == 0)
+			/*
+			 * We've finished writing the new record, we're just
+			 * cleaning up the old record now.
+			 */
+			goto next_pg;
+
+		/* Set up the header for the new record. */
+		memset(&new_hdr, 0, sizeof(HEAPSPLITHDR));
+		new_hdr.std_hdr.flags = HEAP_RECSPLIT;
+		/* We'll set this later if next_rid.pgno == PGNO_INVALID. */
+		new_hdr.nextpg = next_rid.pgno;
+		new_hdr.nextindx = next_rid.indx;
+		/*
+		 * Figure out how much we can fit on the page, rounding down to
+		 * a multiple of 4.  If we will have to expand the offset table,
+		 * account for that.It needs to be enough to at least fit the
+		 * split header.
+		 */
+		size = HEAP_FREESPACE(dbp, cp->page);
+		if (NUM_ENT(cp->page) == 0 ||
+		    cp->indx > HEAP_HIGHINDX(cp->page))
+			size -= sizeof(db_indx_t);
+		/* Round down to a multiple of 4. */
+		size = DB_ALIGN(
+		    size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+		DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+		new_hdr.std_hdr.size =
+		    (u_int16_t)(size - sizeof(HEAPSPLITHDR));
+		if (new_hdr.std_hdr.size >= left) {
+			new_hdr.std_hdr.size = left;
+			new_hdr.std_hdr.flags |= HEAP_RECLAST;
+			new_hdr.nextpg = PGNO_INVALID;
+			new_hdr.nextindx = 0;
+		}
+		if (is_first) {
+			new_hdr.std_hdr.flags |= HEAP_RECFIRST;
+			new_hdr.tsize = left;
+			is_first = 0;
+		}
+
+		/* Now write the new data to the page. */
+		t_data.size = new_hdr.std_hdr.size;
+		hdr_dbt.data = &new_hdr;
+		hdr_dbt.size = sizeof(HEAPSPLITHDR);
+		/* Log the write. */
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __heap_addrem_log(dbp,
+			    dbc->txn, &LSN(cp->page), 0,
+			    DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+			    size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+		if ((ret = __heap_pitem(dbc,
+		    (PAGE *)cp->page, cp->indx, size, &hdr_dbt, &t_data)) != 0)
+			goto err;
+
+		left -= new_hdr.std_hdr.size;
+		t_data.data = (u_int8_t *)(t_data.data) + new_hdr.std_hdr.size;
+
+		/* Get the next page, if any. */
+next_pg:	if (next_rid.pgno != PGNO_INVALID) {
+			ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+			    next_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+			if (ret != 0)
+				goto err;
+			cp->indx = next_rid.indx;
+			old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+		} else {
+			/*
+			 * Remember the final piece's RID, we may need to update
+			 * the header after writing the rest of the record.
+			 */
+			last_rid.pgno = cp->pgno;
+			last_rid.indx = cp->indx;
+			/* Discard the page and drop the lock, txn-ally. */
+			DISCARD(dbc, cp->page, cp->lock, 1, ret);
+			if (ret != 0)
+				goto err;
+			break;
+		}
+	}
+
+	/*
+	 * If there is more work to do, let heapc_split do it.  After
+	 * heapc_split returns we need to update nextpg and nextindx in the
+	 * header of the last piece we wrote above.
+	 *
+	 * For logging purposes, we "delete" the old record and then "add" the
+	 * record.  This makes redo/undo work as-is, but we won't actually
+	 * delete and re-add the record.
+	 */
+	if (left > 0) {
+		memset(&t_key, 0, sizeof(DBT));
+		t_key.size = t_key.ulen = sizeof(DB_HEAP_RID);
+		t_key.data = &next_rid;
+		t_key.flags = DB_DBT_USERMEM;
+		t_data.size = left;
+		if ((ret = __heapc_split(dbc, &t_key, &t_data, 0)) != 0)
+			goto err;
+
+		ACQUIRE_CUR(dbc,
+		    DB_LOCK_WRITE, last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+		if (ret != 0)
+			goto err;
+
+		cp->indx = last_rid.indx;
+		old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+		if (DBC_LOGGING(dbc)) {
+			old_size = DB_ALIGN(old_hdr->size +
+			    HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+			hdr_dbt.data = old_hdr;
+			hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+			log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+			log_dbt.size = DB_ALIGN(
+			    old_hdr->size, sizeof(u_int32_t));
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx, old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+
+		((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+		((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __heap_addrem_log(dbp, dbc->txn,
+			    &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+			    (u_int32_t)cp->indx,old_size,
+			    &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+
+		DISCARD(dbc, cp->page, cp->lock, 1, ret);
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	return (ret);
+}
+
+/*
+ * __heapc_put --
+ *
+ * Put using a cursor.  If the given key exists, update the associated data.  If
+ * the given key does not exsist, return an error.
+ */
+static int
+__heapc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBT hdr_dbt, log_dbt, new_data;
+	DB_MPOOLFILE *mpf;
+	HEAPHDR hdr, *old_hdr;
+	HEAP_CURSOR *cp;
+	PAGE *rpage;
+	db_pgno_t region_pgno;
+	int oldspace, ret, space, t_ret;
+	u_int32_t data_size, dlen, new_size, old_flags, old_size, tot_size;
+	u_int8_t *buf, *olddata, *src, *dest;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	rpage = NULL;
+	buf = dest = src = NULL;
+	dlen = 0;
+
+	if (flags != DB_CURRENT) {
+		/* We're going to write following the get, so use RMW. */
+		old_flags = dbc->flags;
+		F_SET(dbc, DBC_RMW);
+		ret = __heapc_get(dbc, key, data, DB_SET, pgnop);
+		F_CLR(key, DB_DBT_ISSET);
+		dbc->flags = old_flags;
+		DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+		if (ret != 0)
+			return (ret);
+		else if (flags == DB_NOOVERWRITE)
+			return (DB_KEYEXIST);
+		if ((ret = __memp_dirty(mpf, &cp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+	} else {
+		/* We have a read lock, but need a write lock. */
+		if (STD_LOCKING(dbc) && cp->lock_mode != DB_LOCK_WRITE &&
+		    (ret = __db_lget(dbc,
+		    LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+			return (ret);
+
+		if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+			return (ret);
+	}
+
+	/* We've got the page locked and stored in cp->page. */
+	HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), oldspace);
+
+	/*
+	 * Figure out the spacing issue.  There is a very rare corner case where
+	 * we don't have enough space on the page to expand the data. Splitting
+	 * the record results in a larger header, if the page is jam packed
+	 * there might not be room for the larger header.
+	 *
+	 * hdr->size is the size of the stored data, it doesn't include any
+	 * padding.
+	 */
+	old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+	/* Need data.size + header size, 4-byte aligned. */
+	old_size =
+	    DB_ALIGN(old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+	if (old_size < sizeof(HEAPSPLITHDR))
+		old_size = sizeof(HEAPSPLITHDR);
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		if (F_ISSET(old_hdr, HEAP_RECSPLIT))
+			tot_size = ((HEAPSPLITHDR *)old_hdr)->tsize;
+		else
+			tot_size = old_hdr->size;
+		if (tot_size < data->doff) {
+			/* Post-pending */
+			dlen = data->dlen;
+			data_size = data->doff + data->size;
+		} else {
+			if (tot_size - data->doff < data->dlen)
+				dlen = tot_size - data->doff;
+			else
+				dlen = data->dlen;
+			data_size = tot_size - dlen + data->size;
+		}
+	} else
+		data_size = data->size;
+	new_size = DB_ALIGN(data_size + sizeof(HEAPHDR), sizeof(u_int32_t));
+	if (new_size < sizeof(HEAPSPLITHDR))
+		new_size = sizeof(HEAPSPLITHDR);
+
+	/* Check whether we actually have enough space on this page. */
+	if (F_ISSET(old_hdr, HEAP_RECSPLIT) ||
+	    (new_size > old_size &&
+	    new_size - old_size > HEAP_FREESPACE(dbp, cp->page))) {
+		/*
+		 * We've got to split the record, not enough room on the
+		 * page.  Splitting the record will remove old_size bytes and
+		 * introduce at least sizeof(HEAPSPLITHDR).
+		 */
+		if (F_ISSET(data, DB_DBT_PARTIAL))
+			return (__heapc_reloc_partial(dbc, key, data));
+		else
+			return (__heapc_reloc(dbc, key, data));
+	}
+
+	memset(&new_data, 0, sizeof(DBT));
+	new_data.size = data_size;
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		/*
+		 * Before replacing the old data, we need to use it to build the
+		 * new data.
+		 */
+		if ((ret = __os_malloc(dbp->env, data_size, &buf)) != 0)
+			goto err;
+		new_data.data = buf;
+
+		/*
+		 * Preserve data->doff bytes at the start, or all of the old
+		 * record plus padding, if post-pending.
+		 */
+		olddata = (u_int8_t *)old_hdr + sizeof(HEAPHDR);
+		if (data->doff > old_hdr->size) {
+			memcpy(buf, olddata, old_hdr->size);
+			buf += old_hdr->size;
+			memset(buf, '\0', data->doff - old_hdr->size);
+			buf += data->doff - old_hdr->size;
+		} else {
+			memcpy(buf, olddata, data->doff);
+			buf += data->doff;
+		}
+
+		/* Now copy in the user's data. */
+		memcpy(buf, data->data, data->size);
+		buf += data->size;
+
+		/* Fill in remaining data from the old record, skipping dlen. */
+		if (data->doff < old_hdr->size) {
+			olddata += data->doff + data->dlen;
+			memcpy(buf,
+			    olddata, old_hdr->size - data->doff - data->dlen);
+		}
+	} else {
+		new_data.data = data->data;
+	}
+
+	/*
+	 * Do the update by deleting the old record and writing the new
+	 * record.  Start by logging the entire operation.
+	 */
+	memset(&hdr, 0, sizeof(HEAPHDR));
+	hdr.size = data_size;
+	if (DBC_LOGGING(dbc)) {
+		hdr_dbt.data = old_hdr;
+		hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+		log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+		log_dbt.size = DB_ALIGN(old_hdr->size, sizeof(u_int32_t));
+		if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+		    0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
+		    old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+			goto err;
+		hdr_dbt.data = &hdr;
+		hdr_dbt.size = HEAP_HDRSIZE(&hdr);
+		if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+		    0, DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+		    new_size, &hdr_dbt, &new_data, &LSN(cp->page))) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+
+	if ((ret = __heap_ditem(dbc, cp->page, cp->indx, old_size)) != 0)
+		goto err;
+	hdr_dbt.data = &hdr;
+	hdr_dbt.size = HEAP_HDRSIZE(&hdr);
+	if ((ret = __heap_pitem(dbc,
+	    (PAGE *)cp->page, cp->indx, new_size, &hdr_dbt, &new_data)) != 0)
+		goto err;
+
+	/* Check whether we need to update the space bitmap. */
+	HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), space);
+
+	if (space != oldspace) {
+		/* Get the region page with an exclusive latch. */
+		region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+
+		if ((ret = __memp_fget(mpf, &region_pgno,
+		    dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+			goto err;
+
+		HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (rpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (F_ISSET(data, DB_DBT_PARTIAL))
+		__os_free(dbp->env, new_data.data);
+
+	if (ret != 0 && LOCK_ISSET(cp->lock))
+		(void)__TLPUT(dbc, cp->lock);
+
+	return (ret);
+}
+
+/*
+ * __heap_getpage --
+ *	Return a page with sufficient free space.  The page will be write locked
+ *	and marked dirty.
+ */
+static int
+__heap_getpage(dbc, size, avail)
+	DBC *dbc;
+	u_int32_t size;
+	u_int8_t *avail;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DB_LOCK meta_lock;
+	DB_LSN meta_lsn;
+	DB_MPOOLFILE *mpf;
+	HEAP *h;
+	HEAPPG *rpage;
+	HEAP_CURSOR *cp;
+	db_pgno_t data_pgno, *lkd_pgs, meta_pgno, region_pgno, start_region;
+	int i, lk_mode, max, p, ret, space, start, t_ret;
+
+	LOCK_INIT(meta_lock);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	h = dbp->heap_internal;
+	start_region = region_pgno = h->curregion;
+	max = HEAP_REGION_SIZE(dbp);
+	i = ret = t_ret = 0;
+	lkd_pgs = NULL;
+
+	/*
+	 * The algorithm for finding a page:
+	 *
+	 * Look in the space bitmap of the current region page for a data page
+	 * with at least size bytes free.  Once we find a page, try to lock it
+	 * and if we get the lock we're done.
+	 *
+	 * Don't wait for a locked region page, just move on to the next region
+	 * page, creating it if it doesn't exist.  If the size of the heap
+	 * database is not constrained, just keep creating regions and extending
+	 * the database until we find a page with space.  If the database size
+	 * is constrained, loop back to the first region page from the final
+	 * region page.  If we wind up making it all the way back to where our
+	 * search began, we need to start waiting for locked region pages.  If
+	 * we finish another loop through the database waiting for every region
+	 * page, we know there's no room.
+	 */
+
+	/*
+	 * Figure out the % of the page the data will occupy and translate that
+	 * to the relevant bit-map value we need to look for.
+	 */
+	HEAP_CALCSPACEBITS(dbp, size, space);
+
+	/*
+	 * Get the current region page, with a shared latch.  On the first loop
+	 * through a fixed size database, we move on to the next region if the
+	 * page is locked.  On the second loop, we wait for locked region
+	 * pages.  If the database isn't fixed size, we never wait, we'll
+	 * eventually get to use one of the region pages we create.
+	 */
+	lk_mode = DB_MPOOL_TRY;
+find:	while ((ret = __memp_fget(mpf, &region_pgno,
+	    dbc->thread_info, NULL, lk_mode, &rpage)) != 0 ||
+	    TYPE(rpage) != P_IHEAP) {
+		if (ret == DB_LOCK_NOTGRANTED)
+			goto next_region;
+		if (ret != 0 && ret != DB_PAGE_NOTFOUND)
+			return (ret);
+		/*
+		 * The region page doesn't exist, or hasn't been initialized,
+		 * create it, then try again.  If the page exists, we have to
+		 * drop it before initializing the region.
+		 */
+		if (ret == 0 && (ret = __memp_fput(
+		    mpf, dbc->thread_info, rpage, dbc->priority)) != 0)
+			return (ret);
+
+		if ((ret = __heap_create_region(dbc, region_pgno)) != 0)
+			return (ret);
+	}
+
+	start = h->curpgindx;
+	/*
+	 * If this is the last region page in a fixed size db, figure out the
+	 * maximum pgno in the bitmap.
+	 */
+	if (region_pgno + max > h->maxpgno)
+		max = h->maxpgno - region_pgno;
+	/*
+	 * Look in the bitmap for a page with sufficient free space.  We use i
+	 * in a slightly strange way.  Because the 2-bits in the bitmap are only
+	 * an estimate, there is a chance the data won't fit on the page we
+	 * choose.  In that case, we re-start the process and want to be able to
+	 * resume this loop where we left off.
+	 */
+	for (; i < max; i++) {
+		p = start + i;
+		if (p >= max)
+			p -= max;
+		if ((*avail = HEAP_SPACE(dbp, rpage, p)) > space)
+			continue;
+		data_pgno = region_pgno + p + 1;
+		ACQUIRE_CUR(dbc,
+		    DB_LOCK_WRITE, data_pgno, DB_LOCK_NOWAIT, 0, ret);
+		/*
+		 * If we have the lock and the page or have the lock and need to
+		 * create the page, we're good.  If we don't have the lock, try
+		 * to find different page.
+		 */
+		if (ret == 0 || ret == DB_PAGE_NOTFOUND)
+			break;
+		else if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+			ret = 0;
+			continue;
+		} else
+			goto err;
+	}
+
+	/*
+	 * Keep a worst case range of highest used page in the region.
+	 */
+	if (i < max && data_pgno > rpage->high_pgno) {
+		if ((ret = __memp_dirty(mpf,
+		    &rpage, dbc->thread_info, NULL, dbc->priority, 0)) != 0)
+			goto err;
+		/* We might have blocked, check again */
+		if (data_pgno > rpage->high_pgno)
+			rpage->high_pgno = data_pgno;
+	}
+
+	/* Done with the region page, even if we didn't find a page. */
+	if ((ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0) {
+		/* Did not read the data page, so we can release its lock. */
+		DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+		goto err;
+	}
+	rpage = NULL;
+
+	if (i >= max) {
+		/*
+		 * No free pages on this region page, advance to the next region
+		 * page.  If we're at the end of a fixed size heap db, loop
+		 * around to the first region page.  There is not currently a
+		 * data page locked.
+		 */
+next_region:	region_pgno += HEAP_REGION_SIZE(dbp) + 1;
+
+		if (region_pgno > h->maxpgno)
+			region_pgno = FIRST_HEAP_RPAGE;
+
+		if (region_pgno == start_region) {
+			/*
+			 * We're in a fixed size db and we've looped through all
+			 * region pages.
+			 */
+
+			if (lk_mode == DB_MPOOL_TRY) {
+				/*
+				 * We may have missed a region page with room,
+				 * because we didn't wait for locked pages.  Try
+				 * another loop, waiting for all pages.
+				 */
+				lk_mode = 0;
+			} else {
+				/*
+				 * We've seen every region page, because we
+				 * waited for all pages.  No room.
+				 */
+				ret = DB_HEAP_FULL;
+				goto err;
+			}
+		}
+
+		h->curregion = region_pgno;
+		h->curpgindx = 0;
+		i = 0;
+		goto find;
+	}
+
+	/*
+	 * At this point we have the page locked.  If we have the page, we need
+	 * to mark it dirty.  If we don't have the page (or if the page is
+	 * empty) we need to create and initialize it.
+	 */
+	if (cp->pgno == PGNO_INVALID || PGNO(cp->page) == PGNO_INVALID) {
+		/*
+		 * The data page needs to be created and the metadata page needs
+		 * to be updated.  Once we get the metadata page, we must not
+		 * jump to err, the metadata page and lock are put back here.
+		 *
+		 * It is possible that the page was created by an aborted txn,
+		 * in which case the page exists but is all zeros.  We still
+		 * need to "create" it and log the creation.
+		 *
+		 */
+
+		meta_pgno = PGNO_BASE_MD;
+		if ((ret = __db_lget(dbc, LCK_ALWAYS, meta_pgno,
+		    DB_LOCK_WRITE, DB_LOCK_NOWAIT, &meta_lock)) != 0) {
+			/*
+			 * We don't want to block while having latched
+			 * a page off the end of file.  This could
+			 * get truncated by another thread and we
+			 * will deadlock.
+			 */
+			p = cp->page != NULL;
+			DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+			if (t_ret != 0 ||
+			     (ret != DB_LOCK_NOTGRANTED &&
+			     ret != DB_LOCK_DEADLOCK))
+				goto pg_err;
+			if ((ret = __db_lget(dbc, LCK_ALWAYS, meta_pgno,
+			    DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+				goto pg_err;
+			ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+			    data_pgno, 0, DB_MPOOL_CREATE, ret);
+			/*
+			 * We can race, having read this page when it was
+			 * less than last_pgno but now an aborted
+			 * allocation can make this page beyond last_pgno
+			 * so we must free it. If we can't get the
+			 * lock on the page again, then some other
+			 * thread will handle the issue.
+			 */
+			if (ret != 0) {
+pg_err:				if (p != 0) {
+					ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+					    data_pgno, 0, 0, t_ret);
+					if (t_ret == 0 &&
+					     PGNO(cp->page) == PGNO_INVALID) {
+						(void)__memp_fput(mpf,
+						     dbc->thread_info,
+						     cp->page, dbc->priority);
+						(void)__memp_fget(mpf,
+						     &data_pgno,
+						     dbc->thread_info, dbc->txn,
+						     DB_MPOOL_FREE, &cp->page);
+					}
+					(void)__LPUT(dbc, cp->lock);
+				}
+				(void)__LPUT(dbc, meta_lock);
+				goto err;
+			}
+			/* Check if we lost a race. */
+			if (PGNO(cp->page) != PGNO_INVALID) {
+				if ((ret = __LPUT(dbc, meta_lock)) != 0)
+					goto err;
+				goto check;
+			}
+		}
+
+		/*
+		 * Before creating a new page in this region, check that the
+		 * region page still exists.  By this point, the transaction
+		 * that created the region must have aborted or committed,
+		 * because we now hold the metadata lock.  If we can't get the
+		 * latch, the page must exist.
+		 */
+		ret = __memp_fget(mpf, &region_pgno,
+		    dbc->thread_info, NULL, DB_MPOOL_TRY, &rpage);
+		if (ret == DB_LOCK_NOTGRANTED)
+			ret = 0;
+		else if (ret != 0) {
+			/* 
+			 * Free up the metadata lock.  If this was an error
+			 * other than a missing region page, bail.
+			 */
+			if ((t_ret = __LPUT(dbc, meta_lock)) != 0)
+				ret = t_ret;
+			if (ret != DB_PAGE_NOTFOUND)
+				goto err;
+			/*
+			 * The region no longer exists.  Release the page's lock
+			 * (we haven't created the page yet) and find a new page
+			 * on a different region.
+			 */
+			DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+			goto find;
+		} else
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, rpage, dbc->priority);
+		rpage = NULL;
+		if (ret != 0)
+			goto meta_unlock;
+
+		if ((ret = __memp_fget(mpf, &meta_pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+
+		/* Log the page creation.  Can't jump to err if it fails. */
+		if (DBC_LOGGING(dbc))
+			ret = __heap_pg_alloc_log(dbp,
+			    dbc->txn, &LSN(meta), 0, &LSN(meta), meta_pgno,
+				data_pgno, (u_int32_t)P_HEAP, meta->last_pgno);
+		else
+			LSN_NOT_LOGGED(LSN(meta));
+
+		/*
+		 * We may have created a page earlier with a larger page number
+		 * check before updating the metadata page.
+		 */
+		if (ret == 0 && data_pgno > meta->last_pgno)
+			meta->last_pgno = data_pgno;
+		meta_lsn = LSN(meta);
+
+		if ((t_ret = __memp_fput(mpf,
+		    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		meta = NULL;
+		if (ret != 0)
+			goto meta_unlock;
+
+		/* If the page doesn't actually exist we need to create it. */
+		if (cp->pgno == PGNO_INVALID) {
+			cp->pgno = data_pgno;
+			if ((ret = __memp_fget(mpf, &cp->pgno,
+			    dbc->thread_info, dbc->txn,
+			    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &cp->page)) != 0)
+				goto meta_unlock;
+			DB_ASSERT(dbp->env, cp->pgno == data_pgno);
+		} else if ((ret = __memp_dirty(mpf, &cp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+			/* Did not read the page, so we can release the lock. */
+			DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+			goto meta_unlock;
+		}
+
+		/* Now that we have the page we initialize it and we're done. */
+		P_INIT(cp->page,
+		    dbp->pgsize, cp->pgno, P_INVALID, P_INVALID, 0, P_HEAP);
+		LSN(cp->page) = meta_lsn;
+
+meta_unlock:	if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+	} else {
+		/* Check whether we actually have enough space on this page. */
+check:		if (size + sizeof(db_indx_t) > HEAP_FREESPACE(dbp, cp->page)) {
+			/* Put back the page and lock, they were never used. */
+			DISCARD(dbc, cp->page, cp->lock, 0, ret);
+			if (ret != 0)
+				goto err;
+
+			/* Re-start the bitmap check on the next page. */
+			i++;
+			goto find;
+		}
+
+		if ((ret = __memp_dirty(mpf, &cp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+			/* Did not read the page, so we can release the lock. */
+			DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+			goto err;
+		}
+	}
+
+	h->curpgindx = data_pgno - region_pgno - 1;
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (rpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __heap_append --
+ *	Add an item to a heap database.
+ *
+ * PUBLIC: int __heap_append
+ * PUBLIC:     __P((DBC *, DBT *, DBT *));
+ */
+int
+__heap_append(dbc, key, data)
+	DBC *dbc;
+	DBT *data, *key;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	DB_HEAP_RID rid;
+	DB_MPOOLFILE *mpf;
+	HEAPPG *rpage;
+	HEAPHDR hdr;
+	HEAP_CURSOR *cp;
+	db_indx_t indx;
+	db_pgno_t region_pgno;
+	int ret, space, t_ret;
+	u_int8_t avail;
+	u_int32_t data_size;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	ret = t_ret = 0;
+	rpage = NULL;
+	cp = (HEAP_CURSOR *)dbc->internal;
+
+	/* Need data.size + header size, 4-byte aligned. */
+	if (F_ISSET(data, DB_DBT_PARTIAL))
+		data_size = DB_ALIGN(data->doff +
+		    data->size + sizeof(HEAPHDR), sizeof(u_int32_t));
+	else
+		data_size = DB_ALIGN(
+		    data->size + sizeof(HEAPHDR), sizeof(u_int32_t));
+
+	if (data_size >= HEAP_MAXDATASIZE(dbp))
+		return (__heapc_split(dbc, key, data, 1));
+	else if (data_size < sizeof(HEAPSPLITHDR))
+		data_size = sizeof(HEAPSPLITHDR);
+
+	if ((ret = __heap_getpage(dbc, data_size, &avail)) != 0)
+		goto err;
+
+	indx = HEAP_FREEINDX(cp->page);
+	memset(&hdr, 0, sizeof(HEAPHDR));
+	hdr.size = data->size;
+	if (F_ISSET(data, DB_DBT_PARTIAL))
+		hdr.size += data->doff;
+	tmp_dbt.data = &hdr;
+	tmp_dbt.size = sizeof(HEAPHDR);
+
+	/* Log the write. */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+		    0, DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
+		    data_size, &tmp_dbt, data, &LSN(cp->page))) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(cp->page));
+
+	if ((ret = __heap_pitem(
+	    dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, data)) != 0)
+		goto err;
+
+	rid.pgno = cp->pgno;
+	rid.indx = indx;
+	cp->indx = indx;
+
+	/* Check whether we need to update the space bitmap. */
+	HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), space);
+
+	if (space != avail) {
+		/* Get the region page with an exclusive latch. */
+		region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+		if ((ret = __memp_fget(mpf, &region_pgno,
+		    dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+			goto err;
+
+		HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (rpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (cp->page != NULL) {
+		DISCARD(dbc, cp->page, cp->lock, 1, t_ret);
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if (ret == 0 && key != NULL)
+		ret = __db_retcopy(dbp->env, key,
+		    &rid, DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+
+	return (ret);
+}
+
+static int
+__heapc_split(dbc, key, data, is_first)
+	DBC *dbc;
+	DBT *key, *data;
+	int is_first;
+{
+	DB *dbp;
+	DBT hdr_dbt, t_data;
+	DB_HEAP_RID rid;
+	DB_MPOOLFILE *mpf;
+	HEAPPG *rpage;
+	HEAPSPLITHDR hdrs;
+	HEAP_CURSOR *cp;
+	db_indx_t indx;
+	db_pgno_t region_pgno;
+	int ret, spacebits, t_ret;
+	u_int32_t buflen, doff, left, size;
+	u_int8_t availbits, *buf;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	memset(&hdrs, 0, sizeof(HEAPSPLITHDR));
+	memset(&t_data, 0, sizeof(DBT));
+	hdrs.std_hdr.flags = HEAP_RECSPLIT | HEAP_RECLAST;
+
+	doff = data->doff;
+	rpage = NULL;
+	ret = t_ret = 0;
+	indx = 0;
+	buf = NULL;
+	buflen = 0;
+
+	/*
+	 * Write the record to multiple pages, in chunks starting from the end.
+	 * To reconstruct during a get we need the RID of the next chunk, so if
+	 * work our way from back to front during writing we always know the rid
+	 * of the "next" chunk, it's the chunk we just wrote.
+	 */
+	t_data.data = (u_int8_t *)data->data + data->size;
+	left = data->size;
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		left += data->doff;
+	}
+	hdrs.tsize = left;
+	while (left > 0) {
+		size = DB_ALIGN(left + sizeof(HEAPSPLITHDR), sizeof(u_int32_t));
+		if (size < sizeof(HEAPSPLITHDR))
+			size = sizeof(HEAPSPLITHDR);
+
+		if (size > HEAP_MAXDATASIZE(dbp))
+			/*
+			 * Data won't fit on a single page, find one at least
+			 * 33% free.
+			 */
+			size = DB_ALIGN(dbp->pgsize / 3, sizeof(u_int32_t));
+		else
+			hdrs.std_hdr.flags |= HEAP_RECFIRST;
+
+		if ((ret = __heap_getpage(dbc, size, &availbits)) != 0)
+			return (ret);
+
+		/*
+		 * size is the total number of bytes being written to the page.
+		 * The header holds the size of the data being written.
+		 */
+		if (F_ISSET(&(hdrs.std_hdr), HEAP_RECFIRST)) {
+			hdrs.std_hdr.size = left;
+			/*
+			 * If we're called from heapc_reloc, we are only writing
+			 * a piece of the full record and shouldn't set
+			 * HEAP_RECFIRST.
+			 */
+			if (!is_first)
+				F_CLR(&(hdrs.std_hdr), HEAP_RECFIRST);
+		} else {
+			/*
+			 * Figure out how much room is on the page.  If we will
+			 * have to expand the offset table, account for that.
+			 */
+			size = HEAP_FREESPACE(dbp, cp->page);
+			if (NUM_ENT(cp->page) == 0 ||
+			    HEAP_FREEINDX(cp->page) > HEAP_HIGHINDX(cp->page))
+				size -= sizeof(db_indx_t);
+			/* Round down to a multiple of 4. */
+			size = DB_ALIGN(
+			    size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+			DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+			hdrs.std_hdr.size =
+			    (u_int16_t)(size - sizeof(HEAPSPLITHDR));
+		}
+
+		/*
+		 * t_data.data points at the end of the data left to write.  Now
+		 * that we know how much we're going to write to this page, we
+		 * can adjust the pointer to point at the start of the data to
+		 * be written.
+		 *
+		 * If DB_DBT_PARTIAL is set, once data->data is exhausted, we
+		 * have to pad with data->doff bytes (or as much as can fit on
+		 * this page.)  left - doff gives the number of bytes to use
+		 * from data->data.  Once that can't fill t_data, we have to
+		 * start padding.
+		 */
+		t_data.data = (u_int8_t *)(t_data.data) - hdrs.std_hdr.size;
+		DB_ASSERT(dbp->env, (F_ISSET(data, DB_DBT_PARTIAL) ||
+		    t_data.data >= data->data));
+		t_data.size = hdrs.std_hdr.size;
+		if (F_ISSET(data, DB_DBT_PARTIAL) &&
+		    t_data.size > left - doff) {
+			if (buflen < t_data.size) {
+				if (__os_realloc(
+				    dbp->env, t_data.size, &buf) != 0)
+					return (ENOMEM);
+				buflen = t_data.size;
+			}
+			/*
+			 * We have to figure out how much data remains.  left
+			 * includes doff, so we need (left - doff) bytes from
+			 * data.  We also need the amount of padding that can
+			 * fit on the page.  That's the amount we can fit on the
+			 * page minus the bytes we're taking from data.
+			*/
+			t_data.data = buf;
+			memset(buf, '\0', t_data.size - left + doff);
+			buf += t_data.size - left + doff;
+			memcpy(buf, data->data, left - doff);
+			doff -= t_data.size - left + doff;
+			buf = t_data.data;
+		}
+		hdr_dbt.data = &hdrs;
+		hdr_dbt.size = sizeof(HEAPSPLITHDR);
+		indx = HEAP_FREEINDX(cp->page);
+
+		/* Log the write. */
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __heap_addrem_log(dbp,
+			    dbc->txn, &LSN(cp->page), 0,
+			    DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
+			    size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(cp->page));
+
+		if ((ret = __heap_pitem(dbc,
+		    (PAGE *)cp->page, indx, size, &hdr_dbt, &t_data)) != 0)
+			goto err;
+		F_CLR(&(hdrs.std_hdr), HEAP_RECLAST);
+		left -= hdrs.std_hdr.size;
+
+		/*
+		 * Save the rid where we just wrote, this is the "next"
+		 * chunk.
+		 */
+		hdrs.nextpg = cp->pgno;
+		hdrs.nextindx = indx;
+
+		/* Check whether we need to update the space bitmap. */
+		HEAP_CALCSPACEBITS(dbp,
+		    HEAP_FREESPACE(dbp, cp->page), spacebits);
+
+		if (spacebits != availbits) {
+			/* Get the region page with an exclusive latch. */
+			region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+			if ((ret = __memp_fget(mpf, &region_pgno,
+			    dbc->thread_info,
+			    NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+				goto err;
+
+			HEAP_SETSPACE(dbp,
+			    rpage, cp->pgno - region_pgno - 1, spacebits);
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, rpage, dbc->priority);
+			rpage = NULL;
+			if (ret != 0)
+				goto err;
+		}
+
+	}
+
+	rid.pgno = cp->pgno;
+	rid.indx = indx;
+	cp->indx = indx;
+
+err:	if (rpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (cp->page != NULL) {
+		DISCARD(dbc, cp->page, cp->lock, 1, t_ret);
+		if (ret == 0)
+			ret = t_ret;
+	}
+	if (buf != NULL)
+		__os_free(dbp->env, buf);
+
+	if (ret == 0 && key != NULL)
+		ret = __db_retcopy(dbp->env, key,
+		    &rid, DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	return (ret);
+}
+
+/*
+ * __heapc_pitem --
+ *	Put an item on a heap page.  Copy all bytes from the header (if any)
+ *	first and then copy from data.
+ *
+ * PUBLIC: int __heap_pitem __P((DBC *,
+ * PUBLIC:	PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__heap_pitem(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	DBT *hdr, *data;
+{
+	DB *dbp;
+	u_int8_t *buf;
+
+	dbp = dbc->dbp;
+
+	DB_ASSERT(dbp->env, TYPE(pagep) == P_HEAP);
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+	DB_ASSERT(dbp->env, nbytes == DB_ALIGN(nbytes, sizeof(u_int32_t)));
+	DB_ASSERT(dbp->env, DB_ALIGN(((HEAPHDR *)hdr->data)->size,
+	    sizeof(u_int32_t)) >= data->size);
+	DB_ASSERT(dbp->env, nbytes >= hdr->size + data->size);
+
+	/*
+	 * We're writing data either as a result of DB->put or as a result of
+	 * undo-ing a delete.	If we're undo-ing a delete we just need to write
+	 * the bytes from hdr to the page.  Otherwise, we need to construct a
+	 * heap header, etc.
+	 */
+	HEAP_OFFSETTBL(dbp, pagep)[indx] = HOFFSET(pagep) - nbytes;
+	buf = P_ENTRY(dbp, pagep, indx);
+	DB_ASSERT(dbp->env, buf > (u_int8_t*)&HEAP_OFFSETTBL(dbp, pagep)[indx]);
+
+	if (hdr != NULL) {
+		memcpy(buf, hdr->data, hdr->size);
+		buf += hdr->size;
+	}
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		memset(buf, 0, data->doff);
+		buf += data->doff;
+	}
+	memcpy(buf, data->data, data->size);
+
+	/*
+	 * Update data page header.  If DEBUG/DIAGNOSTIC is set, the page might
+	 * be filled with 0xdb, so we can't just look for a 0 in the offset
+	 * table.  We used the first available index, so start there and scan
+	 * forward.  If the table is full, the first available index is the
+	 * highest index plus one.
+	 */
+	if (indx > HEAP_HIGHINDX(pagep)) {
+		if (NUM_ENT(pagep) == 0)
+			HEAP_FREEINDX(pagep) = 0;
+		else if (HEAP_FREEINDX(pagep) >= indx) {
+			if (indx > (u_int32_t)HEAP_HIGHINDX(pagep) + 1)
+				HEAP_FREEINDX(pagep) = HEAP_HIGHINDX(pagep) + 1;
+			else
+				HEAP_FREEINDX(pagep) = indx + 1;
+		}
+		while (++HEAP_HIGHINDX(pagep) < indx)
+			HEAP_OFFSETTBL(dbp,pagep)[HEAP_HIGHINDX(pagep)] = 0;
+	} else {
+		for (; indx <= HEAP_HIGHINDX(pagep); indx++)
+			if (HEAP_OFFSETTBL(dbp, pagep)[indx] == 0)
+				break;
+		HEAP_FREEINDX(pagep) = indx;
+	}
+	HOFFSET(pagep) -= nbytes;
+	NUM_ENT(pagep)++;
+
+	return (0);
+}
+
+/*
+ * __heapc_dup --
+ *      Duplicate a heap cursor, such that the new one holds appropriate
+ *      locks for the position of the original.
+ *
+ * PUBLIC: int __heapc_dup __P((DBC *, DBC *));
+ */
+int
+__heapc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	HEAP_CURSOR *orig, *new;
+
+	orig = (HEAP_CURSOR *)orig_dbc->internal;
+	new = (HEAP_CURSOR *)new_dbc->internal;
+	new->flags = orig->flags;
+	return (0);
+}
+
+/*
+ * __heapc_gsplit --
+ *      Get a heap split record.  The page pointed to by the cursor must
+ *	be the first segment of this record.
+ *
+ * PUBLIC: int __heapc_gsplit __P((DBC *,
+ * PUBLIC:     DBT *, void **, u_int32_t *));
+ */
+int
+__heapc_gsplit(dbc, dbt, bpp, bpsz)
+	DBC *dbc;
+	DBT *dbt;
+	void **bpp;
+	u_int32_t *bpsz;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	DB_HEAP_RID rid;
+	DB_LOCK data_lock;
+	HEAP_CURSOR *cp;
+	ENV *env;
+	HEAPPG *dpage;
+	HEAPSPLITHDR *hdr;
+	db_indx_t bytes;
+	u_int32_t curoff, needed, start, tlen;
+	u_int8_t *p, *src;
+	int putpage, ret, t_ret;
+
+	LOCK_INIT(data_lock);
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (HEAP_CURSOR *)dbc->internal;
+	putpage = FALSE;
+	ret = 0;
+
+	/*
+	 * We should have first page, locked already in cursor.  Get the
+	 * record id out of the cursor and set up local variables.
+	 */
+	DB_ASSERT(env, cp->page != NULL);
+	rid.pgno = cp->pgno;
+	rid.indx = cp->indx;
+	dpage = cp->page;
+	hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, dpage, rid.indx);
+	DB_ASSERT(env, hdr->tsize != 0);
+	tlen = hdr->tsize;
+
+	/*
+	 * If we doing a partial retrieval, figure out how much we are
+	 * actually going to get.
+	 */
+	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		start = dbt->doff;
+		if (start > tlen)
+			needed = 0;
+		else if (dbt->dlen > tlen - start)
+			needed = tlen - start;
+		else
+			needed = dbt->dlen;
+	} else {
+		start = 0;
+		needed = tlen;
+	}
+
+	/*
+	 * If the caller has not requested any data, return success. This
+	 * "early-out" also avoids setting up the streaming optimization when
+	 * no page would be retrieved. If it were removed, the streaming code
+	 * should only initialize when needed is not 0.
+	 */
+	if (needed == 0) {
+		dbt->size = 0;
+		return (0);
+	}
+
+	/*
+	 * Check if the buffer is big enough; if it is not and we are
+	 * allowed to malloc space, then we'll malloc it.  If we are
+	 * not (DB_DBT_USERMEM), then we'll set the dbt and return
+	 * appropriately.
+	 */
+	if (F_ISSET(dbt, DB_DBT_USERCOPY))
+		goto skip_alloc;
+
+	/* Allocate any necessary memory. */
+	if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+		if (needed > dbt->ulen) {
+			dbt->size = needed;
+			return (DB_BUFFER_SMALL);
+		}
+	} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+		if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+		if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+		if ((ret = __os_realloc(env, needed, bpp)) != 0)
+			return (ret);
+		*bpsz = needed;
+		dbt->data = *bpp;
+	} else if (bpp != NULL)
+		dbt->data = *bpp;
+	else {
+		DB_ASSERT(env,
+		    F_ISSET(dbt,
+		    DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+		    bpsz != NULL || bpp != NULL);
+		return (DB_BUFFER_SMALL);
+	}
+
+skip_alloc:
+	/*
+	 * Go through each of the pieces, copying the data on each one
+	 * into the buffer.  Never copy more than the total data length.
+	 * We are starting off with the page that is currently pointed to by
+	 * the cursor,
+	 */
+	curoff = 0;
+	dbt->size = needed;
+	for (p = dbt->data; needed > 0;) {
+		/* Check if we need any bytes from this page */
+		if (curoff + hdr->std_hdr.size >= start) {
+			bytes = hdr->std_hdr.size;
+			src = (u_int8_t *)hdr +
+			    P_TO_UINT16(sizeof(HEAPSPLITHDR));
+			if (start > curoff) {
+				src += start - curoff;
+				bytes -= start - curoff;
+			}
+			if (bytes > needed)
+				bytes = needed;
+			if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+				/*
+				 * The offset into the DBT is the total size
+				 * less the amount of data still needed.  Care
+				 * needs to be taken if doing a partial copy
+				 * beginning at an offset other than 0.
+				 */
+				if ((ret = env->dbt_usercopy(
+				    dbt, dbt->size - needed,
+				    src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+					if (putpage)
+						(void)__memp_fput(
+						    mpf, dbc->thread_info,
+						    dpage, dbp->priority);
+
+					return (ret);
+				}
+			} else
+				memcpy(p, src, bytes);
+			p += bytes;
+			needed -= bytes;
+		}
+		curoff += hdr->std_hdr.size;
+
+		/* Find next record piece as long as it exists */
+		if (!F_ISSET((HEAPHDR *)hdr, HEAP_RECLAST)) {
+			rid.pgno = hdr->nextpg;
+			rid.indx = hdr->nextindx;
+
+			/*
+			 * First pass through here, we are using the
+			 * page pointed to by the cursor, and this page
+			 * will get put when the cursor is is closed.
+			 * Only pages specifically gotten in this loop
+			 * need to be put back.
+			 */
+			if (putpage) {
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				    dpage, dbp->priority) ) != 0)
+					goto err;
+				dpage = NULL;
+				if ((ret = __TLPUT(dbc, data_lock)) != 0)
+					goto err;
+			}
+
+			if ((ret = __db_lget(dbc, 0, rid.pgno,
+			    DB_LOCK_READ, 0, &data_lock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(mpf, &rid.pgno,
+			    dbc->thread_info, dbc->txn, 0, &dpage)) != 0)
+				goto err;
+			hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, dpage, rid.indx);
+			putpage = TRUE;
+
+			/*
+			 * If we have the last piece of this record and we're
+			 * reading the entire record, then what we need should
+			 * equal what is remaining.
+			 */
+			if (F_ISSET((HEAPHDR *)hdr, HEAP_RECLAST) &&
+			    !F_ISSET(dbt, DB_DBT_PARTIAL) &&
+			    (hdr->std_hdr.size != needed)) {
+				__db_errx(env, DB_STR_A("1167",
+			     "Incorrect record size in header: %s: rid %lu.%lu",
+				    "%s %lu %lu"), dbc->dbp->fname,
+				    (u_long)(cp->pgno), (u_long)(cp->indx));
+				ret = __env_panic(env, DB_RUNRECOVERY);
+				goto err;
+			}
+		}
+	}
+
+err:	DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+	if (putpage && dpage != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, dpage, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, data_lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+/*
+ * __heapc_refresh --
+ *      do the proper set up for cursor reuse.
+ *
+ * PUBLIC: int __heapc_refresh __P((DBC *));
+ */
+int
+__heapc_refresh(dbc)
+	DBC *dbc;
+{
+	HEAP_CURSOR *cp;
+
+	cp = (HEAP_CURSOR *)dbc->internal;
+
+	LOCK_INIT(cp->lock);
+	cp->lock_mode = DB_LOCK_NG;
+	cp->flags = 0;
+
+	return (0);
+}
diff --git a/src/heap/heap.src b/src/heap/heap.src
new file mode 100644
index 00000000..47bd4bb0
--- /dev/null
+++ b/src/heap/heap.src
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__heap
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/heap.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a heap db.
+ *
+ * opcode:	identifies if this is an add or delete.
+ * fileid:	file identifier of the file being modified.
+ * pgno:	page number.
+ * indx:	location at which to insert or delete.
+ * nbytes:	number of bytes added/removed to/from the page.
+ * hdr:		header for the data item.
+ * dbt:		data that is to be added or deleted.
+ * pagelsn:	former lsn of the page.
+ */
+BEGIN addrem		49	151
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	nbytes		u_int32_t	lu
+DBT	hdr		DBT		s
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page in a heap database.
+ *
+ * meta_lsn:	the lsn of the metadata page
+ * meta_pgno	the metadata page
+ * page_lsn:	the allocated page's original lsn.
+ * pgno:	the page allocated.
+ * ptype:	the type of the page allocated.
+ * last_pgno:	the last page in the file after this op (4.3+).
+ */
+BEGIN pg_alloc	49	152
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+ARG	pgno		db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * trunc_meta -- Used to record truncation of a heap database's meta page
+ *
+ * fileid:	file identifier of the file being modified.
+ * pgno:	page number.
+ * last_pgno:	value of last_pgno on meta page
+ * key_count:	value of key_count on meta page
+ * record_count: value of record_count on meta page
+ * curregion:	value of curregion on meta page
+ * nregions:	value of nregions on meta page
+ * pagelsn:	former lsn of the page.
+ */
+BEGIN trunc_meta	49	153
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	last_pgno	u_int32_t	lu
+ARG	key_count	u_int32_t	lu
+ARG	record_count	u_int32_t	lu
+ARG	curregion	u_int32_t	lu
+ARG	nregions	u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+/*
+ * trunc_page -- Used to record truncation of a heap database's region page
+ *
+ * fileid:	file identifier of the file being modified.
+ * pgno:	page number.
+ * old_data:	the contents of the page before truncation
+ * pagelsn:	former lsn of the page.
+ */
+BEGIN trunc_page	49	154
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+DBT	old_data	DBT		s
+ARG	is_region	u_int32_t	lu
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+
diff --git a/src/heap/heap_auto.c b/src/heap/heap_auto.c
new file mode 100644
index 00000000..1cb705f4
--- /dev/null
+++ b/src/heap/heap_auto.c
@@ -0,0 +1,73 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __heap_addrem_desc[] = {
+	{LOGREC_ARG, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__heap_addrem_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__heap_addrem_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_addrem_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_addrem_args, nbytes), "nbytes", "%lu"},
+	{LOGREC_DBT, SSZ(__heap_addrem_args, hdr), "hdr", ""},
+	{LOGREC_DBT, SSZ(__heap_addrem_args, dbt), "dbt", ""},
+	{LOGREC_POINTER, SSZ(__heap_addrem_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_pg_alloc_desc[] = {
+	{LOGREC_DB, SSZ(__heap_pg_alloc_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__heap_pg_alloc_args, meta_lsn), "meta_lsn", ""},
+	{LOGREC_ARG, SSZ(__heap_pg_alloc_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_pg_alloc_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_pg_alloc_args, ptype), "ptype", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_pg_alloc_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_trunc_meta_desc[] = {
+	{LOGREC_DB, SSZ(__heap_trunc_meta_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, last_pgno), "last_pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, key_count), "key_count", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, record_count), "record_count", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, curregion), "curregion", "%lu"},
+	{LOGREC_ARG, SSZ(__heap_trunc_meta_args, nregions), "nregions", "%lu"},
+	{LOGREC_POINTER, SSZ(__heap_trunc_meta_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_trunc_page_desc[] = {
+	{LOGREC_DB, SSZ(__heap_trunc_page_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__heap_trunc_page_args, pgno), "pgno", "%lu"},
+	{LOGREC_DBT, SSZ(__heap_trunc_page_args, old_data), "old_data", ""},
+	{LOGREC_ARG, SSZ(__heap_trunc_page_args, is_region), "is_region", "%lu"},
+	{LOGREC_POINTER, SSZ(__heap_trunc_page_args, pagelsn), "pagelsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __heap_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_addrem_recover, DB___heap_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_pg_alloc_recover, DB___heap_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_meta_recover, DB___heap_trunc_meta)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_page_recover, DB___heap_trunc_page)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/heap/heap_autop.c b/src/heap/heap_autop.c
new file mode 100644
index 00000000..b767203b
--- /dev/null
+++ b/src/heap/heap_autop.c
@@ -0,0 +1,105 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_HEAP
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__heap_addrem_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__heap_addrem", __heap_addrem_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__heap_pg_alloc_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__heap_pg_alloc", __heap_pg_alloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__heap_trunc_meta_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__heap_trunc_meta", __heap_trunc_meta_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__heap_trunc_page_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__heap_trunc_page", __heap_trunc_page_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_addrem_print, DB___heap_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_pg_alloc_print, DB___heap_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_meta_print, DB___heap_trunc_meta)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_page_print, DB___heap_trunc_page)) != 0)
+		return (ret);
+	return (0);
+}
+#endif /* HAVE_HEAP */
diff --git a/src/heap/heap_backup.c b/src/heap/heap_backup.c
new file mode 100644
index 00000000..4588b0ba
--- /dev/null
+++ b/src/heap/heap_backup.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_backup --
+ *	Copy a heap database file coordinated with mpool.
+ *
+ * PUBLIC: int __heap_backup __P((DB_ENV *, DB *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_FH *, void *, u_int32_t));
+ */
+int
+__heap_backup(dbenv, dbp, ip, fp, handle, flags)
+	DB_ENV *dbenv;
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_FH *fp;
+	void *handle;
+	u_int32_t flags;
+{
+	HEAPPG *p;
+	db_pgno_t chunk_pgno, high_pgno, max_pgno;
+	int ret;
+
+	max_pgno = dbp->mpf->mfp->last_pgno;
+	chunk_pgno = FIRST_HEAP_RPAGE;
+
+	for (;;) {
+		/*
+		 * Get the chunk page and the chunk's highest used page.
+		 * Immediately return the page, it makes error handling easier.
+		 */
+		if ((ret = __memp_fget(dbp->mpf,
+		    &chunk_pgno, ip, NULL, 0, &p)) != 0)
+			break;
+		high_pgno = p->high_pgno;
+		if ((ret = __memp_fput(dbp->mpf,
+		    ip, p, DB_PRIORITY_UNCHANGED)) != 0)
+			break;
+
+		/*
+		 * Backup all the used pages in this chunk, starting at the
+		 * chunk page.  If this is the very first chunk, be sure to
+		 * backup the db meta page, too.
+		*/
+		if ((ret = __memp_backup_mpf(dbenv->env, dbp->mpf, ip,
+		    chunk_pgno == FIRST_HEAP_RPAGE ? 0 : chunk_pgno,
+		    high_pgno, fp, handle, flags)) != 0)
+			break;
+		chunk_pgno += HEAP_REGION_SIZE(dbp) + 1;
+		if (chunk_pgno > max_pgno)
+			break;
+	}
+
+	return (ret);
+}
diff --git a/src/heap/heap_conv.c b/src/heap/heap_conv.c
new file mode 100644
index 00000000..9f432d13
--- /dev/null
+++ b/src/heap/heap_conv.c
@@ -0,0 +1,93 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/heap.h"
+
+/*
+ * __heap_pgin --
+ *	Convert host-specific page layout from the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __heap_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__heap_pgin(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	return (TYPE(h) == P_HEAPMETA ? __heap_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __heap_pgout --
+ *	Convert host-specific page layout from the host-independent format
+ *	stored on disk.
+ *
+ * PUBLIC: int __heap_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__heap_pgout(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	PAGE *h;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	return (TYPE(h) == P_HEAPMETA ?  __heap_mswap(dbp->env, pp) :
+	    __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __heap_mswap --
+ *	Swap the bytes on the heap metadata page.
+ *
+ * PUBLIC: int __heap_mswap __P((ENV *, PAGE *));
+ */
+int
+__heap_mswap(env, pg)
+	ENV *env;
+	PAGE *pg;
+{
+	u_int8_t *p;
+
+	COMPQUIET(env, NULL);
+
+	__db_metaswap(pg);
+	p = (u_int8_t *)pg + sizeof(DBMETA);
+
+	SWAP32(p);		/* curregion */
+	SWAP32(p);		/* nregions */
+	SWAP32(p);		/* gbytes */
+	SWAP32(p);		/* bytes */
+	SWAP32(p);		/* region_size */
+	p += 92 * sizeof(u_int32_t); /* unused */
+	SWAP32(p);		/* crypto_magic */
+
+	return (0);
+}
diff --git a/src/heap/heap_method.c b/src/heap/heap_method.c
new file mode 100644
index 00000000..f938b5e7
--- /dev/null
+++ b/src/heap/heap_method.c
@@ -0,0 +1,168 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+
+/*
+ * __heap_db_create --
+ *	Heap specific initialization of the DB structure.
+ *
+ * PUBLIC: int __heap_db_create __P((DB *));
+ */
+int
+__heap_db_create(dbp)
+	DB *dbp;
+{
+	HEAP *h;
+	int ret;
+
+	if ((ret = __os_calloc(dbp->env, 1, sizeof(HEAP), &h)) != 0)
+		return (ret);
+	dbp->heap_internal = h;
+	h->region_size = 0;
+
+	dbp->get_heapsize = __heap_get_heapsize;
+	dbp->get_heap_regionsize = __heap_get_heap_regionsize;
+	dbp->set_heapsize = __heap_set_heapsize;
+	dbp->set_heap_regionsize = __heap_set_heap_regionsize;
+
+	return (0);
+}
+
+/*
+ * __heap_db_close --
+ *      Heap specific discard of the DB structure.
+ *
+ * PUBLIC: int __heap_db_close __P((DB *));
+ */
+int
+__heap_db_close(dbp)
+	DB *dbp;
+{
+	HEAP *h;
+	int ret;
+
+	ret = 0;
+	if ((h = dbp->heap_internal) == NULL)
+		return (0);
+
+	__os_free(dbp->env, h);
+	dbp->heap_internal = NULL;
+
+	return (0);
+}
+
+/*
+ * __heap_get_heapsize --
+ *	Get the initial size of the heap.
+ *
+ * PUBLIC: int __heap_get_heapsize __P((DB *, u_int32_t *, u_int32_t *));
+ */
+int
+__heap_get_heapsize(dbp, gbytes, bytes)
+	DB *dbp;
+	u_int32_t *gbytes, *bytes;
+{
+	HEAP *h;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+	h = dbp->heap_internal;
+	*gbytes = h->gbytes;
+	*bytes = h->bytes;
+
+	return (0);
+}
+
+/*
+ * __heap_get_heap_regionsize --
+ *	Get the region size of the heap.
+ *
+ * PUBLIC: int __heap_get_heap_regionsize __P((DB *, u_int32_t *));
+ */
+int
+__heap_get_heap_regionsize(dbp, npages)
+	DB *dbp;
+	u_int32_t *npages;
+{
+	HEAP *h;
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+	h = dbp->heap_internal;
+	*npages = h->region_size;
+
+	return (0);
+}
+
+/*
+ * __heap_set_heapsize --
+ *	Set the initial size of the heap.
+ *
+ * PUBLIC: int __heap_set_heapsize __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__heap_set_heapsize(dbp, gbytes, bytes, flags)
+	DB *dbp;
+	u_int32_t gbytes, bytes, flags;
+{
+	HEAP *h;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_heapsize");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+	COMPQUIET(flags, 0);
+	h = dbp->heap_internal;
+	h->gbytes = gbytes;
+	h->bytes = bytes;
+
+	return (0);
+}
+
+/*
+ * __heap_set_heap_regionsize --
+ *	Set the region size of the heap.
+ *
+ * PUBLIC: int __heap_set_heap_regionsize __P((DB *, u_int32_t));
+ */
+int
+__heap_set_heap_regionsize(dbp, npages)
+	DB *dbp;
+	u_int32_t npages;
+{
+	HEAP *h;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_heap_regionsize");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+	if (npages == 0) {
+		__db_errx(dbp->env, DB_STR("1168", "region size may not be 0"));
+		return (EINVAL);
+	}
+
+	h = dbp->heap_internal;
+	h->region_size = npages;
+
+	return (0);
+}
+
+/*
+ * __heap_exist --
+ *	Test to see if heap exists or not, used in Perl interface
+ *
+ * PUBLIC: int __heap_exist __P((void));
+ */
+int
+__heap_exist()
+{
+	return (1);
+}
diff --git a/src/heap/heap_open.c b/src/heap/heap_open.c
new file mode 100644
index 00000000..6827450d
--- /dev/null
+++ b/src/heap/heap_open.c
@@ -0,0 +1,439 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/fop.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static void __heap_init_meta __P((DB *, HEAPMETA *, db_pgno_t, DB_LSN*));
+
+/*
+ * __heap_open --
+ *	Open a heap.
+ *
+ * PUBLIC: int __heap_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	HEAP *h;
+	db_pgno_t npgs;
+	int ret;
+
+	h = (HEAP *)dbp->heap_internal;
+	COMPQUIET(name, NULL);
+
+	ret = __heap_read_meta(dbp, ip, txn, base_pgno, flags);
+
+	if (h->gbytes != 0 || h->bytes != 0) {
+		/*
+		 * We don't have to worry about rounding with gbytes, as pgsize
+		 * is always a multiple of 2, but we round up if bytes isn't
+		 * a multiple of the page size.
+		 */
+		npgs = (db_pgno_t)(h->gbytes * (GIGABYTE / dbp->pgsize));
+		npgs += (db_pgno_t)((h->bytes +dbp->pgsize - 1)/ dbp->pgsize);
+		h->maxpgno = npgs - 1;
+		if (h->maxpgno < FIRST_HEAP_DPAGE) {
+			__db_errx(dbp->env,
+			    "requested database size is too small");
+			return (EINVAL);
+		}
+	} else
+		/* If not fixed size heap, set maxregion to maximum value */
+		h->maxpgno = UINT32_MAX;
+
+	return (ret);
+}
+
+/*
+ * __heap_metachk --
+ *
+ * PUBLIC: int __heap_metachk __P((DB *, const char *, HEAPMETA *));
+ */
+int
+__heap_metachk(dbp, name, hm)
+	DB *dbp;
+	const char *name;
+	HEAPMETA *hm;
+{
+	ENV *env;
+	HEAP *h;
+	int ret;
+	u_int32_t vers;
+
+	env = dbp->env;
+	h = (HEAP *)dbp->heap_internal;
+
+	/*
+	 * At this point, all we know is that the magic number is for a Heap.
+	 * Check the version, the database may be out of date.
+	 */
+	vers = hm->dbmeta.version;
+	if (F_ISSET(dbp, DB_AM_SWAP))
+		M_32_SWAP(vers);
+	switch (vers) {
+	case 1:
+		break;
+	default:
+		__db_errx(env,
+		    "%s: unsupported heap version: %lu", name, (u_long)vers);
+		return (EINVAL);
+	}
+
+	/* Swap the page if needed. */
+	if (F_ISSET(dbp, DB_AM_SWAP) &&
+	    (ret = __heap_mswap(env, (PAGE *)hm)) != 0)
+		return (ret);
+
+	/* Check application info against metadata info. */
+	if (h->gbytes != 0 || h->bytes != 0)
+		if (h->gbytes != hm->gbytes || h->bytes != hm->bytes) {
+			__db_errx(env, DB_STR_A("1155",
+		  "%s: specified heap size does not match size set in database",
+			    "%s"), name);
+			return (EINVAL);
+		}
+
+	/* Set the page size. */
+	dbp->pgsize = hm->dbmeta.pagesize;
+
+	/* Copy the file's ID. */
+	memcpy(dbp->fileid, hm->dbmeta.uid, DB_FILE_ID_LEN);
+
+	return (0);
+}
+
+/*
+ * __heap_read_meta --
+ *	Read the meta page and set up the internal structure.
+ *
+ * PUBLIC: int __heap_read_meta __P((DB *,
+ * PUBLIC:	DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_read_meta(dbp, ip, txn, meta_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	db_pgno_t meta_pgno;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	HEAPMETA *meta;
+	HEAP *h;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+
+	meta = NULL;
+	h = dbp->heap_internal;
+	LOCK_INIT(metalock);
+	mpf = dbp->mpf;
+	ret = 0;
+
+	/* Get a cursor.  */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	/* Get the metadata page. */
+	if ((ret =
+	    __db_lget(dbc, 0, meta_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &meta_pgno, ip, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	/*
+	 * If the magic number is set, the heap has been created.  Correct
+	 * any fields that may not be right.  Note, all of the local flags
+	 * were set by DB->open.
+	 *
+	 * Otherwise, we'd better be in recovery or abort, in which case the
+	 * metadata page will be created/initialized elsewhere.
+	 */
+	if (meta->dbmeta.magic == DB_HEAPMAGIC) {
+		h->curregion = meta->curregion;
+		h->curpgindx = 0;
+		h->gbytes = meta->gbytes;
+		h->bytes = meta->bytes;
+		h->region_size = meta->region_size;
+
+		if (PGNO(meta) == PGNO_BASE_MD && !F_ISSET(dbp, DB_AM_RECOVER))
+			__memp_set_last_pgno(mpf, meta->dbmeta.last_pgno);
+	} else {
+		DB_ASSERT(dbp->env,
+		    IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
+	}
+
+err:	/* Put the metadata page back. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __heap_new_file --
+ * Create the necessary pages to begin a new database file.
+ *
+ * PUBLIC: int __heap_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__heap_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	DBT pdbt;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	HEAP *h;
+	HEAPMETA *meta;
+	HEAPPG *region;
+	db_pgno_t pgno;
+	int ret, t_ret;
+	u_int32_t max_size;
+	void *buf;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	buf = NULL;
+	h = (HEAP *)dbp->heap_internal;
+	max_size = HEAP_REGION_COUNT(dbp, dbp->pgsize);
+
+	if (h->region_size == 0)
+		h->region_size = HEAP_DEFAULT_REGION_MAX(dbp) > max_size ?
+		    max_size : HEAP_DEFAULT_REGION_MAX(dbp);
+	else if (h->region_size > max_size) {
+		__db_errx(dbp->env, DB_STR_A("1169",
+		    "region size may not be larger than %lu",
+		    "%lu"), (u_long)max_size);
+		return (EINVAL);
+	}
+
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		/* Build the meta-data page. */
+		pgno = PGNO_BASE_MD;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+			return (ret);
+		LSN_NOT_LOGGED(lsn);
+		__heap_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		ret = __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta);
+		if ((t_ret =
+		    __memp_fput(mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		meta = NULL;
+		if (ret != 0)
+			goto err;
+
+		/* Build the first region page. */
+		pgno = 1;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &region)) != 0)
+			goto err;
+		memset(region, 0, dbp->pgsize);
+
+		P_INIT(region,
+		    dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+		LSN_NOT_LOGGED(region->lsn);
+		ret = __db_log_page(
+		    dbp, txn, &region->lsn, pgno, (PAGE *)region);
+		if ((t_ret = __memp_fput(
+		    mpf, ip, region, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		region = NULL;
+		if (ret != 0)
+			goto err;
+	} else {
+		memset(&pdbt, 0, sizeof(pdbt));
+
+		/* Build the meta-data page. */
+		pginfo.db_pagesize = dbp->pgsize;
+		pginfo.flags =
+		    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+		pginfo.type = dbp->type;
+		pdbt.data = &pginfo;
+		pdbt.size = sizeof(pginfo);
+		if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
+			return (ret);
+		meta = (HEAPMETA *)buf;
+		LSN_NOT_LOGGED(lsn);
+		__heap_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+		if ((ret =
+		    __db_pgout(dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+			goto err;
+		if ((ret = __fop_write(env, txn, name, dbp->dirname,
+		    DB_APP_DATA, fhp,
+		    dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		meta = NULL;
+
+		/* Build the first region page */
+		memset(buf, 0, dbp->pgsize);
+		region = (HEAPPG *)buf;
+		P_INIT(region,
+		    dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+		LSN_NOT_LOGGED(region->lsn);
+		if ((ret =
+		    __db_pgout(dbp->dbenv, region->pgno, region, &pdbt)) != 0)
+			goto err;
+		if ((ret =
+		    __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
+		    fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
+		    dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+			goto err;
+		region = NULL;
+	}
+
+err:	if (buf != NULL)
+		__os_free(env, buf);
+	return (ret);
+}
+
+/*
+ * __heap_create_region --
+ * Create a region page
+ *
+ * PUBLIC: int __heap_create_region __P((DBC *, db_pgno_t));
+ */
+int
+__heap_create_region(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DB_LOCK meta_lock;
+	DB_MPOOLFILE *mpf;
+	HEAPMETA *meta;
+	HEAPPG *region;
+	db_pgno_t meta_pgno;
+	int ret, t_ret;
+
+	LOCK_INIT(meta_lock);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	region = NULL;
+
+	/* We may need to update the last page number on the metadata page. */
+	meta_pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    LCK_ALWAYS, meta_pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+		return (ret);
+	if ((ret = __memp_fget(mpf, &meta_pgno,
+	    dbc->thread_info, NULL, DB_MPOOL_DIRTY, &meta)) != 0) {
+		(void)__LPUT(dbc, meta_lock);
+		return (ret);
+	}
+
+	ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+	    NULL, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &region);
+
+	if (ret != 0 || region->pgno != 0)
+		/*
+		 * There's been an error or someone got here before us and
+		 * created the page. Either way, our work here is done.
+		 */
+		goto done;
+
+	/* Log the page creation. */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __heap_pg_alloc_log(dbp,
+		    dbc->txn, &LSN(meta), 0, &LSN(meta), meta_pgno,
+		    pgno, (u_int32_t)P_IHEAP, meta->dbmeta.last_pgno)) != 0)
+			goto done;
+	} else
+		LSN_NOT_LOGGED(LSN(&meta->dbmeta));
+
+	memset((void *)region, 0, dbp->pgsize);
+	P_INIT(region,
+	    dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+	LSN(region) = LSN(&meta->dbmeta);
+
+	/*
+	 * We may have created a page earlier with a larger page number
+	 * check before updating the metadata page.
+	 */
+	if (pgno > meta->dbmeta.last_pgno)
+		meta->dbmeta.last_pgno = pgno;
+	if (HEAP_REGION_NUM(dbp, pgno) > meta->nregions)
+		meta->nregions = HEAP_REGION_NUM(dbp, pgno);
+
+done:	if (region != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, region, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+static void
+__heap_init_meta(dbp, meta, pgno, lsnp)
+	DB *dbp;
+	HEAPMETA *meta;
+	db_pgno_t pgno;
+	DB_LSN *lsnp;
+{
+	HEAP *h;
+	ENV *env;
+
+	env = dbp->env;
+	h = dbp->heap_internal;
+
+	memset(meta, 0, sizeof(HEAPMETA));
+	meta->dbmeta.lsn = *lsnp;
+	meta->dbmeta.pgno = pgno;
+	meta->dbmeta.magic = DB_HEAPMAGIC;
+	meta->dbmeta.version = DB_HEAPVERSION;
+	meta->dbmeta.pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+		DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+		meta->crypto_magic = meta->dbmeta.magic;
+	}
+	meta->dbmeta.type = P_HEAPMETA;
+	meta->dbmeta.free = PGNO_INVALID;
+	meta->dbmeta.last_pgno = FIRST_HEAP_RPAGE;
+	memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+	meta->gbytes = h->gbytes;
+	meta->bytes = h->bytes;
+	meta->region_size = h->region_size;
+	meta->nregions = 1;
+	meta->curregion = 1;
+}
diff --git a/src/heap/heap_rec.c b/src/heap/heap_rec.c
new file mode 100644
index 00000000..578a61c4
--- /dev/null
+++ b/src/heap/heap_rec.c
@@ -0,0 +1,386 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_addrem_recover --
+ *	Recovery function for addrem.
+ *
+ * PUBLIC: int __heap_addrem_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__heap_addrem_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep, *regionp;
+	db_pgno_t region_pgno;
+	int cmp_n, cmp_p, modified, oldspace, ret, space;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__heap_addrem_print);
+	REC_INTRO(__heap_addrem_read, ip, 1);
+	region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+	modified = 0;
+	cmp_n = log_compare(lsnp, &LSN(pagep));
+	cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) {
+		/* We are either redo-ing an add or undoing a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __heap_pitem(dbc, pagep,
+		    argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0)
+			goto out;
+		modified = 1;
+	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) ||
+	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) {
+		/* We are either undoing an add or redo-ing a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __heap_ditem(
+		    dbc, pagep, argp->indx, argp->nbytes)) != 0)
+			goto out;
+		modified = 1;
+	}
+
+	if (modified) {
+		REC_FGET(mpf, ip, region_pgno, &regionp, done);
+		if (DB_REDO(op))
+			LSN(pagep) = *lsnp;
+		else
+			LSN(pagep) = argp->pagelsn;
+
+		/* Update the available space bitmap, if necessary. */
+		HEAP_CALCSPACEBITS(
+		    file_dbp, HEAP_FREESPACE(file_dbp, pagep), space);
+		oldspace = HEAP_SPACE(file_dbp, regionp,
+		    argp->pgno - region_pgno - 1);
+		if (space != oldspace) {
+			REC_DIRTY(mpf, ip, dbc->priority, &regionp);
+			HEAP_SETSPACE(file_dbp,
+			    regionp, argp->pgno - region_pgno - 1, space);
+		}
+		if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0)
+			goto out;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __heap_pg_alloc_recover --
+ *	Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __heap_pg_alloc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_pg_alloc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__heap_pg_alloc_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	HEAPMETA *meta;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	HEAPPG *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, ret, trunc;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	pagep = NULL;
+
+	REC_PRINT(__heap_pg_alloc_print);
+	REC_INTRO(__heap_pg_alloc_read, ip, 0);
+
+	trunc = 0;
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+		/* The metadata page must always exist on redo. */
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else {
+			ret = 0;
+			goto done;
+		}
+	}
+	cmp_n = log_compare(lsnp, &LSN(meta));
+	cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = *lsnp;
+		if (argp->pgno > meta->dbmeta.last_pgno)
+			meta->dbmeta.last_pgno = argp->pgno;
+		if (argp->ptype == P_IHEAP &&
+		    HEAP_REGION_NUM(file_dbp, argp->pgno) > meta->nregions)
+			meta->nregions = HEAP_REGION_NUM(file_dbp, argp->pgno);
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = argp->meta_lsn;
+		if (meta->dbmeta.last_pgno != argp->last_pgno) {
+			if (file_dbp->mpf->mfp->last_pgno ==
+			    meta->dbmeta.last_pgno)
+				trunc = 1;
+			meta->dbmeta.last_pgno = argp->last_pgno;
+		}
+		if (argp->ptype == P_IHEAP &&
+		    HEAP_REGION_NUM(file_dbp, argp->pgno) == meta->nregions) {
+			do
+				meta->nregions--;
+			while (argp->last_pgno <
+			    (meta->nregions - 1) * HEAP_REGION_SIZE(file_dbp));
+		}
+	}
+	/*
+	 * Fix up the allocated page.
+	 * If we're undoing and the page doesn't exist, there's nothing to do,
+	 * if the page does exist we simply zero it out.
+	 * Otherwise if we're redoing the operation, we have
+	 * to get the page (creating it if it doesn't exist), and update its
+	 * LSN.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			ret = 0;
+			goto do_meta;
+		}
+		if ((ret = __memp_fget(mpf,
+		    &argp->pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	if (DB_REDO(op) && IS_ZERO_LSN(LSN(pagep))) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, PGNO_INVALID, 0, argp->ptype);
+		LSN(pagep) = *lsnp;
+	} else if ((cmp_n == 0 || IS_ZERO_LSN(LSN(pagep))) && DB_UNDO(op)) {
+		if (argp->pgno == file_dbp->mpf->mfp->last_pgno)
+			trunc = 1;
+		else if (!IS_ZERO_LSN(LSN(pagep))) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			memset(pagep, 0, file_dbp->pgsize);
+		}
+	}
+	/* If the page is newly allocated and aborted, give it back. */
+	if (pagep != NULL && (trunc == 1 ||
+	    (IS_ZERO_LSN(LSN(pagep)) && TYPE(pagep) != P_IHEAP))) {
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, file_dbp->priority)) != 0)
+			goto out;
+		pagep = NULL;
+		if ((ret = __memp_fget(mpf,
+		    &argp->pgno, ip, NULL, DB_MPOOL_FREE, &pagep)) != 0)
+			goto out;
+		if (trunc == 0 && argp->pgno <= mpf->mfp->last_flushed_pgno) {
+			/*
+			 * If this page is on disk we need to zero it.
+			 * This is safe since we never free pages other
+			 * than backing out an allocation, so there can
+			 * not be a previous allocate and free of this
+			 * page that is reflected on disk.
+			 */
+			if ((ret = __db_zero_extend(env, mpf->fhp,
+			    argp->pgno, argp->pgno, file_dbp->pgsize)) != 0)
+				goto out;
+		}
+	}
+	/*
+	 * Keep the region high_pgno up to date	This not logged so we
+	 * always need to check it.
+	 */
+	if (DB_REDO(op)) {
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, file_dbp->priority)) != 0)
+			goto out;
+		pagep = NULL;
+		pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+		if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0)
+			goto out;
+		if (pagep->high_pgno >= argp->pgno)
+			goto done;
+		if ((ret = __memp_dirty(mpf, &pagep, ip, NULL,
+		    DB_PRIORITY_UNCHANGED, 0)) != 0)
+			goto done;
+		pagep->high_pgno = argp->pgno;
+	}
+
+do_meta:
+	if (trunc == 1 &&
+	    (ret = __memp_ftruncate(mpf, NULL, ip, meta->dbmeta.last_pgno + 1,
+	    MP_TRUNC_RECOVER | MP_TRUNC_NOCACHE)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __heap_trunc_meta_recover --
+ *	Recovery function for trunc_meta.
+ *
+ * PUBLIC: int __heap_trunc_meta_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_meta_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__heap_trunc_meta_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	HEAPMETA *meta;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__heap_trunc_meta_print);
+	REC_INTRO(__heap_trunc_meta_read, ip, 1);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+	cmp_n = log_compare(lsnp, &LSN(pagep));
+	cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+	meta = (HEAPMETA *)pagep;
+
+	if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		meta->dbmeta.last_pgno = argp->last_pgno;
+		meta->dbmeta.key_count = argp->key_count;
+		meta->dbmeta.record_count = argp->record_count;
+		meta->curregion = argp->curregion;
+		meta->nregions = argp->nregions;
+		LSN(meta) = argp->pagelsn;
+	} else if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		/* last_pgno to 1 to account for region page */
+		meta->dbmeta.last_pgno = 1;
+		meta->dbmeta.key_count = 0;
+		meta->dbmeta.record_count = 0;
+		meta->curregion = FIRST_HEAP_RPAGE;
+		meta->nregions = 1;
+		LSN(meta) = *lsnp;
+		if ((ret = __memp_ftruncate(mpf, dbc->txn,
+		    ip, PGNO_BASE_MD + 1, MP_TRUNC_NOCACHE)) != 0)
+			goto out;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __heap_trunc_page_recover --
+ *	Recovery function for trunc_page.
+ *
+ * PUBLIC: int __heap_trunc_page_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_page_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__heap_trunc_page_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	int cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__heap_trunc_page_print);
+	REC_INTRO(__heap_trunc_page_read, ip, 1);
+
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op))
+			goto done;
+		if ((ret = __memp_fget(mpf,
+		    &argp->pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+	cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+	if (DB_UNDO(op) && IS_ZERO_LSN(LSN(pagep))) {
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		memcpy(pagep, argp->old_data.data, argp->old_data.size);
+		LSN(pagep) = argp->pagelsn;
+	} else if (cmp_p == 0 && DB_REDO(op)) {
+		if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+			goto out;
+		pagep = NULL;
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_FREE, &pagep)) != 0)
+			goto out;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
diff --git a/src/heap/heap_reclaim.c b/src/heap/heap_reclaim.c
new file mode 100644
index 00000000..8cedb223
--- /dev/null
+++ b/src/heap/heap_reclaim.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_truncate --
+ *	Truncate a database.
+ *
+ * PUBLIC: int __heap_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__heap_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	DB *dbp;
+	DB_LOCK lock, meta_lock;
+	DB_MPOOLFILE *mpf;
+	DBT log_dbt;
+	HEAPHDR *hdr;
+	HEAPMETA *meta;
+	HEAPPG *pg;
+	db_pgno_t pgno;
+	int i, ret, t_ret;
+	u_int32_t count, next_region, region_size;
+
+	LOCK_INIT(lock);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	count = 0;
+	next_region = FIRST_HEAP_RPAGE;
+	region_size = HEAP_REGION_SIZE(dbp);
+
+	/* Traverse the entire database, starting with the metadata pg. */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+		return (ret);
+	if ((ret = __memp_fget(mpf, &pgno,
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0) {
+		__TLPUT(dbc, lock);
+		goto err;
+	}
+
+	for (;;) {
+		pgno++;
+		if ((ret = __db_lget(dbc,
+		    LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+			break;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0) {
+			if (ret == DB_PAGE_NOTFOUND)
+				ret = 0;
+			break;
+		}
+		if (DBC_LOGGING(dbc)) {
+			memset(&log_dbt, 0, sizeof(DBT));
+			log_dbt.data = pg;
+			log_dbt.size = dbp->pgsize;
+			if ((ret = __heap_trunc_page_log(dbp, dbc->txn,
+			    &LSN(pg), 0, pgno,
+			    &log_dbt, (pgno == next_region), &LSN(pg))) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(pg));
+
+		if (pgno == next_region) {
+			DB_ASSERT(dbp->env, TYPE(pg) == P_IHEAP);
+			next_region += region_size + 1;
+		} else {
+			/*
+			 * We can't use pg->entries to calculate the record
+			 * count, because it can include split records.  So we
+			 * check the header for each entry and only count
+			 * non-split records and the first piece of split
+			 * records. But if the page is empty, there's no work to
+			 * do.
+			 */
+			if (NUM_ENT(pg) != 0)
+				for (i = 0; i <= HEAP_HIGHINDX(pg); i++) {
+					if (HEAP_OFFSETTBL(dbp, pg)[i] == 0)
+						continue;
+					hdr = (HEAPHDR *)P_ENTRY(dbp, pg, i);
+					if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+					    F_ISSET(hdr, HEAP_RECFIRST))
+						count++;
+				}
+		}
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, pg, dbc->priority)) != 0)
+			break;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_FREE, &pg)) != 0)
+			break;
+	}
+	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (countp != NULL && ret == 0)
+		*countp = count;
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __heap_trunc_meta_log(dbp, dbc->txn, &LSN(meta), 0,
+		    meta->dbmeta.pgno, meta->dbmeta.last_pgno,
+		    meta->dbmeta.key_count, meta->dbmeta.record_count,
+		    meta->curregion, meta->nregions, &LSN(meta))) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(meta));
+	meta->dbmeta.key_count = 0;
+	meta->dbmeta.record_count = 0;
+	meta->dbmeta.last_pgno = PGNO_BASE_MD + 1;
+	meta->curregion = 1;
+	meta->nregions = 1;
+
+	if ((ret = __memp_ftruncate(mpf, dbc->txn,
+	    dbc->thread_info, PGNO_BASE_MD + 1, MP_TRUNC_NOCACHE)) != 0)
+		goto err;
+
+	/* Create the first region. */
+	pgno = PGNO_BASE_MD + 1;
+	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+	    dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pg)) != 0)
+		goto err;
+
+	memset(pg, 0, dbp->pgsize);
+	P_INIT(pg,
+	    dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+	ret = __db_log_page(dbp, dbc->txn, &pg->lsn, pgno, (PAGE *)pg);
+	if ((t_ret = __memp_fput(
+	    mpf, dbc->thread_info, pg, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, meta_lock)) && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
diff --git a/src/heap/heap_stat.c b/src/heap/heap_stat.c
new file mode 100644
index 00000000..9f4361a7
--- /dev/null
+++ b/src/heap/heap_stat.c
@@ -0,0 +1,289 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __heap_stat --
+ *	Gather/print the heap statistics
+ *
+ * PUBLIC: int __heap_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__heap_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_HEAP_STAT *sp;
+	DB_LOCK lock, metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HEAPMETA *meta;
+	db_pgno_t metapgno;
+	int ret, t_ret, write_meta;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	meta = NULL;
+	LOCK_INIT(metalock);
+	LOCK_INIT(lock);
+	mpf = dbp->mpf;
+	sp = NULL;
+	ret = t_ret = write_meta = 0;
+
+	/* Allocate and clear the structure. */
+	if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+		goto err;
+	memset(sp, 0, sizeof(*sp));
+
+	/* Get the metadata page for the entire database. */
+	metapgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    0, metapgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &metapgno,
+	    dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	sp->heap_metaflags = meta->dbmeta.flags;
+	sp->heap_pagecnt = meta->dbmeta.last_pgno + 1;
+	sp->heap_pagesize = meta->dbmeta.pagesize;
+	sp->heap_magic = meta->dbmeta.magic;
+	sp->heap_version = meta->dbmeta.version;
+	sp->heap_nregions = meta->nregions;
+	sp->heap_regionsize = meta->region_size;
+
+	if (LF_ISSET(DB_FAST_STAT)) {
+		sp->heap_nrecs = meta->dbmeta.record_count;
+	} else {
+		/* Count the entries in the database. */
+		if ((ret = __heap_traverse(dbc, __heap_stat_callback, sp)) != 0)
+			goto err;
+
+		write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
+			(!MULTIVERSION(dbp) || dbc->txn != NULL);
+		if (write_meta) {
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, meta, dbc->priority);
+			meta = NULL;
+			if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				goto err;
+
+			if ((ret = __db_lget(dbc,
+			    0, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(mpf, &metapgno, dbc->thread_info,
+			    dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+				goto err;
+
+			meta->dbmeta.key_count = sp->heap_nrecs;
+			meta->dbmeta.record_count = sp->heap_nrecs;
+		}
+	}
+
+	*(DB_HEAP_STAT **)spp = sp;
+
+err:	/* Discard metadata page. */
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret != 0 && sp != NULL) {
+		__os_ufree(env, sp);
+		*(DB_BTREE_STAT **)spp = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __heap_stat_print --
+ *	Display heap statistics.
+ *
+ * PUBLIC: int __heap_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__heap_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_HEAP_STAT *sp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __heap_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Default Heap database information:");
+	}
+	__db_msg(env, "%lx\tHeap magic number", (u_long)sp->heap_magic);
+	__db_msg(env, "%lu\tHeap version number", (u_long)sp->heap_version);
+	__db_dl(env,
+	    "Underlying database page size", (u_long)sp->heap_pagesize);
+	__db_dl(env,
+	    "Number of records in the database", (u_long)sp->heap_nrecs);
+	__db_dl(env, "Number of database pages", (u_long)sp->heap_pagecnt);
+	__db_dl(env, "Number of database regions", (u_long)sp->heap_nregions);
+	__db_dl(env,
+	    "Number of pages in a region", (u_long)sp->heap_regionsize);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __heap_print_cursor --
+ *	Display the current cursor.
+ *
+ * PUBLIC: void __heap_print_cursor __P((DBC *));
+ */
+void
+__heap_print_cursor(dbc)
+	DBC *dbc;
+{
+	COMPQUIET(dbc, NULL);
+
+	return;
+}
+
+/*
+ * __heap_stat_callback --
+ *	Statistics callback.
+ *
+ * PUBLIC: int __heap_stat_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__heap_stat_callback(dbc, h, cookie, putp)
+	DBC *dbc;
+	PAGE *h;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	DB_HEAP_STAT *sp;
+	HEAPHDR *hdr;
+	int i;
+
+	dbp = dbc->dbp;
+	sp = cookie;
+	*putp = 0;
+
+	switch (TYPE(h)) {
+	case P_HEAP:
+		/*
+		 * We can't just use NUM_ENT, otherwise we'd mis-count split
+		 * records.
+		 */
+		for (i = 0; i < NUM_ENT(h); i++) {
+			hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+			if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+			    F_ISSET(hdr, HEAP_RECFIRST))
+				sp->heap_nrecs++;
+		}
+		break;
+	case P_HEAPMETA: /* Fallthrough */
+	case P_IHEAP: /* Fallthrough */
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__heap_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __heap_traverse --
+ *	Walk a Heap database.
+ *
+ * PUBLIC: int __heap_traverse __P((DBC *,
+ * PUBLIC:     int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__heap_traverse(dbc, callback, cookie)
+	DBC *dbc;
+	int (*callback)__P((DBC *, PAGE *, void *, int *));
+	void *cookie;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pgno_t pgno;
+	int already_put, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	LOCK_INIT(lock);
+	pgno = FIRST_HEAP_DPAGE;
+
+	for (;;) {
+		already_put = 0;
+		h = NULL;
+
+		if ((ret = __db_lget(dbc,
+		    0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+			break;
+		if ((ret = __memp_fget(mpf,
+		    &pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0) {
+			if (ret == DB_PAGE_NOTFOUND)
+				ret = 0;
+			if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+		ret = callback(dbc, h, cookie, &already_put);
+
+		if (!already_put && (t_ret = __memp_fput(
+		    mpf, dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if (ret != 0)
+			break;
+		pgno++;
+	}
+
+	return (ret);
+}
diff --git a/src/heap/heap_stub.c b/src/heap/heap_stub.c
new file mode 100644
index 00000000..b4feb2f3
--- /dev/null
+++ b/src/heap/heap_stub.c
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id:
+ */
+
+#ifndef HAVE_HEAP
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+
+/*
+ * If the library wasn't compiled with the Heap access method, various
+ * routines aren't available.  Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_no_heap_am --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_heap_am __P((ENV *));
+ */
+int
+__db_no_heap_am(env)
+	ENV *env;
+{
+	__db_errx(env,
+	    "library build did not include support for the Heap access method");
+	return (DB_OPNOTSUP);
+}
+
+int
+__heap_db_create(dbp)
+	DB *dbp;
+{
+	COMPQUIET(dbp, NULL);
+	return (0);
+}
+
+int
+__heap_db_close(dbp)
+	DB *dbp;
+{
+	COMPQUIET(dbp, NULL);
+	return (0);
+}
+
+int
+__heap_get_heapsize(dbp, gbytes, bytes)
+	DB *dbp;
+	u_int32_t *gbytes, *bytes;
+{
+	COMPQUIET(gbytes, NULL);
+	COMPQUIET(bytes, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heapc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	COMPQUIET(new_dbc, NULL);
+	return (__db_no_heap_am(orig_dbc->env));
+}
+
+int
+__heapc_gsplit(dbc, dbt, bpp, bpsz)
+	DBC *dbc;
+	DBT *dbt;
+	void **bpp;
+	u_int32_t *bpsz;
+{
+	COMPQUIET(dbt, NULL);
+	COMPQUIET(bpp, NULL);
+	COMPQUIET(bpsz, NULL);
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_append(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_backup(dbenv, dbp, ip, fp, handle, flags)
+	DB_ENV *dbenv;
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_FH *fp;
+	void *handle;
+	u_int32_t flags;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(ip, NULL);
+	COMPQUIET(fp, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbenv->env));
+}
+
+int
+__heapc_init(dbc)
+	DBC *dbc;
+{
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__heap_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__heap_meta2pgset(dbp, vdp, heapmeta, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HEAPMETA *heapmeta;
+	DB *pgset;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(heapmeta, NULL);
+	COMPQUIET(pgset, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_metachk(dbp, name, hm)
+	DB *dbp;
+	const char *name;
+	HEAPMETA *hm;
+{
+	COMPQUIET(name, NULL);
+	COMPQUIET(hm, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(name, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_open(dbp, ip, txn, name, base_pgno, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(base_pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_pgin(dbp, pg, pp, cookie)
+	DB *dbp;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	COMPQUIET(pg, 0);
+	COMPQUIET(pp, NULL);
+	COMPQUIET(cookie, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_pgout(dbp, pg, pp, cookie)
+	 DB *dbp;
+	 db_pgno_t pg;
+	 void *pp;
+	 DBT *cookie;
+{
+	COMPQUIET(pg, 0);
+	COMPQUIET(pp, NULL);
+	COMPQUIET(cookie, NULL);
+	return (__db_no_heap_am(dbp->env));
+}
+
+void
+__heap_print_cursor(dbc)
+	DBC *dbc;
+{
+	(void)__db_no_heap_am(dbc->env);
+}
+
+int
+__heapc_refresh(dbc)
+	DBC *dbc;
+{
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	COMPQUIET(countp, NULL);
+	return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_vrfy(dbp, vdbp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdbp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(h, NULL);
+	COMPQUIET(vdbp, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HEAPMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(meta, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_vrfy_structure(dbp, vdp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_exist()
+{
+	return (0);
+}
+#endif /* !HAVE_HEAP */
diff --git a/src/heap/heap_verify.c b/src/heap/heap_verify.c
new file mode 100644
index 00000000..ea15c28b
--- /dev/null
+++ b/src/heap/heap_verify.c
@@ -0,0 +1,468 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static	int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, db_indx_t,
+   DBT *));
+static	int __heap_verify_offset_cmp __P((const void *, const void *));
+
+/*
+ * __heap_vrfy_meta --
+ *	Verify the heap-specific part of a metadata page.
+ *
+ * PUBLIC: int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *,
+ * PUBLIC:     db_pgno_t, u_int32_t));
+ */
+int
+__heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HEAPMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	HEAP *h;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t last_pgno, max_pgno, npgs;
+	int isbad, ret;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	isbad = 0;
+	/*
+	 * Heap can't be used in subdatabases, so if this isn't set
+	 * something very odd is going on.
+	 */
+	if (!F_ISSET(pip, VRFY_INCOMPLETE))
+		EPRINT((dbp->env, DB_STR_A("1156",
+		    "Page %lu: Heap databases must be one-per-file",
+		    "%lu"), (u_long)pgno));
+
+	/*
+	 * We have already checked the common fields in __db_vrfy_pagezero.
+	 * However, we used the on-disk metadata page, it may have been stale.
+	 * We now have the page from mpool, so check that.
+	 */
+	if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * Check that nregions is correct.  The last page in the database must
+	 * belong to the nregion-th page.
+	 */
+	h = (HEAP *)dbp->heap_internal;
+	h->region_size = meta->region_size;
+	last_pgno = meta->dbmeta.last_pgno;
+	if (meta->nregions != HEAP_REGION_NUM(dbp, last_pgno)) {
+		EPRINT((dbp->env, DB_STR_A("1157",
+		    "Page %lu: Number of heap regions incorrect",
+		    "%lu"), (u_long)pgno));
+		isbad = 1;
+	}
+
+	/*
+	 * Check that last_pgno doesn't surpass the end of a fixed size
+	 * database.
+	 */
+	if (meta->gbytes != 0 || meta->bytes != 0) {
+		/*
+		 * We don't have to worry about rounding with gbytes, as pgsize
+		 * is always a multiple of 2, but we round down if bytes isn't
+		 * a multiple of the page size.
+		 */
+		npgs = (db_pgno_t)(meta->gbytes * (GIGABYTE / dbp->pgsize));
+		npgs += (db_pgno_t)(meta->bytes / dbp->pgsize);
+		max_pgno = npgs - 1;
+		if (last_pgno > max_pgno) {
+			EPRINT((dbp->env, DB_STR_A("1158",
+			    "Page %lu: last_pgno beyond end of fixed size heap",
+			    "%lu"), (u_long)pgno));
+			isbad = 1;
+		}
+	}
+
+err:	if (LF_ISSET(DB_SALVAGE))
+		ret = __db_salvage_markdone(vdp, pgno);
+
+	return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __heap_vrfy --
+ *	Verify a heap data or internal page.
+ *
+ * PUBLIC: int __heap_vrfy __P((DB *,
+ * PUBLIC:     VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_vrfy(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	HEAPHDR *hdr;
+	int cnt, i, j, ret;
+	db_indx_t *offsets, *offtbl, end;
+
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0)
+		goto err;
+
+	if (TYPE(h) == P_IHEAP)
+		/* Nothing to verify on a region page. */
+		return (0);
+
+	offtbl = HEAP_OFFSETTBL(dbp, h);
+
+	if ((ret = __os_malloc(dbp->env,
+	    NUM_ENT(h) * sizeof(db_indx_t), &offsets)) != 0)
+		goto err;
+
+	/*
+	 * Build a sorted list of all the offsets in the table.  Entries in the
+	 * offset table are not always sorted.  While we're here, check that
+	 * flags are sane.
+	 */
+	cnt = 0;
+	for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
+		if (offtbl[i] == 0)
+			/* Unused index. */
+			continue;
+		if (cnt >= NUM_ENT(h)) {
+			/* Unexpected entry in the offset table. */
+			EPRINT((dbp->env, DB_STR_A("1159",
+		 "Page %lu: incorrect number of entries in page's offset table",
+			    "%lu"), (u_long)pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+		if (!F_ISSET(hdr, HEAP_RECSPLIT) &&
+		    F_ISSET(hdr, HEAP_RECFIRST | HEAP_RECLAST)) {
+			EPRINT((dbp->env, DB_STR_A("1165",
+			    "Page %lu: record %lu has invalid flags",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		offsets[cnt] = offtbl[i];
+		cnt++;
+	}
+	if (cnt == 0) {
+		/* Empty page. */
+		ret = 0;
+		goto err;
+	}
+	qsort(offsets, cnt, sizeof(db_indx_t), __heap_verify_offset_cmp);
+
+	/*
+	 * Now check that the record at each offset does not overlap the next
+	 * record.  We can't use the P_ENTRY macro because we've kept track of
+	 * the offsets, not the indexes.
+	 */
+	for (i = 0; i < cnt - 1; i++) {
+		hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]);
+		end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size;
+		if (end > offsets[i+1]) {
+			/*
+			 * Find the record number for this offset, for the error
+			 * msg.
+			 */
+			for (j = 0; j < HEAP_HIGHINDX(h); j++)
+				if (offtbl[j] == offsets[i])
+					break;
+			EPRINT((dbp->env, DB_STR_A("1160",
+		       "Page %lu: record %lu (length %lu) overlaps next record",
+			    "%lu %lu %lu"),
+			    (u_long)pgno, (u_long)j, (u_long)hdr->size));
+			ret = DB_VERIFY_BAD;
+		}
+	}
+
+	/* Finally, check that the last record doesn't overflow the page */
+	hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]);
+	end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size;
+	if (end > dbp->pgsize) {
+		/* Find the record number for this offset, for the error msg. */
+		for (j = 0; j < HEAP_HIGHINDX(h); j++)
+			if (offtbl[j] == offsets[i])
+				break;
+		EPRINT((dbp->env, DB_STR_A("1161",
+		    "Page %lu: record %lu (length %lu) beyond end of page",
+		    "%lu %lu %lu"),
+		    (u_long)pgno, (u_long)j, (u_long)hdr->size));
+		ret = DB_VERIFY_BAD;
+	}
+
+ err:	__os_free(dbp->env, offsets);
+	return (ret);
+}
+
+static int
+__heap_verify_offset_cmp(off1, off2)
+     const void *off1;
+     const void *off2;
+{
+	return (*(db_indx_t *)off1 - *(db_indx_t *)off2);
+}
+
+/*
+ * __heap_vrfy_structure --
+ *	Verify the structure of a heap database.
+ *
+ * PUBLIC: int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+ */
+int
+__heap_vrfy_structure(dbp, vdp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	u_int32_t flags;
+{
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i, next_region, high_pgno;
+	int ret, isbad;
+
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+		return (ret);
+
+	if (pip->type != P_HEAPMETA) {
+		EPRINT((dbp->env, DB_STR_A("1162",
+		    "Page %lu: heap database has no meta page", "%lu"),
+		    (u_long)PGNO_BASE_MD));
+		isbad = 1;
+		goto err;
+	}
+
+	if ((ret = __db_vrfy_pgset_inc(
+	    vdp->pgset, vdp->thread_info, vdp->txn, 0)) != 0)
+		goto err;
+
+	/*
+	 * Not much structure to verify.  Just make sure region pages are where
+	 * they're supposed to be.  If we don't have FTRUNCATE, there could be
+	 * a zero'd out page where the region page is supposed to be.
+	 */
+	next_region = FIRST_HEAP_RPAGE;
+	high_pgno = 0;
+	for (i = 1; i <= vdp->last_pgno; i++) {
+		/* Send feedback to the application about our progress. */
+		if (!LF_ISSET(DB_SALVAGE))
+			__db_vrfy_struct_feedback(dbp, vdp);
+
+		if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 ||
+		    (ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+			return (ret);
+		if (i != next_region &&
+		    pip->type != P_HEAP && pip->type != P_INVALID) {
+			EPRINT((dbp->env, DB_STR_A("1163",
+			   "Page %lu: heap database page of incorrect type %lu",
+			    "%lu %lu"), (u_long)i, (u_long)pip->type));
+			isbad = 1;
+		} else if (i == next_region && pip->type != P_IHEAP
+#ifndef HAVE_FTRUNCATE
+		    && pip->type != P_INVALID
+#endif
+		) {
+			EPRINT((dbp->env, DB_STR_A("1164",
+		  "Page %lu: heap database missing region page (page type %lu)",
+			    "%lu %lu"), (u_long)i, (u_long)pip->type));
+			isbad = 1;
+		} else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+		    vdp->thread_info, vdp->txn, i)) != 0)
+			goto err;
+
+		if (i == next_region) {
+			high_pgno = pip->prev_pgno;
+			next_region += HEAP_REGION_SIZE(dbp) + 1;
+		} else if (pip->type != P_INVALID && i > high_pgno) {
+			EPRINT((dbp->env, DB_STR_A("1166",
+		"Page %lu heap database page beyond high page in region",
+			"%lu"), (u_long) i));
+			isbad = 1;
+		}
+	}
+
+err:	if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+		return (ret);
+	return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __heap_salvage --
+ *	Safely dump out anything that looks like a record on an alleged heap
+ *	data page.
+ *
+ * PUBLIC: int __heap_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBT dbt;
+	HEAPHDR *hdr;
+	db_indx_t i, *offtbl;
+	int err_ret, ret, t_ret;
+
+	COMPQUIET(flags, 0);
+	memset(&dbt, 0, sizeof(DBT));
+
+	offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h);
+	err_ret = ret = t_ret = 0;
+
+	/*
+	 * Walk the page, dumping non-split records and retrieving split records
+	 * when the first piece is encountered,
+	 */
+	for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
+		if (offtbl[i] == 0)
+			continue;
+		hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+		if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+			if (!F_ISSET(hdr, HEAP_RECFIRST))
+				continue;
+			/*
+			 * We don't completely trust hdr->tsize if it's huge,
+			 * gsplit() is able to realloc as needed.
+			 */
+			dbt.size = ((HEAPSPLITHDR *)hdr)->tsize;
+			if (dbt.size > dbp->pgsize * 4)
+				dbt.size = dbp->pgsize * 4;
+			if ((ret =
+			    __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0)
+				goto err;
+			__heap_safe_gsplit(dbp, vdp, h, i, &dbt);
+		} else {
+			dbt.data = (u_int8_t *)hdr + HEAP_HDRSIZE(hdr);
+			dbt.size = hdr->size;
+		}
+
+		if ((ret = __db_vrfy_prdbt(&dbt,
+		    0, " ", handle, callback, 0, 0, vdp)) != 0)
+			err_ret = ret;
+		if (F_ISSET(hdr, HEAP_RECSPLIT))
+			__os_free(dbp->env, dbt.data);
+	}
+
+err:	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+		return (t_ret);
+	return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __heap_safe_gsplit --
+ *	Given a page and offset, retrieve a split record.
+ */
+static int
+__heap_safe_gsplit(dbp, vdp, h, i, dbt)
+     DB *dbp;
+     VRFY_DBINFO *vdp;
+     PAGE *h;
+     db_indx_t i;
+     DBT *dbt;
+{
+	DB_MPOOLFILE *mpf;
+	HEAPSPLITHDR *hdr;
+	int gotpg, ret, t_ret;
+	u_int32_t bufsz, reclen;
+	u_int8_t *buf;
+
+	mpf = dbp->mpf;
+	buf = dbt->data;
+	bufsz = dbt->size;
+	dbt->size = 0;
+	ret = 0;
+
+	gotpg = 0;
+	for (;;) {
+		hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, h, i);
+		reclen = hdr->std_hdr.size;
+		/* First copy the data from this page */
+		if (dbt->size + reclen > bufsz) {
+			bufsz = dbt->size + reclen;
+			if ((ret = __os_realloc(
+			    dbp->env, bufsz, &dbt->data)) != 0)
+				goto err;
+			buf = (u_int8_t *)dbt->data + dbt->size;
+		}
+		memcpy(buf, (u_int8_t *)hdr + sizeof(HEAPSPLITHDR), reclen);
+		buf += reclen;
+		dbt->size += reclen;
+
+		/* If we're not at the end of the record, grab the next page. */
+		if (F_ISSET(&hdr->std_hdr, HEAP_RECLAST))
+			break;
+		if (gotpg && (ret = __memp_fput(mpf,
+		    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+		gotpg = 0;
+		if ((ret = __memp_fget(mpf,
+		    &hdr->nextpg, vdp->thread_info, NULL, 0, &h)) != 0)
+			goto err;
+		gotpg = 1;
+		i = hdr->nextindx;
+	}
+
+err:	if (gotpg && (t_ret = __memp_fput(
+	    mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+		t_ret = ret;
+	return (ret);
+}
+
+/*
+ * __heap_meta2pgset --
+ *	Given a known-good meta page, populate pgsetp with the db_pgno_t's
+ *	corresponding to the pages in the heap.  This is just all pages in the
+ *	database.
+ *
+ * PUBLIC: int __heap_meta2pgset __P((DB *, VRFY_DBINFO *, HEAPMETA *, DB *));
+ */
+int
+__heap_meta2pgset(dbp, vdp, heapmeta, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	HEAPMETA *heapmeta;
+	DB *pgset;
+{
+	db_pgno_t pgno, last;
+	int ret;
+
+	COMPQUIET(dbp, NULL);
+
+	last = heapmeta->dbmeta.last_pgno;
+	ret = 0;
+
+	for (pgno = 1; pgno <= last; pgno++)
+		if ((ret = __db_vrfy_pgset_inc(
+		    pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+			break;
+	return (ret);
+}
diff --git a/src/hmac/hmac.c b/src/hmac/hmac.c
new file mode 100644
index 00000000..4febfc60
--- /dev/null
+++ b/src/hmac/hmac.c
@@ -0,0 +1,223 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield,
+ * -- astubble@rice.edu.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"	/* for hash.h only */
+#include "dbinc/hash.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+
+#define	HMAC_OUTPUT_SIZE	20
+#define	HMAC_BLOCK_SIZE	64
+
+static void __db_hmac __P((u_int8_t *, u_int8_t *, size_t, u_int8_t *));
+
+/*
+ * !!!
+ * All of these functions use a ctx structure on the stack.  The __db_SHA1Init
+ * call does not initialize the 64-byte buffer portion of it.  The
+ * underlying SHA1 functions will properly pad the buffer if the data length
+ * is less than 64-bytes, so there isn't a chance of reading uninitialized
+ * memory.  Although it would be cleaner to do a memset(ctx.buffer, 0, 64)
+ * we do not want to incur that penalty if we don't have to for performance.
+ */
+
+/*
+ * __db_hmac --
+ *	Do a hashed MAC.
+ */
+static void
+__db_hmac(k, data, data_len, mac)
+	u_int8_t *k, *data, *mac;
+	size_t data_len;
+{
+	SHA1_CTX ctx;
+	u_int8_t key[HMAC_BLOCK_SIZE];
+	u_int8_t ipad[HMAC_BLOCK_SIZE];
+	u_int8_t opad[HMAC_BLOCK_SIZE];
+	u_int8_t tmp[HMAC_OUTPUT_SIZE];
+	int i;
+
+	memset(key, 0x00, HMAC_BLOCK_SIZE);
+	memset(ipad, 0x36, HMAC_BLOCK_SIZE);
+	memset(opad, 0x5C, HMAC_BLOCK_SIZE);
+
+	memcpy(key, k, HMAC_OUTPUT_SIZE);
+
+	for (i = 0; i < HMAC_BLOCK_SIZE; i++) {
+		ipad[i] ^= key[i];
+		opad[i] ^= key[i];
+	}
+
+	__db_SHA1Init(&ctx);
+	__db_SHA1Update(&ctx, ipad, HMAC_BLOCK_SIZE);
+	__db_SHA1Update(&ctx, data, data_len);
+	__db_SHA1Final(tmp, &ctx);
+	__db_SHA1Init(&ctx);
+	__db_SHA1Update(&ctx, opad, HMAC_BLOCK_SIZE);
+	__db_SHA1Update(&ctx, tmp, HMAC_OUTPUT_SIZE);
+	__db_SHA1Final(mac, &ctx);
+	return;
+}
+
+/*
+ * __db_chksum --
+ *	Create a MAC/SHA1 checksum.
+ *
+ * PUBLIC: void __db_chksum __P((void *,
+ * PUBLIC:     u_int8_t *, size_t, u_int8_t *, u_int8_t *));
+ */
+void
+__db_chksum(hdr, data, data_len, mac_key, store)
+	void *hdr;
+	u_int8_t *data;
+	size_t data_len;
+	u_int8_t *mac_key;
+	u_int8_t *store;
+{
+	int sumlen;
+	u_int32_t hash4;
+
+	/*
+	 * Since the checksum might be on a page of data we are checksumming
+	 * we might be overwriting after checksumming, we zero-out the
+	 * checksum value so that we can have a known value there when
+	 * we verify the checksum.
+	 * If we are passed a log header XOR in prev and len so we have
+	 * some redundancy on these fields.  Mostly we need to be sure that
+	 * we detect a race when doing hot backups and reading a live log
+	 * file.
+	 */
+	if (mac_key == NULL)
+		sumlen = sizeof(u_int32_t);
+	else
+		sumlen = DB_MAC_KEY;
+	if (hdr == NULL)
+		memset(store, 0, sumlen);
+	else
+		store = ((HDR*)hdr)->chksum;
+	if (mac_key == NULL) {
+		/* Just a hash, no MAC */
+		hash4 = __ham_func4(NULL, data, (u_int32_t)data_len);
+		if (hdr != NULL)
+			hash4 ^= ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len;
+		memcpy(store, &hash4, sumlen);
+	} else {
+		__db_hmac(mac_key, data, data_len, store);
+		if (hdr != 0) {
+			((int *)store)[0] ^= ((HDR *)hdr)->prev;
+			((int *)store)[1] ^= ((HDR *)hdr)->len;
+		}
+	}
+	return;
+}
+/*
+ * __db_derive_mac --
+ *	Create a MAC/SHA1 key.
+ *
+ * PUBLIC: void __db_derive_mac __P((u_int8_t *, size_t, u_int8_t *));
+ */
+void
+__db_derive_mac(passwd, plen, mac_key)
+	u_int8_t *passwd;
+	size_t plen;
+	u_int8_t *mac_key;
+{
+	SHA1_CTX ctx;
+
+	/* Compute the MAC key. mac_key must be 20 bytes. */
+	__db_SHA1Init(&ctx);
+	__db_SHA1Update(&ctx, passwd, plen);
+	__db_SHA1Update(&ctx, (u_int8_t *)DB_MAC_MAGIC, strlen(DB_MAC_MAGIC));
+	__db_SHA1Update(&ctx, passwd, plen);
+	__db_SHA1Final(mac_key, &ctx);
+
+	return;
+}
+
+/*
+ * __db_check_chksum --
+ *	Verify a checksum.
+ *
+ *	Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
+ *
+ * PUBLIC: int __db_check_chksum __P((ENV *,
+ * PUBLIC:     void *, DB_CIPHER *, u_int8_t *, void *, size_t, int));
+ */
+int
+__db_check_chksum(env, hdr, db_cipher, chksum, data, data_len, is_hmac)
+	ENV *env;
+	void *hdr;
+	DB_CIPHER *db_cipher;
+	u_int8_t *chksum;
+	void *data;
+	size_t data_len;
+	int is_hmac;
+{
+	int ret;
+	size_t sum_len;
+	u_int32_t hash4;
+	u_int8_t *mac_key, old[DB_MAC_KEY], new[DB_MAC_KEY];
+
+	/*
+	 * If we are just doing checksumming and not encryption, then checksum
+	 * is 4 bytes.  Otherwise, it is DB_MAC_KEY size.  Check for illegal
+	 * combinations of crypto/non-crypto checksums.
+	 */
+	if (is_hmac == 0) {
+		if (db_cipher != NULL) {
+			__db_errx(env, DB_STR("0195",
+    "Unencrypted checksum with a supplied encryption key"));
+			return (EINVAL);
+		}
+		sum_len = sizeof(u_int32_t);
+		mac_key = NULL;
+	} else {
+		if (db_cipher == NULL) {
+			__db_errx(env, DB_STR("0196",
+			    "Encrypted checksum: no encryption key specified"));
+			return (EINVAL);
+		}
+		sum_len = DB_MAC_KEY;
+		mac_key = db_cipher->mac_key;
+	}
+
+	/*
+	 * !!!
+	 * Since the checksum might be on the page, we need to have known data
+	 * there so that we can generate the same original checksum.  We zero
+	 * it out, just like we do in __db_chksum above.
+	 * If there is a log header, XOR the prev and len fields.
+	 */
+	if (hdr == NULL) {
+		memcpy(old, chksum, sum_len);
+		memset(chksum, 0, sum_len);
+		chksum = old;
+	}
+
+	if (mac_key == NULL) {
+		/* Just a hash, no MAC */
+		hash4 = __ham_func4(NULL, data, (u_int32_t)data_len);
+		if (hdr != NULL)
+			LOG_HDR_SUM(0, hdr, &hash4);
+		ret = memcmp((u_int32_t *)chksum, &hash4, sum_len) ? -1 : 0;
+	} else {
+		__db_hmac(mac_key, data, data_len, new);
+		if (hdr != NULL)
+			LOG_HDR_SUM(1, hdr, new);
+		ret = memcmp(chksum, new, sum_len) ? -1 : 0;
+	}
+
+	return (ret);
+}
diff --git a/src/hmac/sha1.c b/src/hmac/sha1.c
new file mode 100644
index 00000000..76069694
--- /dev/null
+++ b/src/hmac/sha1.c
@@ -0,0 +1,289 @@
+/*
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/hmac.h"
+
+/*
+SHA-1 in C
+By Steve Reid <sreid@sea-to-sky.net>
+100% Public Domain
+
+-----------------
+Modified 7/98
+By James H. Brown <jbrown@burgoyne.com>
+Still 100% Public Domain
+
+Corrected a problem which generated improper hash values on 16 bit machines
+Routine SHA1Update changed from
+	void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int
+len)
+to
+	void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned
+long len)
+
+The 'len' parameter was declared an int which works fine on 32 bit machines.
+However, on 16 bit machines an int is too small for the shifts being done
+against
+it.  This caused the hash function to generate incorrect values if len was
+greater than 8191 (8K - 1) due to the 'len << 3' on line 3 of SHA1Update().
+
+Since the file IO in main() reads 16K at a time, any file 8K or larger would
+be guaranteed to generate the wrong hash (e.g. Test Vector #3, a million
+"a"s).
+
+I also changed the declaration of variables i & j in SHA1Update to
+unsigned long from unsigned int for the same reason.
+
+These changes should make no difference to any 32 bit implementations since
+an
+int and a long are the same size in those environments.
+
+--
+I also corrected a few compiler warnings generated by Borland C.
+1. Added #include <process.h> for exit() prototype
+2. Removed unused variable 'j' in SHA1Final
+3. Changed exit(0) to return (0) at end of main.
+
+ALL changes I made can be located by searching for comments containing 'JHB'
+-----------------
+Modified 8/98
+By Steve Reid <sreid@sea-to-sky.net>
+Still 100% public domain
+
+1- Removed #include <process.h> and used return () instead of exit()
+2- Fixed overwriting of finalcount in SHA1Final() (discovered by Chris Hall)
+3- Changed email address from steve@edmweb.com to sreid@sea-to-sky.net
+
+-----------------
+Modified 4/01
+By Saul Kravitz <Saul.Kravitz@celera.com>
+Still 100% PD
+Modified to run on Compaq Alpha hardware.
+
+*/
+
+/*
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+#define	SHA1HANDSOFF
+
+/* #include <process.h> */	/* prototype for exit() - JHB */
+/* Using return () instead of exit() - SWR */
+
+#define	rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#define	blk0(i) is_bigendian ? block->l[i] : \
+    (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+    |(rol(block->l[i],8)&0x00FF00FF))
+#define	blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+    ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define	R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5); \
+    w=rol(w,30);
+#define	R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5); \
+    w=rol(w,30);
+#define	R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define	R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5); \
+    w=rol(w,30);
+#define	R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+#ifdef VERBOSE  /* SAK */
+static void __db_SHAPrintContext __P((SHA1_CTX *, char *));
+static void
+__db_SHAPrintContext(context, msg)
+	SHA1_CTX *context;
+	char *msg;
+{
+  printf("%s (%d,%d) %x %x %x %x %x\n",
+	 msg,
+	 context->count[0], context->count[1],
+	 context->state[0],
+	 context->state[1],
+	 context->state[2],
+	 context->state[3],
+	 context->state[4]);
+}
+#endif
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+/*
+ * __db_SHA1Transform --
+ *
+ * PUBLIC: void __db_SHA1Transform __P((u_int32_t *, unsigned char *));
+ */
+void
+__db_SHA1Transform(state, buffer)
+	u_int32_t *state;
+	unsigned char *buffer;
+{
+u_int32_t a, b, c, d, e;
+typedef union {
+    unsigned char c[64];
+    u_int32_t l[16];
+} CHAR64LONG16;
+CHAR64LONG16* block;
+    int is_bigendian;
+#ifdef SHA1HANDSOFF
+    unsigned char workspace[64];
+
+    block = (CHAR64LONG16*)workspace;
+    memcpy(block, buffer, 64);
+#else
+    block = (CHAR64LONG16*)buffer;
+#endif
+    is_bigendian = __db_isbigendian();
+
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+}
+
+/* SHA1Init - Initialize new context */
+
+/*
+ * __db_SHA1Init --
+ *      Initialize new context
+ *
+ * PUBLIC: void __db_SHA1Init __P((SHA1_CTX *));
+ */
+void
+__db_SHA1Init(context)
+	SHA1_CTX *context;
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = context->count[1] = 0;
+}
+
+/* Run your data through this. */
+
+/*
+ * __db_SHA1Update --
+ *      Run your data through this.
+ *
+ * PUBLIC: void __db_SHA1Update __P((SHA1_CTX *, unsigned char *,
+ * PUBLIC:     size_t));
+ */
+void
+__db_SHA1Update(context, data, len)
+	SHA1_CTX *context;
+	unsigned char *data;
+	size_t len;
+{
+u_int32_t i, j;	/* JHB */
+
+#ifdef VERBOSE
+    __db_SHAPrintContext(context, DB_STR_P("before"));
+#endif
+    j = (context->count[0] >> 3) & 63;
+    if ((context->count[0] += (u_int32_t)len << 3) < (len << 3))
+	    context->count[1]++;
+    context->count[1] += (u_int32_t)(len >> 29);
+    if ((j + len) > 63) {
+	memcpy(&context->buffer[j], data, (i = 64-j));
+	__db_SHA1Transform(context->state, context->buffer);
+	for ( ; i + 63 < len; i += 64) {
+	    __db_SHA1Transform(context->state, &data[i]);
+	}
+	j = 0;
+    }
+    else i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+#ifdef VERBOSE
+    __db_SHAPrintContext(context, DB_STR_P("after "));
+#endif
+}
+
+/* Add padding and return the message digest. */
+
+/*
+ * __db_SHA1Final --
+ *      Add padding and return the message digest.
+ *
+ * PUBLIC: void __db_SHA1Final __P((unsigned char *, SHA1_CTX *));
+ */
+void
+__db_SHA1Final(digest, context)
+	unsigned char *digest;
+	SHA1_CTX *context;
+{
+u_int32_t i;	/* JHB */
+unsigned char finalcount[8];
+
+    for (i = 0; i < 8; i++) {
+	finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)]
+	 >> ((3-(i & 3)) * 8) ) & 255);  /* Endian independent */
+    }
+    __db_SHA1Update(context, (unsigned char *)"\200", 1);
+    while ((context->count[0] & 504) != 448) {
+	__db_SHA1Update(context, (unsigned char *)"\0", 1);
+    }
+    __db_SHA1Update(context, finalcount, 8);  /* Should cause a SHA1Transform()
+*/
+    for (i = 0; i < 20; i++) {
+	digest[i] = (unsigned char)
+	 ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
+    }
+    /* Wipe variables */
+    i = 0;	/* JHB */
+    memset(context->buffer, 0, 64);
+    memset(context->state, 0, 20);
+    memset(context->count, 0, 8);
+    memset(finalcount, 0, 8);	/* SWR */
+#ifdef SHA1HANDSOFF  /* make SHA1Transform overwrite it's own static vars */
+    __db_SHA1Transform(context->state, context->buffer);
+#endif
+}
+
+/*************************************************************/
diff --git a/src/lock/Design b/src/lock/Design
new file mode 100644
index 00000000..f82bc7e8
--- /dev/null
+++ b/src/lock/Design
@@ -0,0 +1,301 @@
+Synchronization in the Locking Subsystem
+
+This is a document that describes how we implemented fine-grain locking
+in the lock manager (that is, locking on a hash bucket level instead of
+locking the entire region).  We found that the increase in concurrency
+was not sufficient to warrant the increase in complexity or the additional
+cost of performing each lock operation.  Therefore, we don't use this
+any more.  Should we have to do fine-grain locking in a future release,
+this would be a reasonable starting point.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+1. Data structures
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+
+The lock manager maintains 3 different structures:
+
+Objects (__db_lockobj):
+	Describes an object that is locked.  When used with DB, this consists
+	of a __db_ilock (a file identifier and a page number).
+
+Lockers (__db_locker):
+	Identifies a specific locker ID and maintains the head of a list of
+	locks held by a locker (for using during transaction commit/abort).
+
+Locks (__db_lock):
+	Describes a particular object lock held on behalf of a particular
+	locker id.
+
+Objects and Lockers reference Locks.
+
+These structures are organized via two synchronized hash tables.  Each
+hash table consists of two physical arrays: the array of actual hash
+buckets and an array of mutexes so we can lock individual buckets, rather
+than the whole table.
+
+One hash table contains Objects and the other hash table contains Lockers.
+Objects contain two lists of locks, waiters and holders: holders currently
+hold a lock on the Object, waiters are lock waiting to be granted.
+Lockers are a single linked list that connects the Locks held on behalf
+of the specific locker ID.
+
+In the diagram below:
+
+Locker ID #1 holds a lock on Object #1 (L1) and Object #2 (L5), and is
+waiting on a lock on Object #1 (L3).
+
+Locker ID #2 holds a lock on Object #1 (L2) and is waiting on a lock for
+Object #2 (L7).
+
+Locker ID #3 is waiting for a lock on Object #2 (L6).
+
+	OBJECT                   -----------------------
+	HASH                     |                     |
+                             ----|-------------        |
+	________    _______  |   |   ________ |        |
+	|	|-->| O1  |--|---|-->|  O2  | |        |
+	|_______|   |_____|  |   |   |______| V        |
+	|	|    W  H--->L1->L2   W  H--->L5       |	holders
+	|_______|    |       |   |    |                V
+	|	|    ------->L3  \    ------->L6------>L7	waiters
+	|_______|           /     \            \
+	.	.          /       \            \
+	.	.          |        \            \
+	.	.          |         \            -----------
+	|_______|          |          --------------        |
+	|	|      ____|____                ___|_____  _|______
+	|_______|      |       |                |       |  |      |
+	|	|      | LID1  |                |  LID2 |  | LID3 |
+	|_______|      |_______|                |_______|  |______|
+			   ^                        ^        ^
+			   |                        |        |
+			___|________________________|________|___
+	       LOCKER	|    |    |    |    |    |    |    |    |
+	       HASH	|    |    |    |    |    |    |    |    |
+			|    |    |    |    |    |    |    |    |
+			|____|____|____|____|____|____|____|____|
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+2. Synchronization
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+
+There are four types of mutexes in the subsystem.
+
+Object mutexes;
+	These map one-to-one to each bucket in the Object hash table.
+	Holding a mutex on an Object bucket secures all the Objects in
+	that bucket as well as the Lock structures linked from those
+	Objects.  All fields in the Locks EXCEPT the Locker links (the
+	links that attach Locks by Locker ID) are protected by these
+	mutexes.
+
+Locker mutexes:
+	These map one-to-one to each bucket in the Locker hash table.
+	Holding a mutex on a Locker bucket secures the Locker structures
+	and the Locker links in the Locks.
+
+Memory mutex:
+	This mutex allows calls to allocate/free memory, i.e. calls to
+	__db_shalloc and __db_shalloc_free, as well as manipulation of
+	the Object, Locker and Lock free lists.
+
+Region mutex:
+	This mutex is currently only used to protect the locker ids.
+	It may also be needed later to provide exclusive access to
+	the region for deadlock detection.
+
+Creating or removing a Lock requires locking both the Object lock and the
+Locker lock (and eventually the shalloc lock to return the item to the
+free list).
+
+The locking hierarchy is as follows:
+
+	The Region mutex may never be acquired after any other mutex.
+
+	The Object mutex may be acquired after the Region mutex.
+
+	The Locker mutex may be acquired after the Region and Object
+	mutexes.
+
+	The Memory mutex may be acquired after any mutex.
+
+So, if both and Object mutex and a Locker mutex are going to be acquired,
+the Object mutex must be acquired first.
+
+The Memory mutex may be acquired after any other mutex, but no other mutexes
+can be acquired once the Memory mutex is held.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+3. The algorithms:
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+The locking subsystem supports four basic operations:
+	Get a Lock (lock_get)
+
+	Release a Lock (lock_put)
+
+	Release all the Locks on a specific Object (lock_vec)
+
+	Release all the Locks for a specific Locker (lock_vec)
+
+Get a lock:
+	Acquire Object bucket mutex.
+	Acquire Locker bucket mutex.
+
+	Acquire Memory mutex.
+	If the Object does not exist
+		Take an Object off the freelist.
+	If the Locker doesn't exist
+		Take a Locker off the freelist.
+	Take a Lock off the free list.
+	Release Memory mutex.
+
+	Add Lock to the Object list.
+	Add Lock to the Locker list.
+	Release Locker bucket mutex
+
+	If the lock cannot be granted
+		Release Object bucket mutex
+		Acquire lock mutex (blocks)
+
+		Acquire Object bucket mutex
+		If lock acquisition did not succeed (e.g, deadlock)
+			Acquire Locker bucket mutex
+			If locker should be destroyed
+				Remove locker from hash table
+				Acquire Memory mutex
+				Return locker to free list
+				Release Memory mutex
+			Release Locker bucket mutex
+
+			If object should be released
+				Acquire Memory mutex
+				Return object to free list
+				Release Memory mutex
+
+	Release Object bucket mutex
+
+Release a lock:
+	Acquire Object bucket mutex.
+		(Requires that we be able to find the Object hash bucket
+		without looking inside the Lock itself.)
+
+	If releasing a single lock and the user provided generation number
+	doesn't match the Lock's generation number, the Lock has been reused
+	and we return failure.
+
+	Enter lock_put_internal:
+		if the Lock is still on the Object's lists:
+			Increment Lock's generation number.
+			Remove Lock from the Object's list (NULL link fields).
+			Promote locks for the Object.
+
+		Enter locker_list_removal
+			Acquire Locker bucket mutex.
+			If Locker doesn't exist:
+				Release Locker bucket mutex
+				Release Object bucket mutex
+				Return error.
+			Else if Locker marked as deleted:
+				dont_release = TRUE
+			Else
+				Remove Lock from Locker list.
+				If Locker has no more locks
+					Remove Locker from table.
+					Acquire Memory mutex.
+					Return Locker to free list
+					Release Memory mutex
+			Release Locker bucket mutex.
+		Exit locker_list_removal
+
+		If (!dont_release)
+			Acquire Memory mutex
+			Return Lock to free list
+			Release Memory mutex
+
+	Exit lock_put_internal
+
+	Release Object bucket mutex
+
+Release all the Locks on a specific Object (lock_vec, DB_PUT_ALL_OBJ):
+
+	Acquire Object bucket mutex.
+
+	For each lock on the waiter list:
+		lock_put_internal
+	For each lock on the holder list:
+		lock_put_internal
+
+	Release Object bucket mutex.
+
+Release all the Locks for a specific Locker (lock_vec, DB_PUT_ALL):
+
+	Acquire Locker bucket mutex.
+	Mark Locker deleted.
+	Release Locker mutex.
+
+	For each lock on the Locker's list:
+		Remove from locker's list
+			(The lock could get put back on the free list in
+			lock_put and then could get reallocated and the
+			act of setting its locker links could clobber us.)
+		Perform "Release a Lock" above: skip locker_list_removal.
+
+	Acquire Locker bucket mutex.
+	Remove Locker
+	Release Locker mutex.
+
+	Acquire Memory mutex
+	Return Locker to free list
+	Release Memory mutex
+
+Deadlock detection (lock_detect):
+
+	For each bucket in Object table
+		Acquire the Object bucket mutex.
+		create waitsfor
+
+	For each bucket in Object table
+		Release the Object mutex.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+FAQ:
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+Q: Why do you need generation numbers?
+A: If a lock has been released due to a transaction abort (potentially in a
+   different process), and then lock is released by a thread of control
+   unaware of the abort, the lock might have potentially been re-allocated
+   to a different object.  The generation numbers detect this problem.
+
+   Note, we assume that reads/writes of lock generation numbers are atomic,
+   if they are not, it is theoretically possible that a re-allocated lock
+   could be mistaken for another lock.
+
+Q: Why is is safe to walk the Locker list without holding any mutexes at
+   all?
+A: Locks are created with both the Object and Locker bucket mutexes held.
+   Once created, they removed in two ways:
+
+   a) when a specific Lock is released, in which case, the Object and
+   Locker bucket mutexes are again held, and
+
+   b) when all Locks for a specific Locker Id is released.
+
+   In case b), the Locker bucket mutex is held while the Locker chain is
+   marked as "destroyed", which blocks any further access to the Locker
+   chain.  Then, each individual Object bucket mutex is acquired when each
+   individual Lock is removed.
+
+Q: What are the implications of doing fine grain locking?
+
+A: Since we no longer globally lock the entire region, lock_vec will no
+   longer be atomic.  We still execute the items in a lock_vec in order,
+   so things like lock-coupling still work, but you can't make any
+   guarantees about atomicity.
+
+Q: How do I configure for FINE_GRAIN locking?
+
+A: We currently do not support any automatic configuration for FINE_GRAIN
+   locking.  When we do, will need to document that atomicity discussion
+   listed above (it is bug-report #553).
+
+Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
diff --git a/src/lock/lock.c b/src/lock/lock.c
new file mode 100644
index 00000000..e4627734
--- /dev/null
+++ b/src/lock/lock.c
@@ -0,0 +1,2020 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_allocobj __P((DB_LOCKTAB *, u_int32_t));
+static int __lock_alloclock __P((DB_LOCKTAB *, u_int32_t));
+static int  __lock_freelock __P((DB_LOCKTAB *,
+		struct __db_lock *, DB_LOCKER *, u_int32_t));
+static int  __lock_getobj
+		__P((DB_LOCKTAB *, const DBT *, u_int32_t, int, DB_LOCKOBJ **));
+static int __lock_get_api __P((ENV *,
+		u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+static int  __lock_inherit_locks __P ((DB_LOCKTAB *, DB_LOCKER *, u_int32_t));
+static int  __lock_same_family __P((DB_LOCKTAB *, DB_LOCKER *, DB_LOCKER *));
+static int  __lock_put_internal __P((DB_LOCKTAB *,
+		struct __db_lock *, u_int32_t,  u_int32_t));
+static int  __lock_put_nolock __P((ENV *, DB_LOCK *, int *, u_int32_t));
+static int __lock_remove_waiter __P((DB_LOCKTAB *,
+		DB_LOCKOBJ *, struct __db_lock *, db_status_t));
+static int __lock_trade __P((ENV *, DB_LOCK *, DB_LOCKER *));
+static int __lock_vec_api __P((ENV *,
+		u_int32_t, u_int32_t,  DB_LOCKREQ *, int, DB_LOCKREQ **));
+
+static const char __db_lock_invalid[] = "%s: Lock is no longer valid";
+static const char __db_locker_invalid[] = "Locker is not valid";
+
+#ifdef DEBUG
+extern void __db_loadme (void);
+#endif
+
+/*
+ * __lock_vec_pp --
+ *	ENV->lock_vec pre/post processing.
+ *
+ * PUBLIC: int __lock_vec_pp __P((DB_ENV *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+ */
+int
+__lock_vec_pp(dbenv, lid, flags, list, nlist, elistp)
+	DB_ENV *dbenv;
+	u_int32_t lid, flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_vec", DB_INIT_LOCK);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env,
+	     "DB_ENV->lock_vec", flags, DB_LOCK_NOWAIT)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	     (__lock_vec_api(env, lid, flags, list, nlist, elistp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+__lock_vec_api(env, lid, flags, list, nlist, elistp)
+	ENV *env;
+	u_int32_t lid, flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	DB_LOCKER *sh_locker;
+	int ret;
+
+	if ((ret =
+	    __lock_getlocker(env->lk_handle, lid, 0, &sh_locker)) == 0)
+		ret = __lock_vec(env, sh_locker, flags, list, nlist, elistp);
+	return (ret);
+}
+
+/*
+ * __lock_vec --
+ *	ENV->lock_vec.
+ *
+ *	Vector lock routine.  This function takes a set of operations
+ *	and performs them all at once.  In addition, lock_vec provides
+ *	functionality for lock inheritance, releasing all locks for a
+ *	given locker (used during transaction commit/abort), releasing
+ *	all locks on a given object, and generating debugging information.
+ *
+ * PUBLIC: int __lock_vec __P((ENV *,
+ * PUBLIC:     DB_LOCKER *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+ */
+int
+__lock_vec(env, sh_locker, flags, list, nlist, elistp)
+	ENV *env;
+	DB_LOCKER *sh_locker;
+	u_int32_t flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	struct __db_lock *lp, *next_lock;
+	DB_LOCK lock; DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DBT *objlist, *np;
+	u_int32_t ndx;
+	int did_abort, i, ret, run_dd, upgrade, writes;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	run_dd = 0;
+	LOCK_SYSTEM_LOCK(lt, region);
+	for (i = 0, ret = 0; i < nlist && ret == 0; i++)
+		switch (list[i].op) {
+		case DB_LOCK_GET_TIMEOUT:
+			LF_SET(DB_LOCK_SET_TIMEOUT);
+			/* FALLTHROUGH */
+		case DB_LOCK_GET:
+			if (IS_RECOVERING(env)) {
+				LOCK_INIT(list[i].lock);
+				break;
+			}
+			ret = __lock_get_internal(lt,
+			    sh_locker, flags, list[i].obj,
+			    list[i].mode, list[i].timeout, &list[i].lock);
+			break;
+		case DB_LOCK_INHERIT:
+			ret = __lock_inherit_locks(lt, sh_locker, flags);
+			break;
+		case DB_LOCK_PUT:
+			ret = __lock_put_nolock(env,
+			    &list[i].lock, &run_dd, flags);
+			break;
+		case DB_LOCK_PUT_ALL:		/* Put all locks. */
+		case DB_LOCK_PUT_READ:		/* Put read locks. */
+		case DB_LOCK_UPGRADE_WRITE:
+				/* Upgrade was_write and put read locks. */
+			/*
+			 * Since the locker may hold no
+			 * locks (i.e., you could call abort before you've
+			 * done any work), it's perfectly reasonable for there
+			 * to be no locker; this is not an error.
+			 */
+			if (sh_locker == NULL)
+				/*
+				 * If ret is set, then we'll generate an
+				 * error.  If it's not set, we have nothing
+				 * to do.
+				 */
+				break;
+			upgrade = 0;
+			writes = 1;
+			if (list[i].op == DB_LOCK_PUT_READ)
+				writes = 0;
+			else if (list[i].op == DB_LOCK_UPGRADE_WRITE) {
+				if (F_ISSET(sh_locker, DB_LOCKER_DIRTY))
+					upgrade = 1;
+				writes = 0;
+			}
+			objlist = list[i].obj;
+			if (objlist != NULL) {
+				/*
+				 * We know these should be ilocks,
+				 * but they could be something else,
+				 * so allocate room for the size too.
+				 */
+				objlist->size =
+				     sh_locker->nwrites * sizeof(DBT);
+				if ((ret = __os_malloc(env,
+				     objlist->size, &objlist->data)) != 0)
+					goto up_done;
+				memset(objlist->data, 0, objlist->size);
+				np = (DBT *) objlist->data;
+			} else
+				np = NULL;
+
+			/* Now traverse the locks, releasing each one. */
+			for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
+			    lp != NULL; lp = next_lock) {
+				sh_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+				next_lock = SH_LIST_NEXT(lp,
+				    locker_links, __db_lock);
+				if (writes == 1 ||
+				    lp->mode == DB_LOCK_READ ||
+				    lp->mode == DB_LOCK_READ_UNCOMMITTED) {
+					SH_LIST_REMOVE(lp,
+					    locker_links, __db_lock);
+					sh_obj = SH_OFF_TO_PTR(lp,
+					    lp->obj, DB_LOCKOBJ);
+					ndx = sh_obj->indx;
+					OBJECT_LOCK_NDX(lt, region, ndx);
+					/*
+					 * We are not letting lock_put_internal
+					 * unlink the lock, so we'll have to
+					 * update counts here.
+					 */
+					if (lp->status == DB_LSTAT_HELD) {
+						DB_ASSERT(env,
+						    sh_locker->nlocks != 0);
+						sh_locker->nlocks--;
+						if (IS_WRITELOCK(lp->mode))
+							sh_locker->nwrites--;
+					}
+					ret = __lock_put_internal(lt, lp,
+					    sh_obj->indx,
+					    DB_LOCK_FREE | DB_LOCK_DOALL);
+					OBJECT_UNLOCK(lt, region, ndx);
+					if (ret != 0)
+						break;
+					continue;
+				}
+				if (objlist != NULL) {
+					DB_ASSERT(env, (u_int8_t *)np <
+					     (u_int8_t *)objlist->data +
+					     objlist->size);
+					np->data = SH_DBT_PTR(&sh_obj->lockobj);
+					np->size = sh_obj->lockobj.size;
+					np++;
+				}
+			}
+			if (ret != 0)
+				goto up_done;
+
+			if (objlist != NULL)
+				if ((ret = __lock_fix_list(env,
+				     objlist, sh_locker->nwrites)) != 0)
+					goto up_done;
+			switch (list[i].op) {
+			case DB_LOCK_UPGRADE_WRITE:
+				/*
+				 * Upgrade all WWRITE locks to WRITE so
+				 * that we can abort a transaction which
+				 * was supporting dirty readers.
+				 */
+				if (upgrade != 1)
+					goto up_done;
+				SH_LIST_FOREACH(lp, &sh_locker->heldby,
+				    locker_links, __db_lock) {
+					if (lp->mode != DB_LOCK_WWRITE)
+						continue;
+					lock.off = R_OFFSET(&lt->reginfo, lp);
+					lock.gen = lp->gen;
+					F_SET(sh_locker, DB_LOCKER_INABORT);
+					if ((ret = __lock_get_internal(lt,
+					    sh_locker, flags | DB_LOCK_UPGRADE,
+					    NULL, DB_LOCK_WRITE, 0, &lock)) !=0)
+						break;
+				}
+			up_done:
+				/* FALLTHROUGH */
+			case DB_LOCK_PUT_READ:
+			case DB_LOCK_PUT_ALL:
+				break;
+			default:
+				break;
+			}
+			break;
+		case DB_LOCK_PUT_OBJ:
+			/* Remove all the locks associated with an object. */
+			OBJECT_LOCK(lt, region, list[i].obj, ndx);
+			if ((ret = __lock_getobj(lt, list[i].obj,
+			    ndx, 0, &sh_obj)) != 0 || sh_obj == NULL) {
+				if (ret == 0)
+					ret = EINVAL;
+				OBJECT_UNLOCK(lt, region, ndx);
+				break;
+			}
+
+			/*
+			 * Go through both waiters and holders.  Don't bother
+			 * to run promotion, because everyone is getting
+			 * released.  The processes waiting will still get
+			 * awakened as their waiters are released.
+			 */
+			for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock);
+			    ret == 0 && lp != NULL;
+			    lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock))
+				ret = __lock_put_internal(lt, lp, ndx,
+				    DB_LOCK_UNLINK |
+				    DB_LOCK_NOPROMOTE | DB_LOCK_DOALL);
+
+			/*
+			 * On the last time around, the object will get
+			 * reclaimed by __lock_put_internal, structure the
+			 * loop carefully so we do not get bitten.
+			 */
+			for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
+			    ret == 0 && lp != NULL;
+			    lp = next_lock) {
+				next_lock = SH_TAILQ_NEXT(lp, links, __db_lock);
+				ret = __lock_put_internal(lt, lp, ndx,
+				    DB_LOCK_UNLINK |
+				    DB_LOCK_NOPROMOTE | DB_LOCK_DOALL);
+			}
+			OBJECT_UNLOCK(lt, region, ndx);
+			break;
+
+		case DB_LOCK_TIMEOUT:
+			ret = __lock_set_timeout_internal(env,
+			    sh_locker, 0, DB_SET_TXN_NOW);
+			break;
+
+		case DB_LOCK_TRADE:
+			/*
+			 * INTERNAL USE ONLY.
+			 * Change the holder of the lock described in
+			 * list[i].lock to the locker-id specified by
+			 * the locker parameter.
+			 */
+			/*
+			 * You had better know what you're doing here.
+			 * We are trading locker-id's on a lock to
+			 * facilitate file locking on open DB handles.
+			 * We do not do any conflict checking on this,
+			 * so heaven help you if you use this flag under
+			 * any other circumstances.
+			 */
+			ret = __lock_trade(env, &list[i].lock, sh_locker);
+			break;
+#if defined(DEBUG) && defined(HAVE_STATISTICS)
+		case DB_LOCK_DUMP:
+			if (sh_locker == NULL)
+				break;
+
+			SH_LIST_FOREACH(
+			    lp, &sh_locker->heldby, locker_links, __db_lock)
+				__lock_printlock(lt, NULL, lp, 1);
+			break;
+#endif
+		default:
+			__db_errx(env, DB_STR_A("2035",
+			    "Invalid lock operation: %d", "%d"), list[i].op);
+			ret = EINVAL;
+			break;
+		}
+
+	if (ret == 0 && region->detect != DB_LOCK_NORUN &&
+	     (region->need_dd || timespecisset(&region->next_timeout)))
+		run_dd = 1;
+	LOCK_SYSTEM_UNLOCK(lt, region);
+
+	if (run_dd)
+		(void)__lock_detect(env, region->detect, &did_abort);
+
+	if (ret != 0 && elistp != NULL)
+		*elistp = &list[i - 1];
+
+	return (ret);
+}
+
+/*
+ * __lock_get_pp --
+ *	ENV->lock_get pre/post processing.
+ *
+ * PUBLIC: int __lock_get_pp __P((DB_ENV *,
+ * PUBLIC:     u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+ */
+int
+__lock_get_pp(dbenv, locker, flags, obj, lock_mode, lock)
+	DB_ENV *dbenv;
+	u_int32_t locker, flags;
+	DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_get", DB_INIT_LOCK);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB_ENV->lock_get", flags,
+	    DB_LOCK_NOWAIT | DB_LOCK_UPGRADE | DB_LOCK_SWITCH)) != 0)
+		return (ret);
+
+	if ((ret = __dbt_usercopy(env, obj)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	     (__lock_get_api(env, locker, flags, obj, lock_mode, lock)),
+	     0, ret);
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, obj, NULL, NULL);
+	return (ret);
+}
+
+static int
+__lock_get_api(env, locker, flags, obj, lock_mode, lock)
+	ENV *env;
+	u_int32_t locker, flags;
+	const DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	DB_LOCKER *sh_locker;
+	DB_LOCKREGION *region;
+	int ret;
+
+	COMPQUIET(region, NULL);
+
+	region = env->lk_handle->reginfo.primary;
+
+	LOCK_LOCKERS(env, region);
+	ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker);
+	UNLOCK_LOCKERS(env, region);
+	LOCK_SYSTEM_LOCK(env->lk_handle, region);
+	if (ret == 0)
+		ret = __lock_get_internal(env->lk_handle,
+		    sh_locker, flags, obj, lock_mode, 0, lock);
+	LOCK_SYSTEM_UNLOCK(env->lk_handle, region);
+	return (ret);
+}
+
+/*
+ * __lock_get --
+ *	ENV->lock_get.
+ *
+ * PUBLIC: int __lock_get __P((ENV *,
+ * PUBLIC:     DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+ */
+int
+__lock_get(env, locker, flags, obj, lock_mode, lock)
+	ENV *env;
+	DB_LOCKER *locker;
+	u_int32_t flags;
+	const DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	DB_LOCKTAB *lt;
+	int ret;
+
+	lt = env->lk_handle;
+
+	if (IS_RECOVERING(env) && !LF_ISSET(DB_LOCK_IGNORE_REC)) {
+		LOCK_INIT(*lock);
+		return (0);
+	}
+
+	LOCK_SYSTEM_LOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+	ret = __lock_get_internal(lt, locker, flags, obj, lock_mode, 0, lock);
+	LOCK_SYSTEM_UNLOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+	return (ret);
+}
+/*
+ * __lock_alloclock -- allocate a lock from another partition.
+ *	We assume we have the partition locked on entry and leave
+ * it unlocked on success since we will have to retry the lock operation.
+ * The mutex will still be locked if we are out of space.
+ */
+static int
+__lock_alloclock(lt, part_id)
+	DB_LOCKTAB *lt;
+	u_int32_t part_id;
+{
+#define	FREE_LIST_HEAD	free_locks
+#define	STRUCT_NAME	__db_lock
+#define	CURRENT_COUNT	st_locks
+#define	MAX_COUNT	st_maxlocks
+#define	STEAL_NAME	st_locksteals
+#define	STEAL_EVENT	steal
+
+#ifdef DEBUG
+	__db_loadme();
+#endif
+
+#include "lock_alloc.incl"
+}
+
+/*
+ * __lock_get_internal --
+ *	All the work for lock_get (and for the GET option of lock_vec) is done
+ *	inside of lock_get_internal.
+ *
+ * PUBLIC: int  __lock_get_internal __P((DB_LOCKTAB *, DB_LOCKER *, u_int32_t,
+ * PUBLIC:     const DBT *, db_lockmode_t, db_timeout_t, DB_LOCK *));
+ */
+int
+__lock_get_internal(lt, sh_locker, flags, obj, lock_mode, timeout, lock)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+	u_int32_t flags;
+	const DBT *obj;
+	db_lockmode_t lock_mode;
+	db_timeout_t timeout;
+	DB_LOCK *lock;
+{
+	struct __db_lock *newl, *lp;
+	ENV *env;
+	DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	DB_THREAD_INFO *ip;
+	u_int32_t ndx, part_id;
+	int did_abort, ihold, grant_dirty, no_dd, ret, t_ret;
+	roff_t holder, sh_off;
+
+	/*
+	 * We decide what action to take based on what locks are already held
+	 * and what locks are in the wait queue.
+	 */
+	enum {
+		GRANT,		/* Grant the lock. */
+		UPGRADE,	/* Upgrade the lock. */
+		HEAD,		/* Wait at head of wait queue. */
+		SECOND,		/* Wait as the second waiter. */
+		TAIL		/* Wait at tail of the wait queue. */
+	} action;
+
+	env = lt->env;
+	region = lt->reginfo.primary;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	if (sh_locker == NULL) {
+		__db_errx(env, DB_STR("2036", "Locker does not exist"));
+		return (EINVAL);
+	}
+
+	DB_ASSERT(env, lock_mode == DB_LOCK_WAIT || !LF_ISSET(DB_LOCK_SWITCH));
+
+	no_dd = ret = 0;
+	newl = NULL;
+	sh_obj = NULL;
+
+	/* Check that the lock mode is valid.  */
+	if (lock_mode >= (db_lockmode_t)region->nmodes) {
+		__db_errx(env, DB_STR_A("2037",
+		    "DB_ENV->lock_get: invalid lock mode %lu", "%lu"),
+		    (u_long)lock_mode);
+		return (EINVAL);
+	}
+
+again:	if (obj == NULL) {
+		DB_ASSERT(env, LOCK_ISSET(*lock));
+		lp = R_ADDR(&lt->reginfo, lock->off);
+		DB_ASSERT(env, lock->gen == lp->gen);
+		sh_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+		ndx = sh_obj->indx;
+		OBJECT_LOCK_NDX(lt, region, ndx);
+	} else {
+		/* Allocate a shared memory new object. */
+		OBJECT_LOCK(lt, region, obj, lock->ndx);
+		ndx = lock->ndx;
+		if ((ret = __lock_getobj(lt,
+		    obj, lock->ndx, !LF_ISSET(DB_LOCK_CHECK), &sh_obj)) != 0)
+			goto err;
+#ifdef DIAGNOSTIC
+		if (sh_obj == NULL) {
+			ret = ENOENT;
+			goto err;
+		}
+		if (LF_ISSET(DB_LOCK_UPGRADE)) {
+			DB_ASSERT(env, LOCK_ISSET(*lock));
+			lp = R_ADDR(&lt->reginfo, lock->off);
+			DB_ASSERT(env, lock->gen == lp->gen);
+			DB_ASSERT(env,
+			    SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ) == sh_obj);
+		}
+#endif
+	}
+
+#ifdef HAVE_STATISTICS
+	if (LF_ISSET(DB_LOCK_UPGRADE))
+		STAT_INC_VERB(env, lock, upgrade,
+		    lt->obj_stat[ndx].st_nupgrade,
+		    (DBT *) obj, sh_locker->id);
+	else if (!LF_ISSET(DB_LOCK_SWITCH | DB_LOCK_CHECK))
+		STAT_INC_VERB(env, lock, request,
+		    lt->obj_stat[ndx].st_nrequests,
+		    (DBT *) obj, sh_locker->id);
+#endif
+
+	/*
+	 * Figure out if we can grant this lock or if it should wait.
+	 * By default, we can grant the new lock if it does not conflict with
+	 * anyone on the holders list OR anyone on the waiters list.
+	 * The reason that we don't grant if there's a conflict is that
+	 * this can lead to starvation (a writer waiting on a popularly
+	 * read item will never be granted).  The downside of this is that
+	 * a waiting reader can prevent an upgrade from reader to writer,
+	 * which is not uncommon.
+	 *
+	 * There are two exceptions to the no-conflict rule.  First, if
+	 * a lock is held by the requesting locker AND the new lock does
+	 * not conflict with any other holders, then we grant the lock.
+	 * The most common place this happens is when the holder has a
+	 * WRITE lock and a READ lock request comes in for the same locker.
+	 * If we do not grant the read lock, then we guarantee deadlock.
+	 * Second, dirty readers are granted if at all possible while
+	 * avoiding starvation, see below.
+	 *
+	 * In case of conflict, we put the new lock on the end of the waiters
+	 * list, unless we are upgrading or this is a dirty reader in which
+	 * case the locker goes at or near the front of the list.
+	 */
+	ihold = 0;
+	grant_dirty = 0;
+	holder = 0;
+
+	/*
+	 * DB_LOCK_WAIT is is a special case used by the queue
+	 * access method when we want to get an entry which is past
+	 * the end of the queue.  With CDB we have a DB_READ_LOCK and
+	 * need to switch it to DB_LOCK_WAIT. Otherwise we insert a
+	 * DB_LOCK_WAIT and and then after releasing the metadata
+	 * page wait on it and join the waiters queue.  This must be
+	 * done as a single operation so that another locker cannot
+	 * get in and fail to wake us up.
+	 */
+	if (lock_mode == DB_LOCK_WAIT)
+		lp = NULL;
+	else
+		lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
+
+	sh_off = R_OFFSET(&lt->reginfo, sh_locker);
+	for (; lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
+		if (sh_off == lp->holder) {
+			if (lp->mode == lock_mode &&
+			    lp->status == DB_LSTAT_HELD) {
+				if (LF_ISSET(DB_LOCK_UPGRADE))
+					goto upgrade;
+
+#ifdef DIAGNOSTIC
+				if (LF_ISSET(DB_LOCK_CHECK))
+					goto done;
+#endif
+
+				/*
+				 * Lock is held, so we can increment the
+				 * reference count and return this lock
+				 * to the caller.  We do not count reference
+				 * increments towards the locks held by
+				 * the locker.
+				 */
+				lp->refcount++;
+				lock->off = R_OFFSET(&lt->reginfo, lp);
+				lock->gen = lp->gen;
+				lock->mode = lp->mode;
+				goto done;
+			} else {
+				ihold = 1;
+			}
+		} else if (__lock_same_family(lt,
+		    R_ADDR(&lt->reginfo, lp->holder), sh_locker))
+			ihold = 1;
+		else if (CONFLICTS(lt, region, lp->mode, lock_mode))
+			break;
+		else if (lp->mode == DB_LOCK_READ ||
+		     lp->mode == DB_LOCK_WWRITE) {
+			grant_dirty = 1;
+			holder = lp->holder;
+		}
+	}
+
+#ifdef DIAGNOSTIC
+	if (LF_ISSET(DB_LOCK_CHECK)) {
+		ret = ENOENT;
+		goto err;
+	}
+#endif
+
+	/*
+	 * If there are conflicting holders we will have to wait.  If we
+	 * already hold a lock on this object or are doing an upgrade or
+	 * this is a dirty reader it goes to the head of the queue, everyone
+	 * else to the back.
+	 */
+	if (lp != NULL) {
+		if (ihold || LF_ISSET(DB_LOCK_UPGRADE) ||
+		    lock_mode == DB_LOCK_READ_UNCOMMITTED)
+			action = HEAD;
+		else
+			action = TAIL;
+	} else {
+		if (LF_ISSET(DB_LOCK_UPGRADE))
+			action = UPGRADE;
+		else if (lock_mode == DB_LOCK_WAIT)
+			action = TAIL;
+		else if (ihold)
+			action = GRANT;
+		else {
+			/*
+			 * Look for conflicting waiters.
+			 */
+			SH_TAILQ_FOREACH(lp, &sh_obj->waiters, links, __db_lock)
+				if (lp->holder != sh_off &&
+				    CONFLICTS(lt, region, lp->mode, lock_mode))
+					break;
+
+			/*
+			 * If there are no conflicting holders or waiters,
+			 * then we grant. Normally when we wait, we
+			 * wait at the end (TAIL).  However, the goal of
+			 * DIRTY_READ locks to allow forward progress in the
+			 * face of updating transactions, so we try to allow
+			 * all DIRTY_READ requests to proceed as rapidly
+			 * as possible, so long as we can prevent starvation.
+			 *
+			 * When determining how to queue a DIRTY_READ
+			 * request:
+			 *
+			 *	1. If there is a waiting upgrading writer,
+			 *	   then we enqueue the dirty reader BEHIND it
+			 *	   (second in the queue).
+			 *	2. Else, if the current holders are either
+			 *	   READ or WWRITE, we grant
+			 *	3. Else queue SECOND i.e., behind the first
+			 *	   waiter.
+			 *
+			 * The end result is that dirty_readers get to run
+			 * so long as other lockers are blocked.  Once
+			 * there is a locker which is only waiting on
+			 * dirty readers then they queue up behind that
+			 * locker so that it gets to run.  In general
+			 * this locker will be a WRITE which will shortly
+			 * get downgraded to a WWRITE, permitting the
+			 * DIRTY locks to be granted.
+			 */
+			if (lp == NULL)
+				action = GRANT;
+			else if (grant_dirty &&
+			    lock_mode == DB_LOCK_READ_UNCOMMITTED) {
+				/*
+				 * An upgrade will be at the head of the
+				 * queue.
+				 */
+				lp = SH_TAILQ_FIRST(
+				     &sh_obj->waiters, __db_lock);
+				if (lp->mode == DB_LOCK_WRITE &&
+				     lp->holder == holder)
+					action = SECOND;
+				else
+					action = GRANT;
+			} else if (lock_mode == DB_LOCK_READ_UNCOMMITTED)
+				action = SECOND;
+			else
+				action = TAIL;
+		}
+	}
+
+	switch (action) {
+	case HEAD:
+	case TAIL:
+	case SECOND:
+		if (LF_ISSET(DB_LOCK_NOWAIT) && lock_mode != DB_LOCK_WAIT) {
+			ret = DB_LOCK_NOTGRANTED;
+			STAT_INC_VERB(env, lock, nowait_notgranted,
+			    region->stat.st_lock_nowait,
+			    (DBT *) obj, sh_locker->id);
+			goto err;
+		}
+		/* FALLTHROUGH */
+	case GRANT:
+		part_id = LOCK_PART(region, ndx);
+		/* Allocate a new lock. */
+		if ((newl = SH_TAILQ_FIRST(
+		    &FREE_LOCKS(lt, part_id), __db_lock)) == NULL) {
+			if ((ret = __lock_alloclock(lt, part_id)) != 0)
+				goto err;
+			/* Allocation dropped the mutex, start over. */
+			OBJECT_UNLOCK(lt, region, ndx);
+			sh_obj = NULL;
+			goto again;
+		}
+		SH_TAILQ_REMOVE(
+		    &FREE_LOCKS(lt, part_id), newl, links, __db_lock);
+
+#ifdef HAVE_STATISTICS
+		/*
+		 * Keep track of the maximum number of locks allocated
+		 * in each partition and the maximum number of locks
+		 * used by any one bucket.
+		 */
+		if (++lt->obj_stat[ndx].st_nlocks >
+		    lt->obj_stat[ndx].st_maxnlocks)
+			lt->obj_stat[ndx].st_maxnlocks =
+			    lt->obj_stat[ndx].st_nlocks;
+		if (++lt->part_array[part_id].part_stat.st_nlocks >
+		    lt->part_array[part_id].part_stat.st_maxnlocks)
+			lt->part_array[part_id].part_stat.st_maxnlocks =
+			    lt->part_array[part_id].part_stat.st_nlocks;
+#endif
+
+		newl->holder = R_OFFSET(&lt->reginfo, sh_locker);
+		newl->refcount = 1;
+		newl->mode = lock_mode;
+		newl->obj = (roff_t)SH_PTR_TO_OFF(newl, sh_obj);
+		newl->indx = sh_obj->indx;
+		newl->mtx_lock = MUTEX_INVALID;
+		/*
+		 * Now, insert the lock onto its locker's list.
+		 * If the locker does not currently hold any locks,
+		 * there's no reason to run a deadlock
+		 * detector, save that information.
+		 */
+		no_dd = sh_locker->master_locker == INVALID_ROFF &&
+		    SH_LIST_FIRST(
+		    &sh_locker->child_locker, __db_locker) == NULL &&
+		    SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL;
+
+		SH_LIST_INSERT_HEAD(
+		    &sh_locker->heldby, newl, locker_links, __db_lock);
+
+		break;
+
+	case UPGRADE:
+upgrade:	lp = R_ADDR(&lt->reginfo, lock->off);
+		DB_ASSERT(env, lock->gen == lp->gen);
+		if (IS_WRITELOCK(lock_mode) && !IS_WRITELOCK(lp->mode))
+			sh_locker->nwrites++;
+		lp->mode = lock_mode;
+		/* If we are upgrading to a WAIT we must wait. */
+		if (lock_mode != DB_LOCK_WAIT)
+			goto done;
+		if (lp->status != DB_LSTAT_WAITING) {
+			/* We have already been granted. */
+			MUTEX_LOCK(env, lp->mtx_lock);
+			newl = lp;
+			if (lp->status == DB_LSTAT_EXPIRED)
+				goto expired;
+			DB_ASSERT(env, lp->status == DB_LSTAT_PENDING);
+			SH_TAILQ_REMOVE(
+			    &sh_obj->holders, newl, links, __db_lock);
+			newl->links.stqe_prev = -1;
+			goto done;
+		}
+		COMPQUIET(action, UPGRADE);
+	}
+
+	switch (action) {
+	case GRANT:
+		newl->status = DB_LSTAT_HELD;
+		SH_TAILQ_INSERT_TAIL(&sh_obj->holders, newl, links);
+		break;
+	case UPGRADE:
+		DB_ASSERT(env, lock_mode == DB_LOCK_WAIT);
+		/* FALLTHROUGH */
+	case HEAD:
+	case TAIL:
+	case SECOND:
+		if ((lp =
+		    SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) == NULL) {
+			LOCK_DD(env, region);
+			SH_TAILQ_INSERT_HEAD(&region->dd_objs,
+				    sh_obj, dd_links, __db_lockobj);
+			UNLOCK_DD(env, region);
+		}
+		switch (action) {
+		case HEAD:
+			SH_TAILQ_INSERT_HEAD(
+			     &sh_obj->waiters, newl, links, __db_lock);
+			break;
+		case SECOND:
+			SH_TAILQ_INSERT_AFTER(
+			     &sh_obj->waiters, lp, newl, links, __db_lock);
+			break;
+		case TAIL:
+			SH_TAILQ_INSERT_TAIL(&sh_obj->waiters, newl, links);
+			break;
+		case UPGRADE:
+			/* The lock is already in the queue. */
+			newl = R_ADDR(&lt->reginfo, lock->off);
+			break;
+		default:
+			DB_ASSERT(env, 0);
+		}
+
+		/*
+		 * First check to see if this txn has expired.
+		 * If not then see if the lock timeout is past
+		 * the expiration of the txn, if it is, use
+		 * the txn expiration time.  lk_expire is passed
+		 * to avoid an extra call to get the time.
+		 */
+		timespecclear(&sh_locker->lk_expire);
+		if (__clock_expired(env,
+		    &sh_locker->lk_expire, &sh_locker->tx_expire)) {
+			newl->status = DB_LSTAT_EXPIRED;
+			sh_locker->lk_expire = sh_locker->tx_expire;
+
+			/* We are done. */
+			goto expired;
+		}
+
+		/*
+		 * If a timeout was specified in this call then it
+		 * takes priority.  If a lock timeout has been specified
+		 * for this transaction then use that, otherwise use
+		 * the global timeout value.
+		 */
+		if (!LF_ISSET(DB_LOCK_SET_TIMEOUT)) {
+			if (F_ISSET(sh_locker, DB_LOCKER_TIMEOUT))
+				timeout = sh_locker->lk_timeout;
+			else
+				timeout = region->lk_timeout;
+		}
+
+		/*
+		 * For queue we insert the WAIT lock and don't wait on it.
+		 * That way we can unpin the metadata page first and then
+		 * block.
+		 */
+		if (lock_mode == DB_LOCK_WAIT && LF_ISSET(DB_LOCK_NOWAIT)) {
+			newl->mtx_lock = sh_locker->mtx_locker;
+			newl->status = DB_LSTAT_WAITING;
+			goto out;
+		}
+
+		if (timeout != 0)
+			__clock_set_expires(env,
+			    &sh_locker->lk_expire, timeout);
+		else
+			timespecclear(&sh_locker->lk_expire);
+
+		if (timespecisset(&sh_locker->tx_expire) &&
+			(timeout == 0 || __clock_expired(env,
+			    &sh_locker->lk_expire, &sh_locker->tx_expire)))
+				sh_locker->lk_expire = sh_locker->tx_expire;
+		if (timespecisset(&sh_locker->lk_expire) &&
+		    (!timespecisset(&region->next_timeout) ||
+		    timespeccmp(
+		    &region->next_timeout, &sh_locker->lk_expire, >)))
+			region->next_timeout = sh_locker->lk_expire;
+
+in_abort:	newl->status = DB_LSTAT_WAITING;
+		newl->mtx_lock = sh_locker->mtx_locker;
+		STAT(lt->obj_stat[ndx].st_lock_wait++);
+		/* We are about to block, deadlock detector must run. */
+		region->need_dd = 1;
+
+		OBJECT_UNLOCK(lt, region, sh_obj->indx);
+
+		/* If we are switching drop the lock we had. */
+		if (LF_ISSET(DB_LOCK_SWITCH) &&
+		    (ret = __lock_put_nolock(env, lock, &ihold, 0)) != 0) {
+			OBJECT_LOCK_NDX(lt, region, sh_obj->indx);
+			(void)__lock_remove_waiter(
+			    lt, sh_obj, newl, DB_LSTAT_FREE);
+			goto err;
+		}
+
+		LOCK_SYSTEM_UNLOCK(lt, region);
+
+		/*
+		 * Before waiting, see if the deadlock detector should run.
+		 */
+		if (region->detect != DB_LOCK_NORUN && !no_dd)
+			(void)__lock_detect(env, region->detect, &did_abort);
+
+		ip = NULL;
+		if (env->thr_hashtab != NULL &&
+		     (ret = __env_set_state(env, &ip, THREAD_BLOCKED)) != 0) {
+			LOCK_SYSTEM_LOCK(lt, region);
+			OBJECT_LOCK_NDX(lt, region, ndx);
+			goto err;
+		}
+
+		PERFMON2(env, lock, suspend, (DBT *) obj, lock_mode);
+		MUTEX_LOCK(env, newl->mtx_lock);
+		PERFMON2(env, lock, resume, (DBT *) obj, lock_mode);
+
+		if (ip != NULL)
+			ip->dbth_state = THREAD_ACTIVE;
+
+		LOCK_SYSTEM_LOCK(lt, region);
+		OBJECT_LOCK_NDX(lt, region, ndx);
+
+		/* Turn off lock timeout. */
+		if (newl->status != DB_LSTAT_EXPIRED)
+			timespecclear(&sh_locker->lk_expire);
+
+		switch (newl->status) {
+		case DB_LSTAT_ABORTED:
+			/*
+			 * If we raced with the deadlock detector and it
+			 * mistakenly picked this transaction to abort again
+			 * ignore the abort and request the lock again.
+			 */
+			if (F_ISSET(sh_locker, DB_LOCKER_INABORT))
+				goto in_abort;
+			ret = DB_LOCK_DEADLOCK;
+			goto err;
+		case DB_LSTAT_EXPIRED:
+expired:		ret = __lock_put_internal(lt, newl,
+			    ndx, DB_LOCK_UNLINK | DB_LOCK_FREE);
+			newl = NULL;
+			if (ret != 0)
+				goto err;
+#ifdef HAVE_STATISTICS
+			if (timespeccmp(
+			    &sh_locker->lk_expire, &sh_locker->tx_expire, ==))
+				STAT_INC(env, lock, txntimeout,
+				    lt->obj_stat[ndx].st_ntxntimeouts,
+				    (DBT *) obj);
+			else
+				STAT_INC(env, lock, locktimeout,
+				    lt->obj_stat[ndx].st_nlocktimeouts,
+				    (DBT *) obj);
+#endif
+			ret = DB_LOCK_NOTGRANTED;
+			timespecclear(&sh_locker->lk_expire);
+			goto err;
+		case DB_LSTAT_PENDING:
+			if (LF_ISSET(DB_LOCK_UPGRADE)) {
+				/*
+				 * The lock just granted got put on the holders
+				 * list.  Since we're upgrading some other lock,
+				 * we've got to remove it here.
+				 */
+				SH_TAILQ_REMOVE(
+				    &sh_obj->holders, newl, links, __db_lock);
+				/*
+				 * Ensure the object is not believed to be on
+				 * the object's lists, if we're traversing by
+				 * locker.
+				 */
+				newl->links.stqe_prev = -1;
+				if (newl->mode == DB_LOCK_WAIT)
+					goto done;
+				goto upgrade;
+			} else
+				newl->status = DB_LSTAT_HELD;
+			break;
+		case DB_LSTAT_FREE:
+		case DB_LSTAT_HELD:
+		case DB_LSTAT_WAITING:
+		default:
+			__db_errx(env, DB_STR_A("2038",
+			    "Unexpected lock status: %d", "%d"),
+			    (int)newl->status);
+			ret = __env_panic(env, EINVAL);
+			goto err;
+		}
+	}
+
+out:	lock->off = R_OFFSET(&lt->reginfo, newl);
+	lock->gen = newl->gen;
+	lock->mode = newl->mode;
+	sh_locker->nlocks++;
+	if (IS_WRITELOCK(newl->mode)) {
+		sh_locker->nwrites++;
+		if (newl->mode == DB_LOCK_WWRITE)
+			F_SET(sh_locker, DB_LOCKER_DIRTY);
+	}
+
+	OBJECT_UNLOCK(lt, region, ndx);
+	return (0);
+
+err:	if (!LF_ISSET(DB_LOCK_UPGRADE | DB_LOCK_SWITCH))
+		LOCK_INIT(*lock);
+
+done:	if (newl != NULL &&
+	     (t_ret = __lock_freelock(lt, newl, sh_locker,
+	     DB_LOCK_FREE | DB_LOCK_UNLINK)) != 0 && ret == 0)
+		ret = t_ret;
+	OBJECT_UNLOCK(lt, region, ndx);
+
+	return (ret);
+}
+
+/*
+ * __lock_put_pp --
+ *	ENV->lock_put pre/post processing.
+ *
+ * PUBLIC: int  __lock_put_pp __P((DB_ENV *, DB_LOCK *));
+ */
+int
+__lock_put_pp(dbenv, lock)
+	DB_ENV *dbenv;
+	DB_LOCK *lock;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_LOCK->lock_put", DB_INIT_LOCK);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__lock_put(env, lock)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_put --
+ *
+ * PUBLIC: int  __lock_put __P((ENV *, DB_LOCK *));
+ *  Internal lock_put interface.
+ */
+int
+__lock_put(env, lock)
+	ENV *env;
+	DB_LOCK *lock;
+{
+	DB_LOCKTAB *lt;
+	int ret, run_dd;
+
+	if (IS_RECOVERING(env))
+		return (0);
+
+	lt = env->lk_handle;
+
+	LOCK_SYSTEM_LOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+	ret = __lock_put_nolock(env, lock, &run_dd, 0);
+	LOCK_SYSTEM_UNLOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+
+	/*
+	 * Only run the lock detector if put told us to AND we are running
+	 * in auto-detect mode.  If we are not running in auto-detect, then
+	 * a call to lock_detect here will 0 the need_dd bit, but will not
+	 * actually abort anything.
+	 */
+	if (ret == 0 && run_dd)
+		(void)__lock_detect(env,
+		    ((DB_LOCKREGION *)lt->reginfo.primary)->detect, NULL);
+	return (ret);
+}
+
+static int
+__lock_put_nolock(env, lock, runp, flags)
+	ENV *env;
+	DB_LOCK *lock;
+	int *runp;
+	u_int32_t flags;
+{
+	struct __db_lock *lockp;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	lockp = R_ADDR(&lt->reginfo, lock->off);
+	DB_ASSERT(env, lock->gen == lockp->gen);
+	if (lock->gen != lockp->gen) {
+		__db_errx(env, __db_lock_invalid, "DB_LOCK->lock_put");
+		LOCK_INIT(*lock);
+		return (EINVAL);
+	}
+
+	OBJECT_LOCK_NDX(lt, region, lock->ndx);
+	ret = __lock_put_internal(lt,
+	    lockp, lock->ndx, flags | DB_LOCK_UNLINK | DB_LOCK_FREE);
+	OBJECT_UNLOCK(lt, region, lock->ndx);
+
+	LOCK_INIT(*lock);
+
+	*runp = 0;
+	if (ret == 0 && region->detect != DB_LOCK_NORUN &&
+	     (region->need_dd || timespecisset(&region->next_timeout)))
+		*runp = 1;
+
+	return (ret);
+}
+
+/*
+ * __lock_downgrade --
+ *
+ * Used to downgrade locks.  Currently this is used in three places: 1) by the
+ * Concurrent Data Store product to downgrade write locks back to iwrite locks
+ * and 2) to downgrade write-handle locks to read-handle locks at the end of
+ * an open/create. 3) To downgrade write locks to was_write to support dirty
+ * reads.
+ *
+ * PUBLIC: int __lock_downgrade __P((ENV *,
+ * PUBLIC:     DB_LOCK *, db_lockmode_t, u_int32_t));
+ */
+int
+__lock_downgrade(env, lock, new_mode, flags)
+	ENV *env;
+	DB_LOCK *lock;
+	db_lockmode_t new_mode;
+	u_int32_t flags;
+{
+	struct __db_lock *lockp;
+	DB_LOCKER *sh_locker;
+	DB_LOCKOBJ *obj;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	ret = 0;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	LOCK_SYSTEM_LOCK(lt, region);
+
+	lockp = R_ADDR(&lt->reginfo, lock->off);
+	if (lock->gen != lockp->gen) {
+		__db_errx(env, __db_lock_invalid, "lock_downgrade");
+		ret = EINVAL;
+		goto out;
+	}
+
+	sh_locker = R_ADDR(&lt->reginfo, lockp->holder);
+
+	if (IS_WRITELOCK(lockp->mode) && !IS_WRITELOCK(new_mode))
+		sh_locker->nwrites--;
+
+	lockp->mode = new_mode;
+	lock->mode = new_mode;
+
+	/* Get the object associated with this lock. */
+	obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+	OBJECT_LOCK_NDX(lt, region, obj->indx);
+	STAT(lt->obj_stat[obj->indx].st_ndowngrade++);
+	ret = __lock_promote(lt, obj, NULL, flags);
+	OBJECT_UNLOCK(lt, region, obj->indx);
+
+out:	LOCK_SYSTEM_UNLOCK(lt, region);
+	return (ret);
+}
+
+/*
+ * __lock_put_internal -- put a lock structure
+ * We assume that we are called with the proper object locked.
+ */
+static int
+__lock_put_internal(lt, lockp, obj_ndx, flags)
+	DB_LOCKTAB *lt;
+	struct __db_lock *lockp;
+	u_int32_t obj_ndx, flags;
+{
+	DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	ENV *env;
+	u_int32_t part_id;
+	int ret, state_changed;
+
+	COMPQUIET(env, NULL);
+	env = lt->env;
+	region = lt->reginfo.primary;
+	ret = state_changed = 0;
+
+	if (!OBJ_LINKS_VALID(lockp)) {
+		/*
+		 * Someone removed this lock while we were doing a release
+		 * by locker id.  We are trying to free this lock, but it's
+		 * already been done; all we need to do is return it to the
+		 * free list.
+		 */
+		(void)__lock_freelock(lt, lockp, NULL, DB_LOCK_FREE);
+		return (0);
+	}
+
+#ifdef HAVE_STATISTICS
+	if (LF_ISSET(DB_LOCK_DOALL))
+		lt->obj_stat[obj_ndx].st_nreleases += lockp->refcount;
+	else
+		lt->obj_stat[obj_ndx].st_nreleases++;
+#endif
+
+	if (!LF_ISSET(DB_LOCK_DOALL) && lockp->refcount > 1) {
+		lockp->refcount--;
+		PERFMON2(env, lock, put_reduce_count,
+		    &(SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ))->lockobj,
+		    flags);
+		return (0);
+	}
+
+	/* Increment generation number. */
+	lockp->gen++;
+
+	/* Get the object associated with this lock. */
+	sh_obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+
+	PERFMON2(env, lock, put, &sh_obj->lockobj, flags);
+	/*
+	 * Remove this lock from its holders/waitlist.  Set its status
+	 * to ABORTED.  It may get freed below, but if not then the
+	 * waiter has been aborted (it will panic if the lock is
+	 * free).
+	 */
+	if (lockp->status != DB_LSTAT_HELD &&
+	    lockp->status != DB_LSTAT_PENDING) {
+		DB_ASSERT(env, lockp !=
+		     SH_TAILQ_FIRST(&sh_obj->holders, __db_lock));
+		if ((ret = __lock_remove_waiter(
+		    lt, sh_obj, lockp, DB_LSTAT_ABORTED)) != 0)
+			return (ret);
+	} else {
+		DB_ASSERT(env, lockp !=
+		     SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock));
+		SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock);
+		lockp->links.stqe_prev = -1;
+	}
+
+	if (LF_ISSET(DB_LOCK_NOPROMOTE))
+		state_changed = 0;
+	else if ((ret = __lock_promote(lt,
+		    sh_obj, &state_changed, flags)) != 0)
+			return (ret);
+
+	/* Check if object should be reclaimed. */
+	if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL &&
+	    SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+		part_id = LOCK_PART(region, obj_ndx);
+		SH_TAILQ_REMOVE(
+		    &lt->obj_tab[obj_ndx], sh_obj, links, __db_lockobj);
+		if (sh_obj->lockobj.size > sizeof(sh_obj->objdata)) {
+			if (region->part_t_size != 1)
+				LOCK_REGION_LOCK(env);
+			__env_alloc_free(&lt->reginfo,
+			    SH_DBT_PTR(&sh_obj->lockobj));
+			if (region->part_t_size != 1)
+				LOCK_REGION_UNLOCK(env);
+		}
+		SH_TAILQ_INSERT_HEAD(
+		    &FREE_OBJS(lt, part_id), sh_obj, links, __db_lockobj);
+		sh_obj->generation++;
+		STAT(lt->part_array[part_id].part_stat.st_nobjects--);
+		STAT(lt->obj_stat[obj_ndx].st_nobjects--);
+		state_changed = 1;
+	}
+
+	/* Free lock. */
+	if (LF_ISSET(DB_LOCK_UNLINK | DB_LOCK_FREE))
+		ret = __lock_freelock(lt, lockp,
+		     R_ADDR(&lt->reginfo, lockp->holder), flags);
+
+	/*
+	 * If we did not promote anyone; we need to run the deadlock
+	 * detector again.
+	 */
+	if (state_changed == 0)
+		region->need_dd = 1;
+
+	return (ret);
+}
+
+/*
+ * __lock_freelock --
+ *	Free a lock.  Unlink it from its locker if necessary.
+ * We must hold the object lock.
+ *
+ */
+static int
+__lock_freelock(lt, lockp, sh_locker, flags)
+	DB_LOCKTAB *lt;
+	struct __db_lock *lockp;
+	DB_LOCKER *sh_locker;
+	u_int32_t flags;
+{
+	DB_LOCKREGION *region;
+	ENV *env;
+	u_int32_t part_id;
+	int ret;
+
+	env = lt->env;
+	region = lt->reginfo.primary;
+
+	if (LF_ISSET(DB_LOCK_UNLINK)) {
+		SH_LIST_REMOVE(lockp, locker_links, __db_lock);
+		if (lockp->status == DB_LSTAT_HELD) {
+			sh_locker->nlocks--;
+			if (IS_WRITELOCK(lockp->mode))
+				sh_locker->nwrites--;
+		}
+	}
+
+	if (LF_ISSET(DB_LOCK_FREE)) {
+		/*
+		 * If the lock is not held we cannot be sure of its mutex
+		 * state so we refresh it.
+		 */
+		part_id = LOCK_PART(region, lockp->indx);
+		if (lockp->mtx_lock != MUTEX_INVALID &&
+		     lockp->status != DB_LSTAT_HELD &&
+		     lockp->status != DB_LSTAT_EXPIRED) {
+			if ((ret = __mutex_refresh(env, lockp->mtx_lock)) != 0)
+				return (ret);
+			MUTEX_LOCK(env, lockp->mtx_lock);
+		}
+
+		lockp->status = DB_LSTAT_FREE;
+		SH_TAILQ_INSERT_HEAD(&FREE_LOCKS(lt, part_id),
+		     lockp, links, __db_lock);
+		STAT(lt->part_array[part_id].part_stat.st_nlocks--);
+		STAT(lt->obj_stat[lockp->indx].st_nlocks--);
+	}
+
+	return (0);
+}
+
+#undef FREE_LIST_HEAD
+#undef STRUCT_NAME
+#undef CURRENT_COUNT
+#undef MAX_COUNT
+#undef STEAL_NAME
+#undef STEAL_EVENT
+/*
+ * __lock_allocobj -- allocate a object from another partition.
+ *	We assume we have the partition locked on entry and leave
+ * with the same partition locked on exit.
+ */
+static int
+__lock_allocobj(lt, part_id)
+	DB_LOCKTAB *lt;
+	u_int32_t part_id;
+{
+#define	FREE_LIST_HEAD	free_objs
+#define	STRUCT_NAME	__db_lockobj
+#define	CURRENT_COUNT	st_objects
+#define	MAX_COUNT	st_maxobjects
+#define	STEAL_NAME	st_objectsteals
+#define	STEAL_EVENT	object_steal
+
+#ifdef DEBUG
+	__db_loadme();
+#endif
+
+#include "lock_alloc.incl"
+
+}
+
+/*
+ * __lock_getobj --
+ *	Get an object in the object hash table.  The create parameter
+ * indicates if the object should be created if it doesn't exist in
+ * the table.
+ *
+ * This must be called with the object bucket locked.
+ */
+static int
+__lock_getobj(lt, obj, ndx, create, retp)
+	DB_LOCKTAB *lt;
+	const DBT *obj;
+	u_int32_t ndx;
+	int create;
+	DB_LOCKOBJ **retp;
+{
+	DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	ENV *env;
+	int ret;
+	void *p;
+	u_int32_t len, part_id;
+
+	env = lt->env;
+	region = lt->reginfo.primary;
+	len = 0;
+
+	/* Look up the object in the hash table. */
+retry:	SH_TAILQ_FOREACH(sh_obj, &lt->obj_tab[ndx], links, __db_lockobj) {
+		len++;
+		if (obj->size == sh_obj->lockobj.size &&
+		    memcmp(obj->data,
+		    SH_DBT_PTR(&sh_obj->lockobj), obj->size) == 0)
+			break;
+	}
+
+	/*
+	 * If we found the object, then we can just return it.  If
+	 * we didn't find the object, then we need to create it.
+	 */
+	if (sh_obj == NULL && create) {
+		/* Create new object and then insert it into hash table. */
+		part_id = LOCK_PART(region, ndx);
+		if ((sh_obj = SH_TAILQ_FIRST(&FREE_OBJS(
+		    lt, part_id), __db_lockobj)) == NULL) {
+			if ((ret = __lock_allocobj(lt, part_id)) == 0)
+				goto retry;
+			goto err;
+		}
+
+		/*
+		 * If we can fit this object in the structure, do so instead
+		 * of alloc-ing space for it.
+		 */
+		if (obj->size <= sizeof(sh_obj->objdata))
+			p = sh_obj->objdata;
+		else {
+			/*
+			 * If we have only one partition, the region is locked.
+			 */
+			if (region->part_t_size != 1)
+				LOCK_REGION_LOCK(env);
+			ret = __env_alloc(&lt->reginfo, obj->size, &p);
+			if (region->part_t_size != 1)
+				LOCK_REGION_UNLOCK(env);
+			if (ret != 0) {
+				__db_errx(env,
+				    "No space for lock object storage");
+				goto err;
+			}
+		}
+
+		memcpy(p, obj->data, obj->size);
+
+		SH_TAILQ_REMOVE(&FREE_OBJS(
+		    lt, part_id), sh_obj, links, __db_lockobj);
+#ifdef HAVE_STATISTICS
+		/*
+		 * Keep track of both the max number of objects allocated
+		 * per partition and the max number of objects used by
+		 * this bucket.
+		 */
+		len++;
+		if (++lt->obj_stat[ndx].st_nobjects >
+		    lt->obj_stat[ndx].st_maxnobjects)
+			lt->obj_stat[ndx].st_maxnobjects =
+			    lt->obj_stat[ndx].st_nobjects;
+		if (++lt->part_array[part_id].part_stat.st_nobjects >
+		    lt->part_array[part_id].part_stat.st_maxnobjects)
+			lt->part_array[part_id].part_stat.st_maxnobjects =
+			    lt->part_array[part_id].part_stat.st_nobjects;
+#endif
+
+		sh_obj->indx = ndx;
+		SH_TAILQ_INIT(&sh_obj->waiters);
+		SH_TAILQ_INIT(&sh_obj->holders);
+		sh_obj->lockobj.size = obj->size;
+		sh_obj->lockobj.off =
+		    (roff_t)SH_PTR_TO_OFF(&sh_obj->lockobj, p);
+		SH_TAILQ_INSERT_HEAD(
+		    &lt->obj_tab[ndx], sh_obj, links, __db_lockobj);
+	}
+
+#ifdef HAVE_STATISTICS
+	if (len > lt->obj_stat[ndx].st_hash_len)
+		lt->obj_stat[ndx].st_hash_len = len;
+#endif
+
+	*retp = sh_obj;
+	return (0);
+
+err:	return (ret);
+}
+
+/*
+ * __lock_same_family --
+ *	Looks for compatible lockers. There are two modes:
+ *	1) If the lockers 2 belongs to a family transaction, then the locks are
+ *	   compatible if the lockers share the same last ancestor.
+ *	2) Otherwise the lockers are compatible if locker 1 is a parent of
+ *	   locker 2.
+ *	Return 1 if the lockers are compatible.
+ *
+ * This is used to determine if we should grant locks that appear to conflict,
+ * but don't because the lock is already held by a compatible locker.
+ */
+static int
+__lock_same_family(lt, sh_locker1, sh_locker2)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker1;
+	DB_LOCKER *sh_locker2;
+{
+	while (sh_locker2->parent_locker != INVALID_ROFF) {
+		sh_locker2 = R_ADDR(&lt->reginfo, sh_locker2->parent_locker);
+		if (sh_locker2 == sh_locker1)
+			return (1);
+	}
+
+	if (!F_ISSET(sh_locker2, DB_LOCKER_FAMILY_LOCKER))
+		return (0);
+
+	/*
+	 * If checking for a family locker situation, compare the last ancestor
+	 * of each locker.
+	 */
+	while (sh_locker1->parent_locker != INVALID_ROFF)
+		sh_locker1 =
+		    R_ADDR(&lt->reginfo, sh_locker1->parent_locker);
+
+	return (sh_locker1 == sh_locker2);
+}
+
+/*
+ * __lock_locker_same_family --
+ *	Determine if "locker" is an ancestor of "child".
+ * *retp == 1 if so, 0 otherwise.
+ *
+ * PUBLIC: int __lock_locker_same_family
+ * PUBLIC:     __P((ENV *, DB_LOCKER *, DB_LOCKER *, int *));
+ */
+int
+__lock_locker_same_family(env, locker1, locker2, retp)
+	ENV *env;
+	DB_LOCKER *locker1;
+	DB_LOCKER *locker2;
+	int *retp;
+{
+	DB_LOCKTAB *lt;
+
+	lt = env->lk_handle;
+
+	/*
+	 * The locker may not exist for this transaction, if not then it has
+	 * no parents.
+	 */
+	if (locker1 == NULL)
+		*retp = 0;
+	else
+		*retp = __lock_same_family(lt, locker1, locker2);
+	return (0);
+}
+
+/*
+ * __lock_inherit_locks --
+ *	Called on child commit to merge child's locks with parent's.
+ */
+static int
+__lock_inherit_locks(lt, sh_locker, flags)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+	u_int32_t flags;
+{
+	DB_LOCKER *sh_parent;
+	DB_LOCKOBJ *obj;
+	DB_LOCKREGION *region;
+	ENV *env;
+	int ret;
+	struct __db_lock *hlp, *lp;
+	roff_t poff;
+
+	env = lt->env;
+	region = lt->reginfo.primary;
+
+	/*
+	 * Get the committing locker and mark it as deleted.
+	 * This allows us to traverse the locker links without
+	 * worrying that someone else is deleting locks out
+	 * from under us.  However, if the locker doesn't
+	 * exist, that just means that the child holds no
+	 * locks, so inheritance is easy!
+	 */
+	if (sh_locker == NULL) {
+		__db_errx(env, __db_locker_invalid);
+		return (EINVAL);
+	}
+
+	/* Make sure we are a child transaction. */
+	if (sh_locker->parent_locker == INVALID_ROFF) {
+		__db_errx(env, DB_STR("2039", "Not a child transaction"));
+		return (EINVAL);
+	}
+	sh_parent = R_ADDR(&lt->reginfo, sh_locker->parent_locker);
+
+	/*
+	 * In order to make it possible for a parent to have
+	 * many, many children who lock the same objects, and
+	 * not require an inordinate number of locks, we try
+	 * to merge the child's locks with its parent's.
+	 */
+	poff = R_OFFSET(&lt->reginfo, sh_parent);
+	for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
+	    lp != NULL;
+	    lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
+		SH_LIST_REMOVE(lp, locker_links, __db_lock);
+
+		/* See if the parent already has a lock. */
+		obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+		OBJECT_LOCK_NDX(lt, region, obj->indx);
+		SH_TAILQ_FOREACH(hlp, &obj->holders, links, __db_lock)
+			if (hlp->holder == poff && lp->mode == hlp->mode)
+				break;
+
+		if (hlp != NULL) {
+			/* Parent already holds lock. */
+			hlp->refcount += lp->refcount;
+
+			/* Remove lock from object list and free it. */
+			DB_ASSERT(env, lp->status == DB_LSTAT_HELD);
+			SH_TAILQ_REMOVE(&obj->holders, lp, links, __db_lock);
+			(void)__lock_freelock(lt, lp, sh_locker, DB_LOCK_FREE);
+		} else {
+			/* Just move lock to parent chains. */
+			SH_LIST_INSERT_HEAD(&sh_parent->heldby,
+			    lp, locker_links, __db_lock);
+			lp->holder = poff;
+			sh_parent->nlocks++;
+			if (IS_WRITELOCK(lp->mode))
+				sh_parent->nwrites++;
+		}
+
+		/*
+		 * We may need to promote regardless of whether we simply
+		 * moved the lock to the parent or changed the parent's
+		 * reference count, because there might be a sibling waiting,
+		 * who will now be allowed to make forward progress.
+		 */
+		ret = __lock_promote(lt, obj, NULL, flags);
+		OBJECT_UNLOCK(lt, region, obj->indx);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+/*
+ * __lock_wakeup --
+ *
+ * Wakeup any waiters on a lock objects.
+ *
+ * PUBLIC: int __lock_wakeup __P((ENV *, const DBT *));
+ */
+int
+__lock_wakeup(env, obj)
+	ENV *env;
+	const DBT *obj;
+{
+	DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	u_int32_t ndx;
+	int ret;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	OBJECT_LOCK(lt, region, obj, ndx);
+	if ((ret = __lock_getobj(lt, obj, ndx, 0, &sh_obj)) == 0 &&
+	    sh_obj != NULL)
+		ret = __lock_promote(lt, sh_obj, NULL, DB_LOCK_ONEWAITER);
+
+	OBJECT_UNLOCK(lt, region, ndx);
+	return (ret);
+}
+
+/*
+ * __lock_promote --
+ *
+ * Look through the waiters and holders lists and decide which (if any)
+ * locks can be promoted.   Promote any that are eligible.
+ *
+ * PUBLIC: int __lock_promote
+ * PUBLIC:    __P((DB_LOCKTAB *, DB_LOCKOBJ *, int *, u_int32_t));
+ */
+int
+__lock_promote(lt, obj, state_changedp, flags)
+	DB_LOCKTAB *lt;
+	DB_LOCKOBJ *obj;
+	int *state_changedp;
+	u_int32_t flags;
+{
+	struct __db_lock *lp_w, *lp_h, *next_waiter;
+	DB_LOCKREGION *region;
+	int had_waiters, state_changed;
+
+	region = lt->reginfo.primary;
+	had_waiters = 0;
+
+	/*
+	 * We need to do lock promotion.  We also need to determine if we're
+	 * going to need to run the deadlock detector again.  If we release
+	 * locks, and there are waiters, but no one gets promoted, then we
+	 * haven't fundamentally changed the lockmgr state, so we may still
+	 * have a deadlock and we have to run again.  However, if there were
+	 * no waiters, or we actually promoted someone, then we are OK and we
+	 * don't have to run it immediately.
+	 *
+	 * During promotion, we look for state changes so we can return this
+	 * information to the caller.
+	 */
+
+	for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock),
+	    state_changed = lp_w == NULL;
+	    lp_w != NULL;
+	    lp_w = next_waiter) {
+		had_waiters = 1;
+		next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock);
+
+		/* Waiter may have aborted or expired. */
+		if (lp_w->status != DB_LSTAT_WAITING)
+			continue;
+
+		SH_TAILQ_FOREACH(lp_h, &obj->holders, links, __db_lock) {
+			if (lp_h->holder != lp_w->holder &&
+			    CONFLICTS(lt, region, lp_h->mode, lp_w->mode)) {
+				if (!__lock_same_family(lt,
+				     R_ADDR(&lt->reginfo, lp_h->holder),
+				     R_ADDR(&lt->reginfo, lp_w->holder)))
+					break;
+			}
+		}
+		if (lp_h != NULL)	/* Found a conflict. */
+			break;
+
+		/* No conflict, promote the waiting lock. */
+		SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock);
+		lp_w->status = DB_LSTAT_PENDING;
+		SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links);
+
+		/* Wake up waiter. */
+		MUTEX_UNLOCK(lt->env, lp_w->mtx_lock);
+		state_changed = 1;
+		if (LF_ISSET(DB_LOCK_ONEWAITER))
+			break;
+	}
+
+	/*
+	 * If this object had waiters and doesn't any more, then we need
+	 * to remove it from the dd_obj list.
+	 */
+	if (had_waiters && SH_TAILQ_FIRST(&obj->waiters, __db_lock) == NULL) {
+		LOCK_DD(lt->env, region);
+		/*
+		 * Bump the generation when removing an object from the
+		 * queue so that the deadlock detector will retry.
+		 */
+		obj->generation++;
+		SH_TAILQ_REMOVE(&region->dd_objs, obj, dd_links, __db_lockobj);
+		UNLOCK_DD(lt->env, region);
+	}
+
+	if (state_changedp != NULL)
+		*state_changedp = state_changed;
+
+	return (0);
+}
+
+/*
+ * __lock_remove_waiter --
+ *	Any lock on the waitlist has a process waiting for it.  Therefore,
+ * we can't return the lock to the freelist immediately.  Instead, we can
+ * remove the lock from the list of waiters, set the status field of the
+ * lock, and then let the process waking up return the lock to the
+ * free list.
+ *
+ * This must be called with the Object bucket locked.
+ */
+static int
+__lock_remove_waiter(lt, sh_obj, lockp, status)
+	DB_LOCKTAB *lt;
+	DB_LOCKOBJ *sh_obj;
+	struct __db_lock *lockp;
+	db_status_t status;
+{
+	DB_LOCKREGION *region;
+	int do_wakeup;
+
+	region = lt->reginfo.primary;
+
+	do_wakeup = lockp->status == DB_LSTAT_WAITING;
+
+	SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
+	lockp->links.stqe_prev = -1;
+	lockp->status = status;
+	if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+		LOCK_DD(lt->env, region);
+		sh_obj->generation++;
+		SH_TAILQ_REMOVE(
+		    &region->dd_objs,
+		    sh_obj, dd_links, __db_lockobj);
+		UNLOCK_DD(lt->env, region);
+	}
+
+	/*
+	 * Wake whoever is waiting on this lock.
+	 */
+	if (do_wakeup)
+		MUTEX_UNLOCK(lt->env, lockp->mtx_lock);
+
+	return (0);
+}
+
+/*
+ * __lock_trade --
+ *
+ * Trade locker ids on a lock.  This is used to reassign file locks from
+ * a transactional locker id to a long-lived locker id.  This should be
+ * called with the region mutex held.
+ */
+static int
+__lock_trade(env, lock, new_locker)
+	ENV *env;
+	DB_LOCK *lock;
+	DB_LOCKER *new_locker;
+{
+	struct __db_lock *lp;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	lt = env->lk_handle;
+	lp = R_ADDR(&lt->reginfo, lock->off);
+
+	/* If the lock is already released, simply return. */
+	if (lp->gen != lock->gen)
+		return (DB_NOTFOUND);
+
+	if (new_locker == NULL) {
+		__db_errx(env, DB_STR("2040", "Locker does not exist"));
+		return (EINVAL);
+	}
+
+	/* Remove the lock from its current locker. */
+	if ((ret = __lock_freelock(lt,
+	    lp, R_ADDR(&lt->reginfo, lp->holder), DB_LOCK_UNLINK)) != 0)
+		return (ret);
+
+	/* Add lock to its new locker. */
+	SH_LIST_INSERT_HEAD(&new_locker->heldby, lp, locker_links, __db_lock);
+	new_locker->nlocks++;
+	if (IS_WRITELOCK(lp->mode))
+		new_locker->nwrites++;
+	lp->holder = R_OFFSET(&lt->reginfo, new_locker);
+
+	return (0);
+}
+
+/*
+ * __lock_change --
+ *
+ * PUBLIC: int __lock_change __P((ENV *, DB_LOCK *, DB_LOCK *));
+ *
+ * Change a lock to a different object.  This is used when we move a
+ * metadata page to change the handle lock.  We know that the new lock
+ * has replaced the old lock so we just delete that lock.
+ */
+int
+__lock_change(env, old_lock, new_lock)
+	ENV *env;
+	DB_LOCK *old_lock, *new_lock;
+{
+	struct __db_lock *lp, *old_lp;
+	DB_LOCKOBJ *old_obj, *new_obj;
+	DB_LOCKTAB *lt;
+	DB_LOCKREGION *region;
+	u_int32_t old_part, new_part;
+	int ret;
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	old_lp = R_ADDR(&lt->reginfo, old_lock->off);
+	DB_ASSERT(env, old_lp->gen == old_lock->gen);
+	old_obj = SH_OFF_TO_PTR(old_lp, old_lp->obj, DB_LOCKOBJ);
+
+	lp = R_ADDR(&lt->reginfo, new_lock->off);
+	DB_ASSERT(env, lp->gen == new_lock->gen);
+	new_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+
+	/* Don't deadlock on partition mutexes, order the latches. */
+	LOCK_SYSTEM_LOCK(lt, region);
+	old_part = LOCK_PART(region, old_obj->indx);
+	new_part = LOCK_PART(region, new_obj->indx);
+
+	if (old_part == new_part)
+		MUTEX_LOCK_PARTITION(lt, region, old_part);
+	else if (new_obj->indx < old_obj->indx) {
+		MUTEX_LOCK_PARTITION(lt, region, new_part);
+		MUTEX_LOCK_PARTITION(lt, region, old_part);
+	} else  {
+		MUTEX_LOCK_PARTITION(lt, region, old_part);
+		MUTEX_LOCK_PARTITION(lt, region, new_part);
+	}
+
+	for (lp = SH_TAILQ_FIRST(&old_obj->waiters, __db_lock);
+	    lp != NULL;
+	    lp = SH_TAILQ_FIRST(&old_obj->waiters, __db_lock)) {
+		SH_TAILQ_REMOVE(&old_obj->waiters, lp, links, __db_lock);
+		SH_TAILQ_INSERT_TAIL(&new_obj->waiters, lp, links);
+		lp->indx = new_obj->indx;
+		lp->obj = (roff_t)SH_PTR_TO_OFF(lp, new_obj);
+	}
+
+	for (lp = SH_TAILQ_FIRST(&old_obj->holders, __db_lock);
+	    lp != NULL;
+	    lp = SH_TAILQ_FIRST(&old_obj->holders, __db_lock)) {
+		SH_TAILQ_REMOVE(&old_obj->holders, lp, links, __db_lock);
+		if (lp == old_lp)
+			continue;
+		SH_TAILQ_INSERT_TAIL(&new_obj->holders, lp, links);
+		lp->indx = new_obj->indx;
+		lp->obj = (roff_t)SH_PTR_TO_OFF(lp, new_obj);
+	}
+
+	/* Put the lock back in and call put so the object goes away too. */
+	SH_TAILQ_INSERT_TAIL(&old_obj->holders, old_lp, links);
+	ret = __lock_put_internal(lt, old_lp, old_obj->indx,
+	     DB_LOCK_UNLINK | DB_LOCK_FREE | DB_LOCK_NOPROMOTE);
+
+	MUTEX_UNLOCK_PARTITION(lt, region, new_part);
+	if (new_part != old_part)
+		MUTEX_UNLOCK_PARTITION(lt, region, old_part);
+	LOCK_SYSTEM_UNLOCK(lt, region);
+
+	return (ret);
+}
diff --git a/src/lock/lock_alloc.incl b/src/lock/lock_alloc.incl
new file mode 100644
index 00000000..edea07d2
--- /dev/null
+++ b/src/lock/lock_alloc.incl
@@ -0,0 +1,138 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This is a template for allocation in the lock region.  The following
+ * macros must be defined:
+ *
+ * FREE_LIST_HEAD -- the name of the head of the free list.
+ * STRUCT_NAME -- the name of the structure in the free list.
+ * CURRENT_COUNT -- structure element for count of current objects.
+ * MAX_COUNT -- structure element for max of current objects.
+ * STEAL_NAME -- name of stat to track steals.
+ * STEAL_EVENT -- name of event to track steals.
+ */
+#define __lock_alloc() /* for ctags */
+{
+	struct STRUCT_NAME *sh_thing;
+	DB_LOCKPART *end_p, *cur_p, *orig_p;
+	DB_LOCKREGION *region;
+	int begin, locked;
+	u_int32_t i, nobjs;
+
+	region = lt->reginfo.primary;
+
+	orig_p = &lt->part_array[part_id];
+	if (region->part_t_size == 1)
+		goto alloc;
+retry:	MUTEX_UNLOCK(lt->env, orig_p->mtx_part);
+	locked = 0;
+	sh_thing = NULL;
+	end_p = &lt->part_array[region->part_t_size];
+	/*
+	 * Start looking at the next partition and wrap around.  If
+	 * we get back to our partition then raise an error.
+	 */
+	begin = 0;
+	nobjs = 0;
+	cur_p = orig_p + 1;
+again:	for (; sh_thing == NULL && cur_p < end_p; cur_p++) {
+		MUTEX_LOCK(lt->env, cur_p->mtx_part);
+		if ((sh_thing = SH_TAILQ_FIRST(
+		    &cur_p->FREE_LIST_HEAD, STRUCT_NAME)) != NULL)
+			SH_TAILQ_REMOVE(&cur_p->FREE_LIST_HEAD,
+			    sh_thing, links, STRUCT_NAME);
+		MUTEX_UNLOCK(lt->env, cur_p->mtx_part);
+	}
+	if (sh_thing != NULL) {
+		MUTEX_LOCK(lt->env, orig_p->mtx_part);
+		SH_TAILQ_INSERT_HEAD(&orig_p->FREE_LIST_HEAD,
+		    sh_thing, links, STRUCT_NAME);
+		STAT_INC_VERB(env,
+		    lock, STEAL_EVENT, orig_p->part_stat.STEAL_NAME,
+		    cur_p - lt->part_array, part_id);
+		return (0);
+	}
+	if (!begin) {
+		begin = 1;
+		cur_p = lt->part_array;
+		end_p = orig_p;
+		goto again;
+	}
+	/*
+	 * Try to get some more space in the region.
+	 */
+	LOCK_REGION_LOCK(lt->env);
+	MUTEX_LOCK(lt->env, orig_p->mtx_part);
+	locked = 1;
+	nobjs = 0;
+	/* check to see if we raced with someone. */
+	if ((region->stat.MAX_COUNT == 0 ||
+	    region->stat.CURRENT_COUNT < region->stat.MAX_COUNT) &&
+	    SH_TAILQ_FIRST(&orig_p->FREE_LIST_HEAD, STRUCT_NAME) == NULL) {
+		MUTEX_UNLOCK(lt->env, orig_p->mtx_part);
+alloc:		locked = 0;
+		sh_thing = NULL;
+		cur_p = orig_p;
+		end_p = &lt->part_array[region->part_t_size];
+		nobjs = region->stat.CURRENT_COUNT >> 2;
+		/* Just in case. */
+		if (nobjs == 0)
+			nobjs = 1;
+		if (region->stat.MAX_COUNT != 0 &&
+		    region->stat.MAX_COUNT <
+		    region->stat.CURRENT_COUNT + nobjs)
+			nobjs = region->stat.MAX_COUNT -
+			    region->stat.CURRENT_COUNT;
+		/*
+		 * If the max memory is not sized for max objects,
+		 * allocate as much as possible.
+		 */
+		F_SET(&lt->reginfo, REGION_TRACKED);
+		while (__env_alloc(&lt->reginfo,
+		    nobjs * sizeof(struct STRUCT_NAME), &sh_thing) != 0)
+		    	if ((nobjs >>= 1) == 0)
+				break;
+		F_CLR(&lt->reginfo, REGION_TRACKED);
+		region->stat.CURRENT_COUNT += nobjs;
+		if (region->part_t_size != 1) 
+			LOCK_REGION_UNLOCK(lt->env);
+
+		if (nobjs == 0)
+			goto err;
+
+		for (i = 0; i < nobjs; i++) {
+			memset(sh_thing, 0, sizeof (struct STRUCT_NAME));
+			if (&cur_p->free_locks ==
+			    (struct __flock *)&cur_p->FREE_LIST_HEAD)
+				((struct __db_lock *)
+				    sh_thing)->status = DB_LSTAT_FREE;
+			MUTEX_LOCK(lt->env, cur_p->mtx_part);
+			SH_TAILQ_INSERT_HEAD(&cur_p->FREE_LIST_HEAD,
+			    sh_thing, links, STRUCT_NAME);
+			MUTEX_UNLOCK(lt->env, cur_p->mtx_part);
+			if (region->part_t_size != 1 && ++cur_p == end_p)
+				cur_p = lt->part_array;
+			sh_thing++;
+		}
+		if (region->part_t_size != 1) 
+			MUTEX_LOCK(lt->env, orig_p->mtx_part);
+		locked = 1;
+	} else 
+		LOCK_REGION_UNLOCK(lt->env);
+
+	if (SH_TAILQ_FIRST(&orig_p->FREE_LIST_HEAD, STRUCT_NAME) != NULL) 
+		return (0);
+	/* Somone stole all the locks! */
+	if (nobjs > 0)
+		goto retry;
+
+err:	if (region->part_t_size != 1 && locked == 0)
+		MUTEX_LOCK(lt->env, orig_p->mtx_part);
+	return (__lock_nomem(lt->env, "lock entries"));
+}
diff --git a/src/lock/lock_deadlock.c b/src/lock/lock_deadlock.c
new file mode 100644
index 00000000..3c00d7f1
--- /dev/null
+++ b/src/lock/lock_deadlock.c
@@ -0,0 +1,1063 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+#define	ISSET_MAP(M, N)	((M)[(N) / 32] & (1 << ((N) % 32)))
+
+#define	CLEAR_MAP(M, N) {						\
+	u_int32_t __i;							\
+	for (__i = 0; __i < (N); __i++)					\
+		(M)[__i] = 0;						\
+}
+
+#define	SET_MAP(M, B)	((M)[(B) / 32] |= (1 << ((B) % 32)))
+#define	CLR_MAP(M, B)	((M)[(B) / 32] &= ~((u_int)1 << ((B) % 32)))
+
+#define	OR_MAP(D, S, N)	{						\
+	u_int32_t __i;							\
+	for (__i = 0; __i < (N); __i++)					\
+		D[__i] |= S[__i];					\
+}
+#define	BAD_KILLID	0xffffffff
+
+typedef struct {
+	int		valid;
+	int		self_wait;
+	int		in_abort;
+	u_int32_t	count;
+	u_int32_t	id;
+	roff_t		last_lock;
+	roff_t		last_obj;
+	u_int32_t	last_ndx;
+	u_int32_t	last_locker_id;
+	db_pgno_t	pgno;
+	u_int32_t	priority;
+} locker_info;
+
+static int __dd_abort __P((ENV *, locker_info *, int *));
+static int __dd_build __P((ENV *, u_int32_t, u_int32_t **,
+	    u_int32_t *, u_int32_t *, locker_info **, int*, int*));
+static int __dd_find __P((ENV *,
+	    u_int32_t *, locker_info *, u_int32_t, u_int32_t, u_int32_t ***));
+static int __dd_isolder __P((u_int32_t, u_int32_t, u_int32_t, u_int32_t));
+static int __dd_verify __P((locker_info *, u_int32_t *, u_int32_t *,
+	    u_int32_t *, u_int32_t, u_int32_t, u_int32_t));
+
+#ifdef DIAGNOSTIC
+static void __dd_debug
+	    __P((ENV *, locker_info *, u_int32_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __lock_detect_pp --
+ *	ENV->lock_detect pre/post processing.
+ *
+ * PUBLIC: int __lock_detect_pp __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+ */
+int
+__lock_detect_pp(dbenv, flags, atype, rejectp)
+	DB_ENV *dbenv;
+	u_int32_t flags, atype;
+	int *rejectp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_detect", DB_INIT_LOCK);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB_ENV->lock_detect", flags, 0)) != 0)
+		return (ret);
+	switch (atype) {
+	case DB_LOCK_DEFAULT:
+	case DB_LOCK_EXPIRE:
+	case DB_LOCK_MAXLOCKS:
+	case DB_LOCK_MAXWRITE:
+	case DB_LOCK_MINLOCKS:
+	case DB_LOCK_MINWRITE:
+	case DB_LOCK_OLDEST:
+	case DB_LOCK_RANDOM:
+	case DB_LOCK_YOUNGEST:
+		break;
+	default:
+		__db_errx(env, DB_STR("2048",
+	    "DB_ENV->lock_detect: unknown deadlock detection mode specified"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__lock_detect(env, atype, rejectp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_detect --
+ *	ENV->lock_detect.
+ *
+ * PUBLIC: int __lock_detect __P((ENV *, u_int32_t, int *));
+ */
+int
+__lock_detect(env, atype, rejectp)
+	ENV *env;
+	u_int32_t atype;
+	int *rejectp;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	db_timespec now;
+	locker_info *idmap;
+	u_int32_t *bitmap, *copymap, **deadp, **deadlist, *tmpmap;
+	u_int32_t i, cid, keeper, killid, limit, nalloc, nlockers;
+	u_int32_t lock_max, txn_max;
+	int pri_set, ret, status;
+
+	/*
+	 * If this environment is a replication client, then we must use the
+	 * MINWRITE detection discipline.
+	 */
+	if (IS_REP_CLIENT(env))
+		atype = DB_LOCK_MINWRITE;
+
+	copymap = tmpmap = NULL;
+	deadlist = NULL;
+
+	lt = env->lk_handle;
+	if (rejectp != NULL)
+		*rejectp = 0;
+
+	/* Check if a detector run is necessary. */
+
+	/* Make a pass only if auto-detect would run. */
+	region = lt->reginfo.primary;
+
+	timespecclear(&now);
+	if (region->need_dd == 0 &&
+	     (!timespecisset(&region->next_timeout) ||
+	     !__clock_expired(env, &now, &region->next_timeout))) {
+		return (0);
+	}
+	if (region->need_dd == 0)
+		atype = DB_LOCK_EXPIRE;
+
+	/* Reset need_dd, so we know we've run the detector. */
+	region->need_dd = 0;
+
+	/* Build the waits-for bitmap. */
+	ret = __dd_build(env,
+	    atype, &bitmap, &nlockers, &nalloc, &idmap, rejectp, &pri_set);
+	lock_max = region->stat.st_cur_maxid;
+	if (ret != 0 || atype == DB_LOCK_EXPIRE)
+		return (ret);
+
+	/* If there are no lockers, there are no deadlocks. */
+	if (nlockers == 0)
+		return (0);
+
+#ifdef DIAGNOSTIC
+	if (FLD_ISSET(env->dbenv->verbose, DB_VERB_WAITSFOR))
+		__dd_debug(env, idmap, bitmap, nlockers, nalloc);
+#endif
+
+	/* Now duplicate the bitmaps so we can verify deadlock participants. */
+	if ((ret = __os_calloc(env, (size_t)nlockers,
+	    sizeof(u_int32_t) * nalloc, &copymap)) != 0)
+		goto err;
+	memcpy(copymap, bitmap, nlockers * sizeof(u_int32_t) * nalloc);
+
+	if ((ret = __os_calloc(env, sizeof(u_int32_t), nalloc, &tmpmap)) != 0)
+		goto err;
+
+	/* Find a deadlock. */
+	if ((ret =
+	    __dd_find(env, bitmap, idmap, nlockers, nalloc, &deadlist)) != 0)
+		return (ret);
+
+	/*
+	 * We need the cur_maxid from the txn region as well.  In order
+	 * to avoid tricky synchronization between the lock and txn
+	 * regions, we simply unlock the lock region and then lock the
+	 * txn region.  This introduces a small window during which the
+	 * transaction system could then wrap.  We're willing to return
+	 * the wrong answer for "oldest" or "youngest" in those rare
+	 * circumstances.
+	 */
+	if (TXN_ON(env)) {
+		TXN_SYSTEM_LOCK(env);
+		txn_max = ((DB_TXNREGION *)
+		    env->tx_handle->reginfo.primary)->cur_maxid;
+		TXN_SYSTEM_UNLOCK(env);
+	} else
+		txn_max = TXN_MAXIMUM;
+
+	killid = BAD_KILLID;
+	for (deadp = deadlist; *deadp != NULL; deadp++) {
+		if (rejectp != NULL)
+			++*rejectp;
+		killid = (u_int32_t)(*deadp - bitmap) / nalloc;
+		limit = killid;
+
+		/*
+		 * There are cases in which our general algorithm will
+		 * fail.  Returning 1 from verify indicates that the
+		 * particular locker is not only involved in a deadlock,
+		 * but that killing him will allow others to make forward
+		 * progress.  Unfortunately, there are cases where we need
+		 * to abort someone, but killing them will not necessarily
+		 * ensure forward progress (imagine N readers all trying to
+		 * acquire a write lock).
+		 * killid is only set to lockers that pass the db_verify test.
+		 * keeper will hold the best candidate even if it does
+		 * not pass db_verify.  Once we fill in killid then we do
+		 * not need a keeper, but we keep updating it anyway.
+		 */
+
+		keeper = idmap[killid].in_abort == 0 ? killid : BAD_KILLID;
+		if (keeper == BAD_KILLID ||
+		    __dd_verify(idmap, *deadp,
+		    tmpmap, copymap, nlockers, nalloc, keeper) == 0)
+			killid = BAD_KILLID;
+
+		if (!pri_set && killid != BAD_KILLID &&
+		    (atype == DB_LOCK_DEFAULT || atype == DB_LOCK_RANDOM))
+			goto dokill;
+
+		/*
+		 * Start with the id that we know is deadlocked, then examine
+		 * all other set bits and see if any are a better candidate
+		 * for abortion and they are genuinely part of the deadlock.
+		 * The definition of "best":
+		 *	MAXLOCKS: maximum count
+		 *	MAXWRITE: maximum write count
+		 *	MINLOCKS: minimum count
+		 *	MINWRITE: minimum write count
+		 *	OLDEST: smallest id
+		 *	YOUNGEST: largest id
+		 */
+		for (i = (limit + 1) % nlockers;
+		    i != limit;
+		    i = (i + 1) % nlockers) {
+			if (!ISSET_MAP(*deadp, i) || idmap[i].in_abort)
+				continue;
+
+			/*
+			 * Determine if we have a verified candidate
+			 * in killid, if not then compare with the
+			 * non-verified candidate in keeper.
+			 */
+			if (killid == BAD_KILLID) {
+				if (keeper == BAD_KILLID)
+					goto use_next;
+				else
+					cid = keeper;
+			} else
+				cid = killid;
+
+			if (idmap[i].priority > idmap[cid].priority)
+				continue;
+			if (idmap[i].priority < idmap[cid].priority)
+				goto use_next;
+
+			/* Equal priorities, break ties using atype. */
+			switch (atype) {
+			case DB_LOCK_OLDEST:
+				if (__dd_isolder(idmap[cid].id,
+				    idmap[i].id, lock_max, txn_max))
+					continue;
+				break;
+			case DB_LOCK_YOUNGEST:
+				if (__dd_isolder(idmap[i].id,
+				    idmap[cid].id, lock_max, txn_max))
+					continue;
+				break;
+			case DB_LOCK_MAXLOCKS:
+				if (idmap[i].count < idmap[cid].count)
+					continue;
+				break;
+			case DB_LOCK_MAXWRITE:
+				if (idmap[i].count < idmap[cid].count)
+					continue;
+				break;
+			case DB_LOCK_MINLOCKS:
+			case DB_LOCK_MINWRITE:
+				if (idmap[i].count > idmap[cid].count)
+					continue;
+				break;
+			case DB_LOCK_DEFAULT:
+			case DB_LOCK_RANDOM:
+				continue;
+
+			default:
+				killid = BAD_KILLID;
+				ret = EINVAL;
+				goto dokill;
+			}
+
+use_next:		keeper = i;
+			if (__dd_verify(idmap, *deadp,
+			    tmpmap, copymap, nlockers, nalloc, i))
+				killid = i;
+		}
+
+dokill:		if (killid == BAD_KILLID) {
+			if (keeper == BAD_KILLID)
+				continue;
+			else {
+				/*
+				 * Removing a single locker will not
+				 * break the deadlock, signal to run
+				 * detection again.
+				 */
+				region->need_dd = 1;
+				killid = keeper;
+			}
+		}
+
+		/* Kill the locker with lockid idmap[killid]. */
+		if ((ret = __dd_abort(env, &idmap[killid], &status)) != 0)
+			break;
+
+		/*
+		 * It's possible that the lock was already aborted; this isn't
+		 * necessarily a problem, so do not treat it as an error. If
+		 * the txn was aborting and deadlocked trying to upgrade
+		 * a was_write lock, the detector should be run again or
+		 * the deadlock might persist.
+		 */
+		if (status != 0) {
+			if (status != DB_ALREADY_ABORTED)
+				__db_errx(env, DB_STR_A("2049",
+				    "warning: unable to abort locker %lx",
+				    "%lx"), (u_long)idmap[killid].id);
+			else
+				region->need_dd = 1;
+		} else if (FLD_ISSET(env->dbenv->verbose, DB_VERB_DEADLOCK))
+			__db_msg(env, DB_STR_A("2050", "Aborting locker %lx",
+			    "%lx"), (u_long)idmap[killid].id);
+	}
+err:	if (copymap != NULL)
+		__os_free(env, copymap);
+	if (deadlist != NULL)
+		__os_free(env, deadlist);
+	if (tmpmap != NULL)
+		__os_free(env, tmpmap);
+	__os_free(env, bitmap);
+	__os_free(env, idmap);
+
+	return (ret);
+}
+
+/*
+ * ========================================================================
+ * Utilities
+ */
+
+#define	DD_INVALID_ID	((u_int32_t) -1)
+
+/*
+ * __dd_build --
+ *	Build the lock dependency bit maps.
+ * Notes on synchronization:
+ *	LOCK_SYSTEM_LOCK is used to hold objects locked when we have
+ *		a single partition.
+ *	LOCK_LOCKERS is held while we are walking the lockers list and
+ *		to single thread the use of lockerp->dd_id.
+ *	LOCK_DD protects the DD list of objects.
+ */
+
+static int
+__dd_build(env, atype, bmp, nlockers, allocp, idmap, rejectp, pri_set)
+	ENV *env;
+	u_int32_t atype, **bmp, *nlockers, *allocp;
+	locker_info **idmap;
+	int *pri_set, *rejectp;
+{
+	struct __db_lock *lp;
+	DB_LOCKER *lip, *lockerp, *child;
+	DB_LOCKOBJ *op, *lo, *np;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	locker_info *id_array;
+	db_timespec now, min_timeout;
+	u_int32_t *bitmap, count, dd;
+	u_int32_t *entryp, gen, id, indx, ndx, nentries, *tmpmap;
+	u_int8_t *pptr;
+	int is_first, ret;
+
+	COMPQUIET(indx, 0);
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	timespecclear(&now);
+	timespecclear(&min_timeout);
+
+	/*
+	 * While we always check for expired timeouts, if we are called with
+	 * DB_LOCK_EXPIRE, then we are only checking for timeouts (i.e., not
+	 * doing deadlock detection at all).  If we aren't doing real deadlock
+	 * detection, then we can skip a significant, amount of the processing.
+	 * In particular we do not build the conflict array and our caller
+	 * needs to expect this.
+	 */
+	LOCK_SYSTEM_LOCK(lt, region);
+	if (atype == DB_LOCK_EXPIRE) {
+skip:		LOCK_DD(env, region);
+		op = SH_TAILQ_FIRST(&region->dd_objs, __db_lockobj);
+		for (; op != NULL; op = np) {
+			indx = op->indx;
+			gen = op->generation;
+			UNLOCK_DD(env, region);
+			OBJECT_LOCK_NDX(lt, region, indx);
+			if (op->generation != gen) {
+				OBJECT_UNLOCK(lt, region, indx);
+				goto skip;
+			}
+			SH_TAILQ_FOREACH(lp, &op->waiters, links, __db_lock) {
+				lockerp = (DB_LOCKER *)
+				    R_ADDR(&lt->reginfo, lp->holder);
+				if (lp->status == DB_LSTAT_WAITING) {
+					if (__clock_expired(env,
+					    &now, &lockerp->lk_expire)) {
+						lp->status = DB_LSTAT_EXPIRED;
+						MUTEX_UNLOCK(
+						    env, lp->mtx_lock);
+						if (rejectp != NULL)
+							++*rejectp;
+						continue;
+					}
+					if (timespecisset(
+					    &lockerp->lk_expire) &&
+					    (!timespecisset(&min_timeout) ||
+					    timespeccmp(&min_timeout,
+					    &lockerp->lk_expire, >)))
+						min_timeout =
+						    lockerp->lk_expire;
+				}
+			}
+			LOCK_DD(env, region);
+			np = SH_TAILQ_NEXT(op, dd_links, __db_lockobj);
+			OBJECT_UNLOCK(lt, region, indx);
+		}
+		UNLOCK_DD(env, region);
+		LOCK_SYSTEM_UNLOCK(lt, region);
+		goto done;
+	}
+
+	/*
+	 * Allocate after locking the region
+	 * to make sure the structures are large enough.
+	 */
+	LOCK_LOCKERS(env, region);
+	count = region->nlockers;
+	if (count == 0) {
+		UNLOCK_LOCKERS(env, region);
+		LOCK_SYSTEM_UNLOCK(lt, region);
+		*nlockers = 0;
+		return (0);
+	}
+
+	if (FLD_ISSET(env->dbenv->verbose, DB_VERB_DEADLOCK))
+		__db_msg(env, DB_STR_A("2051", "%lu lockers",
+		    "%lu"), (u_long)count);
+
+	nentries = (u_int32_t)DB_ALIGN(count, 32) / 32;
+
+	/* Allocate enough space for a count by count bitmap matrix. */
+	if ((ret = __os_calloc(env, (size_t)count,
+	    sizeof(u_int32_t) * nentries, &bitmap)) != 0) {
+		UNLOCK_LOCKERS(env, region);
+		LOCK_SYSTEM_UNLOCK(lt, region);
+		return (ret);
+	}
+
+	if ((ret = __os_calloc(env,
+	    sizeof(u_int32_t), nentries, &tmpmap)) != 0) {
+		UNLOCK_LOCKERS(env, region);
+		LOCK_SYSTEM_UNLOCK(lt, region);
+		__os_free(env, bitmap);
+		return (ret);
+	}
+
+	if ((ret = __os_calloc(env,
+	    (size_t)count, sizeof(locker_info), &id_array)) != 0) {
+		UNLOCK_LOCKERS(env, region);
+		LOCK_SYSTEM_UNLOCK(lt, region);
+		__os_free(env, bitmap);
+		__os_free(env, tmpmap);
+		return (ret);
+	}
+
+	/*
+	 * First we go through and assign each locker a deadlock detector id.
+	 */
+	id = 0;
+	*pri_set = 0;
+	SH_TAILQ_FOREACH(lip, &region->lockers, ulinks, __db_locker) {
+		if (lip->master_locker == INVALID_ROFF) {
+			DB_ASSERT(env, id < count);
+			lip->dd_id = id++;
+			id_array[lip->dd_id].id = lip->id;
+			id_array[lip->dd_id].priority = lip->priority;
+			if (lip->dd_id > 0 &&
+			    id_array[lip->dd_id-1].priority != lip->priority)
+				*pri_set = 1;
+
+			switch (atype) {
+			case DB_LOCK_MINLOCKS:
+			case DB_LOCK_MAXLOCKS:
+				id_array[lip->dd_id].count = lip->nlocks;
+				break;
+			case DB_LOCK_MINWRITE:
+			case DB_LOCK_MAXWRITE:
+				id_array[lip->dd_id].count = lip->nwrites;
+				break;
+			default:
+				break;
+			}
+		} else
+			lip->dd_id = DD_INVALID_ID;
+
+	}
+
+	/*
+	 * We only need consider objects that have waiters, so we use
+	 * the list of objects with waiters (dd_objs) instead of traversing
+	 * the entire hash table.  For each object, we traverse the waiters
+	 * list and add an entry in the waitsfor matrix for each waiter/holder
+	 * combination. We don't want to lock from the DD mutex to the
+	 * hash mutex, so we drop deadlock mutex  and get the hash mutex.  Then
+	 * check to see if the object has changed.  Once we have the object
+	 * locked then locks cannot be remove and lockers cannot go away.
+	 */
+	if (0) {
+		/* If an object has changed state, start over. */
+again:		memset(bitmap, 0, count * sizeof(u_int32_t) * nentries);
+	}
+	LOCK_DD(env, region);
+	op = SH_TAILQ_FIRST(&region->dd_objs, __db_lockobj);
+	for (; op != NULL; op = np) {
+		indx = op->indx;
+		gen = op->generation;
+		UNLOCK_DD(env, region);
+
+		OBJECT_LOCK_NDX(lt, region, indx);
+		if (gen != op->generation) {
+			OBJECT_UNLOCK(lt, region, indx);
+			goto again;
+		}
+
+		/*
+		 * First we go through and create a bit map that
+		 * represents all the holders of this object.
+		 */
+
+		CLEAR_MAP(tmpmap, nentries);
+		SH_TAILQ_FOREACH(lp, &op->holders, links, __db_lock) {
+			lockerp = (DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder);
+
+			if (lockerp->dd_id == DD_INVALID_ID) {
+				/*
+				 * If the locker was not here when we started,
+				 * then it was not deadlocked at that time.
+				 */
+				if (lockerp->master_locker == INVALID_ROFF)
+					continue;
+				dd = ((DB_LOCKER *)R_ADDR(&lt->reginfo,
+				    lockerp->master_locker))->dd_id;
+				if (dd == DD_INVALID_ID)
+					continue;
+				lockerp->dd_id = dd;
+				switch (atype) {
+				case DB_LOCK_MINLOCKS:
+				case DB_LOCK_MAXLOCKS:
+					id_array[dd].count += lockerp->nlocks;
+					break;
+				case DB_LOCK_MINWRITE:
+				case DB_LOCK_MAXWRITE:
+					id_array[dd].count += lockerp->nwrites;
+					break;
+				default:
+					break;
+				}
+
+			} else
+				dd = lockerp->dd_id;
+			id_array[dd].valid = 1;
+
+			/*
+			 * If the holder has already been aborted, then
+			 * we should ignore it for now.
+			 */
+			if (lp->status == DB_LSTAT_HELD)
+				SET_MAP(tmpmap, dd);
+		}
+
+		/*
+		 * Next, for each waiter, we set its row in the matrix
+		 * equal to the map of holders we set up above.
+		 */
+		for (is_first = 1,
+		    lp = SH_TAILQ_FIRST(&op->waiters, __db_lock);
+		    lp != NULL;
+		    is_first = 0,
+		    lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
+			lockerp = (DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder);
+			if (lp->status == DB_LSTAT_WAITING) {
+				if (__clock_expired(env,
+				    &now, &lockerp->lk_expire)) {
+					lp->status = DB_LSTAT_EXPIRED;
+					MUTEX_UNLOCK(env, lp->mtx_lock);
+					if (rejectp != NULL)
+						++*rejectp;
+					continue;
+				}
+				if (timespecisset(&lockerp->lk_expire) &&
+				    (!timespecisset(&min_timeout) ||
+				    timespeccmp(
+				    &min_timeout, &lockerp->lk_expire, >)))
+					min_timeout = lockerp->lk_expire;
+			}
+
+			if (lockerp->dd_id == DD_INVALID_ID) {
+				dd = ((DB_LOCKER *)R_ADDR(&lt->reginfo,
+				    lockerp->master_locker))->dd_id;
+				lockerp->dd_id = dd;
+				switch (atype) {
+				case DB_LOCK_MINLOCKS:
+				case DB_LOCK_MAXLOCKS:
+					id_array[dd].count += lockerp->nlocks;
+					break;
+				case DB_LOCK_MINWRITE:
+				case DB_LOCK_MAXWRITE:
+					id_array[dd].count += lockerp->nwrites;
+					break;
+				default:
+					break;
+				}
+			} else
+				dd = lockerp->dd_id;
+			id_array[dd].valid = 1;
+
+			/*
+			 * If the transaction is pending abortion, then
+			 * ignore it on this iteration.
+			 */
+			if (lp->status != DB_LSTAT_WAITING)
+				continue;
+
+			entryp = bitmap + (nentries * dd);
+			OR_MAP(entryp, tmpmap, nentries);
+			/*
+			 * If this is the first waiter on the queue,
+			 * then we remove the waitsfor relationship
+			 * with oneself.  However, if it's anywhere
+			 * else on the queue, then we have to keep
+			 * it and we have an automatic deadlock.
+			 */
+			if (is_first) {
+				if (ISSET_MAP(entryp, dd))
+					id_array[dd].self_wait = 1;
+				CLR_MAP(entryp, dd);
+			}
+		}
+		LOCK_DD(env, region);
+		np = SH_TAILQ_NEXT(op, dd_links, __db_lockobj);
+		OBJECT_UNLOCK(lt, region, indx);
+	}
+	UNLOCK_DD(env, region);
+
+	/*
+	 * Now for each locker, record its last lock and set abort status.
+	 * We need to look at the heldby list carefully.  We have the LOCKERS
+	 * locked so they cannot go away.  The lock at the head of the
+	 * list can be removed by locking the object it points at.
+	 * Since lock memory is not freed if we get a lock we can look
+	 * at it safely but SH_LIST_FIRST is not atomic, so we check that
+	 * the list has not gone empty during that macro. We check abort
+	 * status after building the bit maps so that we will not detect
+	 * a blocked transaction without noting that it is already aborting.
+	 */
+	for (id = 0; id < count; id++) {
+		if (!id_array[id].valid)
+			continue;
+		if ((ret = __lock_getlocker_int(lt,
+		    id_array[id].id, 0, &lockerp)) != 0 || lockerp == NULL)
+			continue;
+
+		/*
+		 * If this is a master transaction, try to
+		 * find one of its children's locks first,
+		 * as they are probably more recent.
+		 */
+		child = SH_LIST_FIRST(&lockerp->child_locker, __db_locker);
+		if (child != NULL) {
+			do {
+c_retry:			lp = SH_LIST_FIRST(&child->heldby, __db_lock);
+				if (SH_LIST_EMPTY(&child->heldby) || lp == NULL)
+					goto c_next;
+
+				if (F_ISSET(child, DB_LOCKER_INABORT))
+					id_array[id].in_abort = 1;
+				ndx = lp->indx;
+				OBJECT_LOCK_NDX(lt, region, ndx);
+				if (lp != SH_LIST_FIRST(
+				    &child->heldby, __db_lock) ||
+				    ndx != lp->indx) {
+					OBJECT_UNLOCK(lt, region, ndx);
+					goto c_retry;
+				}
+
+				if (lp != NULL &&
+				    lp->status == DB_LSTAT_WAITING) {
+					id_array[id].last_locker_id = child->id;
+					goto get_lock;
+				} else {
+					OBJECT_UNLOCK(lt, region, ndx);
+				}
+c_next:				child = SH_LIST_NEXT(
+				    child, child_link, __db_locker);
+			} while (child != NULL);
+		}
+
+l_retry:	lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock);
+		if (!SH_LIST_EMPTY(&lockerp->heldby) && lp != NULL) {
+			ndx = lp->indx;
+			OBJECT_LOCK_NDX(lt, region, ndx);
+			if (lp != SH_LIST_FIRST(&lockerp->heldby, __db_lock) ||
+			    lp->indx != ndx) {
+				OBJECT_UNLOCK(lt, region, ndx);
+				goto l_retry;
+			}
+			id_array[id].last_locker_id = lockerp->id;
+get_lock:		id_array[id].last_lock = R_OFFSET(&lt->reginfo, lp);
+			id_array[id].last_obj = lp->obj;
+			lo = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+			id_array[id].last_ndx = lo->indx;
+			pptr = SH_DBT_PTR(&lo->lockobj);
+			if (lo->lockobj.size >= sizeof(db_pgno_t))
+				memcpy(&id_array[id].pgno,
+				    pptr, sizeof(db_pgno_t));
+			else
+				id_array[id].pgno = 0;
+			OBJECT_UNLOCK(lt, region, ndx);
+		}
+		if (F_ISSET(lockerp, DB_LOCKER_INABORT))
+			id_array[id].in_abort = 1;
+	}
+	UNLOCK_LOCKERS(env, region);
+	LOCK_SYSTEM_UNLOCK(lt, region);
+
+	/*
+	 * Now we can release everything except the bitmap matrix that we
+	 * created.
+	 */
+	*nlockers = id;
+	*idmap = id_array;
+	*bmp = bitmap;
+	*allocp = nentries;
+	__os_free(env, tmpmap);
+done:	if (timespecisset(&region->next_timeout))
+		region->next_timeout = min_timeout;
+	return (0);
+}
+
+static int
+__dd_find(env, bmp, idmap, nlockers, nalloc, deadp)
+	ENV *env;
+	u_int32_t *bmp, nlockers, nalloc;
+	locker_info *idmap;
+	u_int32_t ***deadp;
+{
+	u_int32_t i, j, k, *mymap, *tmpmap, **retp;
+	u_int ndead, ndeadalloc;
+	int ret;
+
+#undef	INITIAL_DEAD_ALLOC
+#define	INITIAL_DEAD_ALLOC	8
+
+	ndeadalloc = INITIAL_DEAD_ALLOC;
+	ndead = 0;
+	if ((ret = __os_malloc(env,
+	    ndeadalloc * sizeof(u_int32_t *), &retp)) != 0)
+		return (ret);
+
+	/*
+	 * For each locker, OR in the bits from the lockers on which that
+	 * locker is waiting.
+	 */
+	for (mymap = bmp, i = 0; i < nlockers; i++, mymap += nalloc) {
+		if (!idmap[i].valid)
+			continue;
+		for (j = 0; j < nlockers; j++) {
+			if (!ISSET_MAP(mymap, j))
+				continue;
+
+			/* Find the map for this bit. */
+			tmpmap = bmp + (nalloc * j);
+			OR_MAP(mymap, tmpmap, nalloc);
+			if (!ISSET_MAP(mymap, i))
+				continue;
+
+			/* Make sure we leave room for NULL. */
+			if (ndead + 2 >= ndeadalloc) {
+				ndeadalloc <<= 1;
+				/*
+				 * If the alloc fails, then simply return the
+				 * deadlocks that we already have.
+				 */
+				if (__os_realloc(env,
+				    ndeadalloc * sizeof(u_int32_t *),
+				    &retp) != 0) {
+					retp[ndead] = NULL;
+					*deadp = retp;
+					return (0);
+				}
+			}
+			retp[ndead++] = mymap;
+
+			/* Mark all participants in this deadlock invalid. */
+			for (k = 0; k < nlockers; k++)
+				if (ISSET_MAP(mymap, k))
+					idmap[k].valid = 0;
+			break;
+		}
+	}
+	retp[ndead] = NULL;
+	*deadp = retp;
+	return (0);
+}
+
+static int
+__dd_abort(env, info, statusp)
+	ENV *env;
+	locker_info *info;
+	int *statusp;
+{
+	struct __db_lock *lockp;
+	DB_LOCKER *lockerp;
+	DB_LOCKOBJ *sh_obj;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	*statusp = 0;
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	ret = 0;
+
+	/* We must lock so this locker cannot go away while we abort it. */
+	LOCK_SYSTEM_LOCK(lt, region);
+	LOCK_LOCKERS(env, region);
+
+	/*
+	 * Get the locker.  If it's gone or was aborted while we were
+	 * detecting, return that.
+	 */
+	if ((ret = __lock_getlocker_int(lt,
+	    info->last_locker_id, 0, &lockerp)) != 0)
+		goto err;
+	if (lockerp == NULL || F_ISSET(lockerp, DB_LOCKER_INABORT)) {
+		*statusp = DB_ALREADY_ABORTED;
+		goto err;
+	}
+
+	/*
+	 * Find the locker's last lock.  It is possible for this lock to have
+	 * been freed, either though a timeout or another detector run.
+	 * First lock the lock object so it is stable.
+	 */
+
+	OBJECT_LOCK_NDX(lt, region, info->last_ndx);
+	if ((lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock)) == NULL) {
+		*statusp = DB_ALREADY_ABORTED;
+		goto done;
+	}
+	if (R_OFFSET(&lt->reginfo, lockp) != info->last_lock ||
+	    lockp->holder != R_OFFSET(&lt->reginfo, lockerp) ||
+	    F_ISSET(lockerp, DB_LOCKER_INABORT) ||
+	    lockp->obj != info->last_obj || lockp->status != DB_LSTAT_WAITING) {
+		*statusp = DB_ALREADY_ABORTED;
+		goto done;
+	}
+
+	sh_obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+
+	STAT_INC_VERB(env, lock, deadlock,
+	    region->stat.st_ndeadlocks, lockerp->id, &sh_obj->lockobj);
+	/* Abort lock, take it off list, and wake up this lock. */
+	lockp->status = DB_LSTAT_ABORTED;
+	SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
+
+	/*
+	 * Either the waiters list is now empty, in which case we remove
+	 * it from dd_objs, or it is not empty, in which case we need to
+	 * do promotion.
+	 */
+	if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+		LOCK_DD(env, region);
+		SH_TAILQ_REMOVE(&region->dd_objs,
+		    sh_obj, dd_links, __db_lockobj);
+		UNLOCK_DD(env, region);
+	} else
+		ret = __lock_promote(lt, sh_obj, NULL, 0);
+	MUTEX_UNLOCK(env, lockp->mtx_lock);
+
+done:	OBJECT_UNLOCK(lt, region, info->last_ndx);
+err:	UNLOCK_LOCKERS(env, region);
+	LOCK_SYSTEM_UNLOCK(lt, region);
+	return (ret);
+}
+
+#ifdef DIAGNOSTIC
+static void
+__dd_debug(env, idmap, bitmap, nlockers, nalloc)
+	ENV *env;
+	locker_info *idmap;
+	u_int32_t *bitmap, nlockers, nalloc;
+{
+	DB_MSGBUF mb;
+	u_int32_t i, j, *mymap;
+
+	DB_MSGBUF_INIT(&mb);
+
+	__db_msg(env, "Waitsfor array\nWaiter:\tWaiting on:");
+	for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nalloc) {
+		if (!idmap[i].valid)
+			continue;
+
+		__db_msgadd(env, &mb,				/* Waiter. */
+		    "%lx/%lu:\t", (u_long)idmap[i].id, (u_long)idmap[i].pgno);
+		for (j = 0; j < nlockers; j++)
+			if (ISSET_MAP(mymap, j))
+				__db_msgadd(env,
+				    &mb, " %lx", (u_long)idmap[j].id);
+		__db_msgadd(env, &mb, " %lu", (u_long)idmap[i].last_lock);
+		DB_MSGBUF_FLUSH(env, &mb);
+	}
+}
+#endif
+
+/*
+ * Given a bitmap that contains a deadlock, verify that the bit
+ * specified in the which parameter indicates a transaction that
+ * is actually deadlocked.  Return 1 if really deadlocked, 0 otherwise.
+ * deadmap  --  the array that identified the deadlock.
+ * tmpmap   --  a copy of the initial bitmaps from the dd_build phase.
+ * origmap  --  a temporary bit map into which we can OR things.
+ * nlockers --  the number of actual lockers under consideration.
+ * nalloc   --  the number of words allocated for the bitmap.
+ * which    --  the locker in question.
+ */
+static int
+__dd_verify(idmap, deadmap, tmpmap, origmap, nlockers, nalloc, which)
+	locker_info *idmap;
+	u_int32_t *deadmap, *tmpmap, *origmap;
+	u_int32_t nlockers, nalloc, which;
+{
+	u_int32_t *tmap;
+	u_int32_t j;
+	int count;
+
+	memset(tmpmap, 0, sizeof(u_int32_t) * nalloc);
+
+	/*
+	 * In order for "which" to be actively involved in
+	 * the deadlock, removing him from the evaluation
+	 * must remove the deadlock.  So, we OR together everyone
+	 * except which; if all the participants still have their
+	 * bits set, then the deadlock persists and which does
+	 * not participate.  If the deadlock does not persist
+	 * then "which" does participate.
+	 */
+	count = 0;
+	for (j = 0; j < nlockers; j++) {
+		if (!ISSET_MAP(deadmap, j) || j == which)
+			continue;
+
+		/* Find the map for this bit. */
+		tmap = origmap + (nalloc * j);
+
+		/*
+		 * We special case the first waiter who is also a holder, so
+		 * we don't automatically call that a deadlock.  However, if
+		 * it really is a deadlock, we need the bit set now so that
+		 * we treat the first waiter like other waiters.
+		 */
+		if (idmap[j].self_wait)
+			SET_MAP(tmap, j);
+		OR_MAP(tmpmap, tmap, nalloc);
+		count++;
+	}
+
+	if (count == 1)
+		return (1);
+
+	/*
+	 * Now check the resulting map and see whether
+	 * all participants still have their bit set.
+	 */
+	for (j = 0; j < nlockers; j++) {
+		if (!ISSET_MAP(deadmap, j) || j == which)
+			continue;
+		if (!ISSET_MAP(tmpmap, j))
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * __dd_isolder --
+ *
+ * Figure out the relative age of two lockers.  We make all lockers
+ * older than all transactions, because that's how it's worked
+ * historically (because lockers are lower ids).
+ */
+static int
+__dd_isolder(a, b, lock_max, txn_max)
+	u_int32_t	a, b;
+	u_int32_t	lock_max, txn_max;
+{
+	u_int32_t max;
+
+	/* Check for comparing lock-id and txnid. */
+	if (a <= DB_LOCK_MAXID && b > DB_LOCK_MAXID)
+		return (1);
+	if (b <= DB_LOCK_MAXID && a > DB_LOCK_MAXID)
+		return (0);
+
+	/* In the same space; figure out which one. */
+	max = txn_max;
+	if (a <= DB_LOCK_MAXID)
+		max = lock_max;
+
+	/*
+	 * We can't get a 100% correct ordering, because we don't know
+	 * where the current interval started and if there were older
+	 * lockers outside the interval.  We do the best we can.
+	 */
+
+	/*
+	 * Check for a wrapped case with ids above max.
+	 */
+	if (a > max && b < max)
+		return (1);
+	if (b > max && a < max)
+		return (0);
+
+	return (a < b);
+}
diff --git a/src/lock/lock_failchk.c b/src/lock/lock_failchk.c
new file mode 100644
index 00000000..59fb010f
--- /dev/null
+++ b/src/lock/lock_failchk.c
@@ -0,0 +1,114 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+/*
+ * __lock_failchk --
+ *	Check for locks held by dead threads of control and release
+ *	read locks.  If any write locks were held by dead non-trasnactional
+ *	lockers then we must abort and run recovery.  Otherwise we release
+ *	read locks for lockers owned by dead threads.  Write locks for
+ *	dead transactional lockers will be freed when we abort the transaction.
+ *
+ * PUBLIC: int __lock_failchk __P((ENV *));
+ */
+int
+__lock_failchk(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_LOCKER *lip;
+	DB_LOCKREGION *lrp;
+	DB_LOCKREQ request;
+	DB_LOCKTAB *lt;
+	u_int32_t i;
+	int ret;
+	char buf[DB_THREADID_STRLEN];
+
+	dbenv = env->dbenv;
+	lt = env->lk_handle;
+	lrp = lt->reginfo.primary;
+
+retry:	LOCK_LOCKERS(env, lrp);
+
+	ret = 0;
+	for (i = 0; i < lrp->locker_t_size; i++)
+		SH_TAILQ_FOREACH(lip, &lt->locker_tab[i], links, __db_locker) {
+			/*
+			 * If the locker is transactional, we can ignore it if
+			 * it has no read locks or has no locks at all.  Check
+			 * the heldby list rather then nlocks since a lock may
+			 * be PENDING.  __txn_failchk aborts any transactional
+			 * lockers.  Non-transactional lockers progress to
+			 * is_alive test.
+			 */
+			if ((lip->id >= TXN_MINIMUM) &&
+			     (SH_LIST_EMPTY(&lip->heldby) ||
+			     lip->nlocks == lip->nwrites))
+				continue;
+
+			/* If the locker is still alive, it's not a problem. */
+			if (dbenv->is_alive(dbenv, lip->pid, lip->tid,
+			    F_ISSET(lip, DB_LOCKER_HANDLE_LOCKER) ?
+			    DB_MUTEX_PROCESS_ONLY : 0))
+				continue;
+
+			/*
+			 * We can only deal with read locks.  If a
+			 * non-transactional locker holds write locks we
+			 * have to assume a Berkeley DB operation was
+			 * interrupted with only 1-of-N pages modified.
+			 */
+			if (lip->id < TXN_MINIMUM && lip->nwrites != 0) {
+				ret = __db_failed(env, DB_STR_A("2052",
+				    "locker has write locks", ""),
+				     lip->pid, lip->tid);
+				break;
+			}
+
+			/*
+			 * Discard the locker and its read locks.
+			 */
+			if (!SH_LIST_EMPTY(&lip->heldby)) {
+				__db_msg(env, DB_STR_A("2053",
+				    "Freeing read locks for locker %#lx: %s",
+				    "%#lx %s"), (u_long)lip->id,
+				    dbenv->thread_id_string(
+				    dbenv, lip->pid, lip->tid, buf));
+				UNLOCK_LOCKERS(env, lrp);
+				memset(&request, 0, sizeof(request));
+				request.op = DB_LOCK_PUT_READ;
+				if ((ret = __lock_vec(env,
+				    lip, 0, &request, 1, NULL)) != 0)
+					return (ret);
+			}
+			else
+				UNLOCK_LOCKERS(env, lrp);
+
+			/*
+			 * This locker is most likely referenced by a cursor
+			 * which is owned by a dead thread.  Normally the
+			 * cursor would be available for other threads
+			 * but we assume the dead thread will never release
+			 * it.
+			 */
+			if (lip->id < TXN_MINIMUM &&
+			    (ret = __lock_freelocker(lt, lip)) != 0)
+				return (ret);
+			goto retry;
+		}
+
+	UNLOCK_LOCKERS(env, lrp);
+
+	return (ret);
+}
diff --git a/src/lock/lock_id.c b/src/lock/lock_id.c
new file mode 100644
index 00000000..24b545d1
--- /dev/null
+++ b/src/lock/lock_id.c
@@ -0,0 +1,572 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_freelocker_int
+    __P((DB_LOCKTAB *, DB_LOCKREGION *, DB_LOCKER *, int));
+
+/*
+ * __lock_id_pp --
+ *	ENV->lock_id pre/post processing.
+ *
+ * PUBLIC: int __lock_id_pp __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_id_pp(dbenv, idp)
+	DB_ENV *dbenv;
+	u_int32_t *idp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_id", DB_INIT_LOCK);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__lock_id(env, idp, NULL)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_id --
+ *	ENV->lock_id.
+ *
+ * PUBLIC: int  __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **));
+ */
+int
+__lock_id(env, idp, lkp)
+	ENV *env;
+	u_int32_t *idp;
+	DB_LOCKER **lkp;
+{
+	DB_LOCKER *lk;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	u_int32_t id, *ids;
+	int nids, ret;
+
+	lk = NULL;
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	id = DB_LOCK_INVALIDID;
+	ret = 0;
+
+	id = DB_LOCK_INVALIDID;
+	lk = NULL;
+
+	LOCK_LOCKERS(env, region);
+
+	/*
+	 * Allocate a new lock id.  If we wrap around then we find the minimum
+	 * currently in use and make sure we can stay below that.  This code is
+	 * similar to code in __txn_begin_int for recovering txn ids.
+	 *
+	 * Our current valid range can span the maximum valid value, so check
+	 * for it and wrap manually.
+	 */
+	if (region->lock_id == DB_LOCK_MAXID &&
+	    region->cur_maxid != DB_LOCK_MAXID)
+		region->lock_id = DB_LOCK_INVALIDID;
+	if (region->lock_id == region->cur_maxid) {
+		if ((ret = __os_malloc(env,
+		    sizeof(u_int32_t) * region->nlockers, &ids)) != 0)
+			goto err;
+		nids = 0;
+		SH_TAILQ_FOREACH(lk, &region->lockers, ulinks, __db_locker)
+			ids[nids++] = lk->id;
+		region->lock_id = DB_LOCK_INVALIDID;
+		region->cur_maxid = DB_LOCK_MAXID;
+		if (nids != 0)
+			__db_idspace(ids, nids,
+			    &region->lock_id, &region->cur_maxid);
+		__os_free(env, ids);
+	}
+	id = ++region->lock_id;
+
+	/* Allocate a locker for this id. */
+	ret = __lock_getlocker_int(lt, id, 1, &lk);
+
+err:	UNLOCK_LOCKERS(env, region);
+
+	if (idp != NULL)
+		*idp = id;
+	if (lkp != NULL)
+		*lkp = lk;
+
+	return (ret);
+}
+
+/*
+ * __lock_set_thread_id --
+ *	Set the thread_id in an existing locker.
+ * PUBLIC: void __lock_set_thread_id __P((void *, pid_t, db_threadid_t));
+ */
+void
+__lock_set_thread_id(lref_arg, pid, tid)
+	void *lref_arg;
+	pid_t pid;
+	db_threadid_t tid;
+{
+	DB_LOCKER *lref;
+
+	lref = lref_arg;
+	lref->pid = pid;
+	lref->tid = tid;
+}
+
+/*
+ * __lock_id_free_pp --
+ *	ENV->lock_id_free pre/post processing.
+ *
+ * PUBLIC: int __lock_id_free_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_id_free_pp(dbenv, id)
+	DB_ENV *dbenv;
+	u_int32_t id;
+{
+	DB_LOCKER *sh_locker;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_id_free", DB_INIT_LOCK);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __env_rep_enter(env, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	LOCK_LOCKERS(env, region);
+	if ((ret =
+	     __lock_getlocker_int(env->lk_handle, id, 0, &sh_locker)) == 0) {
+		if (sh_locker != NULL)
+			ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+		else {
+			__db_errx(env, DB_STR_A("2045",
+			    "Unknown locker id: %lx", "%lx"), (u_long)id);
+			ret = EINVAL;
+		}
+	}
+	UNLOCK_LOCKERS(env, region);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_id_free --
+ *	Free a locker id.
+ *
+ * PUBLIC: int  __lock_id_free __P((ENV *, DB_LOCKER *));
+ */
+int
+__lock_id_free(env, sh_locker)
+	ENV *env;
+	DB_LOCKER *sh_locker;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	ret = 0;
+
+	if (sh_locker->nlocks != 0) {
+		__db_errx(env, DB_STR("2046",
+		    "Locker still has locks"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	LOCK_LOCKERS(env, region);
+	ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+	UNLOCK_LOCKERS(env, region);
+
+err:	return (ret);
+}
+
+/*
+ * __lock_id_set --
+ *	Set the current locker ID and current maximum unused ID (for
+ *	testing purposes only).
+ *
+ * PUBLIC: int __lock_id_set __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__lock_id_set(env, cur_id, max_id)
+	ENV *env;
+	u_int32_t cur_id, max_id;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "lock_id_set", DB_INIT_LOCK);
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	region->lock_id = cur_id;
+	region->cur_maxid = max_id;
+
+	return (0);
+}
+
+/*
+ * __lock_getlocker --
+ *	Get a locker in the locker hash table.  The create parameter
+ * indicates if the locker should be created if it doesn't exist in
+ * the table.
+ *
+ * This must be called with the locker mutex lock if create == 1.
+ *
+ * PUBLIC: int __lock_getlocker __P((DB_LOCKTAB *,
+ * PUBLIC:     u_int32_t, int, DB_LOCKER **));
+ * PUBLIC: int __lock_getlocker_int __P((DB_LOCKTAB *,
+ * PUBLIC:     u_int32_t, int, DB_LOCKER **));
+ */
+int
+__lock_getlocker(lt, locker, create, retp)
+	DB_LOCKTAB *lt;
+	u_int32_t locker;
+	int create;
+	DB_LOCKER **retp;
+{
+	DB_LOCKREGION *region;
+	ENV *env;
+	int ret;
+
+	COMPQUIET(region, NULL);
+	env = lt->env;
+	region = lt->reginfo.primary;
+
+	LOCK_LOCKERS(env, region);
+	ret = __lock_getlocker_int(lt, locker, create, retp);
+	UNLOCK_LOCKERS(env, region);
+
+	return (ret);
+}
+
+int
+__lock_getlocker_int(lt, locker, create, retp)
+	DB_LOCKTAB *lt;
+	u_int32_t locker;
+	int create;
+	DB_LOCKER **retp;
+{
+	DB_LOCKER *sh_locker;
+	DB_LOCKREGION *region;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	db_mutex_t mutex;
+	u_int32_t i, indx, nlockers;
+	int ret;
+
+	env = lt->env;
+	region = lt->reginfo.primary;
+
+	LOCKER_HASH(lt, region, locker, indx);
+
+	/*
+	 * If we find the locker, then we can just return it.  If we don't find
+	 * the locker, then we need to create it.
+	 */
+	SH_TAILQ_FOREACH(sh_locker, &lt->locker_tab[indx], links, __db_locker)
+		if (sh_locker->id == locker)
+			break;
+	if (sh_locker == NULL && create) {
+		nlockers = 0;
+		/* Create new locker and then insert it into hash table. */
+		if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK,
+		    DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK,
+		    &mutex)) != 0)
+			return (ret);
+		else
+			MUTEX_LOCK(env, mutex);
+		if ((sh_locker = SH_TAILQ_FIRST(
+		    &region->free_lockers, __db_locker)) == NULL) {
+			nlockers = region->stat.st_lockers >> 2;
+			/* Just in case. */
+			if (nlockers == 0)
+				nlockers = 1;
+			if (region->stat.st_maxlockers != 0 &&
+			    region->stat.st_maxlockers <
+			    region->stat.st_lockers + nlockers)
+				nlockers = region->stat.st_maxlockers -
+				region->stat.st_lockers;
+			/*
+			 * Don't hold lockers when getting the region,
+			 * we could deadlock.  When creating a locker
+			 * there is no race since the id allocation
+			 * is synchronized.
+			 */
+			UNLOCK_LOCKERS(env, region);
+			LOCK_REGION_LOCK(env);
+			/*
+			 * If the max memory is not sized for max objects,
+			 * allocate as much as possible.
+			 */
+			F_SET(&lt->reginfo, REGION_TRACKED);
+			while (__env_alloc(&lt->reginfo, nlockers *
+			    sizeof(struct __db_locker), &sh_locker) != 0)
+				if ((nlockers >> 1) == 0)
+					break;
+			F_CLR(&lt->reginfo, REGION_TRACKED);
+			LOCK_REGION_UNLOCK(lt->env);
+			LOCK_LOCKERS(env, region);
+			for (i = 0; i < nlockers; i++) {
+				SH_TAILQ_INSERT_HEAD(&region->free_lockers,
+				    sh_locker, links, __db_locker);
+				sh_locker++;
+			}
+			if (nlockers == 0)
+				return (__lock_nomem(env, "locker entries"));
+			region->stat.st_lockers += nlockers;
+			sh_locker = SH_TAILQ_FIRST(
+			    &region->free_lockers, __db_locker);
+		}
+		SH_TAILQ_REMOVE(
+		    &region->free_lockers, sh_locker, links, __db_locker);
+		++region->nlockers;
+#ifdef HAVE_STATISTICS
+		STAT_PERFMON2(env, lock, nlockers, region->nlockers, locker);
+		if (region->nlockers > region->stat.st_maxnlockers)
+			STAT_SET(env, lock, maxnlockers,
+			    region->stat.st_maxnlockers,
+			    region->nlockers, locker);
+#endif
+		sh_locker->id = locker;
+		env->dbenv->thread_id(
+		    env->dbenv, &sh_locker->pid, &sh_locker->tid);
+		sh_locker->mtx_locker = mutex;
+		sh_locker->dd_id = 0;
+		sh_locker->master_locker = INVALID_ROFF;
+		sh_locker->parent_locker = INVALID_ROFF;
+		SH_LIST_INIT(&sh_locker->child_locker);
+		sh_locker->flags = 0;
+		SH_LIST_INIT(&sh_locker->heldby);
+		sh_locker->nlocks = 0;
+		sh_locker->nwrites = 0;
+		sh_locker->priority = DB_LOCK_DEFPRIORITY;
+		sh_locker->lk_timeout = 0;
+		timespecclear(&sh_locker->tx_expire);
+		timespecclear(&sh_locker->lk_expire);
+
+		SH_TAILQ_INSERT_HEAD(
+		    &lt->locker_tab[indx], sh_locker, links, __db_locker);
+		SH_TAILQ_INSERT_HEAD(&region->lockers,
+		    sh_locker, ulinks, __db_locker);
+		ENV_GET_THREAD_INFO(env, ip);
+#ifdef DIAGNOSTIC
+		if (ip != NULL)
+			ip->dbth_locker = R_OFFSET(&lt->reginfo, sh_locker);
+#endif
+	}
+
+	*retp = sh_locker;
+	return (0);
+}
+
+/*
+ * __lock_addfamilylocker
+ *	Put a locker entry in for a child transaction.
+ *
+ * PUBLIC: int __lock_addfamilylocker __P((ENV *,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__lock_addfamilylocker(env, pid, id, is_family)
+	ENV *env;
+	u_int32_t pid, id, is_family;
+{
+	DB_LOCKER *lockerp, *mlockerp;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int ret;
+
+	COMPQUIET(region, NULL);
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+	LOCK_LOCKERS(env, region);
+
+	/* get/create the  parent locker info */
+	if ((ret = __lock_getlocker_int(lt, pid, 1, &mlockerp)) != 0)
+		goto err;
+
+	/*
+	 * We assume that only one thread can manipulate
+	 * a single transaction family.
+	 * Therefore the master locker cannot go away while
+	 * we manipulate it, nor can another child in the
+	 * family be created at the same time.
+	 */
+	if ((ret = __lock_getlocker_int(lt, id, 1, &lockerp)) != 0)
+		goto err;
+
+	/* Point to our parent. */
+	lockerp->parent_locker = R_OFFSET(&lt->reginfo, mlockerp);
+
+	/* See if this locker is the family master. */
+	if (mlockerp->master_locker == INVALID_ROFF)
+		lockerp->master_locker = R_OFFSET(&lt->reginfo, mlockerp);
+	else {
+		lockerp->master_locker = mlockerp->master_locker;
+		mlockerp = R_ADDR(&lt->reginfo, mlockerp->master_locker);
+	}
+
+	/*
+	 * Set the family locker flag, so it is possible to distinguish
+	 * between locks held by subtransactions and those with compatible
+	 * lockers.
+	 */
+	if (is_family)
+		F_SET(mlockerp, DB_LOCKER_FAMILY_LOCKER);
+
+	/*
+	 * Link the child at the head of the master's list.
+	 * The guess is when looking for deadlock that
+	 * the most recent child is the one that's blocked.
+	 */
+	SH_LIST_INSERT_HEAD(
+	    &mlockerp->child_locker, lockerp, child_link, __db_locker);
+
+err:	UNLOCK_LOCKERS(env, region);
+
+	return (ret);
+}
+
+/*
+ * __lock_freelocker_int
+ *      Common code for deleting a locker; must be called with the
+ *	locker bucket locked.
+ */
+static int
+__lock_freelocker_int(lt, region, sh_locker, reallyfree)
+	DB_LOCKTAB *lt;
+	DB_LOCKREGION *region;
+	DB_LOCKER *sh_locker;
+	int reallyfree;
+{
+	ENV *env;
+	u_int32_t indx;
+	int ret;
+
+	env = lt->env;
+
+	if (SH_LIST_FIRST(&sh_locker->heldby, __db_lock) != NULL) {
+		__db_errx(env, DB_STR("2047",
+		    "Freeing locker with locks"));
+		return (EINVAL);
+	}
+
+	/* If this is part of a family, we must fix up its links. */
+	if (sh_locker->master_locker != INVALID_ROFF) {
+		SH_LIST_REMOVE(sh_locker, child_link, __db_locker);
+		sh_locker->master_locker = INVALID_ROFF;
+	}
+
+	if (reallyfree) {
+		LOCKER_HASH(lt, region, sh_locker->id, indx);
+		SH_TAILQ_REMOVE(&lt->locker_tab[indx], sh_locker,
+		    links, __db_locker);
+		if (sh_locker->mtx_locker != MUTEX_INVALID &&
+		    (ret = __mutex_free(env, &sh_locker->mtx_locker)) != 0)
+			return (ret);
+		SH_TAILQ_INSERT_HEAD(&region->free_lockers, sh_locker,
+		    links, __db_locker);
+		SH_TAILQ_REMOVE(&region->lockers, sh_locker,
+		    ulinks, __db_locker);
+		region->nlockers--;
+		STAT_PERFMON2(env,
+		    lock, nlockers, region->nlockers, sh_locker->id);
+	}
+
+	return (0);
+}
+
+/*
+ * __lock_freelocker
+ *	Remove a locker its family from the hash table.
+ *
+ * This must be called without the locker bucket locked.
+ *
+ * PUBLIC: int __lock_freelocker  __P((DB_LOCKTAB *, DB_LOCKER *));
+ */
+int
+__lock_freelocker(lt, sh_locker)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+{
+	DB_LOCKREGION *region;
+	ENV *env;
+	int ret;
+
+	region = lt->reginfo.primary;
+	env = lt->env;
+
+	if (sh_locker == NULL)
+		return (0);
+
+	LOCK_LOCKERS(env, region);
+	ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+	UNLOCK_LOCKERS(env, region);
+
+	return (ret);
+}
+
+/*
+ * __lock_familyremove
+ *	Remove a locker from its family.
+ *
+ * This must be called without the locker bucket locked.
+ *
+ * PUBLIC: int __lock_familyremove  __P((DB_LOCKTAB *, DB_LOCKER *));
+ */
+int
+__lock_familyremove(lt, sh_locker)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+{
+	DB_LOCKREGION *region;
+	ENV *env;
+	int ret;
+
+	region = lt->reginfo.primary;
+	env = lt->env;
+
+	LOCK_LOCKERS(env, region);
+	ret = __lock_freelocker_int(lt, region, sh_locker, 0);
+	UNLOCK_LOCKERS(env, region);
+
+	return (ret);
+}
diff --git a/src/lock/lock_list.c b/src/lock/lock_list.c
new file mode 100644
index 00000000..1e3d2a55
--- /dev/null
+++ b/src/lock/lock_list.c
@@ -0,0 +1,365 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_sort_cmp __P((const void *, const void *));
+
+/*
+ * Lock list routines.
+ *	The list is composed of a 32-bit count of locks followed by
+ * each lock.  A lock is represented by a 16-bit page-count, a lock
+ * object and a page list.  A lock object consists of a 16-bit size
+ * and the object itself.   In a pseudo BNF notation, you get:
+ *
+ * LIST = COUNT32 LOCK*
+ * LOCK = COUNT16 LOCKOBJ PAGELIST
+ * LOCKOBJ = COUNT16 OBJ
+ * PAGELIST = COUNT32*
+ *
+ * (Recall that X* means "0 or more X's")
+ *
+ * In most cases, the OBJ is a struct __db_ilock and the page list is
+ * a series of (32-bit) page numbers that should get written into the
+ * pgno field of the __db_ilock.  So, the actual number of pages locked
+ * is the number of items in the PAGELIST plus 1. If this is an application-
+ * specific lock, then we cannot interpret obj and the pagelist must
+ * be empty.
+ *
+ * Consider a lock list for: File A, pages 1&2, File B pages 3-5, Applock
+ * This would be represented as:
+ *	5 1 [fid=A;page=1] 2 2 [fid=B;page=3] 4 5 0 APPLOCK
+ *        ------------------ -------------------- ---------
+ *         LOCK for file A    LOCK for file B     application-specific lock
+ */
+
+#define	MAX_PGNOS	0xffff
+
+/*
+ * These macros are bigger than one might expect because some compilers say a
+ * cast does not return an lvalue, so constructs like *(u_int32_t*)dp = count;
+ * generate warnings.
+ */
+#define	RET_SIZE(size, count)  ((size) +				\
+     sizeof(u_int32_t) + (count) * 2 * sizeof(u_int16_t))
+
+#define	PUT_COUNT(dp, count)	do {	u_int32_t __c = (count);	\
+					LOGCOPY_32(env, dp, &__c);	\
+					dp = (u_int8_t *)dp +		\
+					     sizeof(u_int32_t);		\
+				} while (0)
+#define	PUT_PCOUNT(dp, count)	do {	u_int16_t __c = (count);	\
+					LOGCOPY_16(env, dp, &__c);	\
+					dp = (u_int8_t *)dp +		\
+					    sizeof(u_int16_t);		\
+				} while (0)
+#define	PUT_SIZE(dp, size)	do {	u_int16_t __s = (size);		\
+					LOGCOPY_16(env, dp, &__s);	\
+					dp = (u_int8_t *)dp +		\
+					    sizeof(u_int16_t);		\
+				} while (0)
+#define	PUT_PGNO(dp, pgno)	do {	db_pgno_t __pg = (pgno);	\
+					LOGCOPY_32(env, dp, &__pg);	\
+					dp = (u_int8_t *)dp +		\
+					    sizeof(db_pgno_t);		\
+				} while (0)
+#define	COPY_OBJ(dp, obj)	do {					\
+					memcpy(dp,			\
+					    (obj)->data, (obj)->size);  \
+					dp = (u_int8_t *)dp +		\
+					     DB_ALIGN((obj)->size,	\
+					     sizeof(u_int32_t));	\
+				} while (0)
+#define	GET_COUNT(dp, count)	do {	LOGCOPY_32(env, &count, dp);	\
+					dp = (u_int8_t *)dp +		\
+					     sizeof(u_int32_t);	\
+				} while (0)
+#define	GET_PCOUNT(dp, count)	do {	LOGCOPY_16(env, &count, dp);	\
+					dp = (u_int8_t *)dp +		\
+					     sizeof(u_int16_t);	\
+				} while (0)
+#define	GET_SIZE(dp, size)	do {	LOGCOPY_16(env, &size, dp);	\
+					dp = (u_int8_t *)dp +		\
+					     sizeof(u_int16_t);	\
+				} while (0)
+#define	GET_PGNO(dp, pgno)	do {	LOGCOPY_32(env, &pgno, dp);	\
+					dp = (u_int8_t *)dp +		\
+					     sizeof(db_pgno_t);	\
+				} while (0)
+
+/*
+ * __lock_fix_list --
+ *
+ * PUBLIC: int __lock_fix_list __P((ENV *, DBT *, u_int32_t));
+ */
+int
+__lock_fix_list(env, list_dbt, nlocks)
+	ENV *env;
+	DBT *list_dbt;
+	u_int32_t nlocks;
+{
+	DBT *obj;
+	DB_LOCK_ILOCK *lock, *plock;
+	u_int32_t i, j, nfid, npgno, size;
+	u_int8_t *data, *dp;
+	int ret;
+
+	if ((size = list_dbt->size) == 0)
+		return (0);
+
+	obj = (DBT *)list_dbt->data;
+
+	/*
+	 * If necessary sort the list of locks so that locks on the same fileid
+	 * are together.  We do not sort 1 or 2 locks because by definition if
+	 * there are locks on the same fileid they will be together.  The sort
+	 * will also move any locks that do not look like page locks to the end
+	 * of the list so we can stop looking for locks we can combine when we
+	 * hit one.
+	 */
+	switch (nlocks) {
+	case 1:
+		size = RET_SIZE(obj->size, 1);
+		if ((ret = __os_malloc(env, size, &data)) != 0)
+			return (ret);
+
+		dp = data;
+		PUT_COUNT(dp, 1);
+		PUT_PCOUNT(dp, 0);
+		PUT_SIZE(dp, obj->size);
+		COPY_OBJ(dp, obj);
+		break;
+	default:
+		/* Sort so that all locks with same fileid are together. */
+		qsort(list_dbt->data, nlocks, sizeof(DBT), __lock_sort_cmp);
+		/* FALLTHROUGH */
+	case 2:
+		nfid = npgno = 0;
+		i = 0;
+		if (obj->size != sizeof(DB_LOCK_ILOCK))
+			goto not_ilock;
+
+		nfid = 1;
+		plock = (DB_LOCK_ILOCK *)obj->data;
+
+		/* We use ulen to keep track of the number of pages. */
+		j = 0;
+		obj[0].ulen = 0;
+		for (i = 1; i < nlocks; i++) {
+			if (obj[i].size != sizeof(DB_LOCK_ILOCK))
+				break;
+			lock = (DB_LOCK_ILOCK *)obj[i].data;
+			if (obj[j].ulen < MAX_PGNOS &&
+			     lock->type == plock->type &&
+			     memcmp(lock->fileid,
+			     plock->fileid, DB_FILE_ID_LEN) == 0) {
+				obj[j].ulen++;
+				npgno++;
+			} else {
+				nfid++;
+				plock = lock;
+				j = i;
+				obj[j].ulen = 0;
+			}
+		}
+
+not_ilock:	size = nfid * sizeof(DB_LOCK_ILOCK);
+		size += npgno * sizeof(db_pgno_t);
+		/* Add the number of nonstandard locks and get their size. */
+		nfid += nlocks - i;
+		for (; i < nlocks; i++) {
+			size += obj[i].size;
+			obj[i].ulen = 0;
+		}
+
+		size = RET_SIZE(size, nfid);
+		if ((ret = __os_malloc(env, size, &data)) != 0)
+			return (ret);
+
+		dp = data;
+		PUT_COUNT(dp, nfid);
+
+		for (i = 0; i < nlocks; i = j) {
+			PUT_PCOUNT(dp, obj[i].ulen);
+			PUT_SIZE(dp, obj[i].size);
+			COPY_OBJ(dp, &obj[i]);
+			lock = (DB_LOCK_ILOCK *)obj[i].data;
+			for (j = i + 1; j <= i + obj[i].ulen; j++) {
+				lock = (DB_LOCK_ILOCK *)obj[j].data;
+				PUT_PGNO(dp, lock->pgno);
+			}
+		}
+	}
+
+	__os_free(env, list_dbt->data);
+
+	list_dbt->data = data;
+	list_dbt->size = size;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t,
+ * PUBLIC:	      db_lockmode_t, DBT *));
+ */
+int
+__lock_get_list(env, locker, flags, lock_mode, list)
+	ENV *env;
+	DB_LOCKER *locker;
+	u_int32_t flags;
+	db_lockmode_t lock_mode;
+	DBT *list;
+{
+	DBT obj_dbt;
+	DB_LOCK ret_lock;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_LOCK_ILOCK *lock;
+	db_pgno_t save_pgno;
+	u_int16_t npgno, size;
+	u_int32_t i, nlocks;
+	int ret;
+	void *data, *dp;
+
+	if (list->size == 0)
+		return (0);
+	ret = 0;
+	data = NULL;
+
+	lt = env->lk_handle;
+	dp = list->data;
+
+	/*
+	 * There is no assurance log records will be aligned.  If not, then
+	 * copy the data to an aligned region so the rest of the code does
+	 * not have to worry about it.
+	 */
+	if ((uintptr_t)dp != DB_ALIGN((uintptr_t)dp, sizeof(u_int32_t))) {
+		if ((ret = __os_malloc(env, list->size, &data)) != 0)
+			return (ret);
+		memcpy(data, list->data, list->size);
+		dp = data;
+	}
+
+	region = lt->reginfo.primary;
+	LOCK_SYSTEM_LOCK(lt, region);
+	GET_COUNT(dp, nlocks);
+
+	for (i = 0; i < nlocks; i++) {
+		GET_PCOUNT(dp, npgno);
+		GET_SIZE(dp, size);
+		lock = (DB_LOCK_ILOCK *) dp;
+		save_pgno = lock->pgno;
+		obj_dbt.data = dp;
+		obj_dbt.size = size;
+		dp = ((u_int8_t *)dp) + DB_ALIGN(size, sizeof(u_int32_t));
+		do {
+			if ((ret = __lock_get_internal(lt, locker,
+			     flags, &obj_dbt, lock_mode, 0, &ret_lock)) != 0) {
+				lock->pgno = save_pgno;
+				goto err;
+			}
+			if (npgno != 0)
+				GET_PGNO(dp, lock->pgno);
+		} while (npgno-- != 0);
+		lock->pgno = save_pgno;
+	}
+
+err:	LOCK_SYSTEM_UNLOCK(lt, region);
+	if (data != NULL)
+		__os_free(env, data);
+	return (ret);
+}
+
+#define	UINT32_CMP(A, B)	((A) == (B) ? 0 : ((A) > (B) ? 1 : -1))
+static int
+__lock_sort_cmp(a, b)
+	const void *a, *b;
+{
+	const DBT *d1, *d2;
+	DB_LOCK_ILOCK *l1, *l2;
+
+	d1 = a;
+	d2 = b;
+
+	/* Force all non-standard locks to sort at end. */
+	if (d1->size != sizeof(DB_LOCK_ILOCK)) {
+		if (d2->size != sizeof(DB_LOCK_ILOCK))
+			return (UINT32_CMP(d1->size, d2->size));
+		else
+			return (1);
+	} else if (d2->size != sizeof(DB_LOCK_ILOCK))
+		return (-1);
+
+	l1 = d1->data;
+	l2 = d2->data;
+	if (l1->type != l2->type)
+		return (UINT32_CMP(l1->type, l2->type));
+	return (memcmp(l1->fileid, l2->fileid, DB_FILE_ID_LEN));
+}
+
+/*
+ * PUBLIC: void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *));
+ */
+void
+__lock_list_print(env, mbp, list)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DBT *list;
+{
+	DB_LOCK_ILOCK *lock;
+	db_pgno_t pgno;
+	u_int16_t npgno, size;
+	u_int32_t i, nlocks;
+	u_int8_t *fidp;
+	char *fname, *dname, *p, namebuf[26];
+	void *dp;
+
+	if (list->size == 0)
+		return;
+	dp = list->data;
+
+	GET_COUNT(dp, nlocks);
+
+	for (i = 0; i < nlocks; i++) {
+		GET_PCOUNT(dp, npgno);
+		GET_SIZE(dp, size);
+		lock = (DB_LOCK_ILOCK *) dp;
+		fidp = lock->fileid;
+		(void)__dbreg_get_name(env, fidp, &fname, &dname);
+		__db_msgadd(env, mbp, "\t");
+		if (fname == NULL && dname == NULL)
+			__db_msgadd(env, mbp, "(%lx %lx %lx %lx %lx)",
+			(u_long)fidp[0], (u_long)fidp[1], (u_long)fidp[2],
+			(u_long)fidp[3], (u_long)fidp[4]);
+		else {
+			if (fname != NULL && dname != NULL) {
+				(void)snprintf(namebuf, sizeof(namebuf),
+				    "%14s.%-10s", fname, dname);
+				p = namebuf;
+			} else if (fname != NULL)
+				p = fname;
+			else
+				p = dname;
+			__db_msgadd(env, mbp, "%-25s", p);
+		}
+		dp = ((u_int8_t *)dp) + DB_ALIGN(size, sizeof(u_int32_t));
+		LOGCOPY_32(env, &pgno, &lock->pgno);
+		do {
+			__db_msgadd(env, mbp, " %d", pgno);
+			if (npgno != 0)
+				GET_PGNO(dp, pgno);
+		} while (npgno-- != 0);
+		__db_msgadd(env, mbp, "\n");
+	}
+}
diff --git a/src/lock/lock_method.c b/src/lock/lock_method.c
new file mode 100644
index 00000000..0cc2e19d
--- /dev/null
+++ b/src/lock/lock_method.c
@@ -0,0 +1,630 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * __lock_env_create --
+ *	Lock specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: int __lock_env_create __P((DB_ENV *));
+ */
+int
+__lock_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	u_int32_t cpu;
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 */
+	dbenv->lk_init = 0;
+	dbenv->lk_init_lockers = 0;
+	dbenv->lk_init_objects = 0;
+
+	/*
+	 * Default to 10 partitions per cpu.  This seems to be near
+	 * the point of diminishing returns on Xeon type processors.
+	 * Cpu count often returns the number of hyper threads and if
+	 * there is only one CPU you probably do not want to run partitions.
+	 */
+	cpu = __os_cpu_count();
+	dbenv->lk_partitions = cpu > 1 ? 10 * cpu : 1;
+
+	return (0);
+}
+
+/*
+ * __lock_env_destroy --
+ *	Lock specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __lock_env_destroy __P((DB_ENV *));
+ */
+void
+__lock_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (dbenv->lk_conflicts != NULL) {
+		__os_free(env, dbenv->lk_conflicts);
+		dbenv->lk_conflicts = NULL;
+	}
+}
+
+/*
+ * __lock_get_lk_conflicts
+ *	Get the conflicts matrix.
+ *
+ * PUBLIC: int __lock_get_lk_conflicts
+ * PUBLIC:     __P((DB_ENV *, const u_int8_t **, int *));
+ */
+int
+__lock_get_lk_conflicts(dbenv, lk_conflictsp, lk_modesp)
+	DB_ENV *dbenv;
+	const u_int8_t **lk_conflictsp;
+	int *lk_modesp;
+{
+	DB_LOCKTAB *lt;
+	ENV *env;
+
+	env = dbenv->env;
+	lt = env->lk_handle;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_conflicts", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		if (lk_conflictsp != NULL)
+			*lk_conflictsp = lt->conflicts;
+		if (lk_modesp != NULL)
+			*lk_modesp = ((DB_LOCKREGION *)
+			    (lt->reginfo.primary))->nmodes;
+	} else {
+		if (lk_conflictsp != NULL)
+			*lk_conflictsp = dbenv->lk_conflicts;
+		if (lk_modesp != NULL)
+			*lk_modesp = dbenv->lk_modes;
+	}
+	return (0);
+}
+
+/*
+ * __lock_set_lk_conflicts
+ *	Set the conflicts matrix.
+ *
+ * PUBLIC: int __lock_set_lk_conflicts __P((DB_ENV *, u_int8_t *, int));
+ */
+int
+__lock_set_lk_conflicts(dbenv, lk_conflicts, lk_modes)
+	DB_ENV *dbenv;
+	u_int8_t *lk_conflicts;
+	int lk_modes;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_conflicts");
+
+	if (dbenv->lk_conflicts != NULL) {
+		__os_free(env, dbenv->lk_conflicts);
+		dbenv->lk_conflicts = NULL;
+	}
+	if ((ret = __os_malloc(env,
+	    (size_t)(lk_modes * lk_modes), &dbenv->lk_conflicts)) != 0)
+		return (ret);
+	memcpy(
+	    dbenv->lk_conflicts, lk_conflicts, (size_t)(lk_modes * lk_modes));
+	dbenv->lk_modes = lk_modes;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_detect __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_detect(dbenv, lk_detectp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_detectp;
+{
+	DB_LOCKTAB *lt;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_detect", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		lt = env->lk_handle;
+		ENV_ENTER(env, ip);
+		LOCK_REGION_LOCK(env);
+		*lk_detectp = ((DB_LOCKREGION *)lt->reginfo.primary)->detect;
+		LOCK_REGION_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		*lk_detectp = dbenv->lk_detect;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_detect
+ *	DB_ENV->set_lk_detect.
+ *
+ * PUBLIC: int __lock_set_lk_detect __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_detect(dbenv, lk_detect)
+	DB_ENV *dbenv;
+	u_int32_t lk_detect;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->set_lk_detect", DB_INIT_LOCK);
+
+	switch (lk_detect) {
+	case DB_LOCK_DEFAULT:
+	case DB_LOCK_EXPIRE:
+	case DB_LOCK_MAXLOCKS:
+	case DB_LOCK_MAXWRITE:
+	case DB_LOCK_MINLOCKS:
+	case DB_LOCK_MINWRITE:
+	case DB_LOCK_OLDEST:
+	case DB_LOCK_RANDOM:
+	case DB_LOCK_YOUNGEST:
+		break;
+	default:
+		__db_errx(env, DB_STR("2043",
+    "DB_ENV->set_lk_detect: unknown deadlock detection mode specified"));
+		return (EINVAL);
+	}
+
+	ret = 0;
+	if (LOCKING_ON(env)) {
+		ENV_ENTER(env, ip);
+
+		lt = env->lk_handle;
+		region = lt->reginfo.primary;
+		LOCK_REGION_LOCK(env);
+		/*
+		 * Check for incompatible automatic deadlock detection requests.
+		 * There are scenarios where changing the detector configuration
+		 * is reasonable, but we disallow them guessing it is likely to
+		 * be an application error.
+		 *
+		 * We allow applications to turn on the lock detector, and we
+		 * ignore attempts to set it to the default or current value.
+		 */
+		if (region->detect != DB_LOCK_NORUN &&
+		    lk_detect != DB_LOCK_DEFAULT &&
+		    region->detect != lk_detect) {
+			__db_errx(env, DB_STR("2044",
+	    "DB_ENV->set_lk_detect: incompatible deadlock detector mode"));
+			ret = EINVAL;
+		} else
+			if (region->detect == DB_LOCK_NORUN)
+				region->detect = lk_detect;
+		LOCK_REGION_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		dbenv->lk_detect = lk_detect;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_locks __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_locks(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_maxlocks", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lk_maxp = ((DB_LOCKREGION *)
+		    env->lk_handle->reginfo.primary)->stat.st_maxlocks;
+	} else
+		*lk_maxp = dbenv->lk_max;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_max_locks
+ *	DB_ENV->set_lk_max_locks.
+ *
+ * PUBLIC: int __lock_set_lk_max_locks __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_locks(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_locks");
+
+	dbenv->lk_max = lk_max;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_lockers __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_lockers(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_max_lockers", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lk_maxp = ((DB_LOCKREGION *)
+		    env->lk_handle->reginfo.primary)->stat.st_maxlockers;
+	} else
+		*lk_maxp = dbenv->lk_max_lockers;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_max_lockers
+ *	DB_ENV->set_lk_max_lockers.
+ *
+ * PUBLIC: int __lock_set_lk_max_lockers __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_lockers(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_lockers");
+
+	dbenv->lk_max_lockers = lk_max;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_objects __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_objects(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_max_objects", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lk_maxp = ((DB_LOCKREGION *)
+		    env->lk_handle->reginfo.primary)->stat.st_maxobjects;
+	} else
+		*lk_maxp = dbenv->lk_max_objects;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_max_objects
+ *	DB_ENV->set_lk_max_objects.
+ *
+ * PUBLIC: int __lock_set_lk_max_objects __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_objects(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_objects");
+
+	dbenv->lk_max_objects = lk_max;
+	return (0);
+}
+/*
+ * PUBLIC: int __lock_get_lk_partitions __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_partitions(dbenv, lk_partitionp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_partitionp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_partitions", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lk_partitionp = ((DB_LOCKREGION *)
+		    env->lk_handle->reginfo.primary)->stat.st_partitions;
+	} else
+		*lk_partitionp = dbenv->lk_partitions;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_partitions
+ *	DB_ENV->set_lk_partitions.
+ *
+ * PUBLIC: int __lock_set_lk_partitions __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_partitions(dbenv, lk_partitions)
+	DB_ENV *dbenv;
+	u_int32_t lk_partitions;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_partitions");
+
+	dbenv->lk_partitions = lk_partitions;
+	return (0);
+}
+/*
+ * PUBLIC: int __lock_get_lk_tablesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_tablesize(dbenv, lk_tablesizep)
+	DB_ENV *dbenv;
+	u_int32_t *lk_tablesizep;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_lk_tablesize", DB_INIT_LOCK);
+
+	if (LOCKING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lk_tablesizep = ((DB_LOCKREGION *)
+		    env->lk_handle->reginfo.primary)->stat.st_tablesize;
+	} else
+		*lk_tablesizep = dbenv->object_t_size;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_tablesize
+ *	DB_ENV->set_lk_tablesize.
+ *
+ * PUBLIC: int __lock_set_lk_tablesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_tablesize(dbenv, lk_tablesize)
+	DB_ENV *dbenv;
+	u_int32_t lk_tablesize;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_tablesize");
+
+	dbenv->object_t_size = lk_tablesize;
+	return (0);
+}
+
+/*
+ * __lock_set_lk_priority --
+ *	Set a locker's priority.
+ *
+ * PUBLIC: int __lock_set_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__lock_set_lk_priority(dbenv, lockid, priority)
+	DB_ENV *dbenv;
+	u_int32_t lockid, priority;
+{
+	DB_LOCKER *locker;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if (!LOCKING_ON(env))
+		return (EINVAL);
+
+	if ((ret = __lock_getlocker(env->lk_handle, lockid, 0, &locker)) == 0)
+		locker->priority = priority;
+	return (ret);
+}
+
+/*
+ * __lock_get_lk_priority --
+ *	Get a locker's priority.
+ *
+ * PUBLIC: int __lock_get_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t *));
+ */
+int
+__lock_get_lk_priority(dbenv, lockid, priorityp)
+	DB_ENV *dbenv;
+	u_int32_t lockid, *priorityp;
+{
+	DB_LOCKER *locker;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if (!LOCKING_ON(env))
+		return (EINVAL);
+
+	if ((ret = __lock_getlocker(env->lk_handle, lockid, 0, &locker)) == 0)
+		*priorityp = locker->priority;
+	return ret;
+}
+
+/*
+ * PUBLIC: int __lock_get_env_timeout
+ * PUBLIC:     __P((DB_ENV *, db_timeout_t *, u_int32_t));
+ */
+int
+__lock_get_env_timeout(dbenv, timeoutp, flag)
+	DB_ENV *dbenv;
+	db_timeout_t *timeoutp;
+	u_int32_t flag;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->get_env_timeout", DB_INIT_LOCK);
+
+	ret = 0;
+	if (LOCKING_ON(env)) {
+		lt = env->lk_handle;
+		region = lt->reginfo.primary;
+		ENV_ENTER(env, ip);
+		LOCK_REGION_LOCK(env);
+		switch (flag) {
+		case DB_SET_LOCK_TIMEOUT:
+			*timeoutp = region->lk_timeout;
+			break;
+		case DB_SET_TXN_TIMEOUT:
+			*timeoutp = region->tx_timeout;
+			break;
+		default:
+			ret = 1;
+			break;
+		}
+		LOCK_REGION_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		switch (flag) {
+		case DB_SET_LOCK_TIMEOUT:
+			*timeoutp = dbenv->lk_timeout;
+			break;
+		case DB_SET_TXN_TIMEOUT:
+			*timeoutp = dbenv->tx_timeout;
+			break;
+		default:
+			ret = 1;
+			break;
+		}
+
+	if (ret)
+		ret = __db_ferr(env, "DB_ENV->get_timeout", 0);
+
+	return (ret);
+}
+
+/*
+ * __lock_set_env_timeout
+ *	DB_ENV->set_lock_timeout.
+ *
+ * PUBLIC: int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_env_timeout(dbenv, timeout, flags)
+	DB_ENV *dbenv;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lk_handle, "DB_ENV->set_env_timeout", DB_INIT_LOCK);
+
+	ret = 0;
+	if (LOCKING_ON(env)) {
+		lt = env->lk_handle;
+		region = lt->reginfo.primary;
+		ENV_ENTER(env, ip);
+		LOCK_REGION_LOCK(env);
+		switch (flags) {
+		case DB_SET_LOCK_TIMEOUT:
+			region->lk_timeout = timeout;
+			break;
+		case DB_SET_TXN_TIMEOUT:
+			region->tx_timeout = timeout;
+			break;
+		default:
+			ret = 1;
+			break;
+		}
+		LOCK_REGION_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		switch (flags) {
+		case DB_SET_LOCK_TIMEOUT:
+			dbenv->lk_timeout = timeout;
+			break;
+		case DB_SET_TXN_TIMEOUT:
+			dbenv->tx_timeout = timeout;
+			break;
+		default:
+			ret = 1;
+			break;
+		}
+
+	if (ret)
+		ret = __db_ferr(env, "DB_ENV->set_timeout", 0);
+
+	return (ret);
+}
diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c
new file mode 100644
index 00000000..1aae1815
--- /dev/null
+++ b/src/lock/lock_region.c
@@ -0,0 +1,578 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+static int  __lock_region_init __P((ENV *, DB_LOCKTAB *));
+
+/*
+ * The conflict arrays are set up such that the row is the lock you are
+ * holding and the column is the lock that is desired.
+ */
+#define	DB_LOCK_RIW_N	9
+static const u_int8_t db_riw_conflicts[] = {
+/*         N   R   W   WT  IW  IR  RIW DR  WW */
+/*   N */  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*   R */  0,  0,  1,  0,  1,  0,  1,  0,  1,
+/*   W */  0,  1,  1,  1,  1,  1,  1,  1,  1,
+/*  WT */  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*  IW */  0,  1,  1,  0,  0,  0,  0,  1,  1,
+/*  IR */  0,  0,  1,  0,  0,  0,  0,  0,  1,
+/* RIW */  0,  1,  1,  0,  0,  0,  0,  1,  1,
+/*  DR */  0,  0,  1,  0,  1,  0,  1,  0,  0,
+/*  WW */  0,  1,  1,  0,  1,  1,  1,  0,  1
+};
+
+/*
+ * This conflict array is used for concurrent db access (CDB).  It uses
+ * the same locks as the db_riw_conflicts array, but adds an IW mode to
+ * be used for write cursors.
+ */
+#define	DB_LOCK_CDB_N	5
+static const u_int8_t db_cdb_conflicts[] = {
+	/*		N	R	W	WT	IW */
+	/*   N */	0,	0,	0,	0,	0,
+	/*   R */	0,	0,	1,	0,	0,
+	/*   W */	0,	1,	1,	1,	1,
+	/*  WT */	0,	0,	0,	0,	0,
+	/*  IW */	0,	0,	1,	0,	1
+};
+
+/*
+ * __lock_open --
+ *	Internal version of lock_open: only called from ENV->open.
+ *
+ * PUBLIC: int __lock_open __P((ENV *));
+ */
+int
+__lock_open(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	int region_locked, ret;
+
+	dbenv = env->dbenv;
+	region_locked = 0;
+
+	/* Create the lock table structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_LOCKTAB), &lt)) != 0)
+		return (ret);
+	lt->env = env;
+
+	/* Join/create the lock region. */
+	if ((ret = __env_region_share(env, &lt->reginfo)) != 0)
+		goto err;
+
+	/* If we created the region, initialize it. */
+	if (F_ISSET(&lt->reginfo, REGION_CREATE))
+		if ((ret = __lock_region_init(env, lt)) != 0)
+			goto err;
+
+	/* Set the local addresses. */
+	region = lt->reginfo.primary =
+	    R_ADDR(&lt->reginfo, ((REGENV *)env->reginfo->primary)->lt_primary);
+
+	/* Set remaining pointers into region. */
+	lt->conflicts = R_ADDR(&lt->reginfo, region->conf_off);
+	lt->obj_tab = R_ADDR(&lt->reginfo, region->obj_off);
+#ifdef HAVE_STATISTICS
+	lt->obj_stat = R_ADDR(&lt->reginfo, region->stat_off);
+#endif
+	lt->part_array = R_ADDR(&lt->reginfo, region->part_off);
+	lt->locker_tab = R_ADDR(&lt->reginfo, region->locker_off);
+
+	env->lk_handle = lt;
+	lt->reginfo.mtx_alloc = region->mtx_region;
+
+	LOCK_REGION_LOCK(env);
+	region_locked = 1;
+
+	if (dbenv->lk_detect != DB_LOCK_NORUN) {
+		/*
+		 * Check for incompatible automatic deadlock detection requests.
+		 * There are scenarios where changing the detector configuration
+		 * is reasonable, but we disallow them guessing it is likely to
+		 * be an application error.
+		 *
+		 * We allow applications to turn on the lock detector, and we
+		 * ignore attempts to set it to the default or current value.
+		 */
+		if (region->detect != DB_LOCK_NORUN &&
+		    dbenv->lk_detect != DB_LOCK_DEFAULT &&
+		    region->detect != dbenv->lk_detect) {
+			__db_errx(env, DB_STR("2041",
+			    "lock_open: incompatible deadlock detector mode"));
+			ret = EINVAL;
+			goto err;
+		}
+		if (region->detect == DB_LOCK_NORUN)
+			region->detect = dbenv->lk_detect;
+	}
+
+	/*
+	 * A process joining the region may have reset the lock and transaction
+	 * timeouts.
+	 */
+	if (dbenv->lk_timeout != 0)
+		region->lk_timeout = dbenv->lk_timeout;
+	if (dbenv->tx_timeout != 0)
+		region->tx_timeout = dbenv->tx_timeout;
+
+	LOCK_REGION_UNLOCK(env);
+	region_locked = 0;
+
+	return (0);
+
+err:	if (lt->reginfo.addr != NULL) {
+		if (region_locked)
+			LOCK_REGION_UNLOCK(env);
+		(void)__env_region_detach(env, &lt->reginfo, 0);
+	}
+	env->lk_handle = NULL;
+
+	__os_free(env, lt);
+	return (ret);
+}
+
+/*
+ * __lock_region_init --
+ *	Initialize the lock region.
+ */
+static int
+__lock_region_init(env, lt)
+	ENV *env;
+	DB_LOCKTAB *lt;
+{
+	const u_int8_t *lk_conflicts;
+	struct __db_lock *lp;
+	DB_ENV *dbenv;
+	DB_LOCKER *lidp;
+	DB_LOCKOBJ *op;
+	DB_LOCKREGION *region;
+	DB_LOCKPART *part;
+	u_int32_t extra_locks, extra_objects, i, j, max;
+	u_int8_t *addr;
+	int lk_modes, ret;
+
+	dbenv = env->dbenv;
+
+	if ((ret = __env_alloc(&lt->reginfo,
+	    sizeof(DB_LOCKREGION), &lt->reginfo.primary)) != 0)
+		goto mem_err;
+	((REGENV *)env->reginfo->primary)->lt_primary =
+	     R_OFFSET(&lt->reginfo, lt->reginfo.primary);
+	region = lt->reginfo.primary;
+	memset(region, 0, sizeof(*region));
+
+	/* We share the region so we need the same mutex. */
+	region->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+
+	/* Select a conflict matrix if none specified. */
+	if (dbenv->lk_modes == 0)
+		if (CDB_LOCKING(env)) {
+			lk_modes = DB_LOCK_CDB_N;
+			lk_conflicts = db_cdb_conflicts;
+		} else {
+			lk_modes = DB_LOCK_RIW_N;
+			lk_conflicts = db_riw_conflicts;
+		}
+	else {
+		lk_modes = dbenv->lk_modes;
+		lk_conflicts = dbenv->lk_conflicts;
+	}
+
+	region->need_dd = 0;
+	timespecclear(&region->next_timeout);
+	region->detect = DB_LOCK_NORUN;
+	region->lk_timeout = dbenv->lk_timeout;
+	region->tx_timeout = dbenv->tx_timeout;
+	region->locker_t_size = dbenv->locker_t_size;
+	region->object_t_size = dbenv->object_t_size;
+	region->part_t_size = dbenv->lk_partitions;
+	region->lock_id = 0;
+	region->cur_maxid = DB_LOCK_MAXID;
+	region->nmodes = lk_modes;
+	memset(&region->stat, 0, sizeof(region->stat));
+	region->stat.st_maxlocks = dbenv->lk_max;
+	region->stat.st_maxlockers = dbenv->lk_max_lockers;
+	region->stat.st_maxobjects = dbenv->lk_max_objects;
+	region->stat.st_initlocks = region->stat.st_locks = dbenv->lk_init;
+	region->stat.st_initlockers =
+	     region->stat.st_lockers = dbenv->lk_init_lockers;
+	region->stat.st_initobjects  =
+	     region->stat.st_objects = dbenv->lk_init_objects;
+	region->stat.st_partitions = dbenv->lk_partitions;
+	region->stat.st_tablesize = dbenv->object_t_size;
+
+	/* Allocate room for the conflict matrix and initialize it. */
+	if ((ret = __env_alloc(
+	    &lt->reginfo, (size_t)(lk_modes * lk_modes), &addr)) != 0)
+		goto mem_err;
+	memcpy(addr, lk_conflicts, (size_t)(lk_modes * lk_modes));
+	region->conf_off = R_OFFSET(&lt->reginfo, addr);
+
+	/* Allocate room for the object hash table and initialize it. */
+	if ((ret = __env_alloc(&lt->reginfo,
+	    region->object_t_size * sizeof(DB_HASHTAB), &addr)) != 0)
+		goto mem_err;
+	__db_hashinit(addr, region->object_t_size);
+	region->obj_off = R_OFFSET(&lt->reginfo, addr);
+
+#ifdef HAVE_STATISTICS
+	/* Allocate room for the object hash stats table and initialize it. */
+	if ((ret = __env_alloc(&lt->reginfo,
+	    region->object_t_size * sizeof(DB_LOCK_HSTAT), &addr)) != 0)
+		goto mem_err;
+	memset(addr, 0, region->object_t_size * sizeof(DB_LOCK_HSTAT));
+	region->stat_off = R_OFFSET(&lt->reginfo, addr);
+#endif
+
+	/* Allocate room for the partition table and initialize its mutexes. */
+	if ((ret = __env_alloc(&lt->reginfo,
+	    region->part_t_size * sizeof(DB_LOCKPART), &part)) != 0)
+		goto mem_err;
+	memset(part, 0, region->part_t_size * sizeof(DB_LOCKPART));
+	region->part_off = R_OFFSET(&lt->reginfo, part);
+	for (i = 0; i < region->part_t_size; i++) {
+		if ((ret = __mutex_alloc(
+		    env, MTX_LOCK_REGION, 0, &part[i].mtx_part)) != 0)
+			return (ret);
+	}
+	if ((ret = __mutex_alloc(
+	    env, MTX_LOCK_REGION, 0, &region->mtx_dd)) != 0)
+		return (ret);
+
+	if ((ret = __mutex_alloc(
+	    env, MTX_LOCK_REGION, 0, &region->mtx_lockers)) != 0)
+		return (ret);
+
+	/* Allocate room for the locker hash table and initialize it. */
+	if ((ret = __env_alloc(&lt->reginfo,
+	    region->locker_t_size * sizeof(DB_HASHTAB), &addr)) != 0)
+		goto mem_err;
+	__db_hashinit(addr, region->locker_t_size);
+	region->locker_off = R_OFFSET(&lt->reginfo, addr);
+
+	SH_TAILQ_INIT(&region->dd_objs);
+
+	/*
+	 * If the locks and objects don't divide evenly, spread them around.
+	 */
+	extra_locks = region->stat.st_locks -
+	    ((region->stat.st_locks / region->part_t_size) *
+	    region->part_t_size);
+	extra_objects = region->stat.st_objects -
+	    ((region->stat.st_objects / region->part_t_size) *
+	    region->part_t_size);
+	for (j = 0; j < region->part_t_size; j++) {
+		/* Initialize locks onto a free list. */
+		SH_TAILQ_INIT(&part[j].free_locks);
+		max = region->stat.st_locks / region->part_t_size;
+		if (extra_locks > 0) {
+			max++;
+			extra_locks--;
+		}
+
+		if ((ret =
+			__env_alloc(&lt->reginfo,
+			    sizeof(struct __db_lock) * max,
+			    &lp)) != 0)
+			goto mem_err;
+		part[j].lock_mem_off = R_OFFSET(&lt->reginfo, lp);
+		for (i = 0; i < max; ++i) {
+			memset(lp, 0, sizeof(*lp));
+			lp->status = DB_LSTAT_FREE;
+			SH_TAILQ_INSERT_HEAD(
+			    &part[j].free_locks, lp, links, __db_lock);
+			++lp;
+		}
+
+		/* Initialize objects onto a free list.  */
+		max = region->stat.st_objects / region->part_t_size;
+		if (extra_objects > 0) {
+			max++;
+			extra_objects--;
+		}
+		SH_TAILQ_INIT(&part[j].free_objs);
+
+		if ((ret =
+			__env_alloc(&lt->reginfo,
+			    sizeof(DB_LOCKOBJ) * max,
+			    &op)) != 0)
+			goto mem_err;
+		part[j].lockobj_mem_off = R_OFFSET(&lt->reginfo, op);
+		for (i = 0; i < max; ++i) {
+			memset(op, 0, sizeof(*op));
+			SH_TAILQ_INSERT_HEAD(
+			    &part[j].free_objs, op, links, __db_lockobj);
+			++op;
+		}
+	}
+
+	/* Initialize lockers onto a free list.  */
+	SH_TAILQ_INIT(&region->lockers);
+	SH_TAILQ_INIT(&region->free_lockers);
+	if ((ret =
+		__env_alloc(&lt->reginfo,
+		    sizeof(DB_LOCKER) * region->stat.st_lockers,
+		    &lidp)) != 0)
+		goto mem_err;
+
+	region->locker_mem_off = R_OFFSET(&lt->reginfo, lidp);
+	for (i = 0; i < region->stat.st_lockers; ++i) {
+		SH_TAILQ_INSERT_HEAD(
+			&region->free_lockers, lidp, links, __db_locker);
+		++lidp;
+	}
+	return (0);
+mem_err:		__db_errx(env, DB_STR("2042",
+			    "unable to allocate memory for the lock table"));
+			return (ret);
+		}
+
+/*
+ * __lock_env_refresh --
+ *	Clean up after the lock system on a close or failed open.
+ *
+ * PUBLIC: int __lock_env_refresh __P((ENV *));
+ */
+int
+__lock_env_refresh(env)
+	ENV *env;
+{
+	DB_LOCKREGION *lr;
+	DB_LOCKTAB *lt;
+	REGINFO *reginfo;
+	u_int32_t j;
+	int ret;
+
+	lt = env->lk_handle;
+	reginfo = &lt->reginfo;
+	lr = reginfo->primary;
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		reginfo->mtx_alloc = MUTEX_INVALID;
+		/* Discard the conflict matrix. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lr->conf_off));
+
+		/* Discard the object hash table. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lr->obj_off));
+
+		/* Discard the locker hash table. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lr->locker_off));
+
+		/* Discard the object hash stat table. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lr->stat_off));
+		for (j = 0; j < lr->part_t_size; j++) {
+			SH_TAILQ_INIT(&FREE_OBJS(lt, j));
+			SH_TAILQ_INIT(&FREE_LOCKS(lt, j));
+			__env_alloc_free(reginfo,
+			    R_ADDR(reginfo,
+				lt->part_array[j].lock_mem_off));
+			__env_alloc_free(reginfo,
+			    R_ADDR(reginfo,
+				lt->part_array[j].lockobj_mem_off));
+		}
+
+		/* Discard the object partition array. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lr->part_off));
+		SH_TAILQ_INIT(&lr->free_lockers);
+		__env_alloc_free(reginfo,
+		    R_ADDR(reginfo, lr->locker_mem_off));
+	}
+
+	/* Detach from the region. */
+	ret = __env_region_detach(env, reginfo, 0);
+
+	/* Discard DB_LOCKTAB. */
+	__os_free(env, lt);
+	env->lk_handle = NULL;
+
+	return (ret);
+}
+
+/*
+ * __lock_region_mutex_count --
+ *	Return the number of mutexes the lock region will need.
+ *
+ * PUBLIC: u_int32_t __lock_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__lock_region_mutex_count(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * We need one mutex per locker for it to block on.
+	 */
+	return (dbenv->lk_init_lockers + dbenv->lk_partitions + 3);
+}
+/*
+ * __lock_region_mutex_max --
+ *	Return the number of additional mutexes the lock region will need.
+ *
+ * PUBLIC: u_int32_t __lock_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__lock_region_mutex_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * For backward compatibility, ensure enough mutexes.
+	 * These might actually get used by other things.
+	 */
+	if ((count = dbenv->lk_max_lockers) == 0)
+		count = DB_LOCK_DEFAULT_N;
+	if (count > dbenv->lk_init_lockers)
+		return (count - dbenv->lk_init_lockers);
+	else
+		return (0);
+}
+
+/*
+ * __lock_region_max --
+ *	Return the amount of extra memory to allocate for locking information.
+ * PUBLIC: size_t __lock_region_max __P((ENV *));
+ */
+size_t
+__lock_region_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t retval;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	retval = 0;
+	if ((count = dbenv->lk_max) == 0)
+		count = DB_LOCK_DEFAULT_N;
+	if (count > dbenv->lk_init)
+		retval += __env_alloc_size(sizeof(struct __db_lock)) *
+		    (count - dbenv->lk_init);
+	if ((count = dbenv->lk_max_objects) == 0)
+		count = DB_LOCK_DEFAULT_N;
+	if (count > dbenv->lk_init_objects)
+		retval += __env_alloc_size(sizeof(DB_LOCKOBJ)) *
+		    (count - dbenv->lk_init_objects);
+	if ((count = dbenv->lk_max_lockers) == 0)
+		count = DB_LOCK_DEFAULT_N;
+	if (count > dbenv->lk_init_lockers)
+		retval += __env_alloc_size(sizeof(DB_LOCKER)) *
+		    (count - dbenv->lk_init_lockers);
+
+	/* And we keep getting this wrong, let's be generous. */
+	retval += retval / 4;
+
+	return (retval);
+}
+
+/*
+ * __lock_region_size --
+ *	Return the initial region size.
+ * PUBLIC: size_t __lock_region_size __P((ENV *, size_t));
+ */
+size_t
+__lock_region_size(env, other_alloc)
+	ENV *env;
+	size_t other_alloc;
+{
+	DB_ENV *dbenv;
+	size_t retval;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	/* Make sure there is at least 5 objects and locks per partition. */
+	if (dbenv->lk_init_objects < dbenv->lk_partitions * 5)
+		dbenv->lk_init_objects = dbenv->lk_partitions * 5;
+	if (dbenv->lk_init < dbenv->lk_partitions * 5)
+		dbenv->lk_init = dbenv->lk_partitions * 5;
+	/*
+	 * Figure out how much space we're going to need.  This list should
+	 * map one-to-one with the __env_alloc calls in __lock_region_init.
+	 */
+	retval = 0;
+	retval += __env_alloc_size(sizeof(DB_LOCKREGION));
+	retval += __env_alloc_size((size_t)(dbenv->lk_modes * dbenv->lk_modes));
+	/*
+	 * Try to figure out the size of the locker hash table.
+	 */
+	if (dbenv->lk_max_lockers != 0)
+		dbenv->locker_t_size = __db_tablesize(dbenv->lk_max_lockers);
+	else if (dbenv->tx_max != 0)
+		dbenv->locker_t_size = __db_tablesize(dbenv->tx_max);
+	else {
+		if (dbenv->memory_max != 0)
+			count = (u_int32_t)
+			    (((dbenv->memory_max - other_alloc) / 10) /
+				sizeof(DB_LOCKER));
+		else
+			count = DB_LOCK_DEFAULT_N / 10;
+		if (count < dbenv->lk_init_lockers)
+			count = dbenv->lk_init_lockers;
+		dbenv->locker_t_size = __db_tablesize(count);
+	}
+	retval += __env_alloc_size(dbenv->locker_t_size * (sizeof(DB_HASHTAB)));
+	retval += __env_alloc_size(sizeof(DB_LOCKER)) * dbenv->lk_init_lockers;
+	retval += __env_alloc_size(sizeof(struct __db_lock) * dbenv->lk_init);
+	other_alloc += retval;
+	/*
+	 * We want to allocate a object hash table that is big enough to
+	 * avoid many collisions, but not too big for starters.  Arbitrarily
+	 * pick the point 2/3s of the way to the max size.  If the max
+	 * is not stated then guess that objects will fill 1/2 the memory.
+	 * Failing to know how much memory there might we just wind up
+	 * using the default value.  If this winds up being less than
+	 * the init value then we just make the table fit the init value.
+	 */
+	if ((count = dbenv->lk_max_objects) == 0) {
+		if (dbenv->memory_max != 0)
+			count = (u_int32_t)(
+			    ((dbenv->memory_max - other_alloc) / 2)
+			    / sizeof(DB_LOCKOBJ));
+		else
+			count = DB_LOCK_DEFAULT_N;
+		if (count < dbenv->lk_init_objects)
+			count = dbenv->lk_init_objects;
+	}
+	count *= 2;
+	count += dbenv->lk_init_objects;
+	count /= 3;
+	if (dbenv->object_t_size == 0)
+		dbenv->object_t_size = __db_tablesize(count);
+	retval += __env_alloc_size(
+	    __db_tablesize(dbenv->object_t_size) * (sizeof(DB_HASHTAB)));
+#ifdef HAVE_STATISTICS
+	retval += __env_alloc_size(
+	    __db_tablesize(dbenv->object_t_size) * (sizeof(DB_LOCK_HSTAT)));
+#endif
+	retval +=
+	    __env_alloc_size(dbenv->lk_partitions * (sizeof(DB_LOCKPART)));
+	retval += __env_alloc_size(sizeof(DB_LOCKOBJ) * dbenv->lk_init_objects);
+
+	return (retval);
+}
diff --git a/src/lock/lock_stat.c b/src/lock/lock_stat.c
new file mode 100644
index 00000000..11b934aa
--- /dev/null
+++ b/src/lock/lock_stat.c
@@ -0,0 +1,770 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int  __lock_dump_locker
+		__P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *));
+static int  __lock_dump_object __P((DB_LOCKTAB *, DB_MSGBUF *, DB_LOCKOBJ *));
+static int  __lock_print_all __P((ENV *, u_int32_t));
+static int  __lock_print_stats __P((ENV *, u_int32_t));
+static void __lock_print_header __P((ENV *));
+static int  __lock_stat __P((ENV *, DB_LOCK_STAT **, u_int32_t));
+
+/*
+ * __lock_stat_pp --
+ *	ENV->lock_stat pre/post processing.
+ *
+ * PUBLIC: int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+ */
+int
+__lock_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_LOCK_STAT **statp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_stat", DB_INIT_LOCK);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->lock_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__lock_stat(env, statp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_stat --
+ *	ENV->lock_stat.
+ */
+static int
+__lock_stat(env, statp, flags)
+	ENV *env;
+	DB_LOCK_STAT **statp;
+	u_int32_t flags;
+{
+	DB_LOCKREGION *region;
+	DB_LOCKTAB *lt;
+	DB_LOCK_STAT *stats, tmp;
+	DB_LOCK_HSTAT htmp;
+	DB_LOCK_PSTAT ptmp;
+	int ret;
+	u_int32_t i;
+	uintmax_t tmp_wait, tmp_nowait;
+
+	*statp = NULL;
+	lt = env->lk_handle;
+
+	if ((ret = __os_umalloc(env, sizeof(*stats), &stats)) != 0)
+		return (ret);
+
+	/* Copy out the global statistics. */
+	LOCK_REGION_LOCK(env);
+
+	region = lt->reginfo.primary;
+	memcpy(stats, &region->stat, sizeof(*stats));
+	stats->st_locktimeout = region->lk_timeout;
+	stats->st_txntimeout = region->tx_timeout;
+	stats->st_id = region->lock_id;
+	stats->st_cur_maxid = region->cur_maxid;
+	stats->st_nlockers = region->nlockers;
+	stats->st_nmodes = region->nmodes;
+
+	for (i = 0; i < region->object_t_size; i++) {
+		stats->st_nrequests += lt->obj_stat[i].st_nrequests;
+		stats->st_nreleases += lt->obj_stat[i].st_nreleases;
+		stats->st_nupgrade += lt->obj_stat[i].st_nupgrade;
+		stats->st_ndowngrade += lt->obj_stat[i].st_ndowngrade;
+		stats->st_lock_wait += lt->obj_stat[i].st_lock_wait;
+		stats->st_lock_nowait += lt->obj_stat[i].st_lock_nowait;
+		stats->st_nlocktimeouts += lt->obj_stat[i].st_nlocktimeouts;
+		stats->st_ntxntimeouts += lt->obj_stat[i].st_ntxntimeouts;
+		if (stats->st_maxhlocks < lt->obj_stat[i].st_maxnlocks)
+			stats->st_maxhlocks = lt->obj_stat[i].st_maxnlocks;
+		if (stats->st_maxhobjects < lt->obj_stat[i].st_maxnobjects)
+			stats->st_maxhobjects = lt->obj_stat[i].st_maxnobjects;
+		if (stats->st_hash_len < lt->obj_stat[i].st_hash_len)
+			stats->st_hash_len = lt->obj_stat[i].st_hash_len;
+		if (LF_ISSET(DB_STAT_CLEAR)) {
+			htmp = lt->obj_stat[i];
+			memset(&lt->obj_stat[i], 0, sizeof(lt->obj_stat[i]));
+			lt->obj_stat[i].st_nlocks = htmp.st_nlocks;
+			lt->obj_stat[i].st_maxnlocks = htmp.st_nlocks;
+			lt->obj_stat[i].st_nobjects = htmp.st_nobjects;
+			lt->obj_stat[i].st_maxnobjects = htmp.st_nobjects;
+
+		}
+	}
+
+	for (i = 0; i < region->part_t_size; i++) {
+		stats->st_nlocks += lt->part_array[i].part_stat.st_nlocks;
+		stats->st_maxnlocks +=
+		     lt->part_array[i].part_stat.st_maxnlocks;
+		stats->st_nobjects += lt->part_array[i].part_stat.st_nobjects;
+		stats->st_maxnobjects +=
+		    lt->part_array[i].part_stat.st_maxnobjects;
+		stats->st_locksteals +=
+		    lt->part_array[i].part_stat.st_locksteals;
+		if (stats->st_maxlsteals <
+		    lt->part_array[i].part_stat.st_locksteals)
+			stats->st_maxlsteals =
+			    lt->part_array[i].part_stat.st_locksteals;
+		stats->st_objectsteals +=
+		    lt->part_array[i].part_stat.st_objectsteals;
+		if (stats->st_maxosteals <
+		    lt->part_array[i].part_stat.st_objectsteals)
+			stats->st_maxosteals =
+			    lt->part_array[i].part_stat.st_objectsteals;
+		__mutex_set_wait_info(env,
+		     lt->part_array[i].mtx_part, &tmp_wait, &tmp_nowait);
+		stats->st_part_nowait += tmp_nowait;
+		stats->st_part_wait += tmp_wait;
+		if (tmp_wait > stats->st_part_max_wait) {
+			stats->st_part_max_nowait = tmp_nowait;
+			stats->st_part_max_wait = tmp_wait;
+		}
+
+		if (LF_ISSET(DB_STAT_CLEAR)) {
+			ptmp = lt->part_array[i].part_stat;
+			memset(&lt->part_array[i].part_stat,
+			    0, sizeof(lt->part_array[i].part_stat));
+			lt->part_array[i].part_stat.st_nlocks =
+			     ptmp.st_nlocks;
+			lt->part_array[i].part_stat.st_maxnlocks =
+			     ptmp.st_nlocks;
+			lt->part_array[i].part_stat.st_nobjects =
+			     ptmp.st_nobjects;
+			lt->part_array[i].part_stat.st_maxnobjects =
+			     ptmp.st_nobjects;
+		}
+	}
+
+	__mutex_set_wait_info(env, region->mtx_region,
+	    &stats->st_region_wait, &stats->st_region_nowait);
+	__mutex_set_wait_info(env, region->mtx_dd,
+	    &stats->st_objs_wait, &stats->st_objs_nowait);
+	__mutex_set_wait_info(env, region->mtx_lockers,
+	    &stats->st_lockers_wait, &stats->st_lockers_nowait);
+	stats->st_regsize = lt->reginfo.rp->size;
+	if (LF_ISSET(DB_STAT_CLEAR)) {
+		tmp = region->stat;
+		memset(&region->stat, 0, sizeof(region->stat));
+		if (!LF_ISSET(DB_STAT_SUBSYSTEM)) {
+			__mutex_clear(env, region->mtx_region);
+			__mutex_clear(env, region->mtx_dd);
+			__mutex_clear(env, region->mtx_lockers);
+			for (i = 0; i < region->part_t_size; i++)
+				__mutex_clear(env, lt->part_array[i].mtx_part);
+		}
+
+		region->stat.st_maxlocks = tmp.st_maxlocks;
+		region->stat.st_maxlockers = tmp.st_maxlockers;
+		region->stat.st_maxobjects = tmp.st_maxobjects;
+		region->stat.st_nlocks =
+		    region->stat.st_maxnlocks = tmp.st_nlocks;
+		region->stat.st_maxnlockers = region->nlockers;
+		region->stat.st_nobjects =
+		    region->stat.st_maxnobjects = tmp.st_nobjects;
+		region->stat.st_partitions = tmp.st_partitions;
+		region->stat.st_tablesize = tmp.st_tablesize;
+	}
+
+	LOCK_REGION_UNLOCK(env);
+
+	*statp = stats;
+	return (0);
+}
+
+/*
+ * __lock_stat_print_pp --
+ *	ENV->lock_stat_print pre/post processing.
+ *
+ * PUBLIC: int __lock_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lk_handle, "DB_ENV->lock_stat_print", DB_INIT_LOCK);
+
+#define	DB_STAT_LOCK_FLAGS						  \
+	(DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR | DB_STAT_LOCK_CONF |\
+	 DB_STAT_LOCK_LOCKERS |	DB_STAT_LOCK_OBJECTS | DB_STAT_LOCK_PARAMS)
+	if ((ret = __db_fchk(env, "DB_ENV->lock_stat_print",
+	    flags, DB_STAT_CLEAR | DB_STAT_LOCK_FLAGS)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__lock_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __lock_stat_print --
+ *	ENV->lock_stat_print method.
+ *
+ * PUBLIC: int  __lock_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__lock_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __lock_print_stats(env, orig_flags);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_CONF | DB_STAT_LOCK_LOCKERS |
+	    DB_STAT_LOCK_OBJECTS | DB_STAT_LOCK_PARAMS) &&
+	    (ret = __lock_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __lock_print_stats --
+ *	Display default lock region statistics.
+ */
+static int
+__lock_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_LOCK_STAT *sp;
+	int ret;
+
+#ifdef LOCK_DIAGNOSTIC
+	DB_LOCKTAB *lt;
+	DB_LOCKREGION *region;
+	u_int32_t i;
+	u_int32_t wait, nowait;
+
+	lt = env->lk_handle;
+	region = lt->reginfo.primary;
+
+	for (i = 0; i < region->object_t_size; i++) {
+		if (lt->obj_stat[i].st_hash_len == 0)
+			continue;
+		__db_dl(env,
+		    "Hash bucket", (u_long)i);
+		__db_dl(env, "Partition", (u_long)LOCK_PART(region, i));
+		__mutex_set_wait_info(env,
+		    lt->part_array[LOCK_PART(region, i)].mtx_part,
+		    &wait, &nowait);
+		__db_dl_pct(env,
+	    "The number of partition mutex requests that required waiting",
+		    (u_long)wait, DB_PCT(wait, wait + nowait), NULL);
+		__db_dl(env,
+		    "Maximum hash bucket length",
+		    (u_long)lt->obj_stat[i].st_hash_len);
+		__db_dl(env,
+		    "Total number of locks requested",
+		    (u_long)lt->obj_stat[i].st_nrequests);
+		__db_dl(env,
+		    "Total number of locks released",
+		    (u_long)lt->obj_stat[i].st_nreleases);
+		__db_dl(env,
+		    "Total number of locks upgraded",
+		    (u_long)lt->obj_stat[i].st_nupgrade);
+		__db_dl(env,
+		    "Total number of locks downgraded",
+		    (u_long)lt->obj_stat[i].st_ndowngrade);
+		__db_dl(env,
+	  "Lock requests not available due to conflicts, for which we waited",
+		    (u_long)lt->obj_stat[i].st_lock_wait);
+		__db_dl(env,
+  "Lock requests not available due to conflicts, for which we did not wait",
+		    (u_long)lt->obj_stat[i].st_lock_nowait);
+		__db_dl(env, "Number of locks that have timed out",
+		    (u_long)lt->obj_stat[i].st_nlocktimeouts);
+		__db_dl(env, "Number of transactions that have timed out",
+		    (u_long)lt->obj_stat[i].st_ntxntimeouts);
+	}
+#endif
+	if ((ret = __lock_stat(env, &sp, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default locking region information:");
+	__db_dl(env, "Last allocated locker ID", (u_long)sp->st_id);
+	__db_msg(env, "%#lx\tCurrent maximum unused locker ID",
+	    (u_long)sp->st_cur_maxid);
+	__db_dl(env, "Number of lock modes", (u_long)sp->st_nmodes);
+	__db_dl(env,
+	    "Initial number of locks allocated", (u_long)sp->st_initlocks);
+	__db_dl(env,
+	    "Initial number of lockers allocated", (u_long)sp->st_initlockers);
+	__db_dl(env, "Initial number of lock objects allocated",
+	    (u_long)sp->st_initobjects);
+	__db_dl(env,
+	    "Maximum number of locks possible", (u_long)sp->st_maxlocks);
+	__db_dl(env,
+	    "Maximum number of lockers possible", (u_long)sp->st_maxlockers);
+	__db_dl(env, "Maximum number of lock objects possible",
+	    (u_long)sp->st_maxobjects);
+	__db_dl(env,
+	    "Current number of locks allocated", (u_long)sp->st_locks);
+	__db_dl(env,
+	    "Current number of lockers allocated", (u_long)sp->st_lockers);
+	__db_dl(env, "Current number of lock objects allocated",
+	    (u_long)sp->st_objects);
+	__db_dl(env, "Number of lock object partitions",
+	    (u_long)sp->st_partitions);
+	__db_dl(env, "Size of object hash table",
+	    (u_long)sp->st_tablesize);
+	__db_dl(env, "Number of current locks", (u_long)sp->st_nlocks);
+	__db_dl(env, "Maximum number of locks at any one time",
+	    (u_long)sp->st_maxnlocks);
+	__db_dl(env, "Maximum number of locks in any one bucket",
+	    (u_long)sp->st_maxhlocks);
+	__db_dl(env, "Maximum number of locks stolen by for an empty partition",
+	    (u_long)sp->st_locksteals);
+	__db_dl(env, "Maximum number of locks stolen for any one partition",
+	    (u_long)sp->st_maxlsteals);
+	__db_dl(env, "Number of current lockers", (u_long)sp->st_nlockers);
+	__db_dl(env, "Maximum number of lockers at any one time",
+	    (u_long)sp->st_maxnlockers);
+	__db_dl(env,
+	    "Number of current lock objects", (u_long)sp->st_nobjects);
+	__db_dl(env, "Maximum number of lock objects at any one time",
+	    (u_long)sp->st_maxnobjects);
+	__db_dl(env, "Maximum number of lock objects in any one bucket",
+	    (u_long)sp->st_maxhobjects);
+	__db_dl(env,
+	    "Maximum number of objects stolen by for an empty partition",
+	    (u_long)sp->st_objectsteals);
+	__db_dl(env, "Maximum number of objects stolen for any one partition",
+	    (u_long)sp->st_maxosteals);
+	__db_dl(env,
+	    "Total number of locks requested", (u_long)sp->st_nrequests);
+	__db_dl(env,
+	    "Total number of locks released", (u_long)sp->st_nreleases);
+	__db_dl(env,
+	    "Total number of locks upgraded", (u_long)sp->st_nupgrade);
+	__db_dl(env,
+	    "Total number of locks downgraded", (u_long)sp->st_ndowngrade);
+	__db_dl(env,
+	  "Lock requests not available due to conflicts, for which we waited",
+	    (u_long)sp->st_lock_wait);
+	__db_dl(env,
+  "Lock requests not available due to conflicts, for which we did not wait",
+	    (u_long)sp->st_lock_nowait);
+	__db_dl(env, "Number of deadlocks", (u_long)sp->st_ndeadlocks);
+	__db_dl(env, "Lock timeout value", (u_long)sp->st_locktimeout);
+	__db_dl(env, "Number of locks that have timed out",
+	    (u_long)sp->st_nlocktimeouts);
+	__db_dl(env,
+	    "Transaction timeout value", (u_long)sp->st_txntimeout);
+	__db_dl(env, "Number of transactions that have timed out",
+	    (u_long)sp->st_ntxntimeouts);
+
+	__db_dlbytes(env, "Region size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+	__db_dl_pct(env,
+	    "The number of partition locks that required waiting",
+	    (u_long)sp->st_part_wait, DB_PCT(
+	    sp->st_part_wait, sp->st_part_wait + sp->st_part_nowait), NULL);
+	__db_dl_pct(env,
+    "The maximum number of times any partition lock was waited for",
+	    (u_long)sp->st_part_max_wait, DB_PCT(sp->st_part_max_wait,
+	    sp->st_part_max_wait + sp->st_part_max_nowait), NULL);
+	__db_dl_pct(env,
+	    "The number of object queue operations that required waiting",
+	    (u_long)sp->st_objs_wait, DB_PCT(sp->st_objs_wait,
+	    sp->st_objs_wait + sp->st_objs_nowait), NULL);
+	__db_dl_pct(env,
+	    "The number of locker allocations that required waiting",
+	    (u_long)sp->st_lockers_wait, DB_PCT(sp->st_lockers_wait,
+	    sp->st_lockers_wait + sp->st_lockers_nowait), NULL);
+	__db_dl_pct(env,
+	    "The number of region locks that required waiting",
+	    (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+	    sp->st_region_wait + sp->st_region_nowait), NULL);
+	__db_dl(env, "Maximum hash bucket length",
+	    (u_long)sp->st_hash_len);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __lock_print_all --
+ *	Display debugging lock region statistics.
+ */
+static int
+__lock_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_LOCKER *lip;
+	DB_LOCKOBJ *op;
+	DB_LOCKREGION *lrp;
+	DB_LOCKTAB *lt;
+	DB_MSGBUF mb;
+	int i, j;
+	u_int32_t k;
+
+	lt = env->lk_handle;
+	lrp = lt->reginfo.primary;
+	DB_MSGBUF_INIT(&mb);
+
+	LOCK_REGION_LOCK(env);
+	__db_print_reginfo(env, &lt->reginfo, "Lock", flags);
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_PARAMS)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Lock region parameters:");
+		__mutex_print_debug_single(env,
+		    "Lock region region mutex", lrp->mtx_region, flags);
+		STAT_ULONG("locker table size", lrp->locker_t_size);
+		STAT_ULONG("object table size", lrp->object_t_size);
+		STAT_ULONG("obj_off", lrp->obj_off);
+		STAT_ULONG("locker_off", lrp->locker_off);
+		STAT_ULONG("need_dd", lrp->need_dd);
+		if (timespecisset(&lrp->next_timeout)) {
+#ifdef HAVE_STRFTIME
+			time_t t = (time_t)lrp->next_timeout.tv_sec;
+			char tbuf[64];
+			if (strftime(tbuf, sizeof(tbuf),
+			    "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+				__db_msg(env, "next_timeout: %s.%09lu",
+				     tbuf, (u_long)lrp->next_timeout.tv_nsec);
+			else
+#endif
+				__db_msg(env, "next_timeout: %lu.%09lu",
+				     (u_long)lrp->next_timeout.tv_sec,
+				     (u_long)lrp->next_timeout.tv_nsec);
+		}
+	}
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_CONF)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Lock conflict matrix:");
+		for (i = 0; i < lrp->stat.st_nmodes; i++) {
+			for (j = 0; j < lrp->stat.st_nmodes; j++)
+				__db_msgadd(env, &mb, "%lu\t", (u_long)
+				    lt->conflicts[i * lrp->stat.st_nmodes + j]);
+			DB_MSGBUF_FLUSH(env, &mb);
+		}
+	}
+	LOCK_REGION_UNLOCK(env);
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_LOCKERS)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Locks grouped by lockers:");
+		__lock_print_header(env);
+		LOCK_LOCKERS(env, lrp);
+		for (k = 0; k < lrp->locker_t_size; k++)
+			SH_TAILQ_FOREACH(
+			    lip, &lt->locker_tab[k], links, __db_locker)
+				(void)__lock_dump_locker(env, &mb, lt, lip);
+		UNLOCK_LOCKERS(env, lrp);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_OBJECTS)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Locks grouped by object:");
+		__lock_print_header(env);
+		for (k = 0; k < lrp->object_t_size; k++) {
+			OBJECT_LOCK_NDX(lt, lrp, k);
+			SH_TAILQ_FOREACH(
+			    op, &lt->obj_tab[k], links, __db_lockobj) {
+				(void)__lock_dump_object(lt, &mb, op);
+				__db_msg(env, "%s", "");
+			}
+			OBJECT_UNLOCK(lt, lrp, k);
+		}
+	}
+
+	return (0);
+}
+
+static int
+__lock_dump_locker(env, mbp, lt, lip)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DB_LOCKTAB *lt;
+	DB_LOCKER *lip;
+{
+	DB_LOCKREGION *lrp;
+	struct __db_lock *lp;
+	char buf[DB_THREADID_STRLEN];
+	u_int32_t ndx;
+
+	lrp = lt->reginfo.primary;
+
+	__db_msgadd(env,
+	    mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s",
+	    (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites,
+	    env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf));
+	__db_msgadd(env, mbp,
+	    " flags %-4x priority %-10u", lip->flags, lip->priority);
+
+	if (timespecisset(&lip->tx_expire)) {
+#ifdef HAVE_STRFTIME
+		time_t t = (time_t)lip->tx_expire.tv_sec;
+		char tbuf[64];
+		if (strftime(tbuf, sizeof(tbuf),
+		    "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+			__db_msgadd(env, mbp, "expires %s.%09lu",
+			    tbuf, (u_long)lip->tx_expire.tv_nsec);
+		else
+#endif
+			__db_msgadd(env, mbp, "expires %lu.%09lu",
+			    (u_long)lip->tx_expire.tv_sec,
+			    (u_long)lip->tx_expire.tv_nsec);
+	}
+	if (F_ISSET(lip, DB_LOCKER_TIMEOUT))
+		__db_msgadd(
+		    env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout);
+	if (timespecisset(&lip->lk_expire)) {
+#ifdef HAVE_STRFTIME
+		time_t t = (time_t)lip->lk_expire.tv_sec;
+		char tbuf[64];
+		if (strftime(tbuf,
+		    sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+			__db_msgadd(env, mbp, " lk expires %s.%09lu",
+			    tbuf, (u_long)lip->lk_expire.tv_nsec);
+		else
+#endif
+			__db_msgadd(env, mbp, " lk expires %lu.%09lu",
+			    (u_long)lip->lk_expire.tv_sec,
+			    (u_long)lip->lk_expire.tv_nsec);
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+
+	/*
+	 * We need some care here since the list may change while we
+	 * look.
+	 */
+retry:	SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) {
+		if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) {
+			ndx = lp->indx;
+			OBJECT_LOCK_NDX(lt, lrp, ndx);
+			if (lp->indx == ndx)
+				__lock_printlock(lt, mbp, lp, 1);
+			else {
+				OBJECT_UNLOCK(lt, lrp, ndx);
+				goto retry;
+			}
+			OBJECT_UNLOCK(lt, lrp, ndx);
+		}
+	}
+	return (0);
+}
+
+static int
+__lock_dump_object(lt, mbp, op)
+	DB_LOCKTAB *lt;
+	DB_MSGBUF *mbp;
+	DB_LOCKOBJ *op;
+{
+	struct __db_lock *lp;
+
+	SH_TAILQ_FOREACH(lp, &op->holders, links, __db_lock)
+		__lock_printlock(lt, mbp, lp, 1);
+	SH_TAILQ_FOREACH(lp, &op->waiters, links, __db_lock)
+		__lock_printlock(lt, mbp, lp, 1);
+	return (0);
+}
+
+/*
+ * __lock_print_header --
+ */
+static void
+__lock_print_header(env)
+	ENV *env;
+{
+	__db_msg(env, "%-8s %-10s%-4s %-7s %s",
+	    "Locker", "Mode",
+	    "Count", "Status", "----------------- Object ---------------");
+}
+
+/*
+ * __lock_printlock --
+ *
+ * PUBLIC: void __lock_printlock
+ * PUBLIC:     __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int));
+ */
+void
+__lock_printlock(lt, mbp, lp, ispgno)
+	DB_LOCKTAB *lt;
+	DB_MSGBUF *mbp;
+	struct __db_lock *lp;
+	int ispgno;
+{
+	DB_LOCKOBJ *lockobj;
+	DB_MSGBUF mb;
+	ENV *env;
+	db_pgno_t pgno;
+	u_int32_t *fidp, type;
+	u_int8_t *ptr;
+	char *fname, *dname, *p, namebuf[26];
+	const char *mode, *status;
+
+	env = lt->env;
+
+	if (mbp == NULL) {
+		DB_MSGBUF_INIT(&mb);
+		mbp = &mb;
+	}
+
+	switch (lp->mode) {
+	case DB_LOCK_IREAD:
+		mode = "IREAD";
+		break;
+	case DB_LOCK_IWR:
+		mode = "IWR";
+		break;
+	case DB_LOCK_IWRITE:
+		mode = "IWRITE";
+		break;
+	case DB_LOCK_NG:
+		mode = "NG";
+		break;
+	case DB_LOCK_READ:
+		mode = "READ";
+		break;
+	case DB_LOCK_READ_UNCOMMITTED:
+		mode = "READ_UNCOMMITTED";
+		break;
+	case DB_LOCK_WRITE:
+		mode = "WRITE";
+		break;
+	case DB_LOCK_WWRITE:
+		mode = "WAS_WRITE";
+		break;
+	case DB_LOCK_WAIT:
+		mode = "WAIT";
+		break;
+	default:
+		mode = "UNKNOWN";
+		break;
+	}
+	switch (lp->status) {
+	case DB_LSTAT_ABORTED:
+		status = "ABORT";
+		break;
+	case DB_LSTAT_EXPIRED:
+		status = "EXPIRED";
+		break;
+	case DB_LSTAT_FREE:
+		status = "FREE";
+		break;
+	case DB_LSTAT_HELD:
+		status = "HELD";
+		break;
+	case DB_LSTAT_PENDING:
+		status = "PENDING";
+		break;
+	case DB_LSTAT_WAITING:
+		status = "WAIT";
+		break;
+	default:
+		status = "UNKNOWN";
+		break;
+	}
+	__db_msgadd(env, mbp, "%8lx %-10s %4lu %-7s ",
+	    (u_long)((DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder))->id,
+	    mode, (u_long)lp->refcount, status);
+
+	lockobj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+	ptr = SH_DBT_PTR(&lockobj->lockobj);
+	if (ispgno && lockobj->lockobj.size == sizeof(struct __db_ilock)) {
+		/* Assume this is a DBT lock. */
+		memcpy(&pgno, ptr, sizeof(db_pgno_t));
+		fidp = (u_int32_t *)(ptr + sizeof(db_pgno_t));
+		type = *(u_int32_t *)(ptr + sizeof(db_pgno_t) + DB_FILE_ID_LEN);
+		(void)__dbreg_get_name(
+		    lt->env, (u_int8_t *)fidp, &fname, &dname);
+		if (fname == NULL && dname == NULL)
+			__db_msgadd(env, mbp, "(%lx %lx %lx %lx %lx) ",
+			    (u_long)fidp[0], (u_long)fidp[1], (u_long)fidp[2],
+			    (u_long)fidp[3], (u_long)fidp[4]);
+		else {
+			if (fname != NULL && dname != NULL) {
+				(void)snprintf(namebuf, sizeof(namebuf),
+				    "%14s:%-10s", fname, dname);
+				p = namebuf;
+			} else if (fname != NULL)
+				p = fname;
+			else
+				p = dname;
+			__db_msgadd(env, mbp, "%-25s ", p);
+		}
+		__db_msgadd(env, mbp, "%-7s %7lu",
+			type == DB_PAGE_LOCK ? "page" :
+			type == DB_RECORD_LOCK ? "record" :
+			type == DB_DATABASE_LOCK ? "database" : "handle",
+			(u_long)pgno);
+	} else {
+		__db_msgadd(env, mbp, "0x%lx ",
+		    (u_long)R_OFFSET(&lt->reginfo, lockobj));
+		__db_prbytes(env, mbp, ptr, lockobj->lockobj.size);
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__lock_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_LOCK_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__lock_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/lock/lock_stub.c b/src/lock/lock_stub.c
new file mode 100644
index 00000000..3875af55
--- /dev/null
+++ b/src/lock/lock_stub.c
@@ -0,0 +1,631 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * If the library wasn't compiled with locking support, various routines
+ * aren't available.  Stub them here, returning an appropriate error.
+ */
+static int __db_nolocking __P((ENV *));
+
+/*
+ * __db_nolocking --
+ *	Error when a Berkeley DB build doesn't include the locking subsystem.
+ */
+static int
+__db_nolocking(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("2054",
+	    "library build did not include support for locking"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__lock_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	COMPQUIET(dbenv, 0);
+	return (0);
+}
+
+void
+__lock_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	COMPQUIET(dbenv, 0);
+}
+
+int
+__lock_get_lk_conflicts(dbenv, lk_conflictsp, lk_modesp)
+	DB_ENV *dbenv;
+	const u_int8_t **lk_conflictsp;
+	int *lk_modesp;
+{
+	COMPQUIET(lk_conflictsp, NULL);
+	COMPQUIET(lk_modesp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_detect(dbenv, lk_detectp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_detectp;
+{
+	COMPQUIET(lk_detectp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_lockers(dbenv, lk_initp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_initp;
+{
+	COMPQUIET(lk_initp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_locks(dbenv, lk_initp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_initp;
+{
+	COMPQUIET(lk_initp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_objects(dbenv, lk_initp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_initp;
+{
+	COMPQUIET(lk_initp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_lockers(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	COMPQUIET(lk_maxp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_locks(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	COMPQUIET(lk_maxp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_objects(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	COMPQUIET(lk_maxp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_partitions(dbenv, lk_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lk_maxp;
+{
+	COMPQUIET(lk_maxp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_tablesize(dbenv, lk_tablesizep)
+	DB_ENV *dbenv;
+	u_int32_t *lk_tablesizep;
+{
+	COMPQUIET(lk_tablesizep, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_tablesize(dbenv, lk_tablesize)
+	DB_ENV *dbenv;
+	u_int32_t lk_tablesize;
+{
+	COMPQUIET(lk_tablesize, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_priority(dbenv, lockid, priorityp)
+	DB_ENV *dbenv;
+	u_int32_t lockid, *priorityp;
+{
+	COMPQUIET(lockid, 0);
+	COMPQUIET(priorityp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_priority(dbenv, lockid, priority)
+	DB_ENV *dbenv;
+	u_int32_t lockid, priority;
+{
+	COMPQUIET(lockid, 0);
+	COMPQUIET(priority, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_env_timeout(dbenv, timeoutp, flag)
+	DB_ENV *dbenv;
+	db_timeout_t *timeoutp;
+	u_int32_t flag;
+{
+	COMPQUIET(timeoutp, NULL);
+	COMPQUIET(flag, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_detect_pp(dbenv, flags, atype, abortp)
+	DB_ENV *dbenv;
+	u_int32_t flags, atype;
+	int *abortp;
+{
+	COMPQUIET(flags, 0);
+	COMPQUIET(atype, 0);
+	COMPQUIET(abortp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_pp(dbenv, locker, flags, obj, lock_mode, lock)
+	DB_ENV *dbenv;
+	u_int32_t locker, flags;
+	DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	COMPQUIET(locker, 0);
+	COMPQUIET(flags, 0);
+	COMPQUIET(obj, NULL);
+	COMPQUIET(lock_mode, 0);
+	COMPQUIET(lock, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_id_pp(dbenv, idp)
+	DB_ENV *dbenv;
+	u_int32_t *idp;
+{
+	COMPQUIET(idp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_id_free_pp(dbenv, id)
+	DB_ENV *dbenv;
+	u_int32_t id;
+{
+	COMPQUIET(id, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_put_pp(dbenv, lock)
+	DB_ENV *dbenv;
+	DB_LOCK *lock;
+{
+	COMPQUIET(lock, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_LOCK_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_vec_pp(dbenv, locker, flags, list, nlist, elistp)
+	DB_ENV *dbenv;
+	u_int32_t locker, flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	COMPQUIET(locker, 0);
+	COMPQUIET(flags, 0);
+	COMPQUIET(list, NULL);
+	COMPQUIET(nlist, 0);
+	COMPQUIET(elistp, NULL);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_conflicts(dbenv, lk_conflicts, lk_modes)
+	DB_ENV *dbenv;
+	u_int8_t *lk_conflicts;
+	int lk_modes;
+{
+	COMPQUIET(lk_conflicts, NULL);
+	COMPQUIET(lk_modes, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_detect(dbenv, lk_detect)
+	DB_ENV *dbenv;
+	u_int32_t lk_detect;
+{
+	COMPQUIET(lk_detect, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_locks(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	COMPQUIET(lk_max, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_lockers(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	COMPQUIET(lk_max, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_objects(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	COMPQUIET(lk_max, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_partitions(dbenv, lk_max)
+	DB_ENV *dbenv;
+	u_int32_t lk_max;
+{
+	COMPQUIET(lk_max, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_env_timeout(dbenv, timeout, flags)
+	DB_ENV *dbenv;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	COMPQUIET(timeout, 0);
+	COMPQUIET(flags, 0);
+	return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_open(env)
+	ENV *env;
+{
+	return (__db_nolocking(env));
+}
+
+u_int32_t
+__lock_region_mutex_count(env)
+	ENV *env;
+{
+	return (__db_nolocking(env));
+}
+
+u_int32_t
+__lock_region_mutex_max(env)
+	ENV *env;
+{
+	return (__db_nolocking(env));
+}
+
+size_t
+__lock_region_max(env)
+	ENV *env;
+{
+	return (0);
+}
+
+size_t
+__lock_region_size(env, other_alloc)
+	ENV *env;
+	size_t other_alloc;
+{
+	COMPQUIET(other_alloc, 0);
+	return (0);
+}
+
+int
+__lock_id_free(env, sh_locker)
+	ENV *env;
+	DB_LOCKER *sh_locker;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(sh_locker, 0);
+	return (0);
+}
+
+int
+__lock_env_refresh(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__lock_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+int
+__lock_put(env, lock)
+	ENV *env;
+	DB_LOCK *lock;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(lock, NULL);
+	return (0);
+}
+
+int
+__lock_vec(env, sh_locker, flags, list, nlist, elistp)
+	ENV *env;
+	DB_LOCKER *sh_locker;
+	u_int32_t flags;
+	int nlist;
+	DB_LOCKREQ *list, **elistp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(sh_locker, 0);
+	COMPQUIET(flags, 0);
+	COMPQUIET(list, NULL);
+	COMPQUIET(nlist, 0);
+	COMPQUIET(elistp, NULL);
+	return (0);
+}
+
+int
+__lock_get(env, locker, flags, obj, lock_mode, lock)
+	ENV *env;
+	DB_LOCKER *locker;
+	u_int32_t flags;
+	const DBT *obj;
+	db_lockmode_t lock_mode;
+	DB_LOCK *lock;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(locker, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(obj, NULL);
+	COMPQUIET(lock_mode, 0);
+	COMPQUIET(lock, NULL);
+	return (0);
+}
+
+int
+__lock_id(env, idp, lkp)
+	ENV *env;
+	u_int32_t *idp;
+	DB_LOCKER **lkp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(idp, NULL);
+	COMPQUIET(lkp, NULL);
+	return (0);
+}
+
+int
+__lock_inherit_timeout(env, parent, locker)
+	ENV *env;
+	DB_LOCKER *parent, *locker;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(parent, NULL);
+	COMPQUIET(locker, NULL);
+	return (0);
+}
+
+int
+__lock_set_timeout(env, locker, timeout, op)
+	ENV *env;
+	DB_LOCKER *locker;
+	db_timeout_t timeout;
+	u_int32_t op;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(locker, NULL);
+	COMPQUIET(timeout, 0);
+	COMPQUIET(op, 0);
+	return (0);
+}
+
+int
+__lock_addfamilylocker(env, pid, id, is_family)
+	ENV *env;
+	u_int32_t pid, id, is_family;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(pid, 0);
+	COMPQUIET(id, 0);
+	COMPQUIET(is_family, 0);
+	return (0);
+}
+
+int
+__lock_freelocker(lt, sh_locker)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+{
+	COMPQUIET(lt, NULL);
+	COMPQUIET(sh_locker, NULL);
+	return (0);
+}
+
+int
+__lock_familyremove(lt, sh_locker)
+	DB_LOCKTAB *lt;
+	DB_LOCKER *sh_locker;
+{
+	COMPQUIET(lt, NULL);
+	COMPQUIET(sh_locker, NULL);
+	return (0);
+}
+
+int
+__lock_downgrade(env, lock, new_mode, flags)
+	ENV *env;
+	DB_LOCK *lock;
+	db_lockmode_t new_mode;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(lock, NULL);
+	COMPQUIET(new_mode, 0);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+int
+__lock_locker_same_family(env, locker1, locker2, retp)
+	ENV *env;
+	DB_LOCKER *locker1;
+	DB_LOCKER *locker2;
+	int *retp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(locker1, NULL);
+	COMPQUIET(locker2, NULL);
+
+	*retp = 1;
+	return (0);
+}
+
+void
+__lock_set_thread_id(lref, pid, tid)
+	void *lref;
+	pid_t pid;
+	db_threadid_t tid;
+{
+	COMPQUIET(lref, NULL);
+	COMPQUIET(pid, 0);
+	COMPQUIET(tid, 0);
+}
+
+int
+__lock_failchk(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__lock_get_list(env, locker, flags, lock_mode, list)
+	ENV *env;
+	DB_LOCKER *locker;
+	u_int32_t flags;
+	db_lockmode_t lock_mode;
+	DBT *list;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(locker, NULL);
+	COMPQUIET(flags, 0);
+	COMPQUIET(lock_mode, 0);
+	COMPQUIET(list, NULL);
+	return (0);
+}
+
+void
+__lock_list_print(env, mbp, list)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DBT *list;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(list, NULL);
+}
+
+int
+__lock_getlocker(lt, locker, create, retp)
+	DB_LOCKTAB *lt;
+	u_int32_t locker;
+	int create;
+	DB_LOCKER **retp;
+{
+	COMPQUIET(locker, 0);
+	COMPQUIET(create, 0);
+	COMPQUIET(retp, NULL);
+	return (__db_nolocking(lt->env));
+}
+
+int
+__lock_id_set(env, cur_id, max_id)
+	ENV *env;
+	u_int32_t cur_id, max_id;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(cur_id, 0);
+	COMPQUIET(max_id, 0);
+	return (0);
+}
+
+int
+__lock_wakeup(env, obj)
+	ENV *env;
+	const DBT *obj;
+{
+	COMPQUIET(obj, NULL);
+	return (__db_nolocking(env));
+}
+
+int
+__lock_change(env, old_lock, new_lock)
+	ENV *env;
+	DB_LOCK *old_lock, *new_lock;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(old_lock, NULL);
+	COMPQUIET(new_lock, NULL);
+}
diff --git a/src/lock/lock_timer.c b/src/lock/lock_timer.c
new file mode 100644
index 00000000..943047f0
--- /dev/null
+++ b/src/lock/lock_timer.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * __lock_set_timeout --
+ *	Set timeout values in shared memory.
+ *
+ * This is called from the transaction system.  We either set the time that
+ * this transaction expires or the amount of time a lock for this transaction
+ * is permitted to wait.
+ *
+ * PUBLIC: int __lock_set_timeout __P((ENV *,
+ * PUBLIC:      DB_LOCKER *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_timeout(env, locker, timeout, op)
+	ENV *env;
+	DB_LOCKER *locker;
+	db_timeout_t timeout;
+	u_int32_t op;
+{
+	int ret;
+
+	if (locker == NULL)
+		return (0);
+	LOCK_REGION_LOCK(env);
+	ret = __lock_set_timeout_internal(env, locker, timeout, op);
+	LOCK_REGION_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __lock_set_timeout_internal
+ *		-- set timeout values in shared memory.
+ *
+ * This is the internal version called from the lock system.  We either set
+ * the time that this transaction expires or the amount of time that a lock
+ * for this transaction is permitted to wait.
+ *
+ * PUBLIC: int __lock_set_timeout_internal
+ * PUBLIC:     __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_timeout_internal(env, sh_locker, timeout, op)
+	ENV *env;
+	DB_LOCKER *sh_locker;
+	db_timeout_t timeout;
+	u_int32_t op;
+{
+	DB_LOCKREGION *region;
+	region = env->lk_handle->reginfo.primary;
+
+	if (op == DB_SET_TXN_TIMEOUT) {
+		if (timeout == 0)
+			timespecclear(&sh_locker->tx_expire);
+		else
+			__clock_set_expires(env,
+			    &sh_locker->tx_expire, timeout);
+	} else if (op == DB_SET_LOCK_TIMEOUT) {
+		sh_locker->lk_timeout = timeout;
+		F_SET(sh_locker, DB_LOCKER_TIMEOUT);
+	} else if (op == DB_SET_TXN_NOW) {
+		timespecclear(&sh_locker->tx_expire);
+		__clock_set_expires(env, &sh_locker->tx_expire, 0);
+		sh_locker->lk_expire = sh_locker->tx_expire;
+		if (!timespecisset(&region->next_timeout) ||
+		    timespeccmp(
+			&region->next_timeout, &sh_locker->lk_expire, >))
+			region->next_timeout = sh_locker->lk_expire;
+	} else
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * __lock_inherit_timeout
+ *		-- inherit timeout values from parent locker.
+ * This is called from the transaction system.  This will
+ * return EINVAL if the parent does not exist or did not
+ * have a current txn timeout set.
+ *
+ * PUBLIC: int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *));
+ */
+int
+__lock_inherit_timeout(env, parent, locker)
+	ENV *env;
+	DB_LOCKER *parent, *locker;
+{
+	int ret;
+
+	ret = 0;
+	LOCK_REGION_LOCK(env);
+
+	/*
+	 * If the parent is not there yet, that's ok.  If it
+	 * does not have any timouts set, then avoid creating
+	 * the child locker at this point.
+	 */
+	if (parent == NULL ||
+	    (timespecisset(&parent->tx_expire) &&
+	    !F_ISSET(parent, DB_LOCKER_TIMEOUT))) {
+		ret = EINVAL;
+		goto err;
+	}
+
+	locker->tx_expire = parent->tx_expire;
+
+	if (F_ISSET(parent, DB_LOCKER_TIMEOUT)) {
+		locker->lk_timeout = parent->lk_timeout;
+		F_SET(locker, DB_LOCKER_TIMEOUT);
+		if (!timespecisset(&parent->tx_expire))
+			ret = EINVAL;
+	}
+
+err:	LOCK_REGION_UNLOCK(env);
+	return (ret);
+}
diff --git a/src/lock/lock_util.c b/src/lock/lock_util.c
new file mode 100644
index 00000000..f7029cd7
--- /dev/null
+++ b/src/lock/lock_util.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+
+/*
+ * The next two functions are the hash functions used to store objects in the
+ * lock hash tables.  They are hashing the same items, but one (__lock_ohash)
+ * takes a DBT (used for hashing a parameter passed from the user) and the
+ * other (__lock_lhash) takes a DB_LOCKOBJ (used for hashing something that is
+ * already in the lock manager).  In both cases, we have a special check to
+ * fast path the case where we think we are doing a hash on a DB page/fileid
+ * pair.  If the size is right, then we do the fast hash.
+ *
+ * We know that DB uses DB_LOCK_ILOCK types for its lock objects.  The first
+ * four bytes are the 4-byte page number and the next DB_FILE_ID_LEN bytes
+ * are a unique file id, where the first 4 bytes on UNIX systems are the file
+ * inode number, and the first 4 bytes on Windows systems are the FileIndexLow
+ * bytes.  This is followed by a random number.  The inode values tend
+ * to increment fairly slowly and are not good for hashing.  So, we use
+ * the XOR of the page number and the four bytes of the file id random
+ * number to produce a 32-bit hash value.
+ *
+ * We have no particular reason to believe that this algorithm will produce
+ * a good hash, but we want a fast hash more than we want a good one, when
+ * we're coming through this code path.
+ */
+#define	FAST_HASH(P) {			\
+	u_int32_t __h;			\
+	u_int8_t *__cp, *__hp;		\
+	__hp = (u_int8_t *)&__h;	\
+	__cp = (u_int8_t *)(P);		\
+	__hp[0] = __cp[0] ^ __cp[12];	\
+	__hp[1] = __cp[1] ^ __cp[13];	\
+	__hp[2] = __cp[2] ^ __cp[14];	\
+	__hp[3] = __cp[3] ^ __cp[15];	\
+	return (__h);			\
+}
+
+/*
+ * __lock_ohash --
+ *
+ * PUBLIC: u_int32_t __lock_ohash __P((const DBT *));
+ */
+u_int32_t
+__lock_ohash(dbt)
+	const DBT *dbt;
+{
+	if (dbt->size == sizeof(DB_LOCK_ILOCK))
+		FAST_HASH(dbt->data);
+
+	return (__ham_func5(NULL, dbt->data, dbt->size));
+}
+
+/*
+ * __lock_lhash --
+ *
+ * PUBLIC: u_int32_t __lock_lhash __P((DB_LOCKOBJ *));
+ */
+u_int32_t
+__lock_lhash(lock_obj)
+	DB_LOCKOBJ *lock_obj;
+{
+	void *obj_data;
+
+	obj_data = SH_DBT_PTR(&lock_obj->lockobj);
+
+	if (lock_obj->lockobj.size == sizeof(DB_LOCK_ILOCK))
+		FAST_HASH(obj_data);
+
+	return (__ham_func5(NULL, obj_data, lock_obj->lockobj.size));
+}
+
+/*
+ * __lock_nomem --
+ *	Report a lack of some resource.
+ *
+ * PUBLIC: int __lock_nomem __P((ENV *, const char *));
+ */
+int
+__lock_nomem(env, res)
+	ENV *env;
+	const char *res;
+{
+	__db_errx(env, DB_STR_A("2055", "Lock table is out of available %s",
+	    "%s"), res);
+	return (ENOMEM);
+}
diff --git a/src/log/log.c b/src/log/log.c
new file mode 100644
index 00000000..5808145f
--- /dev/null
+++ b/src/log/log.c
@@ -0,0 +1,1727 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int	__log_init __P((ENV *, DB_LOG *));
+static int	__log_recover __P((DB_LOG *));
+
+/*
+ * __log_open --
+ *	Internal version of log_open: only called from ENV->open.
+ *
+ * PUBLIC: int __log_open __P((ENV *));
+ */
+int
+__log_open(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	LOG *lp;
+	u_int8_t *bulk;
+	int region_locked, ret;
+
+	dbenv = env->dbenv;
+	region_locked = 0;
+
+	/* Create/initialize the DB_LOG structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
+		return (ret);
+	dblp->env = env;
+
+	/* Join/create the log region. */
+	if ((ret = __env_region_share(env, &dblp->reginfo)) != 0)
+		goto err;
+
+	/* If we created the region, initialize it. */
+	if (F_ISSET(&dblp->reginfo, REGION_CREATE))
+		if ((ret = __log_init(env, dblp)) != 0)
+			goto err;
+
+	/* Set the local addresses. */
+	lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo,
+	    ((REGENV *)env->reginfo->primary)->lg_primary);
+	dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
+
+	/*
+	 * If the region is threaded, we have to lock the DBREG list, and we
+	 * need to allocate a mutex for that purpose.
+	 */
+	if ((ret = __mutex_alloc(env,
+	    MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
+		goto err;
+
+	/*
+	 * Set the handle -- we may be about to run recovery, which allocates
+	 * log cursors.  Log cursors require logging be already configured,
+	 * and the handle being set is what demonstrates that.
+	 *
+	 * If we created the region, run recovery.  If that fails, make sure
+	 * we reset the log handle before cleaning up, otherwise we will try
+	 * and clean up again in the mainline ENV initialization code.
+	 */
+	env->lg_handle = dblp;
+
+	if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
+		/*
+		 * We first take the log file size from the environment, if
+		 * specified.  If that wasn't set, default it.  Regardless,
+		 * recovery may set it from the persistent information in a
+		 * log file header.
+		 */
+		if (lp->log_size == 0)
+			lp->log_size =
+			    FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
+			    LG_MAX_INMEM : LG_MAX_DEFAULT;
+
+		if ((ret = __log_recover(dblp)) != 0)
+			goto err;
+
+		/*
+		 * If the next log file size hasn't been set yet, default it
+		 * to the current log file size.
+		 */
+		if (lp->log_nsize == 0)
+			lp->log_nsize = lp->log_size;
+
+		/*
+		 * If we haven't written any log files, write the first one
+		 * so that checkpoint gets a valid ckp_lsn value.
+		 */
+		if (IS_INIT_LSN(lp->lsn) &&
+		    (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
+			goto err;
+
+		/*
+		 * Initialize replication's next-expected LSN value
+		 * and replication's bulk buffer.  In __env_open, we
+		 * always create/open the replication region before
+		 * the log region so we're assured that our rep_handle
+		 * is valid at this point, if replication is being used.
+		 */
+		lp->ready_lsn = lp->lsn;
+		if (IS_ENV_REPLICATED(env)) {
+			if ((ret =
+			    __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
+				goto err;
+			lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
+			lp->bulk_len = MEGABYTE;
+			lp->bulk_off = 0;
+			lp->wait_ts = env->rep_handle->request_gap;
+			__os_gettime(env, &lp->rcvd_ts, 1);
+		} else {
+			lp->bulk_buf = INVALID_ROFF;
+			lp->bulk_len = 0;
+			lp->bulk_off = 0;
+		}
+	} else {
+		/*
+		 * A process joining the region may have reset the log file
+		 * size, too.  If so, it only affects the next log file we
+		 * create.  We need to check that the size is reasonable given
+		 * the buffer size in the region.
+		 */
+		LOG_SYSTEM_LOCK(env);
+		region_locked = 1;
+
+		 if (dbenv->lg_size != 0) {
+			if ((ret =
+			    __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
+				goto err;
+
+			lp->log_nsize = dbenv->lg_size;
+		 }
+
+		LOG_SYSTEM_UNLOCK(env);
+		region_locked = 0;
+
+		if (dbenv->lg_flags != 0 && (ret =
+		    __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0)
+			return (ret);
+	}
+	dblp->reginfo.mtx_alloc = lp->mtx_region;
+
+	return (0);
+
+err:	if (dblp->reginfo.addr != NULL) {
+		if (region_locked)
+			LOG_SYSTEM_UNLOCK(env);
+		(void)__env_region_detach(env, &dblp->reginfo, 0);
+	}
+	env->lg_handle = NULL;
+
+	(void)__mutex_free(env, &dblp->mtx_dbreg);
+	__os_free(env, dblp);
+
+	return (ret);
+}
+
+/*
+ * __log_init --
+ *	Initialize a log region in shared memory.
+ */
+static int
+__log_init(env, dblp)
+	ENV *env;
+	DB_LOG *dblp;
+{
+	DB_ENV *dbenv;
+	LOG *lp;
+	int ret;
+	void *p;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * This is the first point where we can validate the buffer size,
+	 * because we know all three settings have been configured (file size,
+	 * buffer size and the in-memory flag).
+	 */
+	if ((ret =
+	   __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
+		return (ret);
+
+	if ((ret = __env_alloc(&dblp->reginfo,
+	    sizeof(*lp), &dblp->reginfo.primary)) != 0)
+		goto mem_err;
+
+	((REGENV *)env->reginfo->primary)->lg_primary =
+	     R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
+
+	lp = dblp->reginfo.primary;
+	memset(lp, 0, sizeof(*lp));
+
+	/* We share the region so we need the same mutex. */
+	lp->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+
+	lp->fid_max = 0;
+	SH_TAILQ_INIT(&lp->fq);
+	lp->free_fid_stack = INVALID_ROFF;
+	lp->free_fids = lp->free_fids_alloced = 0;
+
+	/* Initialize LOG LSNs. */
+	INIT_LSN(lp->lsn);
+	INIT_LSN(lp->t_lsn);
+
+	/*
+	 * It's possible to be waiting for an LSN of [1][0], if a replication
+	 * client gets the first log record out of order.  An LSN of [0][0]
+	 * signifies that we're not waiting.
+	 */
+	ZERO_LSN(lp->waiting_lsn);
+
+	/*
+	 * Log makes note of the fact that it ran into a checkpoint on
+	 * startup if it did so, as a recovery optimization.  A zero
+	 * LSN signifies that it hasn't found one [yet].
+	 */
+	ZERO_LSN(lp->cached_ckp_lsn);
+
+	if ((ret =
+	    __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
+		return (ret);
+	if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
+		return (ret);
+
+	/* Initialize the buffer. */
+	if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
+mem_err:	__db_errx( env, DB_STR("2524",
+		    "unable to allocate log region memory"));
+		return (ret);
+	}
+	lp->regionmax = dbenv->lg_regionmax;
+	lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
+	lp->buffer_size = dbenv->lg_bsize;
+	lp->filemode = dbenv->lg_filemode;
+	lp->log_size = lp->log_nsize = dbenv->lg_size;
+	lp->stat.st_fileid_init = dbenv->lg_fileid_init;
+
+	/* Initialize the commit Queue. */
+	SH_TAILQ_INIT(&lp->free_commits);
+	SH_TAILQ_INIT(&lp->commits);
+	lp->ncommit = 0;
+
+	/* Initialize the logfiles list for in-memory logs. */
+	SH_TAILQ_INIT(&lp->logfiles);
+	SH_TAILQ_INIT(&lp->free_logfiles);
+
+	/*
+	 * Fill in the log's persistent header.  Don't fill in the log file
+	 * sizes, as they may change at any time and so have to be filled in
+	 * as each log file is created.
+	 */
+	lp->persist.magic = DB_LOGMAGIC;
+	/*
+	 * Don't use __log_set_version because env->dblp isn't set up yet.
+	 */
+	lp->persist.version = DB_LOGVERSION;
+	lp->persist.notused = 0;
+	env->lg_handle = dblp;
+
+	/* Migrate persistent flags from the ENV into the region. */
+	if (dbenv->lg_flags != 0 &&
+	    (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
+		return (ret);
+
+	(void)time(&lp->timestamp);
+	return (0);
+}
+
+/*
+ * __log_recover --
+ *	Recover a log.
+ */
+static int
+__log_recover(dblp)
+	DB_LOG *dblp;
+{
+	DBT dbt;
+	DB_ENV *dbenv;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	ENV *env;
+	LOG *lp;
+	u_int32_t cnt, rectype;
+	int ret;
+	logfile_validity status;
+
+	env = dblp->env;
+	dbenv = env->dbenv;
+	logc = NULL;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * Find a log file.  If none exist, we simply return, leaving
+	 * everything initialized to a new log.
+	 */
+	if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
+		return (ret);
+	if (cnt == 0) {
+		if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+			__db_msg(env, DB_STR("2525", "No log files found"));
+		return (0);
+	}
+
+	/*
+	 * If the last file is an old, unreadable version, start a new
+	 * file.  Don't bother finding the end of the last log file;
+	 * we assume that it's valid in its entirety, since the user
+	 * should have shut down cleanly or run recovery before upgrading.
+	 */
+	if (status == DB_LV_OLD_UNREADABLE) {
+		lp->lsn.file = lp->s_lsn.file = cnt + 1;
+		lp->lsn.offset = lp->s_lsn.offset = 0;
+		goto skipsearch;
+	}
+	DB_ASSERT(env,
+	    (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
+
+	/*
+	 * We have the last useful log file and we've loaded any persistent
+	 * information.  Set the end point of the log past the end of the last
+	 * file. Read the last file, looking for the last checkpoint and
+	 * the log's end.
+	 */
+	lp->lsn.file = cnt + 1;
+	lp->lsn.offset = 0;
+	lsn.file = cnt;
+	lsn.offset = 0;
+
+	/*
+	 * Allocate a cursor and set it to the first record.  This shouldn't
+	 * fail, leave error messages on.
+	 */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	F_SET(logc, DB_LOG_LOCKED);
+	memset(&dbt, 0, sizeof(dbt));
+	if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
+		goto err;
+
+	/*
+	 * Read to the end of the file.  This may fail at some point, so
+	 * turn off error messages.
+	 */
+	F_SET(logc, DB_LOG_SILENT_ERR);
+	while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
+		if (dbt.size < sizeof(u_int32_t))
+			continue;
+		LOGCOPY_32(env, &rectype, dbt.data);
+		if (rectype == DB___txn_ckp)
+			/*
+			 * If we happen to run into a checkpoint, cache its
+			 * LSN so that the transaction system doesn't have
+			 * to walk this log file again looking for it.
+			 */
+			lp->cached_ckp_lsn = lsn;
+	}
+	F_CLR(logc, DB_LOG_SILENT_ERR);
+
+	/*
+	 * We now know where the end of the log is.  Set the first LSN that
+	 * we want to return to an application and the LSN of the last known
+	 * record on disk.
+	 */
+	lp->lsn = lsn;
+	lp->s_lsn = lsn;
+	lp->lsn.offset += logc->len;
+	lp->s_lsn.offset += logc->len;
+
+	/* Set up the current buffer information, too. */
+	lp->len = logc->len;
+	lp->a_off = 0;
+	lp->b_off = 0;
+	lp->w_off = lp->lsn.offset;
+
+skipsearch:
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+		__db_msg(env, DB_STR_A("2526",
+		    "Finding last valid log LSN: file: %lu offset %lu",
+		    "%lu %lu"), (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+
+err:	if (logc != NULL)
+		(void)__logc_close(logc);
+
+	return (ret);
+}
+
+/*
+ * __log_find --
+ *	Try to find a log file.  If find_first is set, valp will contain
+ * the number of the first readable log file, else it will contain the number
+ * of the last log file (which may be too old to read).
+ *
+ * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
+ */
+int
+__log_find(dblp, find_first, valp, statusp)
+	DB_LOG *dblp;
+	int find_first;
+	u_int32_t *valp;
+	logfile_validity *statusp;
+{
+	ENV *env;
+	LOG *lp;
+	logfile_validity logval_status, status;
+	struct __db_filestart *filestart;
+	u_int32_t clv, logval;
+	int cnt, fcnt, ret;
+	const char *dir;
+	char *c, **names, *p, *q;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+	logval_status = status = DB_LV_NONEXISTENT;
+
+	/* Return a value of 0 as the log file number on failure. */
+	*valp = 0;
+
+	if (lp->db_log_inmemory) {
+		filestart = find_first ?
+		    SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
+		    SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
+		if (filestart != NULL) {
+			*valp = filestart->file;
+			logval_status = DB_LV_NORMAL;
+		}
+		*statusp = logval_status;
+		return (0);
+	}
+
+	/* Find the directory name. */
+	if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
+		__os_free(env, p);
+		return (ret);
+	}
+	if ((q = __db_rpath(p)) == NULL)
+		dir = PATH_DOT;
+	else {
+		*q = '\0';
+		dir = p;
+	}
+
+	/* Get the list of file names. */
+retry:	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+		__db_err(env, ret, "%s", dir);
+		__os_free(env, p);
+		return (ret);
+	}
+
+	/* Search for a valid log file name. */
+	for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
+		if (!IS_LOG_FILE(names[cnt]))
+			continue;
+
+		/*
+		 * Names of the form log\.[0-9]* are reserved for DB.  Other
+		 * names sharing LFPREFIX, such as "log.db", are legal.
+		 */
+		for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
+			if (!isdigit((int)*c))
+				break;
+		if (*c != '\0')
+			continue;
+
+		/*
+		 * Use atol, not atoi; if an "int" is 16-bits, the largest
+		 * log file name won't fit.
+		 */
+		clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
+
+		/*
+		 * If searching for the first log file, we want to return the
+		 * oldest log file we can read, or, if no readable log files
+		 * exist, the newest log file we can't read (the crossover
+		 * point between the old and new versions of the log file).
+		 *
+		 * If we're searching for the last log file, we want to return
+		 * the newest log file, period.
+		 *
+		 * Readable log files should never precede unreadable log
+		 * files, that would mean the admin seriously screwed up.
+		 */
+		if (find_first) {
+			if (logval != 0 &&
+			    status != DB_LV_OLD_UNREADABLE && clv > logval)
+				continue;
+		} else
+			if (logval != 0 && clv < logval)
+				continue;
+
+		if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
+		    &status, NULL)) != 0) {
+			/*
+			 * If we have raced with removal of a log file since
+			 * the call to __os_dirlist, it may no longer exist.
+			 * In that case, just go on to the next one.  If we're
+			 * at the end of the list, all of the log files we saw
+			 * initially are gone and we need to get the list again.
+			 */
+			if (ret == ENOENT) {
+				ret = 0;
+				if (cnt == 0) {
+					__os_dirfree(env, names, fcnt);
+					goto retry;
+				}
+				continue;
+			}
+			__db_err(env, ret, DB_STR_A("2527",
+			    "Invalid log file: %s", "%s"), names[cnt]);
+			goto err;
+		}
+		switch (status) {
+		case DB_LV_NONEXISTENT:
+			/* __log_valid never returns DB_LV_NONEXISTENT. */
+			DB_ASSERT(env, 0);
+			break;
+		case DB_LV_INCOMPLETE:
+			/*
+			 * The last log file may not have been initialized --
+			 * it's possible to create a log file but not write
+			 * anything to it.  If performing recovery (that is,
+			 * if find_first isn't set), ignore the file, it's
+			 * not interesting.  If we're searching for the first
+			 * log record, return the file (assuming we don't find
+			 * something better), as the "real" first log record
+			 * is likely to be in the log buffer, and we want to
+			 * set the file LSN for our return.
+			 */
+			if (find_first)
+				goto found;
+			break;
+		case DB_LV_OLD_UNREADABLE:
+			/*
+			 * If we're searching for the first log file, then we
+			 * only want this file if we don't yet have a file or
+			 * already have an unreadable file and this one is
+			 * newer than that one.  If we're searching for the
+			 * last log file, we always want this file because we
+			 * wouldn't be here if it wasn't newer than our current
+			 * choice.
+			 */
+			if (!find_first || logval == 0 ||
+			    (status == DB_LV_OLD_UNREADABLE && clv > logval))
+				goto found;
+			break;
+		case DB_LV_NORMAL:
+		case DB_LV_OLD_READABLE:
+found:			logval = clv;
+			logval_status = status;
+			break;
+		}
+	}
+
+	*valp = logval;
+
+err:	__os_dirfree(env, names, fcnt);
+	__os_free(env, p);
+	*statusp = logval_status;
+
+	return (ret);
+}
+
+/*
+ * log_valid --
+ *	Validate a log file.  Returns an error code in the event of
+ *	a fatal flaw in a the specified log file;  returns success with
+ *	a code indicating the currentness and completeness of the specified
+ *	log file if it is not unexpectedly flawed (that is, if it's perfectly
+ *	normal, if it's zero-length, or if it's an old version).
+ *
+ * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
+ * PUBLIC:     DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
+ */
+int
+__log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
+	DB_LOG *dblp;
+	u_int32_t number;
+	int set_persist;
+	DB_FH **fhpp;
+	u_int32_t flags;
+	logfile_validity *statusp;
+	u_int32_t *versionp;
+{
+	DB_CIPHER *db_cipher;
+	DB_FH *fhp;
+	ENV *env;
+	HDR *hdr;
+	LOG *lp;
+	LOGP *persist;
+	logfile_validity status;
+	size_t hdrsize, nr, recsize;
+	int chksum_includes_hdr, is_hmac, ret;
+	u_int32_t logversion;
+	u_int8_t *tmp;
+	char *fname;
+
+	env = dblp->env;
+	db_cipher = env->crypto_handle;
+	fhp = NULL;
+	persist = NULL;
+	status = DB_LV_NORMAL;
+	tmp = NULL;
+#if defined(HAVE_LOG_CHECKSUM)
+	/* Most log versions include the hdr in the checksum. */
+	chksum_includes_hdr = 1;
+#else
+	COMPQUIET(chksum_includes_hdr, 0);
+#endif
+
+	/* Return the file handle to our caller, on request */
+	if (fhpp != NULL)
+		*fhpp = NULL;
+
+	if (flags == 0)
+		flags = DB_OSO_RDONLY | DB_OSO_SEQ;
+	/* Try to open the log file. */
+	if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
+		__os_free(env, fname);
+		return (ret);
+	}
+
+	hdrsize = HDR_NORMAL_SZ;
+	is_hmac = 0;
+	recsize = sizeof(LOGP);
+	if (CRYPTO_ON(env)) {
+		hdrsize = HDR_CRYPTO_SZ;
+		recsize = sizeof(LOGP);
+		recsize += db_cipher->adj_size(recsize);
+		is_hmac = 1;
+	}
+	if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
+		goto err;
+
+	hdr = (HDR *)tmp;
+	persist = (LOGP *)(tmp + hdrsize);
+
+	/*
+	 * Try to read the header.  This can fail if the log is truncated, or
+	 * if we find a preallocated log file where the header has not yet been
+	 * written, so we need to check whether the header is zero-filled.
+	 */
+	if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
+	    nr != recsize + hdrsize ||
+	    (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
+		if (ret == 0)
+			status = DB_LV_INCOMPLETE;
+		else
+			/*
+			 * The error was a fatal read error, not just an
+			 * incompletely initialized log file.
+			 */
+			__db_err(env, ret, DB_STR_A("2528",
+			    "ignoring log file: %s", "%s"), fname);
+		goto err;
+	}
+
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+
+	/*
+	 * Now we have to validate the persistent record.  We have
+	 * several scenarios we have to deal with:
+	 *
+	 * 1.  User has crypto turned on:
+	 *	- They're reading an old, unencrypted log file
+	 *	  .  We will fail the record size match check below.
+	 *	- They're reading a current, unencrypted log file
+	 *	  .  We will fail the record size match check below.
+	 *	- They're reading an old, encrypted log file [NOT YET]
+	 *	  .  After decryption we'll fail the version check.  [NOT YET]
+	 *	- They're reading a current, encrypted log file
+	 *	  .  We should proceed as usual.
+	 * 2.  User has crypto turned off:
+	 *	- They're reading an old, unencrypted log file
+	 *	  .  We will fail the version check.
+	 *	- They're reading a current, unencrypted log file
+	 *	  .  We should proceed as usual.
+	 *	- They're reading an old, encrypted log file [NOT YET]
+	 *	  .  We'll fail the magic number check (it is encrypted).
+	 *	- They're reading a current, encrypted log file
+	 *	  .  We'll fail the magic number check (it is encrypted).
+	 */
+	if (CRYPTO_ON(env)) {
+		/*
+		 * If we are trying to decrypt an unencrypted log
+		 * we can only detect that by having an unreasonable
+		 * data length for our persistent data.
+		 */
+		if ((hdr->len - hdrsize) != sizeof(LOGP)) {
+			__db_errx(env, "log record size mismatch");
+			goto err;
+		}
+		/*
+		 * The checksum is calculated from the encrypted data, and,
+		 * for recent logs, the fields hdr->{prev,len}.
+		 */
+#ifdef HAVE_LOG_CHECKSUM
+		if ((ret = __db_check_chksum(env, hdr, db_cipher,
+		    &hdr->chksum[0], (u_int8_t *)persist,
+		    hdr->len - hdrsize, is_hmac)) != 0) {
+			/*
+			 * The checksum doesn't verify when the header fields
+			 * are included; try without the header.
+			 */
+
+			if ((ret = __db_check_chksum(env, NULL, db_cipher,
+			    &hdr->chksum[0], (u_int8_t *)persist,
+			    hdr->len - hdrsize, is_hmac)) != 0)
+				goto bad_checksum;
+			/*
+ 			 * The checksum verifies without the header.  Make note
+ 			 * of that, because it is only acceptable when the log
+ 			 * version < DB_LOGCHKSUM.  Later, when we determine log
+ 			 * version, we will confirm this.
+			 */
+			chksum_includes_hdr = 0;
+		}
+#endif
+
+		if ((ret = db_cipher->decrypt(env, db_cipher->data,
+		    &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
+			goto err;
+	}
+
+	/* Swap the header, if necessary. */
+	if (LOG_SWAPPED(env)) {
+		/*
+		 * If the magic number is not byte-swapped, we're looking at an
+		 * old log that we can no longer read.
+		 */
+		if (persist->magic == DB_LOGMAGIC) {
+			__db_errx(env, DB_STR_A("2529",
+			    "Ignoring log file: %s historic byte order",
+			    "%s"), fname);
+			status = DB_LV_OLD_UNREADABLE;
+			goto err;
+		}
+
+		__log_persistswap(persist);
+	}
+
+	/* Validate the header. */
+	if (persist->magic != DB_LOGMAGIC) {
+		__db_errx(env, DB_STR_A("2530",
+		    "Ignoring log file: %s: magic number %lx, not %lx",
+		    "%s %lx %lx"), fname,
+		    (u_long)persist->magic, (u_long)DB_LOGMAGIC);
+		ret = EINVAL;
+		goto err;
+	}
+
+	logversion = persist->version;
+	/*
+	 * Set our status code to indicate whether the log file belongs to an
+	 * unreadable or readable old version; leave it alone if and only if
+	 * the log file version is the current one.
+	 */
+	if (logversion > DB_LOGVERSION) {
+		/* This is a fatal error--the log file is newer than DB. */
+		__db_errx(env, DB_STR_A("2531",
+		    "Unacceptable log file %s: unsupported log version %lu",
+		    "%s %lu"), fname, (u_long)logversion);
+		ret = EINVAL;
+		goto err;
+	} else if (logversion < DB_LOGOLDVER) {
+		status = DB_LV_OLD_UNREADABLE;
+		/* This is a non-fatal error, but give some feedback. */
+		__db_errx(env, DB_STR_A("2532",
+		    "Skipping log file %s: historic log version %lu", "%s %lu"),
+		    fname, (u_long)logversion);
+		/*
+		 * We don't want to set persistent info based on an unreadable
+		 * region, so jump to "err".
+		 */
+		goto err;
+	} else if (logversion < DB_LOGVERSION)
+		status = DB_LV_OLD_READABLE;
+
+	/*
+	 * We could not check the checksum before checking the magic and version
+	 * because old log headers put the length and checksum in a different
+	 * location.
+	 */
+#ifdef HAVE_LOG_CHECKSUM
+	if (CRYPTO_ON(env)) {
+		/*
+		 * We might have to declare a checksum failure here, if:
+		 * - the checksum verified only by ignoring the header, and
+		 * - the log version indicates that the header should have 
+		 * been included.
+		 */
+		if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM)
+			goto bad_checksum;
+	} else {
+		/*
+		 * The checksum was calculated with the swapped byte order. We
+		 * might need to swap them back; the check needs the same bytes.
+		 */
+		if (LOG_SWAPPED(env))
+			__log_persistswap(persist);
+		/*
+		 * We have the logversion here, so we know whether to include
+		 * the hdr or not.
+		 */
+		if ((ret = __db_check_chksum(env,
+		    logversion >= DB_LOGCHKSUM ? hdr : NULL, db_cipher,
+		    &hdr->chksum[0], (u_int8_t *)persist,
+		    hdr->len - hdrsize, is_hmac)) != 0) {
+bad_checksum:
+			__db_errx(env, DB_STR("2533",
+			    "log record checksum mismatch"));
+			goto err;
+		}
+
+		if (LOG_SWAPPED(env))
+			__log_persistswap(persist);
+	}
+#endif
+
+	/*
+	 * If the log is readable so far and we're doing system initialization,
+	 * set the region's persistent information based on the headers.
+	 *
+	 * Override the current log file size.
+	 */
+	if (set_persist) {
+		lp = dblp->reginfo.primary;
+		lp->log_size = persist->log_size;
+		lp->persist.version = logversion;
+	}
+	if (versionp != NULL)
+		*versionp = logversion;
+
+err:	if (fname != NULL)
+		__os_free(env, fname);
+	if (ret == 0 && fhpp != NULL)
+		*fhpp = fhp;
+	else
+		/* Must close on error or if we only used it locally. */
+		(void)__os_closehandle(env, fhp);
+	if (tmp != NULL)
+		__os_free(env, tmp);
+
+	if (statusp != NULL)
+		*statusp = status;
+
+	return (ret);
+}
+
+/*
+ * __log_env_refresh --
+ *	Clean up after the log system on a close or failed open.
+ *
+ * PUBLIC: int __log_env_refresh __P((ENV *));
+ */
+int
+__log_env_refresh(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	REGINFO *reginfo;
+	struct __fname *fnp;
+	struct __db_commit *commit;
+	struct __db_filestart *filestart;
+	int ret, t_ret;
+
+	dblp = env->lg_handle;
+	reginfo = &dblp->reginfo;
+	lp = reginfo->primary;
+	ret = 0;
+
+	/*
+	 * Flush the log if it's private -- there's no Berkeley DB guarantee
+	 * that this gets done, but in case the application has forgotten to
+	 * flush for durability, it's the polite thing to do.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE) &&
+	    (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * After we close the files, check for any unlogged closes left in
+	 * the shared memory queue.  If we find any, try to log it, otherwise
+	 * return the error.  We cannot say the environment was closed
+	 * cleanly.
+	 */
+	MUTEX_LOCK(env, lp->mtx_filelist);
+	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+		if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
+		    (t_ret = __dbreg_close_id_int(
+		    env, fnp, DBREG_CLOSE, 1)) != 0)
+			ret = t_ret;
+	MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		reginfo->mtx_alloc = MUTEX_INVALID;
+		/* Discard the flush mutex. */
+		if ((t_ret =
+		    __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
+			ret = t_ret;
+
+		/* Discard the buffer. */
+		__env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
+
+		/* Discard stack of free file IDs. */
+		if (lp->free_fid_stack != INVALID_ROFF)
+			__env_alloc_free(reginfo,
+			    R_ADDR(reginfo, lp->free_fid_stack));
+
+		/* Discard the list of in-memory log file markers. */
+		while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
+		    __db_filestart)) != NULL) {
+			SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
+			    __db_filestart);
+			__env_alloc_free(reginfo, filestart);
+		}
+
+		while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
+		    __db_filestart)) != NULL) {
+			SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
+			    __db_filestart);
+			__env_alloc_free(reginfo, filestart);
+		}
+
+		/* Discard commit queue elements. */
+		while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
+		    __db_commit)) != NULL) {
+			SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
+			    __db_commit);
+			__env_alloc_free(reginfo, commit);
+		}
+
+		/* Discard replication bulk buffer. */
+		if (lp->bulk_buf != INVALID_ROFF) {
+			__env_alloc_free(reginfo,
+			    R_ADDR(reginfo, lp->bulk_buf));
+			lp->bulk_buf = INVALID_ROFF;
+		}
+	}
+
+	/* Discard the per-thread DBREG mutex. */
+	if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Detach from the region. */
+	if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Close open files, release allocated memory. */
+	if (dblp->lfhp != NULL) {
+		if ((t_ret =
+		    __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
+			ret = t_ret;
+		dblp->lfhp = NULL;
+	}
+	if (dblp->dbentry != NULL)
+		__os_free(env, dblp->dbentry);
+
+	__os_free(env, dblp);
+
+	env->lg_handle = NULL;
+	return (ret);
+}
+
+/*
+ * __log_get_cached_ckp_lsn --
+ *	Retrieve any last checkpoint LSN that we may have found on startup.
+ *
+ * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
+ */
+int
+__log_get_cached_ckp_lsn(env, ckp_lsnp)
+	ENV *env;
+	DB_LSN *ckp_lsnp;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+
+	LOG_SYSTEM_LOCK(env);
+	*ckp_lsnp = lp->cached_ckp_lsn;
+	LOG_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __log_region_mutex_count --
+ *	Return the number of mutexes the log region will need.
+ *
+ * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__log_region_mutex_count(env)
+	ENV *env;
+{
+	/*
+	 * We need a few assorted mutexes, and one per transaction waiting
+	 * on the group commit list.  We can't know how many that will be,
+	 * but it should be bounded by the maximum active transactions.
+	 */
+	return (env->dbenv->tx_init + 5);
+}
+
+/*
+ * __log_region_mutex_max --
+ *	Return the number of additional mutexes the log region will need.
+ *
+ * PUBLIC: u_int32_t __log_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__log_region_mutex_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	if ((count = dbenv->tx_max) == 0)
+		count = DEF_MAX_TXNS;
+	if (count < dbenv->tx_init)
+		return (0);
+	return (count - dbenv->tx_init);
+}
+
+/*
+ * __log_region_size --
+ *	Return the amount of space needed for the log region.
+ *	Make the region large enough to hold txn_max transaction
+ *	detail structures  plus some space to hold thread handles
+ *	and the beginning of the alloc region and anything we
+ *	need for mutex system resource recording.
+ * PUBLIC: size_t	__log_region_size __P((ENV *));
+ */
+size_t
+__log_region_size(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t s;
+
+	dbenv = env->dbenv;
+
+	/* Set the default buffer size, if not otherwise configured. */
+	if (dbenv->lg_bsize == 0)
+		dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
+		    LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
+
+	s = dbenv->lg_bsize;
+	/* Allocate the initial fileid allocation, plus some path name space. */
+	s += dbenv->lg_fileid_init * __env_alloc_size((sizeof(FNAME)) + 16);
+
+	return (s);
+}
+/*
+ * __log_region_max --
+ *	Return the amount of extra memory to allocate for logging informaition.
+ * PUBLIC: size_t	__log_region_max __P((ENV *));
+ */
+size_t
+__log_region_max(env)
+	ENV *env;
+{
+
+	DB_ENV *dbenv;
+	size_t s;
+
+	dbenv = env->dbenv;
+	if (dbenv->lg_fileid_init == 0) {
+		if ((s = dbenv->lg_regionmax) == 0)
+			s = LG_BASE_REGION_SIZE;
+	} else if ((s = dbenv->lg_regionmax) != 0 &&
+	     s < dbenv->lg_fileid_init * (__env_alloc_size(sizeof(FNAME)) + 16))
+		s = 0;
+	else if (s != 0)
+		s -= dbenv->lg_fileid_init *
+		     (__env_alloc_size(sizeof(FNAME)) + 16);
+
+	return (s);
+}
+
+/*
+ * __log_vtruncate
+ *	This is a virtual truncate.  We set up the log indicators to
+ * make everyone believe that the given record is the last one in the
+ * log.  Returns with the next valid LSN (i.e., the LSN of the next
+ * record to be written). This is used in replication to discard records
+ * in the log file that do not agree with the master.
+ *
+ * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
+ */
+int
+__log_vtruncate(env, lsn, ckplsn, trunclsn)
+	ENV *env;
+	DB_LSN *lsn, *ckplsn, *trunclsn;
+{
+	DBT log_dbt;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	LOG *lp;
+	u_int32_t bytes, len;
+	size_t offset;
+	int ret, t_ret;
+
+	/* Need to find out the length of this soon-to-be-last record. */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	memset(&log_dbt, 0, sizeof(log_dbt));
+	ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
+	len = logc->len;
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		return (ret);
+
+	/* Now do the truncate. */
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+
+	LOG_SYSTEM_LOCK(env);
+
+	/*
+	 * Flush the log so we can simply initialize the in-memory buffer
+	 * after the truncate.
+	 */
+	if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+		goto err;
+
+	lp->lsn = *lsn;
+	lp->len = len;
+	lp->lsn.offset += lp->len;
+
+	offset = lp->b_off;
+	if (lp->db_log_inmemory && (ret =
+	    __log_inmem_lsnoff(dblp, &lp->lsn, &offset)) != 0) {
+			lp->b_off = (db_size_t)offset;
+			goto err;
+	}
+	lp->b_off = (db_size_t)offset;
+
+	/*
+	 * I am going to assume that the number of bytes written since
+	 * the last checkpoint doesn't exceed a 32-bit number.
+	 */
+	DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
+	bytes = 0;
+	if (ckplsn->file != lp->lsn.file) {
+		bytes = lp->log_size - ckplsn->offset;
+		if (lp->lsn.file > ckplsn->file + 1)
+			bytes += lp->log_size *
+			    ((lp->lsn.file - ckplsn->file) - 1);
+		bytes += lp->lsn.offset;
+	} else
+		bytes = lp->lsn.offset - ckplsn->offset;
+
+	lp->stat.st_wc_mbytes += bytes / MEGABYTE;
+	lp->stat.st_wc_bytes += bytes % MEGABYTE;
+
+	/*
+	 * If the synced lsn is greater than our new end of log, reset it
+	 * to our current end of log.
+	 */
+	MUTEX_LOCK(env, lp->mtx_flush);
+	if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
+		lp->s_lsn = lp->lsn;
+	MUTEX_UNLOCK(env, lp->mtx_flush);
+
+	/* Initialize the in-region buffer to a pristine state. */
+	ZERO_LSN(lp->f_lsn);
+	lp->w_off = lp->lsn.offset;
+
+	if (trunclsn != NULL)
+		*trunclsn = lp->lsn;
+
+	/* Truncate the log to the new point. */
+	if ((ret = __log_zero(env, &lp->lsn)) != 0)
+		goto err;
+
+err:	LOG_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __log_is_outdated --
+ *	Used by the replication system to identify if a client's logs are too
+ *	old.
+ *
+ * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
+ */
+int
+__log_is_outdated(env, fnum, outdatedp)
+	ENV *env;
+	u_int32_t fnum;
+	int *outdatedp;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	char *name;
+	int ret;
+	u_int32_t cfile;
+	struct __db_filestart *filestart;
+
+	dblp = env->lg_handle;
+
+	/*
+	 * The log represented by env is compared to the file number passed
+	 * in fnum.  If the log file fnum does not exist and is lower-numbered
+	 * than the current logs, return *outdatedp non-zero, else we return 0.
+	 */
+	if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
+		LOG_SYSTEM_LOCK(env);
+		lp = (LOG *)dblp->reginfo.primary;
+		filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+		*outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
+		LOG_SYSTEM_UNLOCK(env);
+		return (0);
+	}
+
+	*outdatedp = 0;
+	if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
+		__os_free(env, name);
+		return (ret);
+	}
+
+	/* If the file exists, we're just fine. */
+	if (__os_exists(env, name, NULL) == 0)
+		goto out;
+
+	/*
+	 * It didn't exist, decide if the file number is too big or
+	 * too little.  If it's too little, then we need to indicate
+	 * that the LSN is outdated.
+	 */
+	LOG_SYSTEM_LOCK(env);
+	lp = (LOG *)dblp->reginfo.primary;
+	cfile = lp->lsn.file;
+	LOG_SYSTEM_UNLOCK(env);
+
+	if (cfile > fnum)
+		*outdatedp = 1;
+out:	__os_free(env, name);
+	return (ret);
+}
+
+/*
+ * __log_zero --
+ *	Zero out the tail of a log after a truncate.
+ *
+ * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
+ */
+int
+__log_zero(env, from_lsn)
+	ENV *env;
+	DB_LSN *from_lsn;
+{
+	DB_FH *fhp;
+	DB_LOG *dblp;
+	LOG *lp;
+	struct __db_filestart *filestart, *nextstart;
+	size_t nbytes, len, nw;
+	u_int32_t fn, mbytes, bytes;
+	u_int8_t buf[4096];
+	int ret;
+	char *fname;
+
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+	DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
+	if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
+		__db_errx(env, DB_STR("2534",
+		    "Warning: truncating to point beyond end of log"));
+		return (0);
+	}
+
+	if (lp->db_log_inmemory) {
+		/*
+		 * Remove the files that are invalidated by this truncate.
+		 */
+		for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+		    filestart != NULL; filestart = nextstart) {
+			nextstart = SH_TAILQ_NEXT(filestart,
+			    links, __db_filestart);
+			if (filestart->file > from_lsn->file) {
+				SH_TAILQ_REMOVE(&lp->logfiles,
+				    filestart, links, __db_filestart);
+				SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
+				    filestart, links, __db_filestart);
+			}
+		}
+
+		return (0);
+	}
+
+	/* Close any open file handles so unlinks don't fail. */
+	if (dblp->lfhp != NULL) {
+		(void)__os_closehandle(env, dblp->lfhp);
+		dblp->lfhp = NULL;
+	}
+
+	/* Throw away any extra log files that we have around. */
+	for (fn = from_lsn->file + 1;; fn++) {
+		if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
+			__os_free(env, fname);
+			break;
+		}
+		(void)__os_closehandle(env, fhp);
+		(void)time(&lp->timestamp);
+		ret = __os_unlink(env, fname, 0);
+		__os_free(env, fname);
+		if (ret != 0)
+			return (ret);
+	}
+
+	/* We removed some log files; have to 0 to end of file. */
+	if ((ret =
+	    __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
+		__os_free(env, fname);
+		return (ret);
+	}
+	__os_free(env, fname);
+	if ((ret = __os_ioinfo(env,
+	    NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
+		goto err;
+	DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
+	len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
+
+	memset(buf, 0, sizeof(buf));
+
+	/* Initialize the write position. */
+	if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
+		goto err;
+
+	while (len > 0) {
+		nbytes = len > sizeof(buf) ? sizeof(buf) : len;
+		if ((ret =
+		    __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
+			goto err;
+		len -= nbytes;
+	}
+
+err:	(void)__os_closehandle(env, dblp->lfhp);
+	dblp->lfhp = NULL;
+
+	return (ret);
+}
+
+/*
+ * __log_inmem_lsnoff --
+ *	Find the offset in the buffer of a given LSN.
+ *
+ * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
+ */
+int
+__log_inmem_lsnoff(dblp, lsnp, offsetp)
+	DB_LOG *dblp;
+	DB_LSN *lsnp;
+	size_t *offsetp;
+{
+	LOG *lp;
+	struct __db_filestart *filestart;
+
+	lp = (LOG *)dblp->reginfo.primary;
+
+	SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
+		if (filestart->file == lsnp->file) {
+			*offsetp = (u_int32_t)
+			    (filestart->b_off + lsnp->offset) % lp->buffer_size;
+			return (0);
+		}
+
+	return (DB_NOTFOUND);
+}
+
+/*
+ * __log_inmem_newfile --
+ *	Records the offset of the beginning of a new file in the in-memory
+ *	buffer.
+ *
+ * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
+ */
+int
+__log_inmem_newfile(dblp, file)
+	DB_LOG *dblp;
+	u_int32_t file;
+{
+	HDR hdr;
+	LOG *lp;
+	struct __db_filestart *filestart;
+	int ret;
+#ifdef DIAGNOSTIC
+	struct __db_filestart *first, *last;
+#endif
+
+	lp = (LOG *)dblp->reginfo.primary;
+
+	/*
+	 * If the log buffer is empty, reuse the filestart entry.
+	 */
+	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+	if (filestart != NULL &&
+	    RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
+	    sizeof(HDR) + sizeof(LOGP)) {
+		filestart->file = file;
+		filestart->b_off = lp->b_off;
+		return (0);
+	}
+
+	/*
+	 * We write an empty header at the end of every in-memory log file.
+	 * This is used during cursor traversal to indicate when to switch the
+	 * LSN to the next file.
+	 */
+	if (file > 1) {
+		memset(&hdr, 0, sizeof(HDR));
+		__log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
+		lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
+	}
+
+	filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
+	if (filestart == NULL) {
+		if ((ret = __env_alloc(&dblp->reginfo,
+		    sizeof(struct __db_filestart), &filestart)) != 0)
+			return (ret);
+		memset(filestart, 0, sizeof(*filestart));
+	} else
+		SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
+		    links, __db_filestart);
+
+	filestart->file = file;
+	filestart->b_off = lp->b_off;
+
+#ifdef DIAGNOSTIC
+	first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+	last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
+
+	/* Check that we don't wrap. */
+	DB_ASSERT(dblp->env, !first || first == last ||
+	    RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
+	    RINGBUF_LEN(lp, first->b_off, last->b_off) +
+	    RINGBUF_LEN(lp, last->b_off, lp->b_off));
+#endif
+
+	SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
+	return (0);
+}
+
+/*
+ * __log_inmem_chkspace --
+ *	Ensure that the requested amount of space is available in the buffer,
+ *	and invalidate the region.
+ *      Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
+ */
+int
+__log_inmem_chkspace(dblp, len)
+	DB_LOG *dblp;
+	size_t len;
+{
+	DB_LSN active_lsn, old_active_lsn;
+	ENV *env;
+	LOG *lp;
+	struct __db_filestart *filestart;
+	size_t offset;
+	int ret;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	DB_ASSERT(env, lp->db_log_inmemory);
+
+	/*
+	 * Allow room for an extra header so that we don't need to check for
+	 * space when switching files.
+	 */
+	len += sizeof(HDR);
+
+	/*
+	 * If transactions are enabled and we're about to fill available space,
+	 * update the active LSN and recheck.  If transactions aren't enabled,
+	 * don't even bother checking: in that case we can always overwrite old
+	 * log records, because we're never going to abort.
+	 */
+	while (TXN_ON(env) &&
+	    RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
+		old_active_lsn = lp->active_lsn;
+		active_lsn = lp->lsn;
+
+		/*
+		 * Drop the log region lock so we don't hold it while
+		 * taking the transaction region lock.
+		 */
+		LOG_SYSTEM_UNLOCK(env);
+		ret = __txn_getactive(env, &active_lsn);
+		LOG_SYSTEM_LOCK(env);
+		if (ret != 0)
+			return (ret);
+		active_lsn.offset = 0;
+
+		/* If we didn't make any progress, give up. */
+		if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
+			__db_errx(env, DB_STR("2535",
+"In-memory log buffer is full (an active transaction spans the buffer)"));
+			return (DB_LOG_BUFFER_FULL);
+		}
+
+		/* Make sure we're moving the region LSN forwards. */
+		if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
+			lp->active_lsn = active_lsn;
+			offset = lp->a_off;
+			(void)__log_inmem_lsnoff(dblp, &active_lsn, &offset);
+			lp->a_off = (db_size_t)offset;
+		}
+	}
+
+	/*
+	 * Remove the first file if it is invalidated by this write.
+	 * Log records can't be bigger than a file, so we only need to
+	 * check the first file.
+	 */
+	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+	if (filestart != NULL &&
+	    RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
+		SH_TAILQ_REMOVE(&lp->logfiles, filestart,
+		    links, __db_filestart);
+		SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
+		    links, __db_filestart);
+		lp->f_lsn.file = filestart->file + 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __log_inmem_copyout --
+ *	Copies the given number of bytes from the buffer -- no checking.
+ *      Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
+ */
+void
+__log_inmem_copyout(dblp, offset, buf, size)
+	DB_LOG *dblp;
+	size_t offset;
+	void *buf;
+	size_t size;
+{
+	LOG *lp;
+	size_t nbytes;
+
+	lp = (LOG *)dblp->reginfo.primary;
+	nbytes = (offset + size < lp->buffer_size) ?
+	    size : lp->buffer_size - offset;
+	memcpy(buf, dblp->bufp + offset, nbytes);
+	if (nbytes < size)
+		memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
+}
+
+/*
+ * __log_inmem_copyin --
+ *	Copies the given number of bytes into the buffer -- no checking.
+ *      Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
+ */
+void
+__log_inmem_copyin(dblp, offset, buf, size)
+	DB_LOG *dblp;
+	size_t offset;
+	void *buf;
+	size_t size;
+{
+	LOG *lp;
+	size_t nbytes;
+
+	lp = (LOG *)dblp->reginfo.primary;
+	nbytes = (offset + size < lp->buffer_size) ?
+	    size : lp->buffer_size - offset;
+	memcpy(dblp->bufp + offset, buf, nbytes);
+	if (nbytes < size)
+		memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
+}
+
+/*
+ * __log_set_version --
+ *	Sets the current version of the log subsystem to the given version.
+ *	Essentially this modifies the lp->persist.version field in the
+ *	shared memory region.  Called when region is initially created
+ *	and when replication is starting up or finds a new master.
+ *
+ * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
+ */
+void
+__log_set_version(env, newver)
+	ENV *env;
+	u_int32_t newver;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+	/*
+	 * We should be able to update this atomically without locking.
+	 */
+	lp->persist.version = newver;
+}
+
+/*
+ * __log_get_oldversion --
+ *	Returns the last version of log that this environment was working
+ *	with.  Since there could be several versions of log files, if
+ *	the user upgraded and didn't log archive, we check the version
+ *	of the first log file, compare it to the last log file.  If those
+ *	are different, then there is an older log existing, and we then
+ *	walk backward in the log files looking for the version of the
+ *	most recent older log file.
+ *
+ * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
+ */
+int
+__log_get_oldversion(env, ver)
+	ENV *env;
+	u_int32_t *ver;
+{
+	DBT rec;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	LOG *lp;
+	u_int32_t firstfnum, fnum, lastver, oldver;
+	int ret, t_ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	logc = NULL;
+	ret = 0;
+	oldver = DB_LOGVERSION;
+	/*
+	 * If we're in-memory logs we're always the current version.
+	 */
+	if (lp->db_log_inmemory) {
+		*ver = oldver;
+		return (0);
+	}
+	memset(&rec, 0, sizeof(rec));
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+	/*
+	 * Get the version numbers of the first and last log files.
+	 */
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+		/*
+		 * If there is no log file, we'll get DB_NOTFOUND.
+		 * If we get that, set the version to the current.
+		 */
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		goto err;
+	}
+	firstfnum = lsn.file;
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
+		goto err;
+	if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
+	    NULL, &oldver)) != 0)
+		goto err;
+	/*
+	 * If the first and last LSN are in the same file, then we
+	 * already have the version in oldver.  Return it.
+	 */
+	if (firstfnum == lsn.file)
+		goto err;
+
+	/*
+	 * Otherwise they're in different files and we call __log_valid
+	 * to get the version numbers in both files.
+	 */
+	if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
+	    NULL, &lastver)) != 0)
+		goto err;
+	/*
+	 * If the version numbers are different, walk backward getting
+	 * the version of each log file until we find one that is
+	 * different than the last.
+	 */
+	if (oldver != lastver) {
+		for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
+			if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
+			    NULL, &oldver)) != 0)
+				goto err;
+			if (oldver != lastver)
+				break;
+		}
+	}
+err:	if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
+		ret = t_ret;
+	if (ret == 0 && ver != NULL)
+		*ver = oldver;
+	return (ret);
+}
diff --git a/src/log/log_archive.c b/src/log/log_archive.c
new file mode 100644
index 00000000..280a2071
--- /dev/null
+++ b/src/log/log_archive.c
@@ -0,0 +1,643 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __absname __P((ENV *, char *, char *, char **));
+static int __build_data __P((ENV *, char *, char ***));
+static int __cmpfunc __P((const void *, const void *));
+static int __usermem __P((ENV *, char ***));
+
+/*
+ * __log_archive_pp --
+ *	ENV->log_archive pre/post processing.
+ *
+ * PUBLIC: int __log_archive_pp __P((DB_ENV *, char **[], u_int32_t));
+ */
+int
+__log_archive_pp(dbenv, listp, flags)
+	DB_ENV *dbenv;
+	char ***listp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_archive", DB_INIT_LOG);
+
+#undef	OKFLAGS
+#define	OKFLAGS	(DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG | DB_ARCH_REMOVE)
+	if (flags != 0) {
+		if ((ret = __db_fchk(
+		    env, "DB_ENV->log_archive", flags, OKFLAGS)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(env, "DB_ENV->log_archive",
+		    flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(env, "DB_ENV->log_archive",
+		    flags, DB_ARCH_REMOVE,
+		    DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG)) != 0)
+			return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_archive(env, listp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_archive --
+ *	ENV->log_archive.  Internal.
+ * PUBLIC: int __log_archive __P((ENV *, char **[], u_int32_t));
+ */
+int
+__log_archive(env, listp, flags)
+	ENV *env;
+	char ***listp;
+	u_int32_t flags;
+{
+	DBT rec;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN stable_lsn;
+	LOG *lp;
+	u_int array_size, n;
+	u_int32_t fnum;
+	int handle_check, ret, t_ret;
+	char **array, **arrayp, *name, *p, *pref;
+#ifdef HAVE_GETCWD
+	char path[DB_MAXPATHLEN];
+#endif
+
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+	array = NULL;
+	name = NULL;
+	ret = 0;
+	COMPQUIET(fnum, 0);
+
+	if (flags != DB_ARCH_REMOVE)
+		*listp = NULL;
+
+	/* There are no log files if logs are in memory. */
+	if (lp->db_log_inmemory) {
+		LF_CLR(~DB_ARCH_DATA);
+		if (flags == 0)
+			return (0);
+	}
+
+	/*
+	 * Check if the user wants the list of log files to remove and we're
+	 * at a bad time in replication initialization.
+	 */
+	handle_check = 0;
+	if (!LF_ISSET(DB_ARCH_DATA) &&
+	    !LF_ISSET(DB_ARCH_LOG)) {
+		/*
+		 * If we're locked out, just return success.  No files
+		 * can be archived right now.  Any other error pass back
+		 * to the caller.
+		 */
+		handle_check = IS_ENV_REPLICATED(env);
+		if (handle_check && (ret = __archive_rep_enter(env)) != 0) {
+			if (ret == DB_REP_LOCKOUT)
+				ret = 0;
+			return (ret);
+		}
+	}
+
+	/*
+	 * Prepend the original absolute pathname if the user wants an
+	 * absolute path to the database environment directory.
+	 */
+#ifdef HAVE_GETCWD
+	if (LF_ISSET(DB_ARCH_ABS)) {
+		/*
+		 * XXX
+		 * Can't trust getcwd(3) to set a valid errno, so don't display
+		 * one unless we know it's good.  It's likely a permissions
+		 * problem: use something bland and useless in the default
+		 * return value, so we don't send somebody off in the wrong
+		 * direction.
+		 */
+		__os_set_errno(0);
+		if (getcwd(path, sizeof(path)) == NULL) {
+			ret = __os_get_errno();
+			__db_err(env, ret, DB_STR("2570",
+			    "no absolute path for the current directory"));
+			goto err;
+		}
+		pref = path;
+	} else
+#endif
+		pref = NULL;
+
+	LF_CLR(DB_ARCH_ABS);
+	switch (flags) {
+	case DB_ARCH_DATA:
+		ret = __build_data(env, pref, listp);
+		goto err;
+	case DB_ARCH_LOG:
+		memset(&rec, 0, sizeof(rec));
+		if ((ret = __log_cursor(env, &logc)) != 0)
+			goto err;
+#ifdef UMRW
+		ZERO_LSN(stable_lsn);
+#endif
+		ret = __logc_get(logc, &stable_lsn, &rec, DB_LAST);
+		if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+		fnum = stable_lsn.file;
+		break;
+	case DB_ARCH_REMOVE:
+		__log_autoremove(env);
+		goto err;
+	case 0:
+
+		ret = __log_get_stable_lsn(env, &stable_lsn, 1);
+		/*
+		 * A return of DB_NOTFOUND means the checkpoint LSN
+		 * is before the beginning of the log files we have.
+		 * This is not an error; it just means we're done.
+		 */
+		if (ret != 0) {
+			if (ret == DB_NOTFOUND)
+				ret = 0;
+			goto err;
+		}
+		/* Remove any log files before the last stable LSN. */
+		fnum = stable_lsn.file - 1;
+		break;
+	default:
+		ret = __db_unknown_path(env, "__log_archive");
+		goto err;
+	}
+
+#define	LIST_INCREMENT	64
+	/* Get some initial space. */
+	array_size = 64;
+	if ((ret = __os_malloc(env,
+	    sizeof(char *) * array_size, &array)) != 0)
+		goto err;
+	array[0] = NULL;
+
+	/* Build an array of the file names. */
+	for (n = 0; fnum > 0; --fnum) {
+		if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
+			__os_free(env, name);
+			goto err;
+		}
+		if (__os_exists(env, name, NULL) != 0) {
+			__os_free(env, name);
+			name = NULL;
+			if (LF_ISSET(DB_ARCH_LOG) && fnum == stable_lsn.file)
+				continue;
+			break;
+		}
+
+		if (n >= array_size - 2) {
+			array_size += LIST_INCREMENT;
+			if ((ret = __os_realloc(env,
+			    sizeof(char *) * array_size, &array)) != 0)
+				goto err;
+		}
+
+		if (pref != NULL) {
+			if ((ret =
+			    __absname(env, pref, name, &array[n])) != 0)
+				goto err;
+			__os_free(env, name);
+		} else if ((p = __db_rpath(name)) != NULL) {
+			if ((ret = __os_strdup(env, p + 1, &array[n])) != 0)
+				goto err;
+			__os_free(env, name);
+		} else
+			array[n] = name;
+
+		name = NULL;
+		array[++n] = NULL;
+	}
+
+	/* If there's nothing to return, we're done. */
+	if (n == 0)
+		goto err;
+
+	/* Sort the list. */
+	qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
+
+	/* Rework the memory. */
+	if ((ret = __usermem(env, &array)) != 0)
+		goto err;
+
+	if (listp != NULL)
+		*listp = array;
+
+	if (0) {
+err:		if (array != NULL) {
+			for (arrayp = array; *arrayp != NULL; ++arrayp)
+				__os_free(env, *arrayp);
+			__os_free(env, array);
+		}
+		if (name != NULL)
+			__os_free(env, name);
+	}
+	if (handle_check && (t_ret = __archive_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __log_get_stable_lsn --
+ *	Get the stable lsn based on where checkpoints are.
+ *
+ * PUBLIC: int __log_get_stable_lsn __P((ENV *, DB_LSN *, int));
+ */
+int
+__log_get_stable_lsn(env, stable_lsn, group_wide)
+	ENV *env;
+	DB_LSN *stable_lsn;
+	int group_wide;
+{
+	DBT rec;
+	DB_LOGC *logc;
+	LOG *lp;
+	__txn_ckp_args *ckp_args;
+	int ret, t_ret;
+
+	lp = env->lg_handle->reginfo.primary;
+
+	ret = 0;
+	memset(&rec, 0, sizeof(rec));
+	if (!TXN_ON(env)) {
+		if ((ret = __log_get_cached_ckp_lsn(env, stable_lsn)) != 0)
+			goto err;
+		/*
+		 * No need to check for a return value of DB_NOTFOUND;
+		 * __txn_findlastckp returns 0 if no checkpoint record
+		 * is found.  Instead of checking the return value, we
+		 * check to see if the return LSN has been filled in.
+		 */
+		if (IS_ZERO_LSN(*stable_lsn) && (ret =
+		     __txn_findlastckp(env, stable_lsn, NULL)) != 0)
+			goto err;
+		/*
+		 * If the LSN has not been filled in return DB_NOTFOUND
+		 * so that the caller knows it may be done.
+		 */
+		if (IS_ZERO_LSN(*stable_lsn)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+	} else if ((ret = __txn_getckp(env, stable_lsn)) != 0)
+		goto err;
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+	/*
+	 * Read checkpoint records until we find one that is on disk,
+	 * then copy the ckp_lsn to the stable_lsn;
+	 */
+	while ((ret = __logc_get(logc, stable_lsn, &rec, DB_SET)) == 0 &&
+	    (ret = __txn_ckp_read(env, rec.data, &ckp_args)) == 0) {
+		if (stable_lsn->file < lp->s_lsn.file ||
+		    (stable_lsn->file == lp->s_lsn.file &&
+		    stable_lsn->offset < lp->s_lsn.offset)) {
+			*stable_lsn = ckp_args->ckp_lsn;
+			__os_free(env, ckp_args);
+			break;
+		}
+		*stable_lsn = ckp_args->last_ckp;
+		__os_free(env, ckp_args);
+	}
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+#ifdef	HAVE_REPLICATION_THREADS
+	/*
+	 * If we have RepMgr, get the minimum group-aware LSN.
+	 */
+	if (group_wide && ret == 0 && REP_ON(env) && APP_IS_REPMGR(env) &&
+	    (t_ret = __repmgr_stable_lsn(env, stable_lsn)) != 0)
+		ret = t_ret;
+#else
+	COMPQUIET(group_wide, 0);
+#endif
+err:
+	return (ret);
+}
+
+/*
+ * __log_autoremove --
+ *	Delete any non-essential log files.
+ *
+ * PUBLIC: void __log_autoremove __P((ENV *));
+ */
+void
+__log_autoremove(env)
+	ENV *env;
+{
+	int ret;
+	char **begin, **list;
+
+	/*
+	 * Complain if there's an error, but don't return the error to our
+	 * caller.  Auto-remove is done when writing a log record, and we
+	 * don't want to fail a write, which could fail the corresponding
+	 * committing transaction, for a permissions error.
+	 */
+	if ((ret = __log_archive(env, &list, DB_ARCH_ABS)) != 0) {
+		if (ret != DB_NOTFOUND)
+			__db_err(env, ret, DB_STR("2571",
+			    "log file auto-remove"));
+		return;
+	}
+
+	/* Remove the files. */
+	if (list != NULL) {
+		for (begin = list; *list != NULL; ++list)
+			(void)__os_unlink(env, *list, 0);
+		__os_ufree(env, begin);
+	}
+}
+
+/*
+ * __build_data --
+ *	Build a list of datafiles for return.
+ */
+static int
+__build_data(env, pref, listp)
+	ENV *env;
+	char *pref, ***listp;
+{
+	DBT rec;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	__dbreg_register_args *argp;
+	u_int array_size, last, n, nxt;
+	u_int32_t rectype;
+	int ret, t_ret;
+	char **array, **arrayp, **list, **lp, *p, *real_name;
+
+	/* Get some initial space. */
+	array_size = 64;
+	if ((ret = __os_malloc(env,
+	    sizeof(char *) * array_size, &array)) != 0)
+		return (ret);
+	array[0] = NULL;
+
+	memset(&rec, 0, sizeof(rec));
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	for (n = 0; (ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0;) {
+		if (rec.size < sizeof(rectype)) {
+			ret = EINVAL;
+			__db_errx(env, DB_STR("2572",
+			    "DB_ENV->log_archive: bad log record"));
+			break;
+		}
+
+		LOGCOPY_32(env, &rectype, rec.data);
+		if (rectype != DB___dbreg_register)
+			continue;
+		if ((ret =
+		    __dbreg_register_read(env, rec.data, &argp)) != 0) {
+			ret = EINVAL;
+			__db_errx(env, DB_STR("2573",
+			    "DB_ENV->log_archive: unable to read log record"));
+			break;
+		}
+
+		if (n >= array_size - 2) {
+			array_size += LIST_INCREMENT;
+			if ((ret = __os_realloc(env,
+			    sizeof(char *) * array_size, &array)) != 0)
+				goto free_continue;
+		}
+
+		if ((ret = __os_strdup(env,
+		    argp->name.data, &array[n++])) != 0)
+			goto free_continue;
+		array[n] = NULL;
+
+		if (argp->ftype == DB_QUEUE) {
+			if ((ret = __qam_extent_names(env,
+			    argp->name.data, &list)) != 0)
+				goto q_err;
+			for (lp = list;
+			    lp != NULL && *lp != NULL; lp++) {
+				if (n >= array_size - 2) {
+					array_size += LIST_INCREMENT;
+					if ((ret = __os_realloc(env,
+					    sizeof(char *) *
+					    array_size, &array)) != 0)
+						goto q_err;
+				}
+				if ((ret =
+				    __os_strdup(env, *lp, &array[n++])) != 0)
+					goto q_err;
+				array[n] = NULL;
+			}
+q_err:			if (list != NULL)
+				__os_free(env, list);
+		}
+free_continue:	__os_free(env, argp);
+		if (ret != 0)
+			break;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err1;
+
+	/* If there's nothing to return, we're done. */
+	if (n == 0) {
+		ret = 0;
+		*listp = NULL;
+		goto err1;
+	}
+
+	/* Sort the list. */
+	qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
+
+	/*
+	 * Build the real pathnames, discarding nonexistent files and
+	 * duplicates.
+	 */
+	for (last = nxt = 0; nxt < n;) {
+		/*
+		 * Discard duplicates.  Last is the next slot we're going
+		 * to return to the user, nxt is the next slot that we're
+		 * going to consider.
+		 */
+		if (last != nxt) {
+			array[last] = array[nxt];
+			array[nxt] = NULL;
+		}
+		for (++nxt; nxt < n &&
+		    strcmp(array[last], array[nxt]) == 0; ++nxt) {
+			__os_free(env, array[nxt]);
+			array[nxt] = NULL;
+		}
+
+		/* Get the real name. */
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, array[last], NULL, &real_name)) != 0)
+			goto err2;
+
+		/* If the file doesn't exist, ignore it. */
+		if (__os_exists(env, real_name, NULL) != 0) {
+			__os_free(env, real_name);
+			__os_free(env, array[last]);
+			array[last] = NULL;
+			continue;
+		}
+
+		/* Rework the name as requested by the user. */
+		__os_free(env, array[last]);
+		array[last] = NULL;
+		if (pref != NULL) {
+			ret = __absname(env, pref, real_name, &array[last]);
+			__os_free(env, real_name);
+			if (ret != 0)
+				goto err2;
+		} else if ((p = __db_rpath(real_name)) != NULL) {
+			ret = __os_strdup(env, p + 1, &array[last]);
+			__os_free(env, real_name);
+			if (ret != 0)
+				goto err2;
+		} else
+			array[last] = real_name;
+		++last;
+	}
+
+	/* NULL-terminate the list. */
+	array[last] = NULL;
+
+	/* Rework the memory. */
+	if ((ret = __usermem(env, &array)) != 0)
+		goto err1;
+
+	*listp = array;
+	return (0);
+
+err2:	/*
+	 * XXX
+	 * We've possibly inserted NULLs into the array list, so clean up a
+	 * bit so that the other error processing works.
+	 */
+	if (array != NULL)
+		for (; nxt < n; ++nxt)
+			__os_free(env, array[nxt]);
+	/* FALLTHROUGH */
+
+err1:	if (array != NULL) {
+		for (arrayp = array; *arrayp != NULL; ++arrayp)
+			__os_free(env, *arrayp);
+		__os_free(env, array);
+	}
+	return (ret);
+}
+
+/*
+ * __absname --
+ *	Return an absolute path name for the file.
+ */
+static int
+__absname(env, pref, name, newnamep)
+	ENV *env;
+	char *pref, *name, **newnamep;
+{
+	size_t l_pref, l_name;
+	int isabspath, ret;
+	char *newname;
+
+	l_name = strlen(name);
+	isabspath = __os_abspath(name);
+	l_pref = isabspath ? 0 : strlen(pref);
+
+	/* Malloc space for concatenating the two. */
+	if ((ret = __os_malloc(env,
+	    l_pref + l_name + 2, &newname)) != 0)
+		return (ret);
+	*newnamep = newname;
+
+	/* Build the name.  If `name' is an absolute path, ignore any prefix. */
+	if (!isabspath) {
+		memcpy(newname, pref, l_pref);
+		if (strchr(PATH_SEPARATOR, newname[l_pref - 1]) == NULL)
+			newname[l_pref++] = PATH_SEPARATOR[0];
+	}
+	memcpy(newname + l_pref, name, l_name + 1);
+
+	return (0);
+}
+
+/*
+ * __usermem --
+ *	Create a single chunk of memory that holds the returned information.
+ *	If the user has their own malloc routine, use it.
+ */
+static int
+__usermem(env, listp)
+	ENV *env;
+	char ***listp;
+{
+	size_t len;
+	int ret;
+	char **array, **arrayp, **orig, *strp;
+
+	/* Find out how much space we need. */
+	for (len = 0, orig = *listp; *orig != NULL; ++orig)
+		len += sizeof(char *) + strlen(*orig) + 1;
+	len += sizeof(char *);
+
+	/* Allocate it and set up the pointers. */
+	if ((ret = __os_umalloc(env, len, &array)) != 0)
+		return (ret);
+
+	strp = (char *)(array + (orig - *listp) + 1);
+
+	/* Copy the original information into the new memory. */
+	for (orig = *listp, arrayp = array; *orig != NULL; ++orig, ++arrayp) {
+		len = strlen(*orig);
+		memcpy(strp, *orig, len + 1);
+		*arrayp = strp;
+		strp += len + 1;
+
+		__os_free(env, *orig);
+	}
+
+	/* NULL-terminate the list. */
+	*arrayp = NULL;
+
+	__os_free(env, *listp);
+	*listp = array;
+
+	return (0);
+}
+
+static int
+__cmpfunc(p1, p2)
+	const void *p1, *p2;
+{
+	return (strcmp(*((char * const *)p1), *((char * const *)p2)));
+}
diff --git a/src/log/log_compare.c b/src/log/log_compare.c
new file mode 100644
index 00000000..97b59338
--- /dev/null
+++ b/src/log/log_compare.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+/*
+ * log_compare --
+ *	Compare two LSN's; return 1, 0, -1 if first is >, == or < second.
+ *
+ * EXTERN: int log_compare __P((const DB_LSN *, const DB_LSN *));
+ */
+int
+log_compare(lsn0, lsn1)
+	const DB_LSN *lsn0, *lsn1;
+{
+	return (LOG_COMPARE(lsn0, lsn1));
+}
+
+/*
+ * __log_check_page_lsn --
+ *	Panic if the page's lsn in past the end of the current log.
+ *
+ * PUBLIC: int __log_check_page_lsn __P((ENV *, DB *, DB_LSN *));
+ */
+int
+__log_check_page_lsn(env, dbp, lsnp)
+	ENV *env;
+	DB *dbp;
+	DB_LSN *lsnp;
+{
+	LOG *lp;
+	int ret;
+
+	lp = env->lg_handle->reginfo.primary;
+	LOG_SYSTEM_LOCK(env);
+
+	ret = LOG_COMPARE(lsnp, &lp->lsn);
+
+	LOG_SYSTEM_UNLOCK(env);
+
+	if (ret < 0)
+		return (0);
+
+	__db_errx(env, DB_STR_A("2506",
+	    "file %s has LSN %lu/%lu, past end of log at %lu/%lu",
+	    "%s %lu %lu %lu %lu"),
+	    dbp == NULL ||
+		dbp->fname == NULL ? DB_STR_P("unknown") : dbp->fname,
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+	__db_errx(env, DB_STR("2507",
+    "Commonly caused by moving a database from one database environment"));
+	__db_errx(env, DB_STR("2508",
+    "to another without clearing the database LSNs, or by removing all of"));
+	__db_errx(env, DB_STR("2509",
+	    "the log files from a database environment"));
+	return (EINVAL);
+}
diff --git a/src/log/log_debug.c b/src/log/log_debug.c
new file mode 100644
index 00000000..32fb2542
--- /dev/null
+++ b/src/log/log_debug.c
@@ -0,0 +1,146 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int __log_printf_int __P((ENV *, DB_TXN *, const char *, va_list));
+
+/*
+ * __log_printf_capi --
+ *	Write a printf-style format string into the DB log.
+ *
+ * PUBLIC: int __log_printf_capi __P((DB_ENV *, DB_TXN *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__log_printf_capi(DB_ENV *dbenv, DB_TXN *txnid, const char *fmt, ...)
+#else
+__log_printf_capi(dbenv, txnid, fmt, va_alist)
+	DB_ENV *dbenv;
+	DB_TXN *txnid;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	int ret;
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	ret = __log_printf_pp(dbenv, txnid, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __log_printf_pp --
+ *	Handle the arguments and call an internal routine to do the work.
+ *
+ *	The reason this routine isn't just folded into __log_printf_capi
+ *	is because the C++ API has to call a C API routine, and you can
+ *	only pass variadic arguments to a single routine.
+ *
+ * PUBLIC: int __log_printf_pp
+ * PUBLIC:     __P((DB_ENV *, DB_TXN *, const char *, va_list));
+ */
+int
+__log_printf_pp(dbenv, txnid, fmt, ap)
+	DB_ENV *dbenv;
+	DB_TXN *txnid;
+	const char *fmt;
+	va_list ap;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_printf", DB_INIT_LOG);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_printf_int(env, txnid, fmt, ap)), 0, ret);
+	va_end(ap);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_printf --
+ *	Write a printf-style format string into the DB log.
+ *
+ * PUBLIC: int __log_printf __P((ENV *, DB_TXN *, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__log_printf(ENV *env, DB_TXN *txnid, const char *fmt, ...)
+#else
+__log_printf(env, txnid, fmt, va_alist)
+	ENV *env;
+	DB_TXN *txnid;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	int ret;
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	ret = __log_printf_int(env, txnid, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __log_printf_int --
+ *	Write a printf-style format string into the DB log (internal).
+ */
+static int
+__log_printf_int(env, txnid, fmt, ap)
+	ENV *env;
+	DB_TXN *txnid;
+	const char *fmt;
+	va_list ap;
+{
+	DBT opdbt, msgdbt;
+	DB_LSN lsn;
+	char __logbuf[2048];	/* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+	if (!DBENV_LOGGING(env)) {
+		__db_errx(env, DB_STR("2510",
+		    "Logging not currently permitted"));
+		return (EAGAIN);
+	}
+
+	memset(&opdbt, 0, sizeof(opdbt));
+	opdbt.data = "DIAGNOSTIC";
+	opdbt.size = sizeof("DIAGNOSTIC") - 1;
+
+	memset(&msgdbt, 0, sizeof(msgdbt));
+	msgdbt.data = __logbuf;
+	msgdbt.size = (u_int32_t)vsnprintf(__logbuf, sizeof(__logbuf), fmt, ap);
+
+	return (__db_debug_log(
+	    env, txnid, &lsn, 0, &opdbt, -1, &msgdbt, NULL, 0));
+}
diff --git a/src/log/log_get.c b/src/log/log_get.c
new file mode 100644
index 00000000..db30c969
--- /dev/null
+++ b/src/log/log_get.c
@@ -0,0 +1,1626 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/hash.h"
+
+typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK;
+
+static int __logc_close_pp __P((DB_LOGC *, u_int32_t));
+static int __logc_get_pp __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __logc_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __logc_hdrchk __P((DB_LOGC *, DB_LSN *, HDR *, int *));
+static int __logc_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **));
+static int __logc_inregion __P((DB_LOGC *,
+	       DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **, int *));
+static int __logc_io __P((DB_LOGC *,
+	       u_int32_t, u_int32_t, void *, size_t *, int *));
+static int __logc_ondisk __P((DB_LOGC *,
+	       DB_LSN *, DB_LSN *, u_int32_t, HDR *, u_int8_t **, int *));
+static int __logc_set_maxrec __P((DB_LOGC *, char *));
+static int __logc_shortread __P((DB_LOGC *, DB_LSN *, int));
+static int __logc_version_pp __P((DB_LOGC *, u_int32_t *, u_int32_t));
+
+/*
+ * __log_cursor_pp --
+ *	ENV->log_cursor
+ *
+ * PUBLIC: int __log_cursor_pp __P((DB_ENV *, DB_LOGC **, u_int32_t));
+ */
+int
+__log_cursor_pp(dbenv, logcp, flags)
+	DB_ENV *dbenv;
+	DB_LOGC **logcp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB_ENV->log_cursor", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_cursor(env, logcp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_cursor --
+ *	Create a log cursor.
+ *
+ * PUBLIC: int __log_cursor __P((ENV *, DB_LOGC **));
+ */
+int
+__log_cursor(env, logcp)
+	ENV *env;
+	DB_LOGC **logcp;
+{
+	DB_LOGC *logc;
+	int ret;
+
+	*logcp = NULL;
+
+	/* Allocate memory for the cursor. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_LOGC), &logc)) != 0)
+		return (ret);
+
+	logc->bp_size = LG_CURSOR_BUF_SIZE;
+	/*
+	 * Set this to something positive.
+	 */
+	logc->bp_maxrec = MEGABYTE;
+	if ((ret = __os_malloc(env, logc->bp_size, &logc->bp)) != 0) {
+		__os_free(env, logc);
+		return (ret);
+	}
+
+	logc->env = env;
+	logc->close = __logc_close_pp;
+	logc->get = __logc_get_pp;
+	logc->version = __logc_version_pp;
+
+	*logcp = logc;
+	return (0);
+}
+
+/*
+ * __logc_close_pp --
+ *	DB_LOGC->close pre/post processing.
+ */
+static int
+__logc_close_pp(logc, flags)
+	DB_LOGC *logc;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = logc->env;
+
+	if ((ret = __db_fchk(env, "DB_LOGC->close", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__logc_close(logc)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __logc_close --
+ *	DB_LOGC->close.
+ *
+ * PUBLIC: int __logc_close __P((DB_LOGC *));
+ */
+int
+__logc_close(logc)
+	DB_LOGC *logc;
+{
+	ENV *env;
+
+	env = logc->env;
+
+	if (logc->fhp != NULL) {
+		(void)__os_closehandle(env, logc->fhp);
+		logc->fhp = NULL;
+	}
+
+	if (logc->dbt.data != NULL)
+		__os_free(env, logc->dbt.data);
+
+	__os_free(env, logc->bp);
+	__os_free(env, logc);
+
+	return (0);
+}
+
+/*
+ * __logc_version_pp --
+ *	DB_LOGC->version.
+ */
+static int
+__logc_version_pp(logc, versionp, flags)
+	DB_LOGC *logc;
+	u_int32_t *versionp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = logc->env;
+
+	if ((ret = __db_fchk(env, "DB_LOGC->version", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__logc_version(logc, versionp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __logc_version --
+ *	DB_LOGC->version.
+ *
+ * PUBLIC: int __logc_version __P((DB_LOGC *, u_int32_t *));
+ */
+int
+__logc_version(logc, versionp)
+	DB_LOGC *logc;
+	u_int32_t *versionp;
+{
+	DBT hdrdbt;
+	DB_LOGC *plogc;
+	DB_LSN plsn;
+	ENV *env;
+	LOGP *persist;
+	int ret, t_ret;
+
+	env = logc->env;
+	if (IS_ZERO_LSN(logc->lsn)) {
+		__db_errx(env, DB_STR("2574", "DB_LOGC->get: unset cursor"));
+		return (EINVAL);
+	}
+	ret = 0;
+	/*
+	 * Check if the persist info we have is for the same file
+	 * as the current cursor position.  If we already have the
+	 * information, then we're done.  If not, we open a new
+	 * log cursor and get the header.
+	 *
+	 * Since most users walk forward through the log when
+	 * using this feature (i.e. printlog) we're likely to
+	 * have the information we need.
+	 */
+	if (logc->lsn.file != logc->p_lsn.file) {
+		if ((ret = __log_cursor(env, &plogc)) != 0)
+			return (ret);
+		plsn.file = logc->lsn.file;
+		plsn.offset = 0;
+		plogc->lsn = plsn;
+		memset(&hdrdbt, 0, sizeof(DBT));
+		if ((ret = __logc_get_int(plogc,
+		    &plsn, &hdrdbt, DB_SET)) == 0) {
+			persist = (LOGP *)hdrdbt.data;
+			if (LOG_SWAPPED(env))
+				__log_persistswap(persist);
+			logc->p_lsn = logc->lsn;
+			logc->p_version = persist->version;
+		}
+		if ((t_ret = __logc_close(plogc)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	/* Return the version. */
+	if (ret == 0)
+		*versionp = logc->p_version;
+	return (ret);
+}
+
+/*
+ * __logc_get_pp --
+ *	DB_LOGC->get pre/post processing.
+ */
+static int
+__logc_get_pp(logc, alsn, dbt, flags)
+	DB_LOGC *logc;
+	DB_LSN *alsn;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = logc->env;
+
+	/* Validate arguments. */
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_LAST:
+	case DB_NEXT:
+	case DB_PREV:
+		break;
+	case DB_SET:
+		if (IS_ZERO_LSN(*alsn)) {
+			__db_errx(env, DB_STR_A("2575",
+			    "DB_LOGC->get: invalid LSN: %lu/%lu", "%lu %lu"),
+			    (u_long)alsn->file, (u_long)alsn->offset);
+			return (EINVAL);
+		}
+		break;
+	default:
+		return (__db_ferr(env, "DB_LOGC->get", 1));
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__logc_get(logc, alsn, dbt, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __logc_get --
+ *	DB_LOGC->get.
+ *
+ * PUBLIC: int __logc_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+ */
+int
+__logc_get(logc, alsn, dbt, flags)
+	DB_LOGC *logc;
+	DB_LSN *alsn;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	DB_LSN saved_lsn;
+	ENV *env;
+	LOGP *persist;
+	int ret;
+
+	env = logc->env;
+
+	/*
+	 * On error, we take care not to overwrite the caller's LSN.  This
+	 * is because callers looking for the end of the log loop using the
+	 * DB_NEXT flag, and expect to take the last successful lsn out of
+	 * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND.
+	 *
+	 * !!!
+	 * This line is often flagged an uninitialized memory read during a
+	 * Purify or similar tool run, as the application didn't initialize
+	 * *alsn.  If the application isn't setting the DB_SET flag, there is
+	 * no reason it should have initialized *alsn, but we can't know that
+	 * and we want to make sure we never overwrite whatever the application
+	 * put in there.
+	 */
+	saved_lsn = *alsn;
+	/*
+	 * If we get one of the log's header records as a result of doing a
+	 * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log
+	 * file header records aren't useful to applications.
+	 */
+	if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
+		*alsn = saved_lsn;
+		return (ret);
+	}
+	/*
+	 * The DBT was populated by the call to __logc_get_int, copy the data
+	 * out of DB_DBT_USERMEM space if it is there.
+	 */
+	if ((ret = __dbt_usercopy(env, dbt)) != 0)
+		return (ret);
+
+	if (alsn->offset == 0 && (flags == DB_FIRST ||
+	    flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) {
+		switch (flags) {
+		case DB_FIRST:
+			flags = DB_NEXT;
+			break;
+		case DB_LAST:
+			flags = DB_PREV;
+			break;
+		case DB_NEXT:
+		case DB_PREV:
+		default:
+			break;
+		}
+		/*
+		 * If we're walking the log and we find a persist header
+		 * then store so that we may use it later if needed.
+		 */
+		persist = (LOGP *)dbt->data;
+		if (LOG_SWAPPED(env))
+			__log_persistswap(persist);
+		logc->p_lsn = *alsn;
+		logc->p_version = persist->version;
+		if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+			__os_free(env, dbt->data);
+			dbt->data = NULL;
+		}
+		if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
+			*alsn = saved_lsn;
+			goto err;
+		}
+	}
+
+err:	__dbt_userfree(env, dbt, NULL, NULL);
+	return (ret);
+}
+
+/*
+ * __logc_get_int --
+ *	Get a log record; internal version.
+ */
+static int
+__logc_get_int(logc, alsn, dbt, flags)
+	DB_LOGC *logc;
+	DB_LSN *alsn;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	DB_CIPHER *db_cipher;
+	DB_LOG *dblp;
+	DB_LSN last_lsn, nlsn;
+	ENV *env;
+	HDR hdr;
+	LOG *lp;
+	RLOCK rlock;
+	logfile_validity status;
+	u_int32_t cnt, logfsz, orig_flags;
+	u_int8_t *rp;
+	int eof, is_hmac, need_cksum, ret;
+	size_t blen;
+#ifdef HAVE_LOG_CHECKSUM
+	u_int32_t i, logtype, version;
+	char chksumbuf[256];
+	u_int8_t ch;
+#endif
+
+	env = logc->env;
+	db_cipher = env->crypto_handle;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	eof = is_hmac = 0;
+	orig_flags = flags; /* flags may be altered later. */
+	blen = 0;
+	logfsz = lp->persist.log_size;
+
+	/*
+	 * We don't acquire the log region lock until we need it, and we
+	 * release it as soon as we're done.
+	 */
+	rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE;
+
+#ifdef HAVE_LOG_CHECKSUM
+nextrec:
+#endif
+	nlsn = logc->lsn;
+	switch (flags) {
+	case DB_NEXT:				/* Next log record. */
+		if (!IS_ZERO_LSN(nlsn)) {
+			/* Increment the cursor by the cursor record size. */
+			nlsn.offset += logc->len;
+			break;
+		}
+		flags = DB_FIRST;
+		/* FALLTHROUGH */
+	case DB_FIRST:				/* First log record. */
+		/* Find the first log file. */
+		if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0)
+			goto err;
+
+		/*
+		 * DB_LV_INCOMPLETE:
+		 *	Theoretically, the log file we want could be created
+		 *	but not yet written, the "first" log record must be
+		 *	in the log buffer.
+		 * DB_LV_NORMAL:
+		 * DB_LV_OLD_READABLE:
+		 *	We found a log file we can read.
+		 * DB_LV_NONEXISTENT:
+		 *	No log files exist, the "first" log record must be in
+		 *	the log buffer.
+		 * DB_LV_OLD_UNREADABLE:
+		 *	No readable log files exist, we're at the cross-over
+		 *	point between two versions.  The "first" log record
+		 *	must be in the log buffer.
+		 */
+		switch (status) {
+		case DB_LV_INCOMPLETE:
+			DB_ASSERT(env, lp->lsn.file == cnt);
+			/* FALLTHROUGH */
+		case DB_LV_NORMAL:
+		case DB_LV_OLD_READABLE:
+			nlsn.file = cnt;
+			break;
+		case DB_LV_NONEXISTENT:
+			nlsn.file = 1;
+			DB_ASSERT(env, lp->lsn.file == nlsn.file);
+			break;
+		case DB_LV_OLD_UNREADABLE:
+			nlsn.file = cnt + 1;
+			DB_ASSERT(env, lp->lsn.file == nlsn.file);
+			break;
+		}
+		nlsn.offset = 0;
+		break;
+	case DB_CURRENT:			/* Current log record. */
+		break;
+	case DB_PREV:				/* Previous log record. */
+		if (!IS_ZERO_LSN(nlsn)) {
+			/* If at start-of-file, move to the previous file. */
+			if (nlsn.offset == 0) {
+				if (nlsn.file == 1) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+				if ((!lp->db_log_inmemory &&
+				    (__log_valid(dblp, nlsn.file - 1, 0, NULL,
+				    0, &status, NULL) != 0 ||
+				    (status != DB_LV_NORMAL &&
+				    status != DB_LV_OLD_READABLE)))) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+
+				--nlsn.file;
+			}
+			nlsn.offset = logc->prev;
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_LAST:				/* Last log record. */
+		if (rlock == L_NONE) {
+			rlock = L_ACQUIRED;
+			LOG_SYSTEM_LOCK(env);
+		}
+		nlsn.file = lp->lsn.file;
+		nlsn.offset = lp->lsn.offset - lp->len;
+		break;
+	case DB_SET:				/* Set log record. */
+		nlsn = *alsn;
+		break;
+	default:
+		ret = __db_unknown_path(env, "__logc_get_int");
+		goto err;
+	}
+
+	if (0) {				/* Move to the next file. */
+next_file:	++nlsn.file;
+		nlsn.offset = 0;
+	}
+
+	/*
+	 * The above switch statement should have set nlsn to the lsn of
+	 * the requested record.
+	 */
+
+	if (CRYPTO_ON(env)) {
+		hdr.size = HDR_CRYPTO_SZ;
+		is_hmac = 1;
+	} else {
+		hdr.size = HDR_NORMAL_SZ;
+		is_hmac = 0;
+	}
+
+	/*
+	 * Check to see if the record is in the cursor's buffer -- if so,
+	 * we'll need to checksum it.
+	 */
+	if ((ret = __logc_incursor(logc, &nlsn, &hdr, &rp)) != 0)
+		goto err;
+	if (rp != NULL)
+		goto cksum;
+
+	/*
+	 * Look to see if we're moving backward in the log with the last record
+	 * coming from the disk -- it means the record can't be in the region's
+	 * buffer.  Else, check the region's buffer.
+	 *
+	 * If the record isn't in the region's buffer, then either logs are
+	 * in-memory, and we're done, or we're going to have to read the
+	 * record from disk.  We want to make a point of not reading past the
+	 * end of the logical log (after recovery, there may be data after the
+	 * end of the logical log, not to mention the log file may have been
+	 * pre-allocated).  So, zero out last_lsn, and initialize it inside
+	 * __logc_inregion -- if it's still zero when we check it in
+	 * __logc_ondisk, that's OK, it just means the logical end of the log
+	 * isn't an issue for this request.
+	 */
+	ZERO_LSN(last_lsn);
+	if (!F_ISSET(logc, DB_LOG_DISK) ||
+	    LOG_COMPARE(&nlsn, &logc->lsn) > 0) {
+		F_CLR(logc, DB_LOG_DISK);
+
+		if ((ret = __logc_inregion(logc,
+		    &nlsn, &rlock, &last_lsn, &hdr, &rp, &need_cksum)) != 0)
+			goto err;
+		if (rp != NULL) {
+			/*
+			 * If we read the entire record from the in-memory log
+			 * buffer, we don't need to checksum it, nor do we need
+			 * to worry about vtruncate issues.
+			 */
+			if (need_cksum)
+				goto cksum;
+			goto from_memory;
+		}
+		if (lp->db_log_inmemory)
+			goto nohdr;
+	}
+
+	/*
+	 * We have to read from an on-disk file to retrieve the record.
+	 * If we ever can't retrieve the record at offset 0, we're done,
+	 * return EOF/DB_NOTFOUND.
+	 *
+	 * Discard the region lock if we're still holding it, the on-disk
+	 * reading routines don't need it.
+	 */
+	if (rlock == L_ACQUIRED) {
+		rlock = L_NONE;
+		LOG_SYSTEM_UNLOCK(env);
+	}
+	if ((ret = __logc_ondisk(
+	    logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0)
+		goto err;
+
+	/*
+	 * If we got a 0-length record, that means we're in the midst of some
+	 * bytes that got 0'd as the result of a vtruncate.  In that case or at
+	 * the end of a file, with DB_NEXT we're going to have to retry.
+	 */
+	if (eof || hdr.len == 0) {
+nohdr:		switch (flags) {
+		case DB_LAST:
+		case DB_PREV:
+			/*
+			 * We should never get here.  If we recover a log
+			 * file with 0's at the end, we'll treat the 0'd
+			 * headers as the end of log and ignore them.  If
+			 * we're reading backwards from another file, then
+			 * the first record in that new file should have its
+			 * prev field set correctly.
+			 * First check that the file exists.
+			 */
+			if (eof && logc->bp_lsn.file != nlsn.file)
+				__db_errx(env, DB_STR_A("2583",
+	     "Log file %d not found, check log directory configuration", "%d"),
+	     			     nlsn.file);
+			else
+				__db_errx(env, DB_STR("2576",
+		"Encountered zero length records while traversing backwards"));
+			ret = __env_panic(env, DB_RUNRECOVERY);
+			goto err;
+		case DB_FIRST:
+		case DB_NEXT:
+			/*
+			 * Zero'd records always indicate the end of a file,
+			 * but only go to the next file once.
+			 */
+			if (nlsn.offset != 0)
+				goto next_file;
+			/* FALLTHROUGH */
+		case DB_SET:
+		default:
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+	}
+
+	F_SET(logc, DB_LOG_DISK);
+
+cksum:	/*
+	 * Discard the region lock if we're still holding it.  (The path to
+	 * get here is we acquired the region lock because of the caller's
+	 * flag argument, but we found the record in the in-memory or cursor
+	 * buffers.  Improbable, but it's easy to avoid.)
+	 */
+	if (rlock == L_ACQUIRED) {
+		rlock = L_NONE;
+		LOG_SYSTEM_UNLOCK(env);
+	}
+#ifdef HAVE_LOG_CHECKSUM
+	/*
+	 * Checksum: there are two types of errors -- a configuration error
+	 * or a checksum mismatch.  The former is always bad.  The latter is
+	 * OK if we're searching for the end of the log, and very, very bad
+	 * if we're reading random log records.
+	 */
+	if ((ret = __db_check_chksum(env, &hdr, db_cipher,
+	    hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) {
+		/*
+		 * This might be a log whose checksum does not include the hdr.
+		 * Try again without the header, either for logs whose version
+		 * is pre-DB_LOGCHKSUM, or for the persist record which contains
+		 * the log version. Check for the zero offset first to avoid
+		 * unwanted recursion in __logc_version().
+		 *
+		 * Set the cursor to the LSN we are trying to look at.
+		 */
+		last_lsn = logc->lsn;
+		logc->lsn = nlsn;
+		if ((logc->lsn.offset == 0 ||
+		    (__logc_version(logc, &version) == 0 &&
+		    version < DB_LOGCHKSUM)) &&
+		    __db_check_chksum(env, NULL,  db_cipher, hdr.chksum,
+		    rp + hdr.size, hdr.len - hdr.size, is_hmac) == 0) {
+			logc->lsn = last_lsn;
+			goto from_memory;
+		}
+
+		/*
+		 * If we are iterating logs during log verification and basic
+		 * header info is correct, we can skip the failed log record
+		 * and goto next one.
+		 */
+		if (F_ISSET(logc->env->lg_handle, DBLOG_VERIFYING) &&
+		    (orig_flags == DB_FIRST || orig_flags == DB_LAST ||
+		    orig_flags == DB_PREV || orig_flags == DB_NEXT) &&
+		    hdr.size > 0 && hdr.len > hdr.size && hdr.len < logfsz &&
+		    (((flags == DB_FIRST || flags == DB_NEXT) &&
+		    hdr.prev == last_lsn.offset) ||
+		    ((flags == DB_PREV || flags == DB_LAST) &&
+		    last_lsn.offset - hdr.len == nlsn.offset))) {
+
+			flags = orig_flags;
+
+			logc->lsn = nlsn;
+			logc->len = hdr.len;
+			logc->prev = hdr.prev;
+
+			if (flags == DB_LAST)
+				flags = DB_PREV;
+			else if (flags == DB_FIRST)
+				flags = DB_NEXT;
+
+			memset(chksumbuf, 0, 256);
+			blen = 0;
+			for (i = 0; i < DB_MAC_KEY && blen < 256; i++) {
+				ch = hdr.chksum[i];
+				blen = strlen(chksumbuf);
+				snprintf(chksumbuf + blen, 255 - blen,
+				    isprint(ch) ||
+				    ch == 0x0a ? "%c" : "%#x ", ch);
+			}
+			/* Type field is always the first one in the record. */
+			memcpy(&logtype, rp + hdr.size, sizeof(logtype));
+			__db_errx(env, DB_STR_A("2577",
+			    "DB_LOGC->get: log record LSN %lu/%lu: "
+			    "checksum mismatch, hdr.chksum: %s, hdr.prev: %u, "
+			    "hdr.len: %u, log type: %u. Skipping it and "
+			    "continuing with the %s one",
+			    "%lu %lu %s %u %u %u %s"),
+			    (u_long)nlsn.file, (u_long)nlsn.offset, chksumbuf,
+			    hdr.prev, hdr.len, logtype, flags == DB_NEXT ?
+			    DB_STR_P("next") : DB_STR_P("previous"));
+			goto nextrec;
+		}
+
+		if (F_ISSET(logc, DB_LOG_SILENT_ERR)) {
+			if (ret == -1)
+				ret = EIO;
+		} else if (ret == -1) {
+			__db_errx(env, DB_STR_A("2578",
+		    "DB_LOGC->get: log record LSN %lu/%lu: checksum mismatch",
+			    "%lu %lu"), (u_long)nlsn.file, (u_long)nlsn.offset);
+			__db_errx(env, DB_STR("2579",
+		    "DB_LOGC->get: catastrophic recovery may be required"));
+			ret = __env_panic(env, DB_RUNRECOVERY);
+		}
+		logc->lsn = last_lsn;
+		goto err;
+	}
+#endif
+
+from_memory:
+	/*
+	 * Discard the region lock if we're still holding it.  (The path to
+	 * get here is we acquired the region lock because of the caller's
+	 * flag argument, but we found the record in the in-memory or cursor
+	 * buffers.  Improbable, but it's easy to avoid.)
+	 */
+	if (rlock == L_ACQUIRED) {
+		rlock = L_NONE;
+		LOG_SYSTEM_UNLOCK(env);
+	}
+
+	/* Copy the record into the user's DBT. */
+	if ((ret = __db_retcopy(env, dbt, rp + hdr.size,
+	    (u_int32_t)(hdr.len - hdr.size),
+	    &logc->dbt.data, &logc->dbt.ulen)) != 0)
+		goto err;
+
+	if (CRYPTO_ON(env)) {
+		if ((ret = db_cipher->decrypt(env, db_cipher->data,
+		    hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) {
+			ret = EAGAIN;
+			goto err;
+		}
+		/*
+		 * Return the original log record size to the user,
+		 * even though we've allocated more than that, possibly.
+		 * The log record is decrypted in the user dbt, not in
+		 * the buffer, so we must do this here after decryption,
+		 * not adjust the len passed to the __db_retcopy call.
+		 */
+		dbt->size = hdr.orig_size;
+	}
+
+	/* Update the cursor and the returned LSN. */
+	*alsn = nlsn;
+	logc->lsn = nlsn;
+	logc->len = hdr.len;
+	logc->prev = hdr.prev;
+
+err:	if (rlock == L_ACQUIRED)
+		LOG_SYSTEM_UNLOCK(env);
+
+	return (ret);
+}
+
+/*
+ * __logc_incursor --
+ *	Check to see if the requested record is in the cursor's buffer.
+ */
+static int
+__logc_incursor(logc, lsn, hdr, pp)
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+	HDR *hdr;
+	u_int8_t **pp;
+{
+	ENV *env;
+	u_int8_t *p;
+	int eof;
+
+	env = logc->env;
+	*pp = NULL;
+
+	/*
+	 * Test to see if the requested LSN could be part of the cursor's
+	 * buffer.
+	 *
+	 * The record must be part of the same file as the cursor's buffer.
+	 * The record must start at a byte offset equal to or greater than
+	 * the cursor buffer.
+	 * The record must not start at a byte offset after the cursor
+	 * buffer's end.
+	 */
+	if (logc->bp_lsn.file != lsn->file)
+		return (0);
+	if (logc->bp_lsn.offset > lsn->offset)
+		return (0);
+	if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size)
+		return (0);
+
+	/*
+	 * Read the record's header and check if the record is entirely held
+	 * in the buffer.  If the record is not entirely held, get it again.
+	 * (The only advantage in having part of the record locally is that
+	 * we might avoid a system call because we already have the HDR in
+	 * memory.)
+	 *
+	 * If the header check fails for any reason, it must be because the
+	 * LSN is bogus.  Fail hard.
+	 */
+	p = logc->bp + (lsn->offset - logc->bp_lsn.offset);
+	memcpy(hdr, p, hdr->size);
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+	if (__logc_hdrchk(logc, lsn, hdr, &eof))
+		return (DB_NOTFOUND);
+	if (eof || logc->bp_lsn.offset + logc->bp_rlen < lsn->offset + hdr->len)
+		return (0);
+
+	*pp = p;				/* Success. */
+
+	return (0);
+}
+
+/*
+ * __logc_inregion --
+ *	Check to see if the requested record is in the region's buffer.
+ */
+static int
+__logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump)
+	DB_LOGC *logc;
+	DB_LSN *lsn, *last_lsn;
+	RLOCK *rlockp;
+	HDR *hdr;
+	u_int8_t **pp;
+	int *need_cksump;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	LOG *lp;
+	size_t b_region, len, nr;
+	u_int32_t b_disk;
+	int eof, ret;
+	u_int8_t *p;
+
+	env = logc->env;
+	dblp = env->lg_handle;
+	lp = env->lg_handle->reginfo.primary;
+
+	ret = 0;
+	b_region = 0;
+	*pp = NULL;
+	*need_cksump = 0;
+
+	/* If we haven't yet acquired the log region lock, do so. */
+	if (*rlockp == L_NONE) {
+		*rlockp = L_ACQUIRED;
+		LOG_SYSTEM_LOCK(env);
+	}
+
+	/*
+	 * The routines to read from disk must avoid reading past the logical
+	 * end of the log, so pass that information back to it.
+	 *
+	 * Since they're reading directly from the disk, they must also avoid
+	 * reading past the offset we've written out.  If the log was
+	 * truncated, it's possible that there are zeroes or garbage on
+	 * disk after this offset, and the logical end of the log can
+	 * come later than this point if the log buffer isn't empty.
+	 */
+	*last_lsn = lp->lsn;
+	if (!lp->db_log_inmemory && last_lsn->offset > lp->w_off)
+		last_lsn->offset = lp->w_off;
+
+	/*
+	 * Test to see if the requested LSN could be part of the region's
+	 * buffer.
+	 *
+	 * During recovery, we read the log files getting the information to
+	 * initialize the region.  In that case, the region's lsn field will
+	 * not yet have been filled in, use only the disk.
+	 *
+	 * The record must not start at a byte offset after the region buffer's
+	 * end, since that means the request is for a record after the end of
+	 * the log.  Do this test even if the region's buffer is empty -- after
+	 * recovery, the log files may continue past the declared end-of-log,
+	 * and the disk reading routine will incorrectly attempt to read the
+	 * remainder of the log.
+	 *
+	 * Otherwise, test to see if the region's buffer actually has what we
+	 * want:
+	 *
+	 * The buffer must have some useful content.
+	 * The record must be in the same file as the region's buffer and must
+	 * start at a byte offset equal to or greater than the region's buffer.
+	 */
+	if (IS_ZERO_LSN(lp->lsn))
+		return (0);
+	if (LOG_COMPARE(lsn, &lp->lsn) >= 0)
+		return (DB_NOTFOUND);
+	else if (lp->db_log_inmemory) {
+		if ((ret = __log_inmem_lsnoff(dblp, lsn, &b_region)) != 0)
+			return (ret);
+	} else if (lp->b_off == 0 || LOG_COMPARE(lsn, &lp->f_lsn) < 0)
+		return (0);
+
+	/*
+	 * The current contents of the cursor's buffer will be useless for a
+	 * future call, we're about to overwrite it -- trash it rather than
+	 * try and make it look correct.
+	 */
+	logc->bp_rlen = 0;
+
+	/*
+	 * If the requested LSN is greater than the region buffer's first
+	 * byte, we know the entire record is in the buffer on a good LSN.
+	 *
+	 * If we're given a bad LSN, the "entire" record might not be in
+	 * our buffer in order to fail at the chksum.  __logc_hdrchk made
+	 * sure our dest buffer fits, via bp_maxrec, but we also need to
+	 * make sure we don't run off the end of this buffer, the src.
+	 *
+	 * There is one case where the header check can fail: on a scan through
+	 * in-memory logs, when we reach the end of a file we can read an empty
+	 * header.  In that case, it's safe to return zero, here: it will be
+	 * caught in our caller.  Otherwise, the LSN is bogus.  Fail hard.
+	 */
+	if (lp->db_log_inmemory || LOG_COMPARE(lsn, &lp->f_lsn) > 0) {
+		if (!lp->db_log_inmemory)
+			b_region = lsn->offset - lp->w_off;
+		__log_inmem_copyout(dblp, b_region, hdr, hdr->size);
+		if (LOG_SWAPPED(env))
+			__log_hdrswap(hdr, CRYPTO_ON(env));
+		if (__logc_hdrchk(logc, lsn, hdr, &eof) != 0)
+			return (DB_NOTFOUND);
+		if (eof)
+			return (0);
+		if (lp->db_log_inmemory) {
+			if (RINGBUF_LEN(lp, b_region, lp->b_off) < hdr->len)
+				return (DB_NOTFOUND);
+		} else if (lsn->offset + hdr->len > lp->w_off + lp->buffer_size)
+			return (DB_NOTFOUND);
+		if (logc->bp_size <= hdr->len) {
+			len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
+			if ((ret =
+			    __os_realloc(logc->env, len, &logc->bp)) != 0)
+				 return (ret);
+			logc->bp_size = (u_int32_t)len;
+		}
+		__log_inmem_copyout(dblp, b_region, logc->bp, hdr->len);
+		*pp = logc->bp;
+		return (0);
+	}
+
+	DB_ASSERT(env, !lp->db_log_inmemory);
+
+	/*
+	 * There's a partial record, that is, the requested record starts
+	 * in a log file and finishes in the region buffer.  We have to
+	 * find out how many bytes of the record are in the region buffer
+	 * so we can copy them out into the cursor buffer.  First, check
+	 * to see if the requested record is the only record in the region
+	 * buffer, in which case we should copy the entire region buffer.
+	 *
+	 * Else, walk back through the region's buffer to find the first LSN
+	 * after the record that crosses the buffer boundary -- we can detect
+	 * that LSN, because its "prev" field will reference the record we
+	 * want.  The bytes we need to copy from the region buffer are the
+	 * bytes up to the record we find.  The bytes we'll need to allocate
+	 * to hold the log record are the bytes between the two offsets.
+	 */
+	b_disk = lp->w_off - lsn->offset;
+	if (lp->b_off <= lp->len)
+		b_region = (u_int32_t)lp->b_off;
+	else
+		for (p = dblp->bufp + (lp->b_off - lp->len);;) {
+			memcpy(hdr, p, hdr->size);
+			if (LOG_SWAPPED(env))
+				__log_hdrswap(hdr, CRYPTO_ON(env));
+			if (hdr->prev == lsn->offset) {
+				b_region = (u_int32_t)(p - dblp->bufp);
+				break;
+			}
+			p = dblp->bufp + (hdr->prev - lp->w_off);
+		}
+
+	/*
+	 * If we don't have enough room for the record, we have to allocate
+	 * space.  We have to do it while holding the region lock, which is
+	 * truly annoying, but there's no way around it.  This call is why
+	 * we allocate cursor buffer space when allocating the cursor instead
+	 * of waiting.
+	 */
+	if (logc->bp_size <= b_region + b_disk) {
+		len = (size_t)DB_ALIGN((uintmax_t)(b_region + b_disk) * 2, 128);
+		if ((ret = __os_realloc(logc->env, len, &logc->bp)) != 0)
+			return (ret);
+		logc->bp_size = (u_int32_t)len;
+	}
+
+	/* Copy the region's bytes to the end of the cursor's buffer. */
+	p = (logc->bp + logc->bp_size) - b_region;
+	memcpy(p, dblp->bufp, b_region);
+
+	/* Release the region lock. */
+	if (*rlockp == L_ACQUIRED) {
+		*rlockp = L_NONE;
+		LOG_SYSTEM_UNLOCK(env);
+	}
+
+	/*
+	 * Read the rest of the information from disk.  Neither short reads
+	 * or EOF are acceptable, the bytes we want had better be there.
+	 */
+	if (b_disk != 0) {
+		p -= b_disk;
+		nr = b_disk;
+		if ((ret = __logc_io(
+		    logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0)
+			return (ret);
+		if (nr < b_disk)
+			return (__logc_shortread(logc, lsn, 0));
+
+		/* We read bytes from the disk, we'll need to checksum them. */
+		*need_cksump = 1;
+	}
+
+	/* Copy the header information into the caller's structure. */
+	memcpy(hdr, p, hdr->size);
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __log_hdrswap --
+ *	Swap the bytes in a log header from machines with different endianness.
+ *
+ * PUBLIC: void __log_hdrswap __P((HDR *, int));
+ */
+void
+__log_hdrswap(hdr, is_hmac)
+	HDR *hdr;
+	int is_hmac;
+{
+	M_32_SWAP(hdr->prev);
+	M_32_SWAP(hdr->len);
+	if (!is_hmac)
+		P_32_SWAP(hdr->chksum);
+}
+
+/*
+ * __log_persistswap --
+ *	Swap the bytes in a log file persistent header from machines with
+ *	different endianness.
+ *
+ * PUBLIC: void __log_persistswap __P((LOGP *));
+ */
+void
+__log_persistswap(persist)
+	LOGP *persist;
+{
+	M_32_SWAP(persist->magic);
+	M_32_SWAP(persist->version);
+	M_32_SWAP(persist->log_size);
+	M_32_SWAP(persist->notused);
+}
+
+/*
+ * __logc_ondisk --
+ *	Read a record off disk.
+ */
+static int
+__logc_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp)
+	DB_LOGC *logc;
+	DB_LSN *lsn, *last_lsn;
+	u_int32_t flags;
+	int *eofp;
+	HDR *hdr;
+	u_int8_t **pp;
+{
+	ENV *env;
+	size_t len, nr;
+	u_int32_t offset;
+	int ret;
+
+	env = logc->env;
+	*eofp = 0;
+
+	nr = hdr->size;
+	if ((ret =
+	    __logc_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0)
+		return (ret);
+	if (*eofp)
+		return (0);
+
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+
+	/*
+	 * If the read was successful, but we can't read a full header, assume
+	 * we've hit EOF.  We can't check that the header has been partially
+	 * zeroed out, but it's unlikely that this is caused by a write failure
+	 * since the header is written as a single write call and it's less
+	 * than sector.
+	 */
+	if (nr < hdr->size) {
+		*eofp = 1;
+		return (0);
+	}
+
+	/* Check the HDR. */
+	if ((ret = __logc_hdrchk(logc, lsn, hdr, eofp)) != 0)
+		return (ret);
+	if (*eofp)
+		return (0);
+
+	/*
+	 * Regardless of how we return, the previous contents of the cursor's
+	 * buffer are useless -- trash it.
+	 */
+	logc->bp_rlen = 0;
+
+	/*
+	 * Otherwise, we now (finally!) know how big the record is.  (Maybe
+	 * we should have just stuck the length of the record into the LSN!?)
+	 * Make sure we have enough space.
+	 */
+	if (logc->bp_size <= hdr->len) {
+		len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
+		if ((ret = __os_realloc(env, len, &logc->bp)) != 0)
+			return (ret);
+		logc->bp_size = (u_int32_t)len;
+	}
+
+	/*
+	 * If we're moving forward in the log file, read this record in at the
+	 * beginning of the buffer.  Otherwise, read this record in at the end
+	 * of the buffer, making sure we don't try and read before the start
+	 * of the file.  (We prefer positioning at the end because transaction
+	 * aborts use DB_SET to move backward through the log and we might get
+	 * lucky.)
+	 *
+	 * Read a buffer's worth, without reading past the logical EOF.  The
+	 * last_lsn may be a zero LSN, but that's OK, the test works anyway.
+	 */
+	if (flags == DB_FIRST || flags == DB_NEXT)
+		offset = lsn->offset;
+	else if (lsn->offset + hdr->len < logc->bp_size)
+		offset = 0;
+	else
+		offset = (lsn->offset + hdr->len) - logc->bp_size;
+
+	nr = logc->bp_size;
+	if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset)
+		nr = last_lsn->offset - offset;
+
+	if ((ret =
+	    __logc_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0)
+		return (ret);
+
+	/*
+	 * We should have at least gotten the bytes up-to-and-including the
+	 * record we're reading.
+	 */
+	if (nr < (lsn->offset + hdr->len) - offset)
+		return (__logc_shortread(logc, lsn, 1));
+
+	/*
+	 * Set up the return information.
+	 *
+	 * !!!
+	 * No need to set the bp_lsn.file field, __logc_io set it for us.
+	 */
+	logc->bp_rlen = (u_int32_t)nr;
+	logc->bp_lsn.offset = offset;
+
+	*pp = logc->bp + (lsn->offset - offset);
+
+	return (0);
+}
+
+/*
+ * __logc_hdrchk --
+ *
+ * Check for corrupted HDRs before we use them to allocate memory or find
+ * records.
+ *
+ * If the log files were pre-allocated, a zero-filled HDR structure is the
+ * logical file end.  However, we can see buffers filled with 0's during
+ * recovery, too (because multiple log buffers were written asynchronously,
+ * and one made it to disk before a different one that logically precedes
+ * it in the log file.
+ *
+ * Check for impossibly large records.  The malloc should fail later, but we
+ * have customers that run mallocs that treat all allocation failures as fatal
+ * errors.
+ *
+ * Note that none of this is necessarily something awful happening.  We let
+ * the application hand us any LSN they want, and it could be a pointer into
+ * the middle of a log record, there's no way to tell.
+ */
+static int
+__logc_hdrchk(logc, lsn, hdr, eofp)
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+	HDR *hdr;
+	int *eofp;
+{
+	ENV *env;
+	int ret;
+
+	env = logc->env;
+
+	/*
+	 * Check EOF before we do any other processing.
+	 */
+	if (eofp != NULL) {
+		if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) {
+			*eofp = 1;
+			return (0);
+		}
+		*eofp = 0;
+	}
+
+	/*
+	 * Sanity check the log record's size.
+	 * We must check it after "virtual" EOF above.
+	 */
+	if (hdr->len <= hdr->size)
+		goto err;
+
+	/*
+	 * If the cursor's max-record value isn't yet set, it means we aren't
+	 * reading these records from a log file and no check is necessary.
+	 */
+	if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) {
+		/*
+		 * If we fail the check, there's the pathological case that
+		 * we're reading the last file, it's growing, and our initial
+		 * check information was wrong.  Get it again, to be sure.
+		 */
+		if ((ret = __logc_set_maxrec(logc, NULL)) != 0) {
+			__db_err(env, ret, "DB_LOGC->get");
+			return (ret);
+		}
+		if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec)
+			goto err;
+	}
+	return (0);
+
+err:	if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+		__db_errx(env, DB_STR_A("2580",
+		    "DB_LOGC->get: LSN %lu/%lu: invalid log record header",
+		    "%lu %lu"), (u_long)lsn->file, (u_long)lsn->offset);
+	return (EIO);
+}
+
+/*
+ * __logc_io --
+ *	Read records from a log file.
+ */
+static int
+__logc_io(logc, fnum, offset, p, nrp, eofp)
+	DB_LOGC *logc;
+	u_int32_t fnum, offset;
+	void *p;
+	size_t *nrp;
+	int *eofp;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	LOG *lp;
+	int ret;
+	char *np;
+
+	env = logc->env;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * If we've switched files, discard the current file handle and acquire
+	 * a new one.
+	 */
+	if (logc->fhp != NULL && logc->bp_lsn.file != fnum) {
+		ret = __os_closehandle(env, logc->fhp);
+		logc->fhp = NULL;
+		logc->bp_lsn.file = 0;
+
+		if (ret != 0)
+			return (ret);
+	}
+	if (logc->fhp == NULL) {
+		if ((ret = __log_name(dblp, fnum,
+		    &np, &logc->fhp, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
+			/*
+			 * If we're allowed to return EOF, assume that's the
+			 * problem, set the EOF status flag and return 0.
+			 */
+			if (eofp != NULL) {
+				*eofp = 1;
+				ret = 0;
+			} else if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+				__db_err(env, ret, "DB_LOGC->get: %s",
+				    np == NULL ? "__log_name failed" : np);
+			__os_free(env, np);
+			return (ret);
+		}
+
+		if ((ret = __logc_set_maxrec(logc, np)) != 0) {
+			__db_err(env, ret, "DB_LOGC->get: %s", np);
+			__os_free(env, np);
+			return (ret);
+		}
+		__os_free(env, np);
+
+		logc->bp_lsn.file = fnum;
+	}
+
+	STAT_INC(env, log, read, lp->stat.st_rcount, fnum);
+	/* Seek to the record's offset and read the data. */
+	if ((ret = __os_io(env, DB_IO_READ,
+	    logc->fhp, 0, 0, offset, (u_int32_t)*nrp, p, nrp)) != 0) {
+		if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+			__db_err(env, ret, DB_STR_A("2581",
+			    "DB_LOGC->get: LSN: %lu/%lu: read", "%lu %lu"),
+			    (u_long)fnum, (u_long)offset);
+		return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __logc_shortread --
+ *	Read was short -- return a consistent error message and error.
+ */
+static int
+__logc_shortread(logc, lsn, check_silent)
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+	int check_silent;
+{
+	if (!check_silent || !F_ISSET(logc, DB_LOG_SILENT_ERR))
+		__db_errx(logc->env, DB_STR_A("2582",
+		    "DB_LOGC->get: LSN: %lu/%lu: short read", "%lu %lu"),
+		    (u_long)lsn->file, (u_long)lsn->offset);
+	return (EIO);
+}
+
+/*
+ * __logc_set_maxrec --
+ *	Bound the maximum log record size in a log file.
+ */
+static int
+__logc_set_maxrec(logc, np)
+	DB_LOGC *logc;
+	char *np;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	LOG *lp;
+	u_int32_t mbytes, bytes;
+	int ret;
+
+	env = logc->env;
+	dblp = env->lg_handle;
+
+	/*
+	 * We don't want to try and allocate huge chunks of memory because
+	 * applications with error-checking malloc's often consider that a
+	 * hard failure.  If we're about to look at a corrupted record with
+	 * a bizarre size, we need to know before trying to allocate space
+	 * to hold it.  We could read the persistent data at the beginning
+	 * of the file but that's hard -- we may have to decrypt it, checksum
+	 * it and so on.  Stat the file instead.
+	 */
+	if (logc->fhp != NULL) {
+		if ((ret = __os_ioinfo(env, np, logc->fhp,
+		    &mbytes, &bytes, NULL)) != 0)
+			return (ret);
+		if (logc->bp_maxrec < (mbytes * MEGABYTE + bytes))
+			logc->bp_maxrec = mbytes * MEGABYTE + bytes;
+	}
+
+	/*
+	 * If reading from the log file currently being written, we could get
+	 * an incorrect size, that is, if the cursor was opened on the file
+	 * when it had only a few hundred bytes, and then the cursor used to
+	 * move forward in the file, after more log records were written, the
+	 * original stat value would be wrong.  Use the maximum of the current
+	 * log file size and the size of the buffer -- that should represent
+	 * the max of any log record currently in the file.
+	 *
+	 * The log buffer size is set when the environment is opened and never
+	 * changed, we don't need a lock on it.
+	 */
+	lp = dblp->reginfo.primary;
+	if (logc->bp_maxrec < lp->buffer_size)
+		logc->bp_maxrec = lp->buffer_size;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __log_read_record_pp  __P((DB_ENV *, DB **, void *, void *,
+ * PUBLIC:     DB_LOG_RECSPEC *, u_int32_t, void **));
+ */
+int
+__log_read_record_pp(dbenv, dbpp, td, recbuf, spec, size, argpp)
+	DB_ENV *dbenv;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	DB_LOG_RECSPEC *spec;
+	u_int32_t size;
+	void **argpp;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ENV_REQUIRES_CONFIG(dbenv->env,
+	    dbenv->env->lg_handle, "DB_ENV->log_read_record", DB_INIT_LOG);
+
+	*argpp = NULL;
+	ENV_ENTER(dbenv->env, ip);
+	if ((ret = __os_umalloc(dbenv->env, size + sizeof(DB_TXN), argpp)) != 0)
+		goto done;
+	REPLICATION_WRAP(dbenv->env, (__log_read_record(dbenv->env, dbpp,
+	    td, recbuf, spec, size, argpp)), 0, ret);
+	if (ret != 0) {
+		__os_ufree(dbenv->env, *argpp);
+		*argpp = NULL;
+	}
+done:	ENV_LEAVE(dbenv->env, ip);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __log_read_record  __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     DB_LOG_RECSPEC *, u_int32_t, void **));
+ */
+int
+__log_read_record(env, dbpp, td, recbuf, spec, size, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	DB_LOG_RECSPEC *spec;
+	u_int32_t size;
+	void **argpp;
+{
+	DB_LOG_RECSPEC *sp, *np;
+	DB_TXN *txnp;
+	LOG *lp;
+	PAGE *hdrstart;
+	u_int32_t hdrsize, op, uinttmp;
+	u_int8_t *ap, *bp;
+	int has_data, ret, downrev;
+
+	COMPQUIET(has_data, 0);
+	COMPQUIET(hdrsize, 0);
+	COMPQUIET(hdrstart, NULL);
+	COMPQUIET(op, 0);
+	ap = *argpp;
+	/*
+	 * Allocate space for the arg structure and a transaction
+	 * structure which will imediately follow it.
+	 */
+	if (ap == NULL &&
+	    (ret = __os_malloc(env, size + sizeof(DB_TXN), &ap)) != 0)
+		return (ret);
+	txnp = (DB_TXN *)(ap + size);
+	memset(txnp, 0, sizeof(DB_TXN));
+	txnp->td = td;
+	lp = env->lg_handle->reginfo.primary;
+	downrev = lp->persist.version < DB_LOGVERSION_50;
+
+	bp = recbuf;
+
+	/*
+	 * The first three fields are always the same in every arg
+	 * struct so we know their offsets.
+	 */
+	/* type */
+	LOGCOPY_32(env, ap + SSZ(LOG_REC_HEADER, type), bp);
+	bp += sizeof(u_int32_t);
+
+	/* txnp */
+	LOGCOPY_32(env, &txnp->txnid, bp);
+	*(DB_TXN **)(ap + SSZ(LOG_REC_HEADER, txnp)) = txnp;
+	bp += sizeof(txnp->txnid);
+
+	/* Previous LSN */
+	LOGCOPY_TOLSN(env,
+	     (DB_LSN *)(ap +  SSZ(LOG_REC_HEADER, prev_lsn)), bp);
+	bp += sizeof(DB_LSN);
+
+	ret = 0;
+	for (sp = spec; sp->type != LOGREC_Done; sp++) {
+		switch (sp->type) {
+		case LOGREC_DB:
+			LOGCOPY_32(env, &uinttmp, bp);
+			*(u_int32_t*)(ap + sp->offset) = uinttmp;
+			bp += sizeof(uinttmp);
+			if (dbpp != NULL) {
+				*dbpp = NULL;
+				ret = __dbreg_id_to_db(env,
+				    txnp, dbpp, (int32_t)uinttmp, 1);
+			}
+			break;
+
+		case LOGREC_ARG:
+		case LOGREC_TIME:
+		case LOGREC_DBOP:
+			LOGCOPY_32(env, ap + sp->offset, bp);
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_OP:
+			LOGCOPY_32(env, &op, bp);
+			*(u_int32_t *)(ap + sp->offset) = op;
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_DBT:
+		case LOGREC_PGLIST:
+		case LOGREC_LOCKS:
+		case LOGREC_HDR:
+		case LOGREC_DATA:
+		case LOGREC_PGDBT:
+		case LOGREC_PGDDBT:
+			memset(ap + sp->offset, 0, sizeof(DBT));
+			LOGCOPY_32(env, &uinttmp, bp);
+			*(u_int32_t*)
+			     (ap + sp->offset + SSZ(DBT, size)) = uinttmp;
+			bp += sizeof(u_int32_t);
+			*(void **)(ap + sp->offset + SSZ(DBT, data)) = bp;
+
+			/* Process fields that need to be byte swapped. */
+			switch (sp->type) {
+			case LOGREC_DBT:
+			case LOGREC_PGLIST:
+			case LOGREC_LOCKS:
+				break;
+			case LOGREC_HDR:
+				if (uinttmp == 0)
+					break;
+				has_data = 0;
+				for (np = sp + 1; np->type != LOGREC_Done; np++)
+					if (np->type == LOGREC_DATA) {
+						has_data = 1;
+						break;
+					}
+				hdrstart = (PAGE *)bp;
+				hdrsize = uinttmp;
+				if (has_data == 1)
+					break;
+				/* FALLTHROUGH */
+			case LOGREC_DATA:
+				if (downrev ? LOG_SWAPPED(env) :
+				    (dbpp != NULL && *dbpp != NULL &&
+				    F_ISSET(*dbpp, DB_AM_SWAP)))
+					__db_recordswap(op, hdrsize,
+					    hdrstart, has_data ?
+					    ap + sp->offset : NULL, 1);
+				break;
+			case LOGREC_PGDBT:
+				has_data = 0;
+				for (np = sp + 1; np->type != LOGREC_Done; np++)
+					if (np->type == LOGREC_PGDDBT) {
+						has_data = 1;
+						break;
+					}
+
+				hdrstart = (PAGE *)bp;
+				hdrsize = uinttmp;
+				if (has_data == 1)
+					break;
+				/* FALLTHROUGH */
+			case LOGREC_PGDDBT:
+				if (dbpp != NULL && *dbpp != NULL &&
+				    (downrev ? LOG_SWAPPED(env) :
+				    F_ISSET(*dbpp, DB_AM_SWAP)) &&
+				    (ret = __db_pageswap(env, *dbpp, hdrstart,
+				    hdrsize, has_data == 0 ? NULL :
+				    (DBT *)(ap + sp->offset), 1)) != 0)
+					return (ret);
+				break;
+			default:
+				DB_ASSERT(env, sp->type != sp->type);
+			}
+
+			bp += uinttmp;
+			break;
+
+		case LOGREC_POINTER:
+			LOGCOPY_TOLSN(env, (DB_LSN *)(ap + sp->offset), bp);
+			bp += sizeof(DB_LSN);
+			break;
+
+		default:
+			DB_ASSERT(env, sp->type != sp->type);
+		}
+	}
+
+	*argpp = ap;
+	return (ret);
+}
diff --git a/src/log/log_method.c b/src/log/log_method.c
new file mode 100644
index 00000000..d5aec116
--- /dev/null
+++ b/src/log/log_method.c
@@ -0,0 +1,533 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+/*
+ * __log_env_create --
+ *	Log specific initialization of the DB_ENV structure.
+ *
+ * PUBLIC: int __log_env_create __P((DB_ENV *));
+ */
+int
+__log_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 */
+	dbenv->lg_bsize = 0;
+	dbenv->lg_regionmax = 0;
+
+	return (0);
+}
+
+/*
+ * __log_env_destroy --
+ *	Log specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __log_env_destroy __P((DB_ENV *));
+ */
+void
+__log_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_bsize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_bsize(dbenv, lg_bsizep)
+	DB_ENV *dbenv;
+	u_int32_t *lg_bsizep;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->get_lg_bsize", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lg_bsizep =
+		    ((LOG *)env->lg_handle->reginfo.primary)->buffer_size;
+	} else
+		*lg_bsizep = dbenv->lg_bsize;
+	return (0);
+}
+
+/*
+ * __log_set_lg_bsize --
+ *	DB_ENV->set_lg_bsize.
+ *
+ * PUBLIC: int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_bsize(dbenv, lg_bsize)
+	DB_ENV *dbenv;
+	u_int32_t lg_bsize;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lg_bsize");
+
+	dbenv->lg_bsize = lg_bsize;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_filemode __P((DB_ENV *, int *));
+ */
+int
+__log_get_lg_filemode(dbenv, lg_modep)
+	DB_ENV *dbenv;
+	int *lg_modep;
+{
+	DB_LOG *dblp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->get_lg_filemode", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		dblp = env->lg_handle;
+		ENV_ENTER(env, ip);
+		LOG_SYSTEM_LOCK(env);
+		*lg_modep = ((LOG *)dblp->reginfo.primary)->filemode;
+		LOG_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		*lg_modep = dbenv->lg_filemode;
+
+	return (0);
+}
+
+/*
+ * __log_set_lg_filemode --
+ *	DB_ENV->set_lg_filemode.
+ *
+ * PUBLIC: int __log_set_lg_filemode __P((DB_ENV *, int));
+ */
+int
+__log_set_lg_filemode(dbenv, lg_mode)
+	DB_ENV *dbenv;
+	int lg_mode;
+{
+	DB_LOG *dblp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	LOG *lp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->set_lg_filemode", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		dblp = env->lg_handle;
+		lp = dblp->reginfo.primary;
+		ENV_ENTER(env, ip);
+		LOG_SYSTEM_LOCK(env);
+		lp->filemode = lg_mode;
+		LOG_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		dbenv->lg_filemode = lg_mode;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_max(dbenv, lg_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *lg_maxp;
+{
+	DB_LOG *dblp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->get_lg_max", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		dblp = env->lg_handle;
+		ENV_ENTER(env, ip);
+		LOG_SYSTEM_LOCK(env);
+		*lg_maxp = ((LOG *)dblp->reginfo.primary)->log_nsize;
+		LOG_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		*lg_maxp = dbenv->lg_size;
+
+	return (0);
+}
+
+/*
+ * __log_set_lg_max --
+ *	DB_ENV->set_lg_max.
+ *
+ * PUBLIC: int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_max(dbenv, lg_max)
+	DB_ENV *dbenv;
+	u_int32_t lg_max;
+{
+	DB_LOG *dblp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	LOG *lp;
+	int ret;
+
+	env = dbenv->env;
+	ret = 0;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->set_lg_max", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		dblp = env->lg_handle;
+		lp = dblp->reginfo.primary;
+		ENV_ENTER(env, ip);
+		if ((ret = __log_check_sizes(env, lg_max, 0)) == 0) {
+			LOG_SYSTEM_LOCK(env);
+			lp->log_nsize = lg_max;
+			LOG_SYSTEM_UNLOCK(env);
+		}
+		ENV_LEAVE(env, ip);
+	} else
+		dbenv->lg_size = lg_max;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_regionmax __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_regionmax(dbenv, lg_regionmaxp)
+	DB_ENV *dbenv;
+	u_int32_t *lg_regionmaxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->lg_handle, "DB_ENV->get_lg_regionmax", DB_INIT_LOG);
+
+	if (LOGGING_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*lg_regionmaxp =
+		    ((LOG *)env->lg_handle->reginfo.primary)->regionmax;
+	} else
+		*lg_regionmaxp = dbenv->lg_regionmax;
+	return (0);
+}
+
+/*
+ * __log_set_lg_regionmax --
+ *	DB_ENV->set_lg_regionmax.
+ *
+ * PUBLIC: int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_regionmax(dbenv, lg_regionmax)
+	DB_ENV *dbenv;
+	u_int32_t lg_regionmax;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lg_regionmax");
+
+					/* Let's not be silly. */
+	if (lg_regionmax != 0 && lg_regionmax < LG_BASE_REGION_SIZE) {
+		__db_errx(env, DB_STR_A("2569",
+		    "log region size must be >= %d",
+		    "%d"), LG_BASE_REGION_SIZE);
+		return (EINVAL);
+	}
+
+	dbenv->lg_regionmax = lg_regionmax;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_dir __P((DB_ENV *, const char **));
+ */
+int
+__log_get_lg_dir(dbenv, dirp)
+	DB_ENV *dbenv;
+	const char **dirp;
+{
+	*dirp = dbenv->db_log_dir;
+	return (0);
+}
+
+/*
+ * __log_set_lg_dir --
+ *	DB_ENV->set_lg_dir.
+ *
+ * PUBLIC: int __log_set_lg_dir __P((DB_ENV *, const char *));
+ */
+int
+__log_set_lg_dir(dbenv, dir)
+	DB_ENV *dbenv;
+	const char *dir;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (dbenv->db_log_dir != NULL)
+		__os_free(env, dbenv->db_log_dir);
+	return (__os_strdup(env, dir, &dbenv->db_log_dir));
+}
+
+/*
+ * __log_get_flags --
+ *	DB_ENV->get_flags.
+ *
+ * PUBLIC: void __log_get_flags __P((DB_ENV *, u_int32_t *));
+ */
+void
+__log_get_flags(dbenv, flagsp)
+	DB_ENV *dbenv;
+	u_int32_t *flagsp;
+{
+	DB_LOG *dblp;
+	ENV *env;
+	LOG *lp;
+	u_int32_t flags;
+
+	env = dbenv->env;
+
+	if ((dblp = env->lg_handle) == NULL)
+		return;
+
+	lp = dblp->reginfo.primary;
+
+	flags = *flagsp;
+	if (lp->db_log_autoremove)
+		LF_SET(DB_LOG_AUTO_REMOVE);
+	else
+		LF_CLR(DB_LOG_AUTO_REMOVE);
+	if (lp->db_log_inmemory)
+		LF_SET(DB_LOG_IN_MEMORY);
+	else
+		LF_CLR(DB_LOG_IN_MEMORY);
+	*flagsp = flags;
+}
+
+/*
+ * __log_set_flags --
+ *	DB_ENV->set_flags.
+ *
+ * PUBLIC: void __log_set_flags __P((ENV *, u_int32_t, int));
+ */
+void
+__log_set_flags(env, flags, on)
+	ENV *env;
+	u_int32_t flags;
+	int on;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	if ((dblp = env->lg_handle) == NULL)
+		return;
+
+	lp = dblp->reginfo.primary;
+
+	if (LF_ISSET(DB_LOG_AUTO_REMOVE))
+		lp->db_log_autoremove = on ? 1 : 0;
+	if (LF_ISSET(DB_LOG_IN_MEMORY))
+		lp->db_log_inmemory = on ? 1 : 0;
+}
+
+/*
+ * List of flags we can handle here.  DB_LOG_INMEMORY must be
+ * processed before creating the region, leave it out for now.
+ */
+#undef	OK_FLAGS
+#define	OK_FLAGS							\
+    (DB_LOG_AUTO_REMOVE | DB_LOG_DIRECT |				\
+    DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_ZERO)
+static const FLAG_MAP LogMap[] = {
+	{ DB_LOG_AUTO_REMOVE,	DBLOG_AUTOREMOVE},
+	{ DB_LOG_DIRECT,	DBLOG_DIRECT},
+	{ DB_LOG_DSYNC,		DBLOG_DSYNC},
+	{ DB_LOG_IN_MEMORY,	DBLOG_INMEMORY},
+	{ DB_LOG_ZERO,		DBLOG_ZERO}
+};
+/*
+ * __log_get_config --
+ *	Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__log_get_config(dbenv, which, onp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onp;
+{
+	ENV *env;
+	DB_LOG *dblp;
+	u_int32_t flags;
+
+	env = dbenv->env;
+	if (FLD_ISSET(which, ~OK_FLAGS))
+		return (__db_ferr(env, "DB_ENV->log_get_config", 0));
+	dblp = env->lg_handle;
+	ENV_REQUIRES_CONFIG(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG);
+
+	__env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags);
+	__log_get_flags(dbenv, &flags);
+	if (LF_ISSET(which))
+		*onp = 1;
+	else
+		*onp = 0;
+
+	return (0);
+}
+
+/*
+ * __log_set_config --
+ *	Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__log_set_config(dbenv, flags, on)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	int on;
+{
+	return (__log_set_config_int(dbenv, flags, on, 0));
+}
+/*
+ * __log_set_config_int --
+ *	Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_set_config_int __P((DB_ENV *, u_int32_t, int, int));
+ */
+int
+__log_set_config_int(dbenv, flags, on, in_open)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	int on;
+	int in_open;
+{
+	ENV *env;
+	DB_LOG *dblp;
+	u_int32_t mapped_flags;
+
+	env = dbenv->env;
+	dblp = env->lg_handle;
+	if (FLD_ISSET(flags, ~OK_FLAGS))
+		return (__db_ferr(env, "DB_ENV->log_set_config", 0));
+	ENV_NOT_CONFIGURED(env, dblp, "DB_ENV->log_set_config", DB_INIT_LOG);
+	if (LF_ISSET(DB_LOG_DIRECT) && __os_support_direct_io() == 0) {
+		__db_errx(env,
+"DB_ENV->log_set_config: direct I/O either not configured or not supported");
+		return (EINVAL);
+	}
+
+	if (LOGGING_ON(env)) {
+		if (!in_open && LF_ISSET(DB_LOG_IN_MEMORY) &&
+		    ((LOG *)dblp->reginfo.primary)->db_log_inmemory == 0)
+			ENV_ILLEGAL_AFTER_OPEN(env,
+			     "DB_ENV->log_set_config: DB_LOG_IN_MEMORY");
+		__log_set_flags(env, flags, on);
+		mapped_flags = 0;
+		__env_map_flags(LogMap, sizeof(LogMap), &flags, &mapped_flags);
+		if (on)
+			F_SET(dblp, mapped_flags);
+		else
+			F_CLR(dblp, mapped_flags);
+	} else {
+		/*
+		 * DB_LOG_IN_MEMORY, DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC
+		 * are mutually incompatible.  If we're setting one of them,
+		 * clear all current settings.
+		 */
+		if (on && LF_ISSET(DB_LOG_IN_MEMORY))
+			F_CLR(dbenv,
+			     DB_ENV_TXN_NOSYNC | DB_ENV_TXN_WRITE_NOSYNC);
+
+		if (on)
+			FLD_SET(dbenv->lg_flags, flags);
+		else
+			FLD_CLR(dbenv->lg_flags, flags);
+	}
+
+	return (0);
+}
+
+/*
+ * __log_check_sizes --
+ *	Makes sure that the log file size and log buffer size are compatible.
+ *
+ * PUBLIC: int __log_check_sizes __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__log_check_sizes(env, lg_max, lg_bsize)
+	ENV *env;
+	u_int32_t lg_max;
+	u_int32_t lg_bsize;
+{
+	DB_ENV *dbenv;
+	LOG *lp;
+	int inmem;
+
+	dbenv = env->dbenv;
+
+	if (LOGGING_ON(env)) {
+		lp = env->lg_handle->reginfo.primary;
+		inmem = lp->db_log_inmemory;
+		lg_bsize = lp->buffer_size;
+	} else
+		inmem = (FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) != 0);
+
+	if (inmem) {
+		if (lg_bsize == 0)
+			lg_bsize = LG_BSIZE_INMEM;
+		if (lg_max == 0)
+			lg_max = LG_MAX_INMEM;
+
+		if (lg_bsize <= lg_max) {
+			__db_errx(env,
+		  "in-memory log buffer must be larger than the log file size");
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
diff --git a/src/log/log_print.c b/src/log/log_print.c
new file mode 100644
index 00000000..d2cda519
--- /dev/null
+++ b/src/log/log_print.c
@@ -0,0 +1,380 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+
+static int __log_print_dbregister __P((ENV *, DBT *, DB_LOG *));
+
+/*
+ * PUBLIC: int __log_print_record  __P((ENV *,
+ * PUBLIC:      DBT *, DB_LSN *, char *, DB_LOG_RECSPEC *, void *));
+ */
+int
+__log_print_record(env, recbuf, lsnp, name, spec, info)
+	ENV *env;
+	DBT *recbuf;
+	DB_LSN *lsnp;
+	char *name;
+	DB_LOG_RECSPEC *spec;
+	void *info;
+{
+	DB *dbp;
+	DBT dbt;
+	DB_LOG_RECSPEC *sp, *np;
+	DB_LOG *dblp;
+	DB_LSN prev_lsn;
+	DB_MSGBUF msgbuf;
+	LOG *lp;
+	PAGE *hdrstart, *hdrtmp;
+	int32_t inttmp;
+	u_int32_t hdrsize, op, uinttmp;
+	u_int32_t type, txnid;
+	u_int8_t *bp, *datatmp;
+	int has_data, ret, downrev;
+	struct tm *lt;
+	time_t timeval;
+	char time_buf[CTIME_BUFLEN], *s;
+	const char *hdrname;
+
+	COMPQUIET(hdrstart, NULL);
+	COMPQUIET(hdrname, NULL);
+	COMPQUIET(hdrsize, 0);
+	COMPQUIET(has_data, 0);
+	COMPQUIET(op, 0);
+
+	bp = recbuf->data;
+	dblp = info;
+	dbp = NULL;
+	lp = env->lg_handle->reginfo.primary;
+	downrev = lp->persist.version < DB_LOGVERSION_50;
+	DB_MSGBUF_INIT(&msgbuf);
+
+	/*
+	 * The first three fields are always the same in every arg
+	 * struct so we know their offsets.
+	 */
+	/* type */
+	LOGCOPY_32(env, &type, bp);
+	bp += sizeof(u_int32_t);
+
+	/* txnp */
+	LOGCOPY_32(env, &txnid, bp);
+	bp += sizeof(txnid);
+
+	/* Previous LSN */
+	LOGCOPY_TOLSN(env,&prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+	__db_msgadd(env, &msgbuf,
+    "[%lu][%lu]%s%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    name, (type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)type,
+	    (u_long)txnid,
+	    (u_long)prev_lsn.file, (u_long)prev_lsn.offset);
+
+	for (sp = spec; sp->type != LOGREC_Done; sp++) {
+		switch (sp->type) {
+		case LOGREC_OP:
+			LOGCOPY_32(env, &op, bp);
+			__db_msgadd(env, &msgbuf,  "\t%s: ", sp->name);
+			__db_msgadd(env, &msgbuf,  sp->fmt, OP_MODE_GET(op));
+			__db_msgadd(env, &msgbuf,  " ptype: %s\n",
+			    __db_pagetype_to_string(OP_PAGE_GET(op)));
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_DB:
+			LOGCOPY_32(env, &inttmp, bp);
+			__db_msgadd(env, &msgbuf,  "\t%s: %lu\n",
+			    sp->name, (unsigned long)inttmp);
+			bp += sizeof(inttmp);
+			if (dblp != NULL && inttmp < dblp->dbentry_cnt)
+				dbp = dblp->dbentry[inttmp].dbp;
+			break;
+
+		case LOGREC_DBOP:
+			/* Special op for dbreg_register records. */
+			if (dblp != NULL && (ret =
+			    __log_print_dbregister(env, recbuf, dblp)) != 0)
+				return (ret);
+			LOGCOPY_32(env, &uinttmp, bp);
+			switch (FLD_ISSET(uinttmp, DBREG_OP_MASK)) {
+			case DBREG_CHKPNT:
+				s = "CHKPNT";
+				break;
+			case DBREG_CLOSE:
+				s = "CLOSE";
+				break;
+			case DBREG_OPEN:
+				s = "OPEN";
+				break;
+			case DBREG_PREOPEN:
+				s = "PREOPEN";
+				break;
+			case DBREG_RCLOSE:
+				s = "RCLOSE";
+				break;
+			case DBREG_REOPEN:
+				s = "REOPEN";
+				break;
+			case DBREG_XCHKPNT:
+				s = "XCHKPNT";
+				break;
+			case DBREG_XOPEN:
+				s = "XOPEN";
+				break;
+			case DBREG_XREOPEN:
+				s = "XREOPEN";
+				break;
+			default:
+				s = "UNKNOWN";
+				break;
+			}
+			__db_msgadd(env, &msgbuf,  "\t%s: %s %lx\n", sp->name,
+			    s, (unsigned long)(uinttmp & ~DBREG_OP_MASK));
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_ARG:
+			LOGCOPY_32(env, &uinttmp, bp);
+			__db_msgadd(env, &msgbuf,  "\t%s: ", sp->name);
+			__db_msgadd(env, &msgbuf,  sp->fmt, uinttmp);
+			__db_msgadd(env, &msgbuf,  "\n");
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_TIME:
+			/* time_t is long but we only store 32 bits. */
+			LOGCOPY_32(env, &uinttmp, bp);
+			timeval = uinttmp;
+			lt = localtime(&timeval);
+			__db_msgadd(env, &msgbuf,
+		    "\t%s: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n",
+			    sp->name, (long)timeval,
+			    __os_ctime(&timeval, time_buf),
+			    (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1,
+			    (u_long)lt->tm_mday, (u_long)lt->tm_hour,
+			    (u_long)lt->tm_min, (u_long)lt->tm_sec);
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_PGDBT:
+		case LOGREC_PGDDBT:
+		case LOGREC_PGLIST:
+		case LOGREC_LOCKS:
+		case LOGREC_HDR:
+		case LOGREC_DATA:
+		case LOGREC_DBT:
+			LOGCOPY_32(env, &uinttmp, bp);
+			bp += sizeof(u_int32_t);
+			switch (sp->type) {
+			case LOGREC_HDR:
+				if (uinttmp == 0)
+					break;
+				has_data = 0;
+				for (np = sp + 1; np->type != LOGREC_Done; np++)
+					if (np->type == LOGREC_DATA) {
+						has_data = 1;
+						break;
+					}
+
+				hdrstart = (PAGE*)bp;
+				hdrsize = uinttmp;
+				hdrname = sp->name;
+				if (has_data == 1)
+					break;
+				/* FALLTHROUGH */
+			case LOGREC_DATA:
+				if (downrev ? LOG_SWAPPED(env) :
+				    (dbp != NULL  && F_ISSET(dbp, DB_AM_SWAP)))
+					__db_recordswap(op, hdrsize, hdrstart,
+					    (has_data && uinttmp != 0) ?
+					    bp : NULL, 1);
+				__db_msgadd(env, &msgbuf,  "\t%s: ", hdrname);
+				__db_prbytes(env, &msgbuf,
+				    (u_int8_t *)hdrstart, hdrsize);
+				if (has_data == 0 || uinttmp == 0)
+					break;
+				/* FALLTHROUGH */
+			default:
+				__db_msgadd(env, &msgbuf,  "\t%s: ", sp->name);
+			pr_data:
+				__db_prbytes(env, &msgbuf, bp, uinttmp);
+				has_data = 0;
+				break;
+			case LOGREC_PGDBT:
+				has_data = 0;
+				for (np = sp + 1; np->type != LOGREC_Done; np++)
+					if (np->type == LOGREC_PGDDBT) {
+						has_data = 1;
+						break;
+					}
+
+				hdrstart = (PAGE*)bp;
+				hdrsize = uinttmp;
+				if (has_data == 1)
+					break;
+				/* FALLTHROUGH */
+			case LOGREC_PGDDBT:
+				DB_ASSERT(env, hdrstart != NULL);
+				if (dbp != NULL && (downrev ? LOG_SWAPPED(env) :
+				    F_ISSET(dbp, DB_AM_SWAP))) {
+					dbt.data = bp;
+					dbt.size = uinttmp;
+					if ((ret = __db_pageswap(env, dbp,
+					    hdrstart, hdrsize, has_data == 0 ?
+					    NULL : &dbt, 1)) != 0)
+						return (ret);
+				}
+				if (downrev)
+					goto pr_data;
+				if (ALIGNP_INC(hdrstart,
+				    sizeof(u_int32_t)) != hdrstart) {
+					if ((ret = __os_malloc(env,
+					    hdrsize, &hdrtmp)) != 0)
+						return (ret);
+					memcpy(hdrtmp, hdrstart, hdrsize);
+				} else
+					hdrtmp = hdrstart;
+				if (has_data == 1 && ALIGNP_INC(bp,
+				    sizeof(u_int32_t)) != bp) {
+					if ((ret = __os_malloc(env,
+					    uinttmp, &datatmp)) != 0)
+						return (ret);
+					memcpy(datatmp, bp, uinttmp);
+				} else if (has_data == 1)
+					datatmp = bp;
+				else
+					datatmp = NULL;
+				if ((ret = __db_prpage_int(env, &msgbuf,
+				    dbp, "\t", hdrtmp,
+				    uinttmp, datatmp, DB_PR_PAGE)) != 0)
+					return (ret);
+				has_data = 0;
+				if (hdrtmp != hdrstart)
+					__os_free(env, hdrtmp);
+				if (datatmp != bp && datatmp != NULL)
+					__os_free(env, datatmp);
+				break;
+			case LOGREC_PGLIST:
+				dbt.data = bp;
+				dbt.size = uinttmp;
+				__db_pglist_print(env, &msgbuf, &dbt);
+				break;
+			case LOGREC_LOCKS:
+				dbt.data = bp;
+				dbt.size = uinttmp;
+				__lock_list_print(env, &msgbuf, &dbt);
+				break;
+			}
+			bp += uinttmp;
+			break;
+
+		case LOGREC_POINTER:
+			LOGCOPY_TOLSN(env, &prev_lsn, bp);
+			__db_msgadd(env, &msgbuf,
+			    "\t%s: [%lu][%lu]\n", sp->name,
+			    (u_long)prev_lsn.file, (u_long)prev_lsn.offset);
+			bp += sizeof(DB_LSN);
+			break;
+		case LOGREC_Done:
+			DB_ASSERT(env, sp->type != LOGREC_Done);
+		}
+	}
+	if (msgbuf.buf != NULL)
+		DB_MSGBUF_FLUSH(env, &msgbuf);
+	else
+		__db_msg(env, "%s", "");
+	return (0);
+}
+
+/*
+ * __log_print_dbregister --
+ *	So that we can properly swap and print information from databases
+ * we generate dummy DB handles here.  These are real handles that are never
+ * opened but their fileid, meta_pgno and some flags are set properly.
+ * This code uses parallel structures to those in the dbregister code.
+ * The DB_LOG handle passed in must NOT be the real environment handle
+ * since this would confuse actual running transactions if printing is
+ * done while the environment is active.
+ */
+static int
+__log_print_dbregister(env, recbuf, dblp)
+	ENV *env;
+	DBT *recbuf;
+	DB_LOG *dblp;
+{
+	__dbreg_register_args *argp;
+	DB *dbp;
+	DB_ENTRY *dbe;
+	int ret;
+
+	if ((ret = __dbreg_register_read(env, recbuf->data, &argp)) != 0)
+		return (ret);
+
+	if (dblp->dbentry_cnt <= argp->fileid &&
+	    (ret = __dbreg_add_dbentry(env, dblp, NULL, argp->fileid)) != 0)
+			goto err;
+	dbe = &dblp->dbentry[argp->fileid];
+	dbp = dbe->dbp;
+
+	switch (FLD_ISSET(argp->opcode, DBREG_OP_MASK)) {
+	case DBREG_CHKPNT:
+	case DBREG_OPEN:
+	case DBREG_REOPEN:
+	case DBREG_XCHKPNT:
+	case DBREG_XOPEN:
+	case DBREG_XREOPEN:
+		if (dbp != NULL) {
+			if (memcmp(dbp->fileid,
+			    argp->uid.data, DB_FILE_ID_LEN) == 0 &&
+			    dbp->meta_pgno == argp->meta_pgno)
+				goto done;
+			if ((__db_close(dbp, NULL, DB_NOSYNC)) != 0)
+				goto err;
+			dbe->dbp = dbp = NULL;
+		}
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			goto err;
+		memcpy(dbp->fileid, argp->uid.data, DB_FILE_ID_LEN);
+		dbp->meta_pgno = argp->meta_pgno;
+		F_SET(dbp, DB_AM_RECOVER);
+		/*
+		 * We need to swap bytes if we are on a BIGEND machine XOR
+		 *	we have a BIGEND database.
+		 */
+		if ((F_ISSET(env, ENV_LITTLEENDIAN) == 0) ^
+		    (FLD_ISSET(argp->opcode, DBREG_BIGEND) != 0))
+			F_SET(dbp, DB_AM_SWAP);
+		if (FLD_ISSET(argp->opcode, DBREG_CHKSUM))
+			F_SET(dbp, DB_AM_CHKSUM);
+		if (FLD_ISSET(argp->opcode, DBREG_ENCRYPT))
+			F_SET(dbp, DB_AM_ENCRYPT);
+		if (FLD_ISSET(argp->opcode, DBREG_EXCL))
+			F2_SET(dbp, DB2_AM_EXCL);
+		dbe->dbp = dbp;
+		break;
+	case DBREG_CLOSE:
+	case DBREG_RCLOSE:
+		if (dbp == NULL)
+			goto err;
+		if ((__db_close(dbp, NULL, DB_NOSYNC)) != 0)
+			goto err;
+		dbe->dbp = dbp = NULL;
+		break;
+	case DBREG_PREOPEN:
+		break;
+	default:
+		DB_ASSERT(env, argp->opcode != argp->opcode);
+	}
+done:
+err:
+	__os_free(env, argp);
+	return (ret);
+}
diff --git a/src/log/log_put.c b/src/log/log_put.c
new file mode 100644
index 00000000..8f7e23d8
--- /dev/null
+++ b/src/log/log_put.c
@@ -0,0 +1,2041 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc_auto/db_ext.h"
+
+static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t));
+static int __log_file __P((ENV *, const DB_LSN *, char *, size_t));
+static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
+static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t));
+static int __log_newfh __P((DB_LOG *, int));
+static int __log_put_next __P((ENV *,
+    DB_LSN *, const DBT *, HDR *, DB_LSN *));
+static int __log_put_record_int __P((ENV *, DB *, DB_TXN *, DB_LSN *,
+    u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, va_list));
+static int __log_putr __P((DB_LOG *,
+    DB_LSN *, const DBT *, u_int32_t, HDR *));
+static int __log_write __P((DB_LOG *, void *, u_int32_t));
+
+/*
+ * __log_put_pp --
+ *	ENV->log_put pre/post processing.
+ *
+ * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_put_pp(dbenv, lsnp, udbt, flags)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+	const DBT *udbt;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
+
+	/* Validate arguments: check for allowed flags. */
+	if ((ret = __db_fchk(env, "DB_ENV->log_put", flags,
+	    DB_LOG_CHKPNT | DB_LOG_COMMIT |
+	    DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
+		return (ret);
+
+	/* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+	if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+		return (__db_ferr(env, "DB_ENV->log_put", 1));
+
+	/* Replication clients should never write log records. */
+	if (IS_REP_CLIENT(env)) {
+		__db_errx(env, DB_STR("2511",
+		    "DB_ENV->log_put is illegal on replication clients"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_put --
+ *	ENV->log_put.
+ *
+ * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_put(env, lsnp, udbt, flags)
+	ENV *env;
+	DB_LSN *lsnp;
+	const DBT *udbt;
+	u_int32_t flags;
+{
+	DBT *dbt, t;
+	DB_CIPHER *db_cipher;
+	DB_LOG *dblp;
+	DB_LSN lsn, old_lsn;
+	DB_REP *db_rep;
+	HDR hdr;
+	LOG *lp;
+	REP *rep;
+	int lock_held, need_free, ret;
+	u_int8_t *key;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	db_cipher = env->crypto_handle;
+	db_rep = env->rep_handle;
+	if (db_rep != NULL)
+		rep = db_rep->region;
+	else
+		rep = NULL;
+
+	dbt = &t;
+	t = *udbt;
+	lock_held = need_free = 0;
+	ZERO_LSN(old_lsn);
+	hdr.len = hdr.prev = 0;
+
+	/*
+	 * In general, if we are not a rep application, but are sharing a master
+	 * rep env, we should not be writing log records.  However, we can allow
+	 * a non-replication-aware process to join a pre-existing repmgr
+	 * environment, if env handle meets repmgr's DB_THREAD requirement.
+	 */
+
+	if (IS_REP_MASTER(env) && db_rep->send == NULL) {
+#ifdef HAVE_REPLICATION_THREADS
+		if (F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env)) {
+			if ((ret = __repmgr_autostart(env)) != 0)
+				return (ret);
+		} else
+#endif
+		{
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP)
+			__db_errx(env, DB_STR("2512",
+			    "Non-replication DB_ENV handle attempting "
+			    "to modify a replicated environment"));
+			return (EINVAL);
+#endif
+		}
+	}
+	DB_ASSERT(env, !IS_REP_CLIENT(env));
+
+	/*
+	 * If we are coming from the logging code, we use an internal flag,
+	 * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log
+	 * record in place.  Otherwise, if a user called log_put then we
+	 * must copy it to new memory so that we know we can write it.
+	 *
+	 * We also must copy it to new memory if we are a replication master
+	 * so that we retain an unencrypted copy of the log record to send
+	 * to clients.
+	 */
+	if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) {
+		if (CRYPTO_ON(env))
+			t.size += db_cipher->adj_size(udbt->size);
+		if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
+			goto err;
+		need_free = 1;
+		memcpy(t.data, udbt->data, udbt->size);
+	}
+	if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0)
+		goto err;
+	if (CRYPTO_ON(env))
+		key = db_cipher->mac_key;
+	else
+		key = NULL;
+#ifdef HAVE_LOG_CHECKSUM
+	__db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum);
+#endif
+
+	LOG_SYSTEM_LOCK(env);
+	lock_held = 1;
+
+	if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0)
+		goto panic_check;
+
+	/*
+	 * Assign the return LSN before dropping the region lock.  Necessary
+	 * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in
+	 * by the logging routines.  We use atomic 32-bit operations because
+	 * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC
+	 * relies on reading the fields atomically.
+	 */
+	lsnp->file = lsn.file;
+	lsnp->offset = lsn.offset;
+
+#ifdef HAVE_REPLICATION
+	if (IS_REP_MASTER(env)) {
+		__rep_newfile_args nf_args;
+		DBT newfiledbt;
+		REP_BULK bulk;
+		size_t len;
+		u_int32_t ctlflags;
+		u_int8_t buf[__REP_NEWFILE_SIZE];
+
+		/*
+		 * Replication masters need to drop the lock to send messages,
+		 * but want to drop and reacquire it a minimal number of times.
+		 */
+		ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ?
+		    REPCTL_PERM : 0;
+		LOG_SYSTEM_UNLOCK(env);
+		lock_held = 0;
+		if (LF_ISSET(DB_FLUSH))
+			ctlflags |= REPCTL_FLUSH;
+
+		/*
+		 * If we changed files and we're in a replicated environment,
+		 * we need to inform our clients now that we've dropped the
+		 * region lock.
+		 *
+		 * Note that a failed NEWFILE send is a dropped message that
+		 * our client can handle, so we can ignore it.  It's possible
+		 * that the record we already put is a commit, so we don't just
+		 * want to return failure.
+		 */
+		if (!IS_ZERO_LSN(old_lsn)) {
+			memset(&newfiledbt, 0, sizeof(newfiledbt));
+			nf_args.version = lp->persist.version;
+			(void)__rep_newfile_marshal(env, &nf_args,
+			    buf, __REP_NEWFILE_SIZE, &len);
+			DB_INIT_DBT(newfiledbt, buf, len);
+			(void)__rep_send_message(env, DB_EID_BROADCAST,
+			    REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0);
+		}
+
+		/*
+		 * If we're doing bulk processing put it in the bulk buffer.
+		 */
+		ret = 0;
+		if (FLD_ISSET(rep->config, REP_C_BULK)) {
+			/*
+			 * Bulk could have been turned on by another process.
+			 * If so, set the address into the bulk region now.
+			 */
+			if (db_rep->bulk == NULL)
+				db_rep->bulk = R_ADDR(&dblp->reginfo,
+				    lp->bulk_buf);
+			memset(&bulk, 0, sizeof(bulk));
+			bulk.addr = db_rep->bulk;
+			bulk.offp = &lp->bulk_off;
+			bulk.len = lp->bulk_len;
+			bulk.lsn = lsn;
+			bulk.type = REP_BULK_LOG;
+			bulk.eid = DB_EID_BROADCAST;
+			bulk.flagsp = &lp->bulk_flags;
+			ret = __rep_bulk_message(env, &bulk, NULL,
+			    &lsn, udbt, ctlflags);
+		}
+		if (!FLD_ISSET(rep->config, REP_C_BULK) ||
+		    ret == DB_REP_BULKOVF) {
+			/*
+			 * Then send the log record itself on to our clients.
+			 */
+			/*
+			 * !!!
+			 * In the crypto case, we MUST send the udbt, not the
+			 * now-encrypted dbt.  Clients have no way to decrypt
+			 * without the header.
+			 */
+			ret = __rep_send_message(env, DB_EID_BROADCAST,
+			    REP_LOG, &lsn, udbt, ctlflags, 0);
+		}
+		if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
+			LOG_SYSTEM_LOCK(env);
+#ifdef HAVE_STATISTICS
+			if (IS_USING_LEASES(env))
+				rep->stat.st_lease_sends++;
+#endif
+			/*
+			 * Keep track of our last PERM lsn.  Set this on a
+			 * master under the log lock.  When using leases, if
+			 * we set max_perm_lsn too early (before the send)
+			 * then we hit a lot of false invalid lease checks
+			 * which all try to refresh and hurt performance.
+			 */
+			if (LOG_COMPARE(&lp->max_perm_lsn, &lsn) < 0)
+				lp->max_perm_lsn = lsn;
+			LOG_SYSTEM_UNLOCK(env);
+		}
+		/*
+		 * If the send fails and we're a commit or checkpoint,
+		 * there's nothing we can do;  the record's in the log.
+		 * Flush it, even if we're running with TXN_NOSYNC,
+		 * on the grounds that it should be in durable
+		 * form somewhere.
+		 */
+		if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM))
+			LF_SET(DB_FLUSH);
+		/*
+		 * We ignore send failures so reset 'ret' to 0 here.
+		 * We needed to check special return values from
+		 * bulk transfer and errors from either bulk or normal
+		 * message sending need flushing on perm records.  But
+		 * otherwise we need to ignore it and reset it now.
+		 */
+		ret = 0;
+	}
+#endif
+
+	/*
+	 * If needed, do a flush.  Note that failures at this point
+	 * are only permissible if we know we haven't written a commit
+	 * record;  __log_flush_commit is responsible for enforcing this.
+	 *
+	 * If a flush is not needed, see if WRITE_NOSYNC was set and we
+	 * need to write out the log buffer.
+	 */
+	if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) {
+		if (!lock_held) {
+			LOG_SYSTEM_LOCK(env);
+			lock_held = 1;
+		}
+		if ((ret = __log_flush_commit(env, &lsn, flags)) != 0)
+			goto panic_check;
+	}
+
+	/*
+	 * If flushed a checkpoint record, reset the "bytes since the last
+	 * checkpoint" counters.
+	 */
+	if (LF_ISSET(DB_LOG_CHKPNT))
+		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
+
+	/* Increment count of records added to the log. */
+	STAT(++lp->stat.st_record);
+
+	if (0) {
+panic_check:	/*
+		 * Writing log records cannot fail if we're a replication
+		 * master.  The reason is that once we send the record to
+		 * replication clients, the transaction can no longer
+		 * abort, otherwise the master would be out of sync with
+		 * the rest of the replication group.  Panic the system.
+		 */
+		if (ret != 0 && IS_REP_MASTER(env))
+			ret = __env_panic(env, ret);
+	}
+
+err:	if (lock_held)
+		LOG_SYSTEM_UNLOCK(env);
+	if (need_free)
+		__os_free(env, dbt->data);
+
+	/*
+	 * If auto-remove is set and we switched files, remove unnecessary
+	 * log files.
+	 */
+	if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove)
+		__log_autoremove(env);
+
+	return (ret);
+}
+
+/*
+ * __log_current_lsn_int --
+ *	internal operations of __log_current_lsn
+ *
+ * PUBLIC: int __log_current_lsn_int
+ * PUBLIC:     __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+ */
+int
+__log_current_lsn_int(env, lsnp, mbytesp, bytesp)
+	ENV *env;
+	DB_LSN *lsnp;
+	u_int32_t *mbytesp, *bytesp;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	LOG_SYSTEM_LOCK(env);
+
+	/*
+	 * We need the LSN of the last entry in the log.
+	 *
+	 * Typically, it's easy to get the last written LSN, you simply look
+	 * at the current log pointer and back up the number of bytes of the
+	 * last log record.  However, if the last thing we did was write the
+	 * log header of a new log file, then, this doesn't work, so we return
+	 * the first log record that will be written in this new file.
+	 */
+	*lsnp = lp->lsn;
+	if (lp->lsn.offset > lp->len)
+		lsnp->offset -= lp->len;
+
+	/*
+	 * Since we're holding the log region lock, return the bytes put into
+	 * the log since the last checkpoint, transaction checkpoint needs it.
+	 *
+	 * We add the current buffer offset so as to count bytes that have not
+	 * yet been written, but are sitting in the log buffer.
+	 */
+	if (mbytesp != NULL) {
+		*mbytesp = lp->stat.st_wc_mbytes;
+		*bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
+	}
+
+	LOG_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __log_current_lsn --
+ *	Return the current LSN.
+ *
+ * PUBLIC: int __log_current_lsn
+ * PUBLIC:     __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+ */
+int
+__log_current_lsn(env, lsnp, mbytesp, bytesp)
+	ENV *env;
+	DB_LSN *lsnp;
+	u_int32_t *mbytesp, *bytesp;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ret = 0;
+	ENV_ENTER(env, ip);
+	ret = __log_current_lsn_int(env, lsnp, mbytesp, bytesp);
+	ENV_LEAVE(env, ip);
+
+	return ret;
+}
+
+/*
+ * __log_put_next --
+ *	Put the given record as the next in the log, wherever that may
+ * turn out to be.
+ */
+static int
+__log_put_next(env, lsn, dbt, hdr, old_lsnp)
+	ENV *env;
+	DB_LSN *lsn;
+	const DBT *dbt;
+	HDR *hdr;
+	DB_LSN *old_lsnp;
+{
+	DB_LOG *dblp;
+	DB_LSN old_lsn;
+	LOG *lp;
+	int adv_file, newfile, ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * Save a copy of lp->lsn before we might decide to switch log
+	 * files and change it.  If we do switch log files, and we're
+	 * doing replication, we'll need to tell our clients about the
+	 * switch, and they need to receive a NEWFILE message
+	 * with this "would-be" LSN in order to know they're not
+	 * missing any log records.
+	 */
+	old_lsn = lp->lsn;
+	newfile = 0;
+	adv_file = 0;
+	/*
+	 * If our current log is at an older version and we want to write
+	 * a record then we need to advance the log.
+	 */
+	if (lp->persist.version != DB_LOGVERSION) {
+		__log_set_version(env, DB_LOGVERSION);
+		adv_file = 1;
+	}
+
+	/*
+	 * If this information won't fit in the file, or if we're a
+	 * replication client environment and have been told to do so,
+	 * swap files.
+	 */
+	if (adv_file || lp->lsn.offset == 0 ||
+	    lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
+		if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
+			__db_errx(env, DB_STR_A("2513",
+	    "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
+			    "%lu %lu"),
+			    (u_long)hdr->size + sizeof(LOGP) + dbt->size,
+			    (u_long)lp->log_size);
+			return (EINVAL);
+		}
+
+		if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
+			return (ret);
+
+		/*
+		 * Flag that we switched files, in case we're a master
+		 * and need to send this information to our clients.
+		 * We postpone doing the actual send until we can
+		 * safely release the log region lock and are doing so
+		 * anyway.
+		 */
+		newfile = 1;
+	}
+
+	/* If we switched log files, let our caller know where. */
+	if (newfile)
+		*old_lsnp = old_lsn;
+
+	/* Actually put the record. */
+	return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
+}
+
+/*
+ * __log_flush_commit --
+ *	Flush a record.
+ */
+static int
+__log_flush_commit(env, lsnp, flags)
+	ENV *env;
+	const DB_LSN *lsnp;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_LSN flush_lsn;
+	HDR hdr;
+	LOG *lp;
+	int ret, t_ret;
+	size_t nr, nw;
+	u_int8_t *buffer;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	flush_lsn = *lsnp;
+
+	ret = 0;
+
+	/*
+	 * DB_FLUSH:
+	 *	Flush a record for which the DB_FLUSH flag to log_put was set.
+	 *
+	 * DB_LOG_WRNOSYNC:
+	 *	If there's anything in the current log buffer, write it out.
+	 */
+	if (LF_ISSET(DB_FLUSH))
+		ret = __log_flush_int(dblp, &flush_lsn, 1);
+	else if (!lp->db_log_inmemory && lp->b_off != 0)
+		if ((ret = __log_write(dblp,
+		    dblp->bufp, (u_int32_t)lp->b_off)) == 0)
+			lp->b_off = 0;
+
+	/*
+	 * If a flush supporting a transaction commit fails, we must abort the
+	 * transaction.  (If we aren't doing a commit, return the failure; if
+	 * if the commit we care about made it to disk successfully, we just
+	 * ignore the failure, because there's no way to undo the commit.)
+	 */
+	if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT))
+		return (ret);
+
+	if (LF_ISSET(DB_FLUSH) ?
+	    flush_lsn.file != lp->s_lsn.file ||
+	    flush_lsn.offset < lp->s_lsn.offset :
+	    flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
+		return (0);
+
+	if (IS_REP_MASTER(env)) {
+		__db_err(env, ret, DB_STR("2514",
+		    "Write failed on MASTER commit."));
+		return (__env_panic(env, ret));
+	}
+
+	/*
+	 * Else, make sure that the commit record does not get out after we
+	 * abort the transaction.  Do this by overwriting the commit record
+	 * in the buffer.  (Note that other commits in this buffer will wait
+	 * until a successful write happens, we do not wake them.)  We point
+	 * at the right part of the buffer and write an abort record over the
+	 * commit.  We must then try and flush the buffer again, since the
+	 * interesting part of the buffer may have actually made it out to
+	 * disk before there was a failure, we can't know for sure.
+	 */
+	if (flush_lsn.offset > lp->w_off) {
+		if ((t_ret = __txn_force_abort(env,
+		     dblp->bufp + flush_lsn.offset - lp->w_off)) != 0)
+			return (__env_panic(env, t_ret));
+	} else {
+		/*
+		 * The buffer was written, but its not on disk, we
+		 * must read it back and force things from a commit
+		 * state to an abort state.  Lots of things could fail
+		 * here and we will be left with a commit record but
+		 * a panic return.
+		 */
+		 if (
+		    (t_ret = __os_seek(env,
+		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+		    (t_ret = __os_read(env, dblp->lfhp, &hdr,
+		    HDR_NORMAL_SZ, &nr)) != 0 || nr != HDR_NORMAL_SZ)
+			return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
+		if (LOG_SWAPPED(env))
+			__log_hdrswap(&hdr, CRYPTO_ON(env));
+		if ((t_ret = __os_malloc(env, hdr.len, &buffer)) != 0 ||
+		    (t_ret = __os_seek(env,
+		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+		    (t_ret = __os_read(env, dblp->lfhp, buffer,
+		    hdr.len, &nr)) != 0 || nr != hdr.len ||
+		    (t_ret = __txn_force_abort(env, buffer)) != 0 ||
+		    (t_ret = __os_seek(env,
+		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+		    (t_ret = __os_write(env, dblp->lfhp, buffer,
+		    nr, &nw)) != 0 || nw != nr)
+			return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
+		__os_free(env, buffer);
+	}
+	/*
+	 * Try to flush the log again, if the disk just bounced then we
+	 * want to be sure it does not go away again before we write the
+	 * abort record.
+	 */
+	(void)__log_flush_int(dblp, &flush_lsn, 0);
+
+	return (ret);
+}
+
+/*
+ * __log_newfile --
+ *	Initialize and switch to a new log file.  (Note that this is
+ * called both when no log yet exists and when we fill a log file.)
+ *
+ * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
+ */
+int
+__log_newfile(dblp, lsnp, logfile, version)
+	DB_LOG *dblp;
+	DB_LSN *lsnp;
+	u_int32_t logfile;
+	u_int32_t version;
+{
+	DBT t;
+	DB_CIPHER *db_cipher;
+	DB_LSN lsn;
+	ENV *env;
+	HDR hdr;
+	LOG *lp;
+	LOGP *tpersist;
+	int need_free, ret;
+	u_int32_t lastoff;
+	size_t tsize;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * If we're not specifying a specific log file number and we're
+	 * not at the beginning of a file already, start a new one.
+	 */
+	if (logfile == 0 && lp->lsn.offset != 0) {
+		/*
+		 * Flush the log so this file is out and can be closed.  We
+		 * cannot release the region lock here because we need to
+		 * protect the end of the file while we switch.  In
+		 * particular, a thread with a smaller record than ours
+		 * could detect that there is space in the log. Even
+		 * blocking that event by declaring the file full would
+		 * require all threads to wait here so that the lsn.file
+		 * can be moved ahead after the flush completes.  This
+		 * probably can be changed if we had an lsn for the
+		 * previous file and one for the current, but it does not
+		 * seem like this would get much more throughput, if any.
+		 */
+		if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+			return (ret);
+
+		/*
+		 * Save the last known offset from the previous file, we'll
+		 * need it to initialize the persistent header information.
+		 */
+		lastoff = lp->lsn.offset;
+
+		/* Point the current LSN to the new file. */
+		++lp->lsn.file;
+		lp->lsn.offset = 0;
+
+		/* Reset the file write offset. */
+		lp->w_off = 0;
+	} else
+		lastoff = 0;
+
+	/*
+	 * Replication may require we reset the log file name space entirely.
+	 * In that case we also force a file switch so that replication can
+	 * clean up old files.
+	 */
+	if (logfile != 0) {
+		lp->lsn.file = logfile;
+		lp->lsn.offset = 0;
+		lp->w_off = 0;
+		if (lp->db_log_inmemory) {
+			lsn = lp->lsn;
+			(void)__log_zero(env, &lsn);
+		} else {
+			lp->s_lsn = lp->lsn;
+			if ((ret = __log_newfh(dblp, 1)) != 0)
+				return (ret);
+		}
+	}
+
+	DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0);
+	if (lp->db_log_inmemory &&
+	    (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0)
+		return (ret);
+
+	/*
+	 * Insert persistent information as the first record in every file.
+	 * Note that the previous length is wrong for the very first record
+	 * of the log, but that's okay, we check for it during retrieval.
+	 */
+	memset(&t, 0, sizeof(t));
+	memset(&hdr, 0, sizeof(HDR));
+
+	need_free = 0;
+	tsize = sizeof(LOGP);
+	db_cipher = env->crypto_handle;
+	if (CRYPTO_ON(env))
+		tsize += db_cipher->adj_size(tsize);
+	if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0)
+		return (ret);
+	need_free = 1;
+	/*
+	 * If we're told what version to make this file, then we
+	 * need to be at that version.  Update here.
+	 */
+	if (version != 0) {
+		__log_set_version(env, version);
+		if ((ret = __env_init_rec(env, version)) != 0)
+			goto err;
+	}
+	lp->persist.log_size = lp->log_size = lp->log_nsize;
+	memcpy(tpersist, &lp->persist, sizeof(LOGP));
+	DB_SET_DBT(t, tpersist, tsize);
+	if (LOG_SWAPPED(env))
+		__log_persistswap(tpersist);
+
+	if ((ret =
+	    __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0)
+		goto err;
+
+	if ((ret = __log_putr(dblp, &lsn,
+	    &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
+		goto err;
+
+	/* Update the LSN information returned to the caller. */
+	if (lsnp != NULL)
+		*lsnp = lp->lsn;
+
+err:	if (need_free)
+		__os_free(env, tpersist);
+	return (ret);
+}
+
+/*
+ * __log_putr --
+ *	Actually put a record into the log.
+ */
+static int
+__log_putr(dblp, lsn, dbt, prev, h)
+	DB_LOG *dblp;
+	DB_LSN *lsn;
+	const DBT *dbt;
+	u_int32_t prev;
+	HDR *h;
+{
+	DB_CIPHER *db_cipher;
+	DB_LSN f_lsn;
+	ENV *env;
+	HDR tmp, *hdr;
+	LOG *lp;
+	int ret, t_ret;
+	db_size_t b_off;
+	size_t nr;
+	u_int32_t w_off;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * If we weren't given a header, use a local one.
+	 */
+	db_cipher = env->crypto_handle;
+	if (h == NULL) {
+		hdr = &tmp;
+		memset(hdr, 0, sizeof(HDR));
+		if (CRYPTO_ON(env))
+			hdr->size = HDR_CRYPTO_SZ;
+		else
+			hdr->size = HDR_NORMAL_SZ;
+	} else
+		hdr = h;
+
+	/* Save our position in case we fail. */
+	b_off = lp->b_off;
+	w_off = lp->w_off;
+	f_lsn = lp->f_lsn;
+
+	/*
+	 * Initialize the header.  If we just switched files, lsn.offset will
+	 * be 0, and what we really want is the offset of the previous record
+	 * in the previous file.  Fortunately, prev holds the value we want.
+	 */
+	hdr->prev = prev;
+	hdr->len = (u_int32_t)hdr->size + dbt->size;
+
+#ifdef HAVE_LOG_CHECKSUM
+	/*
+	 * If we were passed in a nonzero checksum, our caller calculated
+	 * the checksum before acquiring the log mutex, as an optimization.
+	 *
+	 * If our caller calculated a real checksum of 0, we'll needlessly
+	 * recalculate it.  C'est la vie;  there's no out-of-bounds value
+	 * here.
+	 */
+	if (hdr->chksum[0] == 0) {
+		if (lp->persist.version < DB_LOGCHKSUM)
+			__db_chksum(NULL, dbt->data, dbt->size,
+			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
+			    hdr->chksum);
+		else
+			__db_chksum(hdr, dbt->data, dbt->size,
+			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
+			    hdr->chksum);
+	} else if (lp->persist.version >= DB_LOGCHKSUM)
+		/*
+		 * We need to include hdr->prev and len here, since they were
+		 * still zero at the time of the caller's __db_chksum() call.
+		 */
+		LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum);
+#endif
+
+	if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp,
+	    (u_int32_t)hdr->size + dbt->size)) != 0)
+		goto err;
+
+	/*
+	 * The offset into the log file at this point is the LSN where
+	 * we're about to put this record, and is the LSN the caller wants.
+	 */
+	*lsn = lp->lsn;
+
+	nr = hdr->size;
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+
+	 /* nr can't overflow a 32 bit value - header size is internal. */
+	ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr);
+
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(hdr, CRYPTO_ON(env));
+
+	if (ret != 0)
+		goto err;
+
+	if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
+		goto err;
+
+	lp->len = (u_int32_t)(hdr->size + dbt->size);
+	lp->lsn.offset += lp->len;
+	return (0);
+err:
+	/*
+	 * If we wrote more than one buffer before failing, get the
+	 * first one back.  The extra buffers will fail the checksums
+	 * and be ignored.
+	 */
+	if (w_off + lp->buffer_size < lp->w_off) {
+		DB_ASSERT(env, !lp->db_log_inmemory);
+		if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 ||
+		    (t_ret = __os_read(env, dblp->lfhp, dblp->bufp,
+		    b_off, &nr)) != 0)
+			return (__env_panic(env, t_ret));
+		if (nr != b_off) {
+			__db_errx(env, DB_STR("2515",
+			    "Short read while restoring log"));
+			return (__env_panic(env, EIO));
+		}
+	}
+
+	/* Reset to where we started. */
+	lp->w_off = w_off;
+	lp->b_off = b_off;
+	lp->f_lsn = f_lsn;
+
+	return (ret);
+}
+
+/*
+ * __log_flush_pp --
+ *	ENV->log_flush pre/post processing.
+ *
+ * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
+ */
+int
+__log_flush_pp(dbenv, lsn)
+	DB_ENV *dbenv;
+	const DB_LSN *lsn;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * See if we need to wait.  s_lsn is not locked so some care is needed.
+ * The sync point can only move forward.  The lsnp->file cannot be
+ * greater than the s_lsn.file.  If the file we want is in the past
+ * we are done.  If the file numbers are the same check the offset.
+ * This all assumes we can read an 32-bit quantity in one state or
+ * the other, not in transition.
+ */
+#define	ALREADY_FLUSHED(lp, lsnp)					\
+	(((lp)->s_lsn.file > (lsnp)->file) ||				\
+	((lp)->s_lsn.file == (lsnp)->file &&				\
+	    (lp)->s_lsn.offset > (lsnp)->offset))
+
+/*
+ * __log_flush --
+ *	ENV->log_flush
+ *
+ * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *));
+ */
+int
+__log_flush(env, lsn)
+	ENV *env;
+	const DB_LSN *lsn;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	int ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	if (lsn != NULL && ALREADY_FLUSHED(lp, lsn))
+		return (0);
+	LOG_SYSTEM_LOCK(env);
+	ret = __log_flush_int(dblp, lsn, 1);
+	LOG_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __log_flush_int --
+ *	Write all records less than or equal to the specified LSN; internal
+ *	version.
+ *
+ * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
+ */
+int
+__log_flush_int(dblp, lsnp, release)
+	DB_LOG *dblp;
+	const DB_LSN *lsnp;
+	int release;
+{
+	struct __db_commit *commit;
+	ENV *env;
+	DB_LSN flush_lsn, f_lsn;
+	LOG *lp;
+	size_t b_off;
+	u_int32_t ncommit, w_off;
+	int do_flush, first, ret;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+	ncommit = 0;
+	ret = 0;
+
+	if (lp->db_log_inmemory) {
+		lp->s_lsn = lp->lsn;
+		STAT(++lp->stat.st_scount);
+		return (0);
+	}
+
+	/*
+	 * If no LSN specified, flush the entire log by setting the flush LSN
+	 * to the last LSN written in the log.  Otherwise, check that the LSN
+	 * isn't a non-existent record for the log.
+	 */
+	if (lsnp == NULL) {
+		flush_lsn.file = lp->lsn.file;
+		flush_lsn.offset = lp->lsn.offset - lp->len;
+	} else if (lsnp->file > lp->lsn.file ||
+	    (lsnp->file == lp->lsn.file &&
+	    lsnp->offset > lp->lsn.offset - lp->len)) {
+		__db_errx(env, DB_STR_A("2516",
+    "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu",
+		    "%lu %lu %lu %lu"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)lp->lsn.file,
+		    (u_long)lp->lsn.offset);
+		__db_errx(env, DB_STR("2517",
+		    "Database environment corrupt; the wrong log files may "
+		    "have been removed or incompatible database files "
+		    "imported from another environment"));
+		return (__env_panic(env, DB_RUNRECOVERY));
+	} else {
+		if (ALREADY_FLUSHED(lp, lsnp))
+			return (0);
+		flush_lsn = *lsnp;
+	}
+
+	/*
+	 * If a flush is in progress and we're allowed to do so, drop
+	 * the region lock and block waiting for the next flush.
+	 */
+	if (release && lp->in_flush != 0) {
+		if ((commit = SH_TAILQ_FIRST(
+		    &lp->free_commits, __db_commit)) == NULL) {
+			if ((ret = __env_alloc(&dblp->reginfo,
+			    sizeof(struct __db_commit), &commit)) != 0)
+				goto flush;
+			memset(commit, 0, sizeof(*commit));
+			if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT,
+			    DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) {
+				__env_alloc_free(&dblp->reginfo, commit);
+				return (ret);
+			}
+			MUTEX_LOCK(env, commit->mtx_txnwait);
+		} else
+			SH_TAILQ_REMOVE(
+			    &lp->free_commits, commit, links, __db_commit);
+
+		lp->ncommit++;
+
+		/*
+		 * Flushes may be requested out of LSN order;  be
+		 * sure we only move lp->t_lsn forward.
+		 */
+		if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0)
+			lp->t_lsn = flush_lsn;
+
+		commit->lsn = flush_lsn;
+		SH_TAILQ_INSERT_HEAD(
+		    &lp->commits, commit, links, __db_commit);
+		LOG_SYSTEM_UNLOCK(env);
+		/* Wait here for the in-progress flush to finish. */
+		MUTEX_LOCK(env, commit->mtx_txnwait);
+		LOG_SYSTEM_LOCK(env);
+
+		lp->ncommit--;
+		/*
+		 * Grab the flag before freeing the struct to see if
+		 * we need to flush the log to commit.  If so,
+		 * use the maximal lsn for any committing thread.
+		 */
+		do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
+		F_CLR(commit, DB_COMMIT_FLUSH);
+		SH_TAILQ_INSERT_HEAD(
+		    &lp->free_commits, commit, links, __db_commit);
+		if (do_flush) {
+			lp->in_flush--;
+			flush_lsn = lp->t_lsn;
+		} else
+			return (0);
+	}
+
+	/*
+	 * Protect flushing with its own mutex so we can release
+	 * the region lock except during file switches.
+	 */
+flush:	MUTEX_LOCK(env, lp->mtx_flush);
+
+	/*
+	 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
+	 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
+	 * after the byte we absolutely know was written to disk, so the test
+	 * is <, not <=.
+	 */
+	if (flush_lsn.file < lp->s_lsn.file ||
+	    (flush_lsn.file == lp->s_lsn.file &&
+	    flush_lsn.offset < lp->s_lsn.offset)) {
+		MUTEX_UNLOCK(env, lp->mtx_flush);
+		goto done;
+	}
+
+	/*
+	 * We may need to write the current buffer.  We have to write the
+	 * current buffer if the flush LSN is greater than or equal to the
+	 * buffer's starting LSN.
+	 *
+	 * Otherwise, it's still possible that this thread may never have
+	 * written to this log file.  Acquire a file descriptor if we don't
+	 * already have one.
+	 */
+	if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) {
+		if ((ret = __log_write(dblp,
+		    dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
+			MUTEX_UNLOCK(env, lp->mtx_flush);
+			goto done;
+		}
+
+		lp->b_off = 0;
+	} else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file)
+		if ((ret = __log_newfh(dblp, 0)) != 0) {
+			MUTEX_UNLOCK(env, lp->mtx_flush);
+			goto done;
+		}
+
+	/*
+	 * We are going to flush, release the region.
+	 * First get the current state of the buffer since
+	 * another write may come in, but we may not flush it.
+	 */
+	b_off = lp->b_off;
+	w_off = lp->w_off;
+	f_lsn = lp->f_lsn;
+	lp->in_flush++;
+	if (release)
+		LOG_SYSTEM_UNLOCK(env);
+
+	/* Sync all writes to disk. */
+	if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
+		MUTEX_UNLOCK(env, lp->mtx_flush);
+		if (release)
+			LOG_SYSTEM_LOCK(env);
+		lp->in_flush--;
+		goto done;
+	}
+
+	/*
+	 * Set the last-synced LSN.
+	 * This value must be set to the LSN past the last complete
+	 * record that has been flushed.  This is at least the first
+	 * lsn, f_lsn.  If the buffer is empty, b_off == 0, then
+	 * we can move up to write point since the first lsn is not
+	 * set for the new buffer.
+	 */
+	lp->s_lsn = f_lsn;
+	if (b_off == 0)
+		lp->s_lsn.offset = w_off;
+
+	MUTEX_UNLOCK(env, lp->mtx_flush);
+	if (release)
+		LOG_SYSTEM_LOCK(env);
+
+	lp->in_flush--;
+	STAT(++lp->stat.st_scount);
+
+	/*
+	 * How many flush calls (usually commits) did this call actually sync?
+	 * At least one, if it got here.
+	 */
+	ncommit = 1;
+done:
+	if (lp->ncommit != 0) {
+		first = 1;
+		SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit)
+			if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) {
+				MUTEX_UNLOCK(env, commit->mtx_txnwait);
+				SH_TAILQ_REMOVE(
+				    &lp->commits, commit, links, __db_commit);
+				ncommit++;
+			} else if (first == 1) {
+				F_SET(commit, DB_COMMIT_FLUSH);
+				MUTEX_UNLOCK(env, commit->mtx_txnwait);
+				SH_TAILQ_REMOVE(
+				    &lp->commits, commit, links, __db_commit);
+				/*
+				 * This thread will wake and flush.
+				 * If another thread commits and flushes
+				 * first we will waste a trip trough the
+				 * mutex.
+				 */
+				lp->in_flush++;
+				first = 0;
+			}
+	}
+#ifdef HAVE_STATISTICS
+	if (lp->stat.st_maxcommitperflush < ncommit)
+		lp->stat.st_maxcommitperflush = ncommit;
+	if (lp->stat.st_mincommitperflush > ncommit ||
+	    lp->stat.st_mincommitperflush == 0)
+		lp->stat.st_mincommitperflush = ncommit;
+#endif
+
+	return (ret);
+}
+
+/*
+ * __log_fill --
+ *	Write information into the log.
+ */
+static int
+__log_fill(dblp, lsn, addr, len)
+	DB_LOG *dblp;
+	DB_LSN *lsn;
+	void *addr;
+	u_int32_t len;
+{
+	LOG *lp;
+	u_int32_t bsize, nrec;
+	size_t nw, remain;
+	int ret;
+
+	lp = dblp->reginfo.primary;
+	bsize = lp->buffer_size;
+
+	if (lp->db_log_inmemory) {
+		__log_inmem_copyin(dblp, lp->b_off, addr, len);
+		lp->b_off = (lp->b_off + len) % lp->buffer_size;
+		return (0);
+	}
+
+	while (len > 0) {			/* Copy out the data. */
+		/*
+		 * If we're beginning a new buffer, note the user LSN to which
+		 * the first byte of the buffer belongs.  We have to know this
+		 * when flushing the buffer so that we know if the in-memory
+		 * buffer needs to be flushed.
+		 */
+		if (lp->b_off == 0)
+			lp->f_lsn = *lsn;
+
+		/*
+		 * If we're on a buffer boundary and the data is big enough,
+		 * copy as many records as we can directly from the data.
+		 */
+		if (lp->b_off == 0 && len >= bsize) {
+			nrec = len / bsize;
+			if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
+				return (ret);
+			addr = (u_int8_t *)addr + nrec * bsize;
+			len -= nrec * bsize;
+			STAT(++lp->stat.st_wcount_fill);
+			continue;
+		}
+
+		/* Figure out how many bytes we can copy this time. */
+		remain = bsize - lp->b_off;
+		nw = remain > len ? len : remain;
+		memcpy(dblp->bufp + lp->b_off, addr, nw);
+		addr = (u_int8_t *)addr + nw;
+		len -= (u_int32_t)nw;
+		lp->b_off += (u_int32_t)nw;
+
+		/* If we fill the buffer, flush it. */
+		if (lp->b_off == bsize) {
+			if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
+				return (ret);
+			lp->b_off = 0;
+			STAT(++lp->stat.st_wcount_fill);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __log_write --
+ *	Write the log buffer to disk.
+ */
+static int
+__log_write(dblp, addr, len)
+	DB_LOG *dblp;
+	void *addr;
+	u_int32_t len;
+{
+	ENV *env;
+	LOG *lp;
+	size_t nw;
+	int ret;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	DB_ASSERT(env, !lp->db_log_inmemory);
+
+	/*
+	 * If we haven't opened the log file yet or the current one has
+	 * changed, acquire a new log file.  We are creating the file if we're
+	 * about to write to the start of it, in other words, if the write
+	 * offset is zero.
+	 */
+	if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file ||
+	    dblp->lf_timestamp != lp->timestamp)
+		if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0)
+			return (ret);
+
+	/*
+	 * If we're writing the first block in a log file on a filesystem that
+	 * guarantees unwritten blocks are zero-filled, we set the size of the
+	 * file in advance.  This increases sync performance on some systems,
+	 * because they don't need to update metadata on every sync.
+	 *
+	 * Ignore any error -- we may have run out of disk space, but that's no
+	 * reason to quit.
+	 */
+#ifdef HAVE_FILESYSTEM_NOTZERO
+	if (lp->w_off == 0 && !__os_fs_notzero()) {
+#else
+	if (lp->w_off == 0) {
+#endif
+		(void)__db_file_extend(env, dblp->lfhp, lp->log_size);
+		if (F_ISSET(dblp, DBLOG_ZERO))
+			(void)__db_zero_extend(env, dblp->lfhp,
+			     0, lp->log_size/lp->buffer_size, lp->buffer_size);
+
+	}
+
+	/*
+	 * Seek to the offset in the file (someone may have written it
+	 * since we last did).
+	 */
+	if ((ret = __os_io(env, DB_IO_WRITE,
+	    dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0)
+		return (ret);
+
+	/* Reset the buffer offset and update the seek offset. */
+	lp->w_off += len;
+
+	/* Update written statistics. */
+	if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
+		lp->stat.st_wc_bytes -= MEGABYTE;
+		++lp->stat.st_wc_mbytes;
+	}
+#ifdef HAVE_STATISTICS
+	if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
+		lp->stat.st_w_bytes -= MEGABYTE;
+		++lp->stat.st_w_mbytes;
+	}
+	++lp->stat.st_wcount;
+#endif
+
+	return (0);
+}
+
+/*
+ * __log_file_pp --
+ *	ENV->log_file pre/post processing.
+ *
+ * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
+ */
+int
+__log_file_pp(dbenv, lsn, namep, len)
+	DB_ENV *dbenv;
+	const DB_LSN *lsn;
+	char *namep;
+	size_t len;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, set;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
+
+	if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0)
+		return (ret);
+	if (set) {
+		__db_errx(env, DB_STR("2518",
+		    "DB_ENV->log_file is illegal with in-memory logs"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_file --
+ *	ENV->log_file.
+ */
+static int
+__log_file(env, lsn, namep, len)
+	ENV *env;
+	const DB_LSN *lsn;
+	char *namep;
+	size_t len;
+{
+	DB_LOG *dblp;
+	int ret;
+	char *name;
+
+	dblp = env->lg_handle;
+	LOG_SYSTEM_LOCK(env);
+	ret = __log_name(dblp, lsn->file, &name, NULL, 0);
+	LOG_SYSTEM_UNLOCK(env);
+	if (ret != 0)
+		return (ret);
+
+	/* Check to make sure there's enough room and copy the name. */
+	if (len < strlen(name) + 1) {
+		*namep = '\0';
+		__db_errx(env, DB_STR("2519",
+		    "DB_ENV->log_file: name buffer is too short"));
+		return (EINVAL);
+	}
+	(void)strcpy(namep, name);
+	__os_free(env, name);
+
+	return (0);
+}
+
+/*
+ * __log_newfh --
+ *	Acquire a file handle for the current log file.
+ */
+static int
+__log_newfh(dblp, create)
+	DB_LOG *dblp;
+	int create;
+{
+	ENV *env;
+	LOG *lp;
+	u_int32_t flags;
+	int ret;
+	logfile_validity status;
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	/* Close any previous file descriptor. */
+	if (dblp->lfhp != NULL) {
+		(void)__os_closehandle(env, dblp->lfhp);
+		dblp->lfhp = NULL;
+	}
+
+	flags = DB_OSO_SEQ |
+	    (create ? DB_OSO_CREATE : 0) |
+	    (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) |
+	    (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0);
+
+	/* Get the path of the new file and open it. */
+	dblp->lfname = lp->lsn.file;
+	if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp,
+	    flags, &status, NULL)) != 0)
+		__db_err(env, ret,
+		    "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
+	else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
+	    status != DB_LV_OLD_READABLE)
+		ret = DB_NOTFOUND;
+
+	return (ret);
+}
+
+/*
+ * __log_name --
+ *	Return the log name for a particular file, and optionally open it.
+ *
+ * PUBLIC: int __log_name __P((DB_LOG *,
+ * PUBLIC:     u_int32_t, char **, DB_FH **, u_int32_t));
+ */
+int
+__log_name(dblp, filenumber, namep, fhpp, flags)
+	DB_LOG *dblp;
+	u_int32_t filenumber, flags;
+	char **namep;
+	DB_FH **fhpp;
+{
+	ENV *env;
+	LOG *lp;
+	int mode, ret;
+	char *oname;
+	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
+
+	env = dblp->env;
+	lp = dblp->reginfo.primary;
+
+	DB_ASSERT(env, !lp->db_log_inmemory);
+
+	/*
+	 * !!!
+	 * The semantics of this routine are bizarre.
+	 *
+	 * The reason for all of this is that we need a place where we can
+	 * intercept requests for log files, and, if appropriate, check for
+	 * both the old-style and new-style log file names.  The trick is
+	 * that all callers of this routine that are opening the log file
+	 * read-only want to use an old-style file name if they can't find
+	 * a match using a new-style name.  The only down-side is that some
+	 * callers may check for the old-style when they really don't need
+	 * to, but that shouldn't mess up anything, and we only check for
+	 * the old-style name when we've already failed to find a new-style
+	 * one.
+	 *
+	 * Create a new-style file name, and if we're not going to open the
+	 * file, return regardless.
+	 */
+	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
+	if ((ret = __db_appname(env,
+	    DB_APP_LOG, new, NULL, namep)) != 0 || fhpp == NULL)
+		return (ret);
+
+	/* The application may have specified an absolute file mode. */
+	if (lp->filemode == 0)
+		mode = env->db_mode;
+	else {
+		LF_SET(DB_OSO_ABSMODE);
+		mode = lp->filemode;
+	}
+
+	/* Open the new-style file -- if we succeed, we're done. */
+	dblp->lf_timestamp = lp->timestamp;
+	if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0)
+		return (0);
+
+	/*
+	 * If the open failed for reason other than the file
+	 * not being there, complain loudly, the wrong user
+	 * probably started up the application.
+	 */
+	if (ret != ENOENT) {
+		__db_err(env, ret, DB_STR_A("2520",
+		    "%s: log file unreadable", "%s"), *namep);
+		return (__env_panic(env, ret));
+	}
+
+	/*
+	 * The open failed... if the DB_RDONLY flag isn't set, we're done,
+	 * the caller isn't interested in old-style files.
+	 */
+	if (!LF_ISSET(DB_OSO_RDONLY)) {
+		__db_err(env, ret, DB_STR_A("2521",
+		    "%s: log file open failed", "%s"), *namep);
+		return (__env_panic(env, ret));
+	}
+
+	/* Create an old-style file name. */
+	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
+	if ((ret = __db_appname(env,
+	    DB_APP_LOG, old, NULL, &oname)) != 0)
+		goto err;
+
+	/*
+	 * Open the old-style file -- if we succeed, we're done.  Free the
+	 * space allocated for the new-style name and return the old-style
+	 * name to the caller.
+	 */
+	if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) {
+		__os_free(env, *namep);
+		*namep = oname;
+		return (0);
+	}
+
+	/*
+	 * Couldn't find either style of name -- return the new-style name
+	 * for the caller's error message.  If it's an old-style name that's
+	 * actually missing we're going to confuse the user with the error
+	 * message, but that implies that not only were we looking for an
+	 * old-style name, but we expected it to exist and we weren't just
+	 * looking for any log file.  That's not a likely error.
+	 */
+err:	__os_free(env, oname);
+	return (ret);
+}
+
+/*
+ * __log_rep_put --
+ *	Short-circuit way for replication clients to put records into the
+ * log.  Replication clients' logs need to be laid out exactly as their masters'
+ * are, so we let replication take responsibility for when the log gets
+ * flushed, when log switches files, etc.  This is just a thin PUBLIC wrapper
+ * for __log_putr with a slightly prettier interface.
+ *
+ * Note that the REP->mtx_clientdb should be held when this is called.
+ * Note that we acquire the log region mutex while holding mtx_clientdb.
+ *
+ * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_rep_put(env, lsnp, rec, flags)
+	ENV *env;
+	DB_LSN *lsnp;
+	const DBT *rec;
+	u_int32_t flags;
+{
+	DBT *dbt, t;
+	DB_CIPHER *db_cipher;
+	DB_LOG *dblp;
+	HDR hdr;
+	LOG *lp;
+	int need_free, ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	LOG_SYSTEM_LOCK(env);
+	memset(&hdr, 0, sizeof(HDR));
+	t = *rec;
+	dbt = &t;
+	need_free = 0;
+	db_cipher = env->crypto_handle;
+	if (CRYPTO_ON(env))
+		t.size += db_cipher->adj_size(rec->size);
+	if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
+		goto err;
+	need_free = 1;
+	memcpy(t.data, rec->data, rec->size);
+
+	if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0)
+		goto err;
+
+	DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0);
+	ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
+err:
+	/*
+	 * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn.
+	 */
+	lp->ready_lsn = lp->lsn;
+
+	if (LF_ISSET(DB_LOG_CHKPNT))
+		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
+
+	/* Increment count of records added to the log. */
+	STAT(++lp->stat.st_record);
+	LOG_SYSTEM_UNLOCK(env);
+	if (need_free)
+		__os_free(env, t.data);
+	return (ret);
+}
+
+static int
+__log_encrypt_record(env, dbt, hdr, orig)
+	ENV *env;
+	DBT *dbt;
+	HDR *hdr;
+	u_int32_t orig;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+
+	if (CRYPTO_ON(env)) {
+		db_cipher = env->crypto_handle;
+		hdr->size = HDR_CRYPTO_SZ;
+		hdr->orig_size = orig;
+		if ((ret = db_cipher->encrypt(env, db_cipher->data,
+		    hdr->iv, dbt->data, dbt->size)) != 0)
+			return (ret);
+	} else {
+		hdr->size = HDR_NORMAL_SZ;
+	}
+	return (0);
+}
+/*
+ * __log_put_record_pp --
+ *	DB_ENV->log_put_record pre/post processing.
+ *
+ * PUBLIC: int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ * PUBLIC:     DB_LOG_RECSPEC *, ...));
+ */
+#ifdef STDC_HEADERS
+int
+__log_put_record_pp(DB_ENV *dbenv, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+    u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+    DB_LOG_RECSPEC *spec, ...)
+#else
+int
+__log_put_record_pp(dbenv, dbp, txnp, ret_lsnp,
+    flags, rectype, has_data, size,
+    spec, va_alist)
+	DB_ENV *dbenv;
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t rectype;
+	u_int32_t has_data;
+	u_int32_t size;
+	DB_LOG_RECSPEC *spec;
+	va_dcl
+#endif
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	va_list argp;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_put_record", DB_INIT_LOG);
+
+	/* Validate arguments: check for allowed flags. */
+	if ((ret = __db_fchk(env, "DB_ENV->log_put_record", flags,
+	    DB_LOG_CHKPNT | DB_LOG_COMMIT |
+	    DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
+		return (ret);
+
+	/* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+	if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+		return (__db_ferr(env, "DB_ENV->log_put_record", 1));
+
+	/* Replication clients should never write log records. */
+	if (IS_REP_CLIENT(env)) {
+		__db_errx(env, DB_STR("2522",
+		    "DB_ENV->log_put is illegal on replication clients"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	va_start(argp, spec);
+	REPLICATION_WRAP(env, (__log_put_record_int(env, dbp,
+	    txnp, ret_lsnp, flags, rectype, has_data, size, spec, argp)),
+	    0, ret);
+	va_end(argp);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ * PUBLIC:     DB_LOG_RECSPEC *, ...));
+ */
+#ifdef STDC_HEADERS
+int
+__log_put_record(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+    u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+    DB_LOG_RECSPEC *spec, ...)
+#else
+int
+__log_put_record(env, dbp, txnp, ret_lsnp,
+    flags, rectype, has_data, size, spec, va_alist);
+	ENV *env;
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t rectype;
+	u_int32_t has_data;
+	u_int32_t size;
+	DB_LOG_RECSPEC *spec;
+	va_dcl
+#endif
+{
+	va_list argp;
+	int ret;
+
+	va_start(argp, spec);
+	ret = __log_put_record_int(env, dbp, txnp, ret_lsnp, flags,
+	    rectype, has_data, size, spec, argp);
+	va_end(argp);
+	return (ret);
+}
+
+#ifdef STDC_HEADERS
+static int
+__log_put_record_int(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+    u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+    DB_LOG_RECSPEC *spec, va_list argp)
+#else
+int
+__log_put_record_int(env, dbp, txnp, ret_lsnp,
+    flags, rectype, has_data, size, spec, argp);
+	ENV *env;
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t has_data;
+	u_int32_t size;
+	u_int32_t rectype;
+	DB_LOG_RECSPEC *spec;
+	va_list argp;
+#endif
+{
+	DBT *data, *dbt, *header, logrec;
+	DB_LOG_RECSPEC *sp;
+	DB_LSN *lsnp, lsn, null_lsn, *pagelsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	LOG *lp;
+	PAGE *pghdrstart;
+	u_int32_t hdrsize, op, zero, uinttmp, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+	void *hdrstart;
+
+	COMPQUIET(lr, NULL);
+	COMPQUIET(hdrsize, 0);
+	COMPQUIET(op, 0);
+	COMPQUIET(hdrstart, NULL);
+	COMPQUIET(pghdrstart, NULL);
+	COMPQUIET(header, NULL);
+
+	/*
+	 * rlsnp will be stored into while holding the log system lock.
+	 * If this is a commit record then ret_lsnp will be the address of
+	 * the transaction detail visible_lsn field.  If not then this
+	 * may be the lsn of a page and we do not want to set it if
+	 * the log_put fails after writing the record (due to an I/O error).
+	 */
+	if (LF_ISSET(DB_LOG_COMMIT))
+		rlsnp = ret_lsnp;
+	else
+		rlsnp = &lsn;
+	npad = 0;
+	ret = 0;
+	data = NULL;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    (dbp != NULL && F_ISSET(dbp, DB_AM_NOT_DURABLE))) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	if (dbp != NULL) {
+		DB_ASSERT(env, dbp->log_filename != NULL);
+		if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+		    (ret = __dbreg_lazy_id(dbp)) != 0)
+			return (ret);
+	}
+
+	logrec.size = size;
+
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	zero = 0;
+	lp = env->lg_handle->reginfo.primary;
+	for (sp = spec; sp->type != LOGREC_Done; sp++) {
+		switch (sp->type) {
+		case LOGREC_DB:
+			/* This is not in the varargs. */
+			uinttmp = (u_int32_t)dbp->log_filename->id;
+			LOGCOPY_32(env, bp, &uinttmp);
+			bp += sizeof(uinttmp);
+			break;
+
+		case LOGREC_ARG:
+		case LOGREC_TIME:
+		case LOGREC_DBOP:
+			uinttmp = va_arg(argp, u_int32_t);
+			LOGCOPY_32(env, bp, &uinttmp);
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_OP:
+			op = va_arg(argp, u_int32_t);
+			LOGCOPY_32(env, bp, &op);
+			bp += sizeof(uinttmp);
+			break;
+		case LOGREC_DBT:
+		case LOGREC_PGLIST:
+		case LOGREC_LOCKS:
+		case LOGREC_HDR:
+		case LOGREC_DATA:
+			dbt = va_arg(argp, DBT *);
+			if (dbt == NULL) {
+				LOGCOPY_32(env, bp, &zero);
+				bp += sizeof(u_int32_t);
+			} else {
+				LOGCOPY_32(env, bp, &dbt->size);
+				bp += sizeof(dbt->size);
+				memcpy(bp, dbt->data, dbt->size);
+			}
+			/* Process fields that need to be byte swapped. */
+			if (dbp != NULL && F_ISSET(dbp, DB_AM_SWAP)) {
+				if (sp->type == LOGREC_HDR &&
+				    dbt != NULL && has_data == 0)
+					__db_recordswap(op,
+					    dbt->size, bp, NULL, 0);
+				else if (sp->type == LOGREC_HDR) {
+					hdrstart = bp;
+					hdrsize = dbt == NULL ? 0 : dbt->size;
+				} else if (sp->type == LOGREC_DATA) {
+					__db_recordswap(op,
+					    hdrsize, hdrstart, bp, 0);
+					has_data = 0;
+				}
+			}
+			if (dbt != NULL)
+				bp += dbt->size;
+
+			break;
+		/*
+		 * Page header and data -- we assume that the header
+		 * is listed first and the data follows sometime later.
+		 * There should be only one header/data pair per record.
+		 */
+		case LOGREC_PGDBT:
+			header = va_arg(argp, DBT *);
+			if (header == NULL) {
+				LOGCOPY_32(env, bp, &zero);
+				bp += sizeof(u_int32_t);
+			} else {
+				LOGCOPY_32(env, bp, &header->size);
+				bp += sizeof(header->size);
+				pghdrstart = (PAGE *)bp;
+				memcpy(bp, header->data, header->size);
+				if (has_data == 0 &&
+				    F_ISSET(dbp, DB_AM_SWAP) &&
+				    (ret = __db_pageswap(
+				     env, dbp, pghdrstart, (size_t)header->size,
+				     NULL, 0)) != 0)
+					return (ret);
+				bp += header->size;
+			}
+			break;
+
+		case LOGREC_PGDDBT:
+			data = va_arg(argp, DBT *);
+			if (data == NULL) {
+				zero = 0;
+				LOGCOPY_32(env, bp, &zero);
+				bp += sizeof(u_int32_t);
+			} else {
+				if (F_ISSET(dbp, DB_AM_SWAP) &&
+				    (ret = __db_pageswap(env, dbp, pghdrstart,
+				    (size_t)header->size, (DBT *)data, 0)) != 0)
+					return (ret);
+				LOGCOPY_32(env, bp, &data->size);
+				bp += sizeof(data->size);
+				memcpy(bp, data->data, data->size);
+				if (F_ISSET(dbp, DB_AM_SWAP) &&
+				     F_ISSET(data, DB_DBT_APPMALLOC))
+					__os_free(env, data->data);
+				bp += data->size;
+			}
+			break;
+		case LOGREC_POINTER:
+			pagelsn = va_arg(argp, DB_LSN *);
+			if (pagelsn != NULL) {
+				if (txnp != NULL) {
+					if (LOG_COMPARE(pagelsn,
+					    &lp->lsn) >= 0 && (ret =
+					    __log_check_page_lsn(env,
+					    dbp, pagelsn)) != 0)
+						return (ret);
+				}
+				LOGCOPY_FROMLSN(env, bp, pagelsn);
+			} else
+				memset(bp, 0, sizeof(*pagelsn));
+			bp += sizeof(*pagelsn);
+			break;
+
+		default:
+			DB_ASSERT(env, sp->type != sp->type);
+		}
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0) {
+			if (txnp != NULL)
+				*lsnp = *rlsnp;
+			*ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env) && !lp->db_log_inmemory)
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_addrem_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
diff --git a/src/log/log_stat.c b/src/log/log_stat.c
new file mode 100644
index 00000000..37b74c74
--- /dev/null
+++ b/src/log/log_stat.c
@@ -0,0 +1,336 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __log_print_all __P((ENV *, u_int32_t));
+static int __log_print_stats __P((ENV *, u_int32_t));
+static int __log_stat __P((ENV *, DB_LOG_STAT **, u_int32_t));
+
+/*
+ * __log_stat_pp --
+ *	DB_ENV->log_stat pre/post processing.
+ *
+ * PUBLIC: int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+ */
+int
+__log_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_LOG_STAT **statp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_stat(env, statp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_stat --
+ *	DB_ENV->log_stat.
+ */
+static int
+__log_stat(env, statp, flags)
+	ENV *env;
+	DB_LOG_STAT **statp;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_LOG_STAT *stats;
+	LOG *lp;
+	int ret;
+
+	*statp = NULL;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if ((ret = __os_umalloc(env, sizeof(DB_LOG_STAT), &stats)) != 0)
+		return (ret);
+
+	/* Copy out the global statistics. */
+	LOG_SYSTEM_LOCK(env);
+	*stats = lp->stat;
+	if (LF_ISSET(DB_STAT_CLEAR))
+		memset(&lp->stat, 0, sizeof(lp->stat));
+
+	stats->st_magic = lp->persist.magic;
+	stats->st_version = lp->persist.version;
+	stats->st_mode = lp->filemode;
+	stats->st_lg_bsize = lp->buffer_size;
+	stats->st_lg_size = lp->log_nsize;
+
+	__mutex_set_wait_info(env, lp->mtx_region,
+	    &stats->st_region_wait, &stats->st_region_nowait);
+	if (LF_ISSET(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
+		__mutex_clear(env, lp->mtx_region);
+	stats->st_regsize = dblp->reginfo.rp->size;
+
+	stats->st_cur_file = lp->lsn.file;
+	stats->st_cur_offset = lp->lsn.offset;
+	stats->st_disk_file = lp->s_lsn.file;
+	stats->st_disk_offset = lp->s_lsn.offset;
+
+	LOG_SYSTEM_UNLOCK(env);
+
+	*statp = stats;
+	return (0);
+}
+
+/*
+ * __log_stat_print_pp --
+ *	DB_ENV->log_stat_print pre/post processing.
+ *
+ * PUBLIC: int __log_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->log_stat_print", DB_INIT_LOG);
+
+	if ((ret = __db_fchk(env, "DB_ENV->log_stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__log_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __log_stat_print --
+ *	DB_ENV->log_stat_print method.
+ *
+ * PUBLIC: int __log_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__log_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __log_print_stats(env, orig_flags);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __log_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __log_print_stats --
+ *	Display default log region statistics.
+ */
+static int
+__log_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_LOG_STAT *sp;
+	int ret;
+
+	if ((ret = __log_stat(env, &sp, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default logging region information:");
+	STAT_HEX("Log magic number", sp->st_magic);
+	STAT_ULONG("Log version number", sp->st_version);
+	__db_dlbytes(env, "Log record cache size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_lg_bsize);
+	__db_msg(env, "%#o\tLog file mode", sp->st_mode);
+	if (sp->st_lg_size % MEGABYTE == 0)
+		__db_msg(env, "%luMb\tCurrent log file size",
+		    (u_long)sp->st_lg_size / MEGABYTE);
+	else if (sp->st_lg_size % 1024 == 0)
+		__db_msg(env, "%luKb\tCurrent log file size",
+		    (u_long)sp->st_lg_size / 1024);
+	else
+		__db_msg(env, "%lu\tCurrent log file size",
+		    (u_long)sp->st_lg_size);
+	__db_dl(env, "Initial fileid allocation", (u_long)sp->st_fileid_init);
+	__db_dl(env, "Current fileids in use", (u_long)sp->st_nfileid);
+	__db_dl(env, "Maximum fileids used", (u_long)sp->st_maxnfileid);
+	__db_dl(env, "Records entered into the log", (u_long)sp->st_record);
+	__db_dlbytes(env, "Log bytes written",
+	    (u_long)0, (u_long)sp->st_w_mbytes, (u_long)sp->st_w_bytes);
+	__db_dlbytes(env, "Log bytes written since last checkpoint",
+	    (u_long)0, (u_long)sp->st_wc_mbytes, (u_long)sp->st_wc_bytes);
+	__db_dl(env, "Total log file I/O writes", (u_long)sp->st_wcount);
+	__db_dl(env, "Total log file I/O writes due to overflow",
+	    (u_long)sp->st_wcount_fill);
+	__db_dl(env, "Total log file flushes", (u_long)sp->st_scount);
+	__db_dl(env, "Total log file I/O reads", (u_long)sp->st_rcount);
+	STAT_ULONG("Current log file number", sp->st_cur_file);
+	STAT_ULONG("Current log file offset", sp->st_cur_offset);
+	STAT_ULONG("On-disk log file number", sp->st_disk_file);
+	STAT_ULONG("On-disk log file offset", sp->st_disk_offset);
+
+	__db_dl(env,
+	    "Maximum commits in a log flush", (u_long)sp->st_maxcommitperflush);
+	__db_dl(env,
+	    "Minimum commits in a log flush", (u_long)sp->st_mincommitperflush);
+
+	__db_dlbytes(env, "Region size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+	__db_dl_pct(env,
+	    "The number of region locks that required waiting",
+	    (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+	    sp->st_region_wait + sp->st_region_nowait), NULL);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __log_print_all --
+ *	Display debugging log region statistics.
+ */
+static int
+__log_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DBLOG_RECOVER,	"DBLOG_RECOVER" },
+		{ DBLOG_FORCE_OPEN,	"DBLOG_FORCE_OPEN" },
+		{ DBLOG_AUTOREMOVE,	"DBLOG_AUTOREMOVE"},
+		{ DBLOG_DIRECT,		"DBLOG_DIRECT"},
+		{ DBLOG_DSYNC,		"DBLOG_DSYNC"},
+		{ DBLOG_FORCE_OPEN,	"DBLOG_FORCE_OPEN"},
+		{ DBLOG_INMEMORY,	"DBLOG_INMEMORY"},
+		{ DBLOG_OPENFILES,	"DBLOG_OPENFILES"},
+		{ DBLOG_RECOVER,	"DBLOG_RECOVER"},
+		{ DBLOG_ZERO,		"DBLOG_ZERO"},
+		{ 0,			NULL }
+	};
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+
+	LOG_SYSTEM_LOCK(env);
+
+	__db_print_reginfo(env, &dblp->reginfo, "Log", flags);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_LOG handle information:");
+	__mutex_print_debug_single(
+	    env, "DB_LOG handle mutex", dblp->mtx_dbreg, flags);
+	STAT_ULONG("Log file name", dblp->lfname);
+	__db_print_fh(env, "Log file handle", dblp->lfhp, flags);
+	__db_prflags(env, NULL, dblp->flags, fn, NULL, "\tFlags");
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "LOG handle information:");
+	__mutex_print_debug_single(
+	    env, "LOG region mutex", lp->mtx_region, flags);
+	__mutex_print_debug_single(
+	    env, "File name list mutex", lp->mtx_filelist, flags);
+
+	STAT_HEX("persist.magic", lp->persist.magic);
+	STAT_ULONG("persist.version", lp->persist.version);
+	__db_dlbytes(env,
+	    "persist.log_size", (u_long)0, (u_long)0, lp->persist.log_size);
+	STAT_FMT("log file permissions mode", "%#lo", u_long, lp->filemode);
+	STAT_LSN("current file offset LSN", &lp->lsn);
+	STAT_LSN("first buffer byte LSN", &lp->lsn);
+	STAT_ULONG("current buffer offset", lp->b_off);
+	STAT_ULONG("current file write offset", lp->w_off);
+	STAT_ULONG("length of last record", lp->len);
+	STAT_LONG("log flush in progress", lp->in_flush);
+	__mutex_print_debug_single(
+	    env, "Log flush mutex", lp->mtx_flush, flags);
+
+	STAT_LSN("last sync LSN", &lp->s_lsn);
+
+	/*
+	 * Don't display the replication fields here, they're displayed as part
+	 * of the replication statistics.
+	 */
+
+	STAT_LSN("cached checkpoint LSN", &lp->cached_ckp_lsn);
+
+	__db_dlbytes(env,
+	    "log buffer size", (u_long)0, (u_long)0, lp->buffer_size);
+	__db_dlbytes(env,
+	    "log file size", (u_long)0, (u_long)0, lp->log_size);
+	__db_dlbytes(env,
+	    "next log file size", (u_long)0, (u_long)0, lp->log_nsize);
+
+	STAT_ULONG("transactions waiting to commit", lp->ncommit);
+	STAT_LSN("LSN of first commit", &lp->t_lsn);
+
+	LOG_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__log_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_LOG_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__log_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/log/log_verify.c b/src/log/log_verify.c
new file mode 100644
index 00000000..e7f8f688
--- /dev/null
+++ b/src/log/log_verify.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#include "dbinc/log_verify.h"
+
+#define	FIRST_OFFSET(env) \
+	(sizeof(LOGP) + (CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ))
+
+static int __env_init_verify __P((ENV *, u_int32_t, DB_DISTAB *));
+
+/*
+ * PUBLIC: int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+ */
+int
+__log_verify_pp(dbenv, lvconfig)
+	DB_ENV *dbenv;
+	const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+	int lsnrg, ret, timerg;
+	DB_THREAD_INFO *ip;
+	const char *phome;
+
+	lsnrg = ret = timerg = 0;
+	phome = NULL;
+
+	if (!IS_ZERO_LSN(lvconfig->start_lsn) ||
+	    !IS_ZERO_LSN(lvconfig->end_lsn))
+		lsnrg = 1;
+	if (lvconfig->start_time != 0 || lvconfig->end_time != 0)
+		timerg = 1;
+
+	if ((!IS_ZERO_LSN(lvconfig->start_lsn) && lvconfig->start_time != 0) ||
+	    (!IS_ZERO_LSN(lvconfig->end_lsn) && lvconfig->end_time != 0) ||
+	    (lsnrg && timerg)) {
+		__db_errx(dbenv->env, DB_STR("2501",
+		    "Set either an lsn range or a time range to verify logs "
+		    "in the range, don't mix time and lsn."));
+		ret = EINVAL;
+		goto err;
+	}
+	phome = dbenv->env->db_home;
+	if (phome != NULL && lvconfig->temp_envhome != NULL &&
+	    strcmp(phome, lvconfig->temp_envhome) == 0) {
+		__db_errx(dbenv->env,
+		    "Environment home for log verification internal use "
+		    "overlaps with that of the environment to verify.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	ENV_ENTER(dbenv->env, ip);
+	ret = __log_verify(dbenv, lvconfig, ip);
+	ENV_LEAVE(dbenv->env, ip);
+err:	return (ret);
+}
+
+/*
+ * PUBLIC: int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *,
+ * PUBLIC:     DB_THREAD_INFO *));
+ */
+int
+__log_verify(dbenv, lvconfig, ip)
+	DB_ENV *dbenv;
+	const DB_LOG_VERIFY_CONFIG *lvconfig;
+	DB_THREAD_INFO *ip;
+{
+
+	u_int32_t logcflag, max_fileno;
+	DB_LOGC *logc;
+	ENV *env;
+	DBT data;
+	DB_DISTAB dtab;
+	DB_LSN key, start, start2, stop, stop2, verslsn;
+	u_int32_t newversion, version;
+	int cmp, fwdscroll, goprev, ret, tret;
+	time_t starttime, endtime;
+	const char *okmsg;
+	DB_LOG_VRFY_INFO *logvrfy_hdl;
+
+	okmsg = NULL;
+	fwdscroll = 1;
+	max_fileno = (u_int32_t)-1;
+	goprev = 0;
+	env = dbenv->env;
+	logc = NULL;
+	memset(&dtab, 0, sizeof(dtab));
+	memset(&data, 0, sizeof(data));
+	version = newversion = 0;
+	ZERO_LSN(verslsn);
+	memset(&start, 0, sizeof(DB_LSN));
+	memset(&start2, 0, sizeof(DB_LSN));
+	memset(&stop, 0, sizeof(DB_LSN));
+	memset(&stop2, 0, sizeof(DB_LSN));
+	memset(&key, 0, sizeof(DB_LSN));
+	memset(&verslsn, 0, sizeof(DB_LSN));
+
+	start = lvconfig->start_lsn;
+	stop = lvconfig->end_lsn;
+	starttime = lvconfig->start_time;
+	endtime = lvconfig->end_time;
+
+	if ((ret = __create_log_vrfy_info(lvconfig, &logvrfy_hdl, ip)) != 0)
+		goto err;
+	logvrfy_hdl->lv_config = lvconfig;
+	if (lvconfig->continue_after_fail)
+		F_SET(logvrfy_hdl, DB_LOG_VERIFY_CAF);
+	if (lvconfig->verbose)
+		F_SET(logvrfy_hdl, DB_LOG_VERIFY_VERBOSE);
+
+	/* Allocate a log cursor. */
+	if ((ret = __log_cursor(dbenv->env, &logc)) != 0) {
+		__db_err(dbenv->env, ret, "DB_ENV->log_cursor");
+		goto err;
+	}
+	/* Ignore failed chksum and go on with next one. */
+	F_SET(logc->env->lg_handle, DBLOG_VERIFYING);
+
+	/* Only scan the range that we want to verify. */
+	if (fwdscroll) {
+		if (IS_ZERO_LSN(stop)) {
+			logcflag = DB_LAST;
+			key.file = key.offset = 0;
+		} else {
+			key = stop;
+			logcflag = DB_SET;
+		}
+		logvrfy_hdl->flags |= DB_LOG_VERIFY_FORWARD;
+		goto startscroll;
+	}
+
+vrfyscroll:
+
+	/*
+	 * Initialize version to 0 so that we get the
+	 * correct version right away.
+	 */
+	version = 0;
+	ZERO_LSN(verslsn);
+
+	/*
+	 * In the log verification config struct, start_lsn and end_lsn have
+	 * higher priority than start_time and end_time, and you can specify
+	 * either lsn or time to start/stop verification.
+	 */
+	if (starttime != 0 || endtime != 0) {
+		if ((ret = __find_lsnrg_by_timerg(logvrfy_hdl,
+		    starttime, endtime, &start2, &stop2)) != 0)
+			goto err;
+		((DB_LOG_VERIFY_CONFIG *)lvconfig)->start_lsn = start = start2;
+		((DB_LOG_VERIFY_CONFIG *)lvconfig)->end_lsn = stop = stop2;
+	}
+
+	if (IS_ZERO_LSN(start)) {
+		logcflag = DB_FIRST;
+		key.file = key.offset = 0;
+	} else {
+		key = start;
+		logcflag = DB_SET;
+		F_SET(logvrfy_hdl, DB_LOG_VERIFY_PARTIAL);
+	}
+	goprev = 0;
+
+	/*
+	 * So far we only support verifying a specific db file. The config's
+	 * dbfile must be prefixed with the data directory if it's not in
+	 * environment home directory.
+	 */
+	if (lvconfig->dbfile != NULL) {
+		F_SET(logvrfy_hdl,
+		    DB_LOG_VERIFY_DBFILE | DB_LOG_VERIFY_PARTIAL);
+		if ((ret = __set_logvrfy_dbfuid(logvrfy_hdl)) != 0)
+			goto err;
+	}
+
+startscroll:
+
+	memset(&data, 0, sizeof(data));
+
+	for (;;) {
+
+		/*
+		 * We may have reached beyond the range we're verifying.
+		 */
+		if (!fwdscroll && !IS_ZERO_LSN(stop)) {
+			cmp = LOG_COMPARE(&key, &stop);
+			if (cmp > 0)
+				break;
+		}
+		if (fwdscroll && !IS_ZERO_LSN(start)) {
+			cmp = LOG_COMPARE(&key, &start);
+			if (cmp < 0)
+				break;
+		}
+
+		ret = __logc_get(logc, &key, &data, logcflag);
+		if (ret != 0) {
+			if (ret == DB_NOTFOUND) {
+				/* We may not start from the first log file. */
+				if (logcflag == DB_PREV && key.file > 1)
+					F_SET(logvrfy_hdl,
+					    DB_LOG_VERIFY_PARTIAL);
+				break;
+			}
+			__db_err(dbenv->env, ret, "DB_LOGC->get");
+			/*
+			 * When go beyond valid lsn range, we may get other
+			 * error values than DB_NOTFOUND.
+			 */
+			goto out;
+		}
+
+		if (logcflag == DB_SET) {
+			if (goprev)
+				logcflag = DB_PREV;
+			else
+				logcflag = DB_NEXT;
+		} else if (logcflag == DB_LAST) {
+			logcflag = DB_PREV;
+			max_fileno = key.file;
+		} else if (logcflag == DB_FIRST)
+			logcflag = DB_NEXT;
+
+		if (key.file != verslsn.file) {
+			/*
+			 * If our log file changed, we need to see if the
+			 * version of the log file changed as well.
+			 * If it changed, reset the print table.
+			 */
+			if ((ret = __logc_version(logc, &newversion)) != 0) {
+				__db_err(dbenv->env, ret, "DB_LOGC->version");
+				goto err;
+			}
+			if (version != newversion) {
+				version = newversion;
+				if (!IS_LOG_VRFY_SUPPORTED(version)) {
+					__db_msg(dbenv->env, DB_STR_A("2502",
+				"[%lu][%lu] Unsupported version of log file, "
+				"log file number: %u, log file version: %u, "
+				"supported log version: %u.",
+					    "%lu %lu %u %u %u"),
+					    (u_long)key.file,
+					    (u_long)key.offset,
+					    key.file, version, DB_LOGVERSION);
+					if (logcflag == DB_NEXT) {
+						key.file += 1;
+						if (key.file > max_fileno)
+							break;
+				/*
+				 * Txns don't span log versions, no need to
+				 * set DB_LOG_VERIFY_PARTIAL here.
+				 */
+					} else {
+						goprev = 1;
+						key.file -= 1;
+						if (key.file == 0)
+							break;
+					}
+					key.offset = FIRST_OFFSET(env);
+					logcflag = DB_SET;
+					continue;
+				}
+				if ((ret = __env_init_verify(env, version,
+				    &dtab)) != 0) {
+					__db_err(dbenv->env, ret,
+					    DB_STR("2503",
+					    "callback: initialization"));
+					goto err;
+				}
+			}
+			verslsn = key;
+		}
+
+		ret = __db_dispatch(dbenv->env, &dtab, &data, &key,
+		    DB_TXN_LOG_VERIFY, logvrfy_hdl);
+
+		if (!fwdscroll && ret != 0) {
+			if (!F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_CAF)) {
+				__db_err(dbenv->env, ret,
+				    "[%lu][%lu] __db_dispatch",
+				    (u_long)key.file, (u_long)key.offset);
+				goto err;
+			} else
+				F_SET(logvrfy_hdl, DB_LOG_VERIFY_ERR);
+		}
+	}
+
+	if (fwdscroll) {
+		fwdscroll = 0;
+		F_CLR(logvrfy_hdl, DB_LOG_VERIFY_FORWARD);
+		goto vrfyscroll;
+	}
+out:
+	/*
+	 * When we arrive here ret can be 0 or errors returned by DB_LOGC->get,
+	 * all which we have already handled. So we clear ret.
+	 */
+	ret = 0;
+
+	/* If continuing after fail, we can complete the entire log. */
+	if (F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_ERR) ||
+	    F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_INTERR))
+		ret = DB_LOG_VERIFY_BAD;
+	/*
+	 * This function can be called when the environment is alive, so
+	 * there can be active transactions.
+	 */
+	__db_log_verify_global_report(logvrfy_hdl);
+	if (ret == DB_LOG_VERIFY_BAD)
+		okmsg = DB_STR_P("FAILED");
+	else {
+		DB_ASSERT(dbenv->env, ret == 0);
+		okmsg = DB_STR_P("SUCCEEDED");
+	}
+
+	__db_msg(dbenv->env, DB_STR_A("2504",
+	    "Log verification ended and %s.", "%s"), okmsg);
+
+err:
+	if (logc != NULL)
+		(void)__logc_close(logc);
+	if ((tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0)
+		ret = tret;
+	if (dtab.int_dispatch)
+		__os_free(dbenv->env, dtab.int_dispatch);
+	if (dtab.ext_dispatch)
+		__os_free(dbenv->env, dtab.ext_dispatch);
+
+	return (ret);
+}
+
+/*
+ * __env_init_verify--
+ */
+static int
+__env_init_verify(env, version, dtabp)
+	ENV *env;
+	u_int32_t version;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	/*
+	 * We need to prime the print table with the current print
+	 * functions.  Then we overwrite only specific entries based on
+	 * each previous version we support.
+	 */
+	if ((ret = __bam_init_verify(env, dtabp)) != 0)
+		goto err;
+	if ((ret = __crdel_init_verify(env, dtabp)) != 0)
+		goto err;
+	if ((ret = __db_init_verify(env, dtabp)) != 0)
+		goto err;
+	if ((ret = __dbreg_init_verify(env, dtabp)) != 0)
+		goto err;
+	if ((ret = __fop_init_verify(env, dtabp)) != 0)
+		goto err;
+#ifdef HAVE_HASH
+	if ((ret = __ham_init_verify(env, dtabp)) != 0)
+		goto err;
+#endif
+#ifdef HAVE_HEAP
+	if ((ret = __heap_init_verify(env, dtabp)) != 0)
+		goto err;
+#endif
+#ifdef HAVE_QUEUE
+	if ((ret = __qam_init_verify(env, dtabp)) != 0)
+		goto err;
+#endif
+	if ((ret = __txn_init_verify(env, dtabp)) != 0)
+		goto err;
+
+	switch (version) {
+	case DB_LOGVERSION:
+		ret = 0;
+		break;
+
+	default:
+		__db_errx(env, DB_STR_A("2505", "Not supported version %lu",
+		    "%lu"), (u_long)version);
+		ret = EINVAL;
+		break;
+	}
+err:	return (ret);
+}
+
+/*
+ * __log_verify_wrap --
+ *      Wrapper function for APIs of other languages, like java/c# and
+ *      script languages. It's much easier to implement the swig layer
+ *      when we split up the C structure.
+ *
+ * PUBLIC: int __log_verify_wrap __P((ENV *, const char *, u_int32_t,
+ * PUBLIC:     const char *, const char *, time_t, time_t, u_int32_t,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, int, int));
+ */
+int
+__log_verify_wrap(env, envhome, cachesize, dbfile, dbname,
+    stime, etime, stfile, stoffset, efile, eoffset, caf, verbose)
+	ENV *env;
+	const char *envhome, *dbfile, *dbname;
+	time_t stime, etime;
+	u_int32_t cachesize, stfile, stoffset, efile, eoffset;
+	int caf, verbose;
+{
+	DB_LOG_VERIFY_CONFIG cfg;
+
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.cachesize = cachesize;
+	cfg.temp_envhome = envhome;
+	cfg.dbfile = dbfile;
+	cfg.dbname = dbname;
+	cfg.start_time = stime;
+	cfg.end_time = etime;
+	cfg.start_lsn.file = stfile;
+	cfg.start_lsn.offset = stoffset;
+	cfg.end_lsn.file = efile;
+	cfg.end_lsn.offset = eoffset;
+	cfg.continue_after_fail = caf;
+	cfg.verbose = verbose;
+
+	return __log_verify_pp(env->dbenv, &cfg);
+}
diff --git a/src/log/log_verify_auto.c b/src/log/log_verify_auto.c
new file mode 100644
index 00000000..08bc5d64
--- /dev/null
+++ b/src/log/log_verify_auto.c
@@ -0,0 +1,318 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/fop.h"
+
+/*
+ * PUBLIC: int __crdel_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_metasub_verify, DB___crdel_metasub)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_create_verify, DB___crdel_inmem_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_rename_verify, DB___crdel_inmem_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_remove_verify, DB___crdel_inmem_remove)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_addrem_verify, DB___db_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_big_verify, DB___db_big)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_ovref_verify, DB___db_ovref)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_debug_verify, DB___db_debug)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_noop_verify, DB___db_noop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_alloc_verify, DB___db_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_free_verify, DB___db_pg_free)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_cksum_verify, DB___db_cksum)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_freedata_verify, DB___db_pg_freedata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_init_verify, DB___db_pg_init)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_trunc_verify, DB___db_pg_trunc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_realloc_verify, DB___db_realloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_relink_verify, DB___db_relink)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_merge_verify, DB___db_merge)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pgno_verify, DB___db_pgno)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __dbreg_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __dbreg_register_verify, DB___dbreg_register)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __bam_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_split_verify, DB___bam_split)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rsplit_verify, DB___bam_rsplit)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_adj_verify, DB___bam_adj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cadjust_verify, DB___bam_cadjust)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_cdel_verify, DB___bam_cdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_repl_verify, DB___bam_repl)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_root_verify, DB___bam_root)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_curadj_verify, DB___bam_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_rcuradj_verify, DB___bam_rcuradj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __bam_irep_verify, DB___bam_irep)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __fop_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_create_verify, DB___fop_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_remove_verify, DB___fop_remove)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_write_verify, DB___fop_write)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_verify, DB___fop_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_rename_verify, DB___fop_rename_noundo)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __fop_file_remove_verify, DB___fop_file_remove)) != 0)
+		return (ret);
+	return (0);
+}
+
+#ifdef HAVE_HASH
+/*
+ * PUBLIC: int __ham_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_insdel_verify, DB___ham_insdel)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_newpage_verify, DB___ham_newpage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_splitdata_verify, DB___ham_splitdata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_replace_verify, DB___ham_replace)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_copypage_verify, DB___ham_copypage)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_metagroup_verify, DB___ham_metagroup)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_groupalloc_verify, DB___ham_groupalloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_changeslot_verify, DB___ham_changeslot)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_contract_verify, DB___ham_contract)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_curadj_verify, DB___ham_curadj)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __ham_chgpg_verify, DB___ham_chgpg)) != 0)
+		return (ret);
+	return (0);
+}
+
+#endif /* HAVE_HASH */
+#ifdef HAVE_HEAP
+/*
+ * PUBLIC: int __heap_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_addrem_verify, DB___heap_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_pg_alloc_verify, DB___heap_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_meta_verify, DB___heap_trunc_meta)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __heap_trunc_page_verify, DB___heap_trunc_page)) != 0)
+		return (ret);
+	return (0);
+}
+#endif /* HAVE_HEAP */
+#ifdef HAVE_QUEUE
+/*
+ * PUBLIC: int __qam_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_incfirst_verify, DB___qam_incfirst)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_mvptr_verify, DB___qam_mvptr)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_del_verify, DB___qam_del)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_add_verify, DB___qam_add)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_delext_verify, DB___qam_delext)) != 0)
+		return (ret);
+	return (0);
+}
+
+#endif /* HAVE_QUEUE */
+/*
+ * PUBLIC: int __txn_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_verify(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_regop_verify, DB___txn_regop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_ckp_verify, DB___txn_ckp)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_child_verify, DB___txn_child)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_prepare_verify, DB___txn_prepare)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_recycle_verify, DB___txn_recycle)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/log/log_verify_int.c b/src/log/log_verify_int.c
new file mode 100644
index 00000000..abe564c6
--- /dev/null
+++ b/src/log/log_verify_int.c
@@ -0,0 +1,4353 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This file contains verification functions for all types of log records,
+ * one for each type. We can't make this automated like the log_type_print/read
+ * functions because there are no consistent handling. Each type of log records
+ * have unique ways to verify, and unique information to extract.
+ *
+ * In each verification function, we first call the log_type_read function
+ * to get the log_type_args structure, then extract information according to
+ * the type of log. The log types can be made into different categories, each
+ * of which have similar types of information.
+ *
+ * For example, txn_regop and txn_ckp types both have timestamps, and we
+ * want to maintain (timestamp,lsn) mapping, so we will have a on_timestamp
+ * function, and call it in txn_regop_verify and txn_ckp_verify functions,
+ * and in the two functions we may call other on_*** functions to extract and
+ * verify other information.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#include "dbinc/log_verify.h"
+
+static int __log_vrfy_proc __P((DB_LOG_VRFY_INFO *, DB_LSN, DB_LSN,
+    u_int32_t, DB_TXN *, int32_t, int *));
+static int __lv_ckp_vrfy_handler __P((DB_LOG_VRFY_INFO *,
+    VRFY_TXN_INFO *, void *));
+static const char *__lv_dbreg_str __P((u_int32_t));
+static int __lv_dbregid_to_dbtype __P((DB_LOG_VRFY_INFO *, int32_t, DBTYPE *));
+static int __lv_dbt_str __P((const DBT *, char **));
+static const char *__lv_dbtype_str __P((DBTYPE));
+static u_int32_t __lv_first_offset __P((ENV *));
+static int __lv_new_logfile_vrfy __P((DB_LOG_VRFY_INFO *, const DB_LSN *));
+static int __lv_log_fwdscr_oncmt __P((DB_LOG_VRFY_INFO *, DB_LSN,
+    u_int32_t, u_int32_t, int32_t));
+static int __lv_log_fwdscr_onrec __P((DB_LOG_VRFY_INFO *,
+    u_int32_t, u_int32_t, DB_LSN, DB_LSN));
+static int __lv_log_mismatch __P((DB_LOG_VRFY_INFO *, DB_LSN, DBTYPE, DBTYPE));
+static int __lv_on_bam_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_ham_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_heap_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_new_txn __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+    const DB_TXN *, u_int32_t, int32_t, const DBT *));
+static int __lv_on_nontxn_update __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+    u_int32_t, u_int32_t, int32_t));
+static int __lv_on_page_update __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t,
+    db_pgno_t, DB_TXN *, int *));
+static int __lv_on_qam_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_timestamp __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+    int32_t, u_int32_t));
+static int __lv_on_txn_aborted __P((DB_LOG_VRFY_INFO *));
+static int __lv_on_txn_logrec __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+    const DB_LSN *, const DB_TXN *, u_int32_t, int32_t));
+static int __lv_vrfy_for_dbfile __P((DB_LOG_VRFY_INFO *, int32_t, int *));
+
+/* General error handlers, called when a check fails. */
+#define	ON_ERROR(lvh, errv) do {					\
+	(lvh)->flags |= (errv);						\
+	if (F_ISSET((lvh), DB_LOG_VERIFY_CAF))				\
+		ret = 0;/* Ignore the error and continue. */		\
+	goto err;							\
+} while (0)
+
+/* Used by logs of unsupported types. */
+#define	ON_NOT_SUPPORTED(env, lvh, lsn, ltype) do {			\
+	__db_errx((env), DB_STR_A("2536",				\
+	    "[%lu][%lu] Not supported type of log record %u.",		\
+	    "%lu %lu %u"), (u_long)((lsn).file), (u_long)((lsn).offset),\
+	    (ltype));							\
+	(lvh)->unknown_logrec_cnt++;					\
+	goto err;							\
+} while (0)
+
+#define	SKIP_FORWARD_CHK(type) ((type) != DB___txn_regop &&		\
+    (type) != DB___txn_ckp && (type) != DB___fop_rename &&		\
+    (type) != DB___txn_child)
+
+#define	NOTCOMMIT(type) ((type) != DB___txn_regop &&			\
+	(type) != DB___txn_child)
+
+#define	LOG_VRFY_PROC(lvh, lsn, argp, fileid) do {			\
+	int __lv_log_vrfy_proc_step = 0;				\
+	if ((ret = __log_vrfy_proc((lvh), (lsn), (argp)->prev_lsn,	\
+	    (argp)->type, (argp)->txnp, (fileid),			\
+	    &__lv_log_vrfy_proc_step)) != 0)				\
+		goto err;						\
+	if (__lv_log_vrfy_proc_step == 1)				\
+		goto out;						\
+	else if (__lv_log_vrfy_proc_step == -1)				\
+		goto err;						\
+	else								\
+		DB_ASSERT(lvh->dbenv->env,				\
+		    __lv_log_vrfy_proc_step == 0);			\
+} while (0)
+
+/* Log record handlers used by log types involving page updates. */
+#define	ON_PAGE_UPDATE(lvh, lsn, argp, pgno) do {			\
+	int __lv_onpgupdate_res;					\
+	if ((ret = __lv_on_page_update((lvh), (lsn), (argp)->fileid,	\
+	    (pgno), (argp)->txnp, &__lv_onpgupdate_res)) != 0)		\
+		goto err;						\
+	if (__lv_onpgupdate_res == 1)					\
+		goto out;						\
+	else if (__lv_onpgupdate_res == -1)				\
+		goto err;						\
+	else								\
+		DB_ASSERT(lvh->dbenv->env, __lv_onpgupdate_res == 0);	\
+} while (0)
+
+static int
+__lv_on_page_update(lvh, lsn, fileid, pgno, txnp, step)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	int32_t fileid;
+	db_pgno_t pgno;
+	DB_TXN *txnp;
+	int *step;
+{
+	u_int32_t otxn, txnid;
+	int res, ret;
+
+	txnid = txnp->txnid;
+	res = ret = 0;
+
+	if ((ret = __add_page_to_txn(lvh, fileid, pgno,
+	    txnid, &otxn, &res)) != 0)
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+	if (res != -1) {/* No access violation, we are done. */
+		*step = 0;
+		goto out;
+	}
+	/*
+	 * It's OK for a child txn to update its parent's page, but not OK
+	 * for a parent txn to update its active child's pages. We can't
+	 * detect the child's abort, so we may false alarm that a parent txn
+	 * is updating its child's pages.
+	 */
+	if ((ret = __is_ancestor_txn(lvh, otxn, txnid, lsn, &res)) != 0)
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+	if (res) {/* The txnid is updating its parent otxn's pages. */
+		*step = 0;
+		goto out;
+	}
+	if ((ret = __is_ancestor_txn(lvh, txnid, otxn, lsn, &res)) != 0)
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+	if (res) {/* The txnid is updating its active child otxn's pages. */
+		__db_errx(lvh->dbenv->env, DB_STR_A("2537",
+		    "[%lu][%lu] [WARNING] Parent txn %lx is updating its "
+		    "active child txn %lx's pages, or %lx aborted.",
+		    "%lu %lu %lx %lx %lx"), (u_long)lsn.file,
+		    (u_long)lsn.offset, (u_long)txnid,
+		    (u_long)otxn, (u_long)otxn);
+		*step = 0;
+		goto out;
+	}
+	/*
+	 * It's likely that the two txns are parent-child and the child
+	 * aborted, but from the log we can't figure out this fact.
+	 */
+	__db_errx(lvh->dbenv->env, DB_STR_A("2538",
+	    "[%lu][%lu] [WARNING] Txn %lx is updating txn %lx's pages.",
+	    "%lu %lu %lx %lx"), (u_long)lsn.file, (u_long)lsn.offset,
+	    (u_long)txnid, (u_long)otxn);
+	*step = 0;
+out:
+err:
+	return (ret);
+}
+
+/*
+ * This macro is put in all types of verify functions where a db file is
+ * updated, but no page number/lock involved.
+ */
+#define	ON_PAGE_UPDATE4
+
+/*
+ * General log record handler used by all log verify functions.
+ */
+static int
+__log_vrfy_proc(lvh, lsn, prev_lsn, type, txnp, fileid, step)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn, prev_lsn;
+	u_int32_t type; /* Log record type. */
+	DB_TXN *txnp;
+	int32_t fileid;
+	int *step;
+{
+	int dovrfy, ret;
+
+	dovrfy = 1;
+	ret = 0;
+	/*
+	 * step is used to tell if go on with the rest of the caller, or
+	 * goto err/out.
+	 * 0: go on after this function; 1: goto out; -1: goto err.
+	 */
+	*step = 0;
+
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		/* Commits are not abort/beginnings. */
+		if (NOTCOMMIT(type) && ((ret = __lv_log_fwdscr_onrec(
+		    lvh, txnp->txnid, type, prev_lsn, lsn)) != 0))
+			goto err;
+		if (SKIP_FORWARD_CHK(type))
+			goto out;
+	} else {/* Verifying */
+		if (F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE))
+			__db_errx(lvh->dbenv->env, DB_STR_A("2539",
+			    "[%lu][%lu] Verifying log record of type %s",
+			    "%lu %lu %s"), (u_long)lsn.file,
+			    (u_long)lsn.offset, LOGTYPE_NAME(lvh, type));
+		/*
+		 * If verifying a log range and we've passed the initial part
+		 * which may have partial txns, remove the PARTIAL bit.
+		 */
+		if (F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL) &&
+		    LOG_COMPARE(&lsn, &(lvh->valid_lsn)) >= 0) {
+			lvh->valid_lsn.offset = lvh->valid_lsn.file = 0;
+			F_CLR(lvh, DB_LOG_VERIFY_PARTIAL);
+		}
+
+		if ((ret = __lv_new_logfile_vrfy(lvh, &lsn)) != 0)
+			ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+		/* If only verify a db file, ignore logs about other dbs. */
+		if (F_ISSET(lvh, DB_LOG_VERIFY_DBFILE) && fileid !=
+		    INVAL_DBREGID && (ret = __lv_vrfy_for_dbfile(lvh,
+		    fileid, &dovrfy)) != 0)
+			goto err;
+		if (!dovrfy)
+			goto out;
+		if (lvh->aborted_txnid != 0 &&
+		    ((ret = __lv_on_txn_aborted(lvh)) != 0))
+			goto err;
+		if ((ret = __get_aborttxn(lvh, lsn)) != 0)
+			goto err;
+		if (txnp->txnid >= TXN_MINIMUM) {
+			if ((ret = __lv_on_txn_logrec(lvh, &lsn, &(prev_lsn),
+			    txnp, type, fileid)) != 0)
+				ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+		} else {/* Non-txnal updates. */
+			if ((ret = __lv_on_nontxn_update(lvh, &lsn,
+			    txnp->txnid, type, fileid)) != 0)
+				ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+		}
+	}
+	if (0) {
+out:
+		*step = 1;
+	}
+	if (0) {
+err:
+		*step = -1;
+	}
+	return (ret);
+}
+
+/* Log record handlers used by log types for each access method. */
+static int
+__lv_on_bam_log(lvh, lsn, fileid)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	int32_t fileid;
+{
+	int ret;
+	DBTYPE dbtype;
+	if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+	    dbtype != DB_BTREE && dbtype != DB_RECNO && dbtype != DB_HASH)
+		ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_BTREE);
+	if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+		ret = 0;
+	return (ret);
+}
+
+static int
+__lv_on_ham_log(lvh, lsn, fileid)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	int32_t fileid;
+{
+	int ret;
+	DBTYPE dbtype;
+	if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+	    dbtype != DB_HASH)
+		ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_HASH);
+	if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+		ret = 0;
+	return (ret);
+}
+
+static int
+__lv_on_heap_log(lvh, lsn, fileid)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	int32_t fileid;
+{
+	int ret;
+	DBTYPE dbtype;
+	if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+	    dbtype != DB_HEAP)
+		ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_HEAP);
+	if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+		ret = 0;
+	return (ret);
+}
+
+static int
+__lv_on_qam_log(lvh, lsn, fileid)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	int32_t fileid;
+{
+	int ret;
+	DBTYPE dbtype;
+	if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+	    dbtype != DB_QUEUE)
+		ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_QUEUE);
+	if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+		ret = 0;
+	return (ret);
+}
+
+/* Catch commits and store into lvinfo->txnrngs database. */
+static int
+__lv_log_fwdscr_oncmt(lvinfo, lsn, txnid, ptxnid, timestamp)
+	DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+	u_int32_t txnid, ptxnid;
+	int32_t timestamp;
+{
+	int ret;
+	struct __lv_txnrange tr;
+	DBT key, data;
+
+	memset(&tr, 0, sizeof(tr));
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	tr.txnid = txnid;
+	tr.end = lsn;
+	tr.when_commit = timestamp;
+	tr.ptxnid = ptxnid;
+	key.data = &(txnid);
+	key.size = sizeof(txnid);
+	data.data = &tr;
+	data.size = sizeof(tr);
+	if ((ret = __db_put(lvinfo->txnrngs, lvinfo->ip, NULL,
+	    &key, &data, 0)) != 0)
+		goto err;
+err:
+	return (ret);
+}
+
+/* Catch aborts and txn beginnings and store into lvinfo->txnrngs database. */
+static int
+__lv_log_fwdscr_onrec(lvinfo, txnid, lrtype, prevlsn, lsn)
+	DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t txnid, lrtype;
+	DB_LSN prevlsn, lsn;
+{
+	int doput, ret, ret2, tret;
+	u_int32_t putflag;
+	struct __lv_txnrange tr, *ptr;
+	DBC *csr;
+	DBT key, key2, data, data2;
+
+	/* Ignore non-txnal log records. */
+	if (txnid < TXN_MINIMUM)
+		return (0);
+
+	/* Not used for now, but may be used later. Pass lint checks. */
+	COMPQUIET(lrtype ,0);
+	putflag = 0;
+	doput = ret = ret2 = 0;
+	csr = NULL;
+	memset(&tr, 0, sizeof(tr));
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+	key.data = &txnid;
+	key.size = sizeof(txnid);
+	tr.txnid = txnid;
+	tr.when_commit = 0;/* This is not a __txn_regop record. */
+
+	if ((ret = __db_cursor(lvinfo->txnrngs, lvinfo->ip,
+	    NULL, &csr, 0)) != 0)
+		goto err;
+	/*
+	 * If the txnid is first seen here or reused later, it's aborted
+	 * after this log record; if this log record is the 1st one of a txn,
+	 * we have the beginning of the txn; otherwise the log record is one
+	 * of the actions taken within the txn, and we don't do anything.
+	 */
+	if ((ret = __dbc_get(csr, &key, &data, DB_SET)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+
+	ptr = (struct __lv_txnrange *)data.data;
+	if (ret == DB_NOTFOUND || !IS_ZERO_LSN(ptr->begin)) {
+		tr.end = lsn;
+		data.data = &tr;
+		data.size = sizeof(tr);
+		doput = 1;
+		key2.data = &lsn;
+		key2.size = sizeof(lsn);
+		data2.data = &(tr.txnid);
+		data2.size = sizeof(tr.txnid);
+		putflag = DB_KEYFIRST;
+		if ((ret2 = __db_put(lvinfo->txnaborts, lvinfo->ip, NULL,
+		    &key2, &data2, 0)) != 0) {
+			ret = ret2;
+			goto err;
+		}
+	} else if (ret == 0 && IS_ZERO_LSN(prevlsn)) {/* The beginning of txn.*/
+		/* The begin field must be [0, 0]. */
+		DB_ASSERT(lvinfo->dbenv->env, IS_ZERO_LSN(ptr->begin));
+		ptr->begin = lsn;
+		putflag = DB_CURRENT;
+		doput = 1;
+	}
+
+	if (doput && (ret = __dbc_put(csr, &key, &data, putflag)) != 0)
+		goto err;
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+
+	return (ret);
+}
+
+/*
+ * Return 0 from dovrfy if verifying logs for a specified db file, and fileid
+ * is not the one we want; Otherwise return 1 from dovrfy. If DB operations
+ * failed, the error is returned.
+ */
+static int
+__lv_vrfy_for_dbfile(lvh, fileid, dovrfy)
+	DB_LOG_VRFY_INFO *lvh;
+	int32_t fileid;
+	int *dovrfy;
+{
+	u_int8_t tmpuid[DB_FILE_ID_LEN];
+	VRFY_FILEREG_INFO *fregp;
+	u_int32_t i;
+	int ret, tret;
+	DBT tgtkey;
+
+	ret = tret = 0;
+	*dovrfy = 0;
+	fregp = NULL;
+	memset(tmpuid, 0, sizeof(u_int8_t) * DB_FILE_ID_LEN);
+	memset(&tgtkey, 0, sizeof(tgtkey));
+	tgtkey.data = lvh->target_dbid;
+	tgtkey.size = DB_FILE_ID_LEN;
+	ret = __get_filereg_info(lvh, &tgtkey, &fregp);
+
+	/*
+	 * If the target db file is not seen yet, we don't verify any file,
+	 * and it does not mean anything wrong.
+	 */
+	if (ret == DB_NOTFOUND) {
+		ret = 0;
+		goto out;
+	}
+	if (ret != 0)
+		goto err;
+
+	for (i = 0; i < fregp->regcnt; i++)
+		if (fregp->dbregids[i] == fileid) {
+			*dovrfy = 1;
+			goto out;
+		}
+out:
+err:
+	if (fregp != NULL &&
+	    (tret = __free_filereg_info(fregp)) != 0 && ret == 0)
+		ret = tret;
+
+	return (ret);
+}
+
+static int
+__lv_log_mismatch(lvh, lsn, dbtype, exp_dbtype)
+	DB_LOG_VRFY_INFO *lvh;
+	DB_LSN lsn;
+	DBTYPE dbtype, exp_dbtype;
+{
+	int ret;
+
+	__db_errx(lvh->dbenv->env, DB_STR_A("2540",
+	    "[%lu][%lu] Log record type does not match related database type, "
+	    "current database type: %s, expected database type according to "
+	    "the log record type: %s.", "%lu %lu %s %s"),
+	    (u_long)lsn.file, (u_long)lsn.offset, __lv_dbtype_str(dbtype),
+	    __lv_dbtype_str(exp_dbtype));
+	ret = DB_LOG_VERIFY_BAD;
+	ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+err:
+	return (ret);
+}
+
+static int
+__lv_dbregid_to_dbtype(lvh, id, ptype)
+	DB_LOG_VRFY_INFO *lvh;
+	int32_t id;
+	DBTYPE *ptype;
+{
+	int ret;
+	VRFY_FILELIFE *pflife;
+
+	ret = 0;
+	pflife = NULL;
+
+	if ((ret = __get_filelife(lvh, id, &pflife)) != 0)
+		goto err;
+	*ptype = pflife->dbtype;
+err:
+	if (pflife != NULL)
+	    __os_free(lvh->dbenv->env, pflife);
+
+	return (ret);
+}
+
+/*
+ * __db_log_verify_global_report --
+ *	Report statistics data in DB_LOG_VRFY_INFO handle.
+ *
+ * PUBLIC: void __db_log_verify_global_report __P((const DB_LOG_VRFY_INFO *));
+ */
+void __db_log_verify_global_report (lvinfo)
+	const DB_LOG_VRFY_INFO *lvinfo;
+{
+	u_int32_t i, nltype;
+
+	__db_msg(lvinfo->dbenv->env,
+	    "Number of active transactions: %u;", lvinfo->ntxn_active);
+	__db_msg(lvinfo->dbenv->env,
+	    "Number of committed transactions: %u;", lvinfo->ntxn_commit);
+	__db_msg(lvinfo->dbenv->env,
+	    "Number of aborted transactions: %u;", lvinfo->ntxn_abort);
+	__db_msg(lvinfo->dbenv->env,
+	    "Number of prepared transactions: %u;", lvinfo->ntxn_prep);
+	__db_msg(lvinfo->dbenv->env,
+	    "Total number of checkpoint: %u;", lvinfo->nckp);
+	__db_msg(lvinfo->dbenv->env,
+	    "Total number of non-transactional updates: %u;",
+	    lvinfo->non_txnup_cnt);
+	__db_msg(lvinfo->dbenv->env,
+	    "Total number of unknown log records: %u;",
+	    lvinfo->unknown_logrec_cnt);
+	__db_msg(lvinfo->dbenv->env,
+	    "Total number of app-specific log record: %u;",
+	    lvinfo->external_logrec_cnt);
+	__db_msg(lvinfo->dbenv->env,
+	    "The number of each type of log record:");
+
+	for (i = 0; i < 256; i++) {
+		nltype = lvinfo->lrtypes[i];
+		if (LOGTYPE_NAME(lvinfo, i) != NULL)
+			__db_msg(lvinfo->dbenv->env, "\n\t%s : %u;",
+			    LOGTYPE_NAME(lvinfo, i), nltype);
+	}
+}
+
+/*
+ * PUBLIC: int __crdel_metasub_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__crdel_metasub_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__crdel_metasub_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __crdel_metasub_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_verify __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__crdel_inmem_create_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_verify __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__crdel_inmem_rename_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_verify __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__crdel_inmem_remove_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_addrem_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_addrem_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_big_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_big_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_big_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_ovref_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_ovref_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_ovref_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_relink_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_relink_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_relink_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_debug_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_debug_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_noop_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_noop_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_noop_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_alloc_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_alloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_alloc_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_free_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_free_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_free_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_free_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_cksum_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_cksum_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_verify __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_freedata_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_freedata_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_freedata_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_freedata_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_freedata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_init_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_init_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_init_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_sort_44_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_sort_44_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_sort_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_trunc_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pg_trunc_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pg_trunc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+out:
+err:
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_realloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_realloc_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_realloc_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_realloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_relink_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_relink_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_relink_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_merge_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_merge_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_merge_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_merge_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pgno_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__db_pgno_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __db_pgno_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+static const char *
+__lv_dbreg_str(op)
+	u_int32_t op;
+{
+	const char *p;
+
+	switch (op) {
+	case DBREG_CHKPNT:
+		p = "DBREG_CHKPNT";
+		break;
+	case DBREG_RCLOSE:
+		p = "DBREG_RCLOSE";
+		break;
+	case DBREG_CLOSE:
+		p = "DBREG_CLOSE";
+		break;
+	case DBREG_OPEN:
+		p = "DBREG_OPEN";
+		break;
+	case DBREG_PREOPEN:
+		p = "DBREG_PREOPEN";
+		break;
+	case DBREG_REOPEN:
+		p = "DBREG_REOPEN";
+		break;
+	case DBREG_XCHKPNT:
+		p = "DBREG_XCHKPNT";
+		break;
+	case DBREG_XOPEN:
+		p = "DBREG_XOPEN";
+		break;
+	case DBREG_XREOPEN:
+		p = "DBREG_XREOPEN";
+		break;
+	default:
+		p = DB_STR_P("Unknown dbreg op code");
+		break;
+	}
+
+	return (p);
+}
+
+static int
+__lv_dbt_str(dbt, str)
+	const DBT *dbt;
+	char **str;
+{
+	char *p, *q;
+	u_int32_t buflen, bufsz, i;
+	int ret;
+
+	ret = 0;
+	p = q = NULL;
+	buflen = bufsz = i = 0;
+	bufsz = sizeof(char) * dbt->size * 2;
+
+	if ((ret = __os_malloc(NULL, bufsz, &p)) != 0)
+		goto err;
+	q = (char *)dbt->data;
+
+	memset(p, 0, bufsz);
+	/*
+	 * Each unprintable character takes up several bytes, so be ware of
+	 * memory access violation.
+	 */
+	for (i = 0; i < dbt->size && buflen < bufsz; i++) {
+		buflen = (u_int32_t)strlen(p);
+		snprintf(p + buflen, bufsz - (buflen + 1),
+		    isprint(q[i]) || q[i] == 0x0a ? "%c" : "%x", q[i]);
+	}
+	*str = p;
+err:
+	return (ret);
+}
+
+static const char *
+__lv_dbtype_str(dbtype)
+	DBTYPE dbtype;
+{
+	char *p;
+
+	switch (dbtype) {
+	case DB_BTREE:
+		p = "DB_BTREE";
+		break;
+	case DB_HASH:
+		p = "DB_HASH";
+		break;
+	case DB_RECNO:
+		p = "DB_RECNO";
+		break;
+	case DB_QUEUE:
+		p = "DB_QUEUE";
+		break;
+	default:
+		p = DB_STR_P("Unknown db type");
+		break;
+	}
+
+	return (p);
+}
+
+/*
+ * PUBLIC: int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__dbreg_register_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__dbreg_register_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_FILEREG_INFO *fregp, freg;
+	VRFY_FILELIFE *pflife, flife;
+	int checklife, rmv_dblife, ret, ret2;
+	u_int32_t opcode;
+	char *puid;
+	const char *dbfname;
+
+	dbfname = NULL;
+	checklife = 1;
+	opcode = 0;
+	ret = ret2 = rmv_dblife = 0;
+	puid = NULL;
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	fregp = NULL;
+	pflife = NULL;
+	memset(&flife, 0, sizeof(flife));
+	memset(&freg, 0, sizeof(freg));
+
+	if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+	dbfname = argp->name.size == 0 ? "(null)" : (char *)(argp->name.data);
+	/*
+	 * We don't call LOG_VRFY_PROC macro here, so we have to copy the code
+	 * snippet in __log_vrfy_proc here.
+	 */
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		if ((ret = __lv_log_fwdscr_onrec(lvh, argp->txnp->txnid,
+		    argp->type, argp->prev_lsn, *lsnp)) != 0)
+			goto err;
+		goto out;
+	}
+	if (lvh->aborted_txnid != 0 && (ret = __lv_on_txn_aborted(lvh)) != 0)
+		goto err;
+
+	if ((ret = __get_filereg_info(lvh, &(argp->uid), &fregp)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+
+	/*
+	 * When DBREG_CLOSE, we should remove the fileuid-filename mapping
+	 * from filereg because the file can be opened again with a different
+	 * fileuid after closed.
+	 */
+	if (ret == 0 && IS_DBREG_CLOSE(opcode)) {
+		if ((ret = __db_del(lvh->fileregs, lvh->ip, NULL,
+		    &(argp->uid), 0)) != 0)
+			goto err;
+	}
+
+	/*
+	 * If this db file is seen for the 1st time, store filereg and
+	 * filelife info. Since we will do a end-to-begin scan before the
+	 * verification, we will be able to get the record but it's regcnt
+	 * is 0 since we didn't know any dbregid yet.
+	 */
+	if (ret == DB_NOTFOUND || fregp->regcnt == 0) {
+		/* Store filereg info unless it's a CLOSE. */
+		freg.fileid = argp->uid;
+		if (!IS_DBREG_CLOSE(opcode)) {
+			freg.regcnt = 1;
+			freg.dbregids = &(argp->fileid);
+		} else {
+			freg.regcnt = 0;
+			freg.dbregids = NULL;
+		}
+		if (ret == DB_NOTFOUND) {
+		/*
+		 * If the db file is an in-memory db file, we can arrive
+		 * here because there is no __fop_rename log for it;
+		 * if the __fop_rename log record is out of the log range we
+		 * verify, we will also arrive here.
+		 */
+			if ((ret = __os_malloc(env, argp->name.size + 1,
+			    &(freg.fname))) != 0)
+				goto err;
+			memset(freg.fname, 0,
+			    sizeof(char) * (argp->name.size + 1));
+			(void)strncpy(freg.fname,
+			    (const char *)(argp->name.data), argp->name.size);
+		} else /* We already have the name. */
+			if ((ret = __os_strdup(env,
+			    fregp->fname, &(freg.fname))) != 0)
+				goto err;
+
+		if (!IS_DBREG_OPEN(opcode) &&
+		    !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+			/* It's likely that the DBREG_OPEN is not seen.*/
+			__db_msg(env, DB_STR_A("2541",
+			    "[%lu][%lu] Suspicious dbreg operation: %s, the "
+			    "database file %s's register in log region does "
+			    "not begin with an open operation.",
+			    "%lu %lu %s %s"), (u_long)lsnp->file,
+			    (u_long)lsnp->offset,
+			    __lv_dbreg_str(opcode), dbfname);
+		}
+
+		/*
+		 * PREOPEN is only generated when opening an in-memory db.
+		 * Because we need to log the fileid we're allocating, but we
+		 * don't have all the details yet, we are preopening the
+		 * database and will actually complete the open later. So
+		 * PREOPEN is not a real open, and the log should be ignored
+		 * in log_verify.
+		 * If fileuid is in a CLOSE operation there is no need to
+		 * record it.
+		 */
+		if ((opcode != DBREG_PREOPEN) && !IS_DBREG_CLOSE(opcode) &&
+		    (ret = __put_filereg_info(lvh, &freg)) != 0)
+			goto err;
+
+		/* Store filelife info unless it's a CLOSE dbreg operation. */
+		if (!IS_DBREG_CLOSE(opcode)) {
+			flife.lifetime = opcode;
+			flife.dbregid = argp->fileid;
+			flife.lsn = *lsnp;
+			flife.dbtype = argp->ftype;
+			flife.meta_pgno = argp->meta_pgno;
+			memcpy(flife.fileid, argp->uid.data, argp->uid.size);
+			if ((ret = __put_filelife(lvh, &flife)) != 0)
+				goto err;
+		}
+		/* on_txn_logrec relies on the freg info in db first. */
+		LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+		goto out;
+	}
+
+	/*
+	 * Add dbregid if it's new, and store the file register info; or
+	 * remove dbregid from fregp if we are closing the file.
+	 */
+	if ((ret = __add_dbregid(lvh, fregp, argp->fileid,
+	    opcode, *lsnp, argp->ftype, argp->meta_pgno, &ret2)) != 0)
+		goto err;
+	ret = ret2;
+	if (ret != 0 && ret != 1 && ret != 2 && ret != -1)
+		goto err;/* DB operation error. */
+	if (ret != 0) {
+		/* Newly seen dbregid does not need to check life. */
+		if (ret == 1)
+			checklife = 0;
+		else if (ret == -1)
+			rmv_dblife = 1;/* The dbreg file id is closed. */
+		else if (ret == 2) {
+			__db_errx(env, DB_STR_A("2542",
+			    "[%lu][%lu] Wrong dbreg operation "
+			    "sequence, opening %s for id %d which is already "
+			    "open.", "%lu %lu %s %d"),
+			    (u_long)lsnp->file, (u_long)lsnp->offset,
+			    dbfname, argp->fileid);
+			ret = DB_LOG_VERIFY_BAD;
+			ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+		}
+		if (!rmv_dblife && (ret = __put_filereg_info(lvh, fregp)) != 0)
+			goto err;
+	}
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	if (!checklife)
+		goto out;
+
+	/*
+	 * Verify the database type does not change, and the lifetime of a
+	 * db file follow an open/chkpnt->[chkpnt]->close order.
+	 * A VRFY_FILELIFE record is removed from db on DBREG_CLOSE,
+	 * and inserted into db on DBREG_OPEN.
+	 */
+	if (!IS_DBREG_OPEN(opcode) &&
+	    (ret = __get_filelife(lvh, argp->fileid, &pflife)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			if (!F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+				__db_errx(env, DB_STR_A("2543",
+				    "[%lu][%lu] Wrong dbreg operation sequence,"
+				    "file %s with id %d is first seen of "
+				    "status: %s", "%lu %lu %s %d"),
+				    (u_long)lsnp->file, (u_long)lsnp->offset,
+				    dbfname, argp->fileid,
+				    __lv_dbreg_str(opcode));
+				ret = DB_LOG_VERIFY_BAD;
+				ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+			} else
+				ret = 0;
+		}
+		goto err;
+	}
+
+	/* Can't go on verifying without pflife. */
+	if (pflife == NULL)
+		goto out;
+	if (argp->ftype != pflife->dbtype) {
+		if ((ret = __lv_dbt_str(&(argp->uid), &puid)) != 0)
+			goto err;
+		__db_errx(env, DB_STR_A("2544",
+		    "[%lu][%lu] The dbtype of database file %s with uid %s "
+		    " and id %d has changed from %s to %s.",
+		    "%lu %lu %s %s %d %s %s"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, dbfname, puid,
+		    pflife->dbregid, __lv_dbtype_str(pflife->dbtype),
+		    __lv_dbtype_str(argp->ftype));
+
+		__os_free(env, puid);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+	if ((IS_DBREG_CLOSE(opcode) &&
+	    (pflife->lifetime != DBREG_CHKPNT ||
+	    pflife->lifetime != DBREG_XCHKPNT) &&
+	    !IS_DBREG_OPEN(pflife->lifetime))) {
+		__db_errx(env, DB_STR_A("2545",
+		    "[%lu][%lu] Wrong dbreg operation sequence for file %s "
+		    "with id %d, current status: %s, new status: %s",
+		    "%lu %lu %s %d %s %s"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, dbfname, pflife->dbregid,
+		    __lv_dbreg_str(pflife->lifetime),
+		    __lv_dbreg_str(opcode));
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+	pflife->lifetime = opcode;
+	pflife->lsn = *lsnp;
+	if ((!rmv_dblife && (ret = __put_filelife(lvh, pflife)) != 0) ||
+	    ((rmv_dblife || IS_DBREG_CLOSE(opcode)) &&
+	    ((ret = __del_filelife(lvh, argp->fileid)) != 0)))
+		goto err;
+
+out:
+	/* There may be something to do here in future. */
+err:
+	__os_free(env, argp);
+	if (fregp != NULL &&
+	    (ret2 = __free_filereg_info(fregp)) != 0 && ret == 0)
+		ret = ret2;
+	if (freg.fname != NULL)
+		__os_free(env, freg.fname);
+	if (pflife != NULL)
+		__os_free(env, pflife);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_split_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_split_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_split_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->left);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->right);
+	/* Parent page lock is always released before __bam_page returns. */
+
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_split_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_split_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_split_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_rsplit_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_rsplit_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_rsplit_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_adj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_adj_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_adj_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_adj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_irep_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_irep_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_irep_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_irep_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_cadjust_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_cadjust_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_cadjust_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_cdel_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_cdel_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_cdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_repl_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_repl_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_repl_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_repl_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_root_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_root_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_root_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_root_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_curadj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_curadj_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_rcuradj_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_rcuradj_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_rcuradj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_relink_43_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_relink_43_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_relink_43_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__bam_merge_44_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__bam_merge_44_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __bam_merge_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_create_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_create_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_create_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_create_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_create_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_create_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_create_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_remove_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_remove_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_write_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_write_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_write_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_write_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_write_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_write_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_write_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_rename_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_rename_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_rename_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_rename_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_rename_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	char *buf;
+	int ret;
+	size_t buflen;
+	VRFY_FILEREG_INFO freg, *fregp;
+
+	memset(&freg, 0, sizeof(freg));
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	buf = NULL;
+
+	if ((ret = __fop_rename_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		/*
+		 * Since we get the fname-fuid map when iterating from end to
+		 * beginning, we only store the latest file name, that's the
+		 * name supposed to be used currently. So if the fileid is
+		 * already stored, and we see it again here, it means the db
+		 * file was renamed and we already have its latest name.
+		 *
+		 * Store the dbfile path (dir/fname) in case there are db
+		 * files with same name in different data directories.
+		 */
+		if (__get_filereg_info(lvh, &(argp->fileid), &fregp) == 0) {
+			if (fregp != NULL &&
+			    (ret = __free_filereg_info(fregp)) != 0)
+				goto err;
+			goto out;
+		}
+		freg.fileid = argp->fileid;
+		if ((ret = __os_malloc(env, buflen = argp->dirname.size +
+		    argp->newname.size + 2, &buf)) != 0)
+			goto err;
+		snprintf(buf, buflen, "%s/%s", (char *)argp->dirname.data,
+		    (char *)argp->newname.data);
+		freg.fname = buf;
+		/* Store the dbfilename<-->dbfileid map. */
+		if ((ret = __put_filereg_info(lvh, &freg)) != 0)
+			goto err;
+	}
+out:
+
+err:
+	if (buf != NULL)
+		__os_free(lvh->dbenv->env, buf);
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__fop_file_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__fop_file_remove_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __fop_file_remove_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+#ifdef HAVE_HASH
+/*
+ * PUBLIC: int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_insdel_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_insdel_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_insdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_newpage_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_newpage_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_newpage_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_splitdata_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_splitdata_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_splitdata_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_splitdata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_replace_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_replace_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_replace_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_replace_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_copypage_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_copypage_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_copypage_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_copypage_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_metagroup_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_metagroup_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_metagroup_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_metagroup_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_metagroup_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_metagroup_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_42_verify __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_groupalloc_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_groupalloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_groupalloc_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_groupalloc_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_FILELIFE *pflife;
+	int ret;
+
+	ret = 0;
+	pflife = NULL;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_groupalloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+
+	/*
+	 * The __ham_groupalloc record is only generated when creating the
+	 * hash sub database so it will always be on the master database's
+	 * fileid.
+	 */
+
+	if ((ret = __get_filelife(lvh, argp->fileid, &pflife)) != 0)
+		goto err;
+
+	if (pflife->meta_pgno != PGNO_BASE_MD) {
+		__db_errx(lvh->dbenv->env, DB_STR_A("2546",
+		    "[%lu][%lu] __ham_groupalloc should apply only to the "
+		    "master database with meta page number 0, current meta "
+		    "page number is %d.", "%lu %lu %d"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		     pflife->meta_pgno);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+out:
+
+err:
+	if (pflife != NULL)
+	    __os_free(lvh->dbenv->env, pflife);
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_changeslot_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_changeslot_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_changeslot_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_changeslot_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_contract_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_contract_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_contract_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_curadj_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__ham_chgpg_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__ham_chgpg_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __ham_chgpg_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+	if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+#endif
+
+#ifdef HAVE_HEAP
+/*
+ * PUBLIC: int __heap_addrem_verify
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__heap_addrem_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __heap_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+out:
+
+err:
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __heap_pg_alloc_verify
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__heap_pg_alloc_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __heap_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+out:
+
+err:
+	__os_free(env, argp);
+	return (ret);	
+}
+
+/*
+ * PUBLIC: int __heap_trunc_meta_verify
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_meta_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__heap_trunc_meta_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __heap_trunc_meta_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);	
+}
+
+/*
+ * PUBLIC: int __heap_trunc_page_verify
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_page_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__heap_trunc_page_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __heap_trunc_page_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+	if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+out:
+
+err:
+	__os_free(env, argp);
+	return (ret);
+}
+#endif
+
+#ifdef HAVE_QUEUE
+/*
+ * PUBLIC: int __qam_incfirst_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_incfirst_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__qam_incfirst_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __qam_incfirst_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_mvptr_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_mvptr_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__qam_mvptr_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __qam_mvptr_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_del_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_del_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__qam_del_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __qam_del_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_add_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_add_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__qam_add_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __qam_add_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+	if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_delext_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_delext_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__qam_delext_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret =
+	    __qam_delext_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+	if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+		goto err;
+
+out:
+
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+#endif
+
+/*
+ * PUBLIC: int __txn_regop_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_regop_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_regop_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_regop_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_regop_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret, ret2, started;
+	VRFY_TXN_INFO *ptvi, *pptvi;
+	VRFY_TIMESTAMP_INFO tsinfo;
+
+	ptvi = pptvi = NULL;
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	ret = ret2 = started = 0;
+
+	if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * The __lv_log_fwdscr_oncmt call must precede LOG_VRFY_PROC otherwise
+	 * this txn will be taken as an aborted txn.
+	 */
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		if ((ret = __lv_log_fwdscr_oncmt(lvh, *lsnp,
+		    argp->txnp->txnid, 0, argp->timestamp)) != 0)
+			goto err;
+
+		tsinfo.lsn = *lsnp;
+		tsinfo.timestamp = argp->timestamp;
+		tsinfo.logtype = argp->type;
+		if ((ret = __put_timestamp_info(lvh, &tsinfo)) != 0)
+			goto err;
+		goto out; /* We are done. */
+	}
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+	if ((ret = __del_txn_pages(lvh, argp->txnp->txnid)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;/* Some txns may have updated no pages. */
+	if ((ret = __lv_on_timestamp(lvh, lsnp, argp->timestamp,
+	    DB___txn_regop)) != 0)
+		goto err;
+	if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+		if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+		    (ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+		    argp->txnp->txnid, &started)) == 0 && started != 0) {
+			ret = 0;
+			goto err;
+		}
+		if (ret2 != 0)
+			ret = ret2;
+		__db_errx(lvh->dbenv->env, DB_STR_A("2547",
+		    "[%lu][%lu] Can not find an active transaction's "
+		    "information, txnid: %lx.", "%lu %lu %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)argp->txnp->txnid);
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+	}
+
+	if (ptvi == NULL) {
+		if (ret == DB_NOTFOUND &&
+		    F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+
+	}
+	DB_ASSERT(env, ptvi->ptxnid == 0);
+
+	/*
+	 * This log record is only logged when committing a outermost txn,
+	 * child txn commits are logged in __txn_child_log.
+	 */
+	if (ptvi->ptxnid == 0) {
+		if (ptvi->status == TXN_STAT_PREPARE)
+			lvh->ntxn_prep--;
+		else if (ptvi->status == TXN_STAT_ACTIVE)
+			lvh->ntxn_active--;
+		lvh->ntxn_commit++;
+	}
+	ptvi->status = TXN_STAT_COMMIT;
+	DB_ASSERT(env, IS_ZERO_LSN(ptvi->last_lsn));
+	ptvi->last_lsn = *lsnp;
+	if ((ret = __put_txn_vrfy_info(lvh, ptvi)) != 0)
+		goto err;
+
+	/* Report txn stats. */
+	if (F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE))
+		__db_msg(env, DB_STR_A("2548",
+		    "[%lu][%lu] The number of active, committed and aborted "
+		    "child txns of txn %lx: %u, %u, %u.",
+		    "%lu %lu %lx %u %u %u"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)ptvi->txnid,
+		    ptvi->nchild_active, ptvi->nchild_commit,
+		    ptvi->nchild_abort);
+out:
+err:
+
+	if (pptvi != NULL && (ret2 = __free_txninfo(pptvi)) != 0 && ret == 0)
+		ret = ret2;
+	if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+		ret = ret2;
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_ckp_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_ckp_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_ckp_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_ckp_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_CKP_INFO *lastckp, ckpinfo;
+	int ret;
+	struct __ckp_verify_params cvp;
+	VRFY_TIMESTAMP_INFO tsinfo;
+	char timebuf[CTIME_BUFLEN];
+	time_t ckp_time, lastckp_time;
+
+	lastckp = NULL;
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	memset(&ckpinfo, 0, sizeof(ckpinfo));
+	memset(&cvp, 0, sizeof(cvp));
+
+	if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		tsinfo.lsn = *lsnp;
+		tsinfo.timestamp = argp->timestamp;
+		tsinfo.logtype = argp->type;
+		/*
+		 * Store the first ckp_lsn, or the least one greater than the
+		 * starting point. There will be no partial txns after
+		 * valid_lsn.
+		 */
+		if (!(!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+		    LOG_COMPARE(&(lvh->lv_config->start_lsn),
+		    &(argp->ckp_lsn)) > 0))
+			lvh->valid_lsn = argp->ckp_lsn;
+		if ((ret = __put_timestamp_info(lvh, &tsinfo)) != 0)
+			goto err;
+		goto out;/* We are done, exit. */
+	}
+	lvh->nckp++;
+	ckp_time = (time_t)argp->timestamp;
+	__db_msg(env, DB_STR_A("2549",
+	    "[%lu][%lu] Checkpoint record, ckp_lsn: [%lu][%lu], "
+	    "timestamp: %s. Total checkpoint: %u",
+	    "%lu %lu %lu %lu %s %u"), (u_long)lsnp->file,
+	    (u_long)lsnp->offset, (u_long)argp->ckp_lsn.file,
+	    (u_long)argp->ckp_lsn.offset,
+	    __os_ctime(&ckp_time, timebuf), lvh->nckp);
+
+	if ((ret = __lv_on_timestamp(lvh, lsnp,
+	    argp->timestamp, DB___txn_ckp)) != 0)
+		goto err;
+	if (((ret = __get_last_ckp_info(lvh, &lastckp)) != 0) &&
+	    ret != DB_NOTFOUND)
+		return (ret);
+	if (ret == DB_NOTFOUND)
+		goto cont;
+
+	if (LOG_COMPARE(&(argp->last_ckp), &(lastckp->lsn)) != 0) {
+		__db_errx(env, DB_STR_A("2550",
+		    "[%lu][%lu] Last known checkpoint [%lu][%lu] not equal "
+		    "to last_ckp :[%lu][%lu]. Some checkpoint log records "
+		    "may be missing.", "%lu %lu %lu %lu %lu %lu"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)lastckp->lsn.file, (u_long)lastckp->lsn.offset,
+		    (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+	/*
+	 * Checkpoint are generally not performed quite often, so we see this
+	 * as an error, but in txn commits we see it as a warning.
+	 */
+	lastckp_time = (time_t)lastckp->timestamp;
+	if (argp->timestamp < lastckp->timestamp) {
+		__db_errx(env, DB_STR_A("2551",
+		    "[%lu][%lu] Last known checkpoint [%lu, %lu] has a "
+		    "timestamp %s smaller than this checkpoint timestamp %s.",
+		    "%lu %lu %lu %lu %s %s"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)lastckp->lsn.file,
+		    (u_long)lastckp->lsn.offset,
+		    __os_ctime(&lastckp_time, timebuf),
+		    __os_ctime(&ckp_time, timebuf));
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+cont:
+	cvp.env = env;
+	cvp.lsn = *lsnp;
+	cvp.ckp_lsn = argp->ckp_lsn;
+
+	/*
+	 * Verify that all active txn's first lsn is greater than
+	 * argp->ckp_lsn.
+	 */
+	if ((ret = __iterate_txninfo(lvh, 0, 0,
+	    __lv_ckp_vrfy_handler, &cvp)) != 0)
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	ckpinfo.timestamp = argp->timestamp;
+	ckpinfo.lsn = *lsnp;
+	ckpinfo.ckplsn = argp->ckp_lsn;
+
+	if ((ret = __put_ckp_info(lvh, &ckpinfo)) != 0)
+		goto err;
+out:
+err:
+	if (argp)
+		__os_free(env, argp);
+	if (lastckp)
+		__os_free(env, lastckp);
+	return (ret);
+}
+
+static int
+__lv_ckp_vrfy_handler(lvinfo, txninfop, param)
+	DB_LOG_VRFY_INFO *lvinfo;
+	VRFY_TXN_INFO *txninfop;
+	void *param;
+{
+	struct __ckp_verify_params *cvp;
+	int ret;
+
+	ret = 0;
+	cvp = (struct __ckp_verify_params *)param;
+	/* ckp_lsn should be less than any active txn's first lsn. */
+	if (txninfop->status == TXN_STAT_ACTIVE && LOG_COMPARE(&(cvp->ckp_lsn),
+	    &(txninfop->first_lsn)) >= 0) {
+		__db_errx(cvp->env, DB_STR_A("2552",
+		    "[%lu][%lu] ckp log's ckp_lsn [%lu][%lu] greater than "
+		    "active txn %lx 's first lsn [%lu][%lu]",
+		     "%lu %lu %lu %lu %lx %lu %lu"),
+		    (u_long)cvp->lsn.file, (u_long)cvp->lsn.offset,
+		    (u_long)cvp->ckp_lsn.file, (u_long)cvp->ckp_lsn.offset,
+		    (u_long)txninfop->txnid,
+		    (u_long)txninfop->first_lsn.file,
+		    (u_long)txninfop->first_lsn.offset);
+		lvinfo->flags |= DB_LOG_VERIFY_ERR;
+		if (!F_ISSET(lvinfo, DB_LOG_VERIFY_CAF))
+			/* Stop the iteration. */
+			ret = DB_LOG_VERIFY_BAD;
+	}
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_child_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_child_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_child_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_TXN_INFO *ptvi, *ptvi2;
+	int ret, ret2, started;
+
+	/*
+	 * This function is called when a txn T0's child txn T1 commits. Before
+	 * this log record we don't know T0 and T1's relationship. This means
+	 * we never know the T0 has an active child txn T1, all child txns
+	 * we know are committed.
+	 */
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	ptvi = ptvi2 = NULL;
+	ret = ret2 = started = 0;
+
+	if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * The __lv_log_fwdscr_oncmt call must precede LOG_VRFY_PROC otherwise
+	 * this txn will be taken as an aborted txn.
+	 */
+	if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+		if ((ret = __lv_log_fwdscr_oncmt(lvh, argp->c_lsn, argp->child,
+		    argp->txnp->txnid, 0)) != 0)
+			goto err;
+		if ((ret = __lv_log_fwdscr_onrec(lvh, argp->txnp->txnid,
+		    argp->type, argp->prev_lsn, *lsnp)) != 0)
+			goto err;
+		goto out;/* We are done. */
+	}
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+	if ((ret = __return_txn_pages(lvh, argp->child,
+	    argp->txnp->txnid)) != 0 && ret != DB_NOTFOUND)
+		goto err;/* Some txns may have updated no pages. */
+
+	/* Update parent txn info. */
+	if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+		if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+		    ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+		    argp->txnp->txnid, &started)) == 0) && started != 0) {
+			ret = 0;
+			goto err;
+		}
+		if (ret2 != 0)
+			ret = ret2;
+		__db_errx(lvh->dbenv->env, DB_STR_A("2553",
+		    "[%lu][%lu] Can not find an active transaction's "
+		    "information, txnid: %lx.", "%lu %lu %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)argp->txnp->txnid);
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+	}
+	if (ptvi == NULL) {
+		if (ret == DB_NOTFOUND &&
+		    F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+
+	}
+	ptvi->nchild_commit++;
+	/*
+	 * The start of this child txn caused lvh->ntxn_active to be
+	 * incremented unnecessarily, so decrement it.
+	 */
+	lvh->ntxn_active--;
+	if (ptvi->status != TXN_STAT_ACTIVE) {
+		__db_errx(lvh->dbenv->env, DB_STR_A("2554",
+		    "[%lu][%lu] Parent txn %lx ended "
+		    "before child txn %lx ends.", "%lu %lu %lx %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)argp->txnp->txnid, (u_long)argp->child);
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+	if ((ret = __put_txn_vrfy_info(lvh, ptvi)) != 0)
+		goto err;
+
+	/* Update child txn info. */
+	if ((ret = __get_txn_vrfy_info(lvh, argp->child, &ptvi2)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+		if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+		    ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+		    argp->child, &started)) == 0) && started != 0) {
+			ret = 0;
+			goto err;
+		}
+		if (ret2 != 0)
+			ret = ret2;
+		__db_errx(lvh->dbenv->env, DB_STR_A("2555",
+		    "[%lu][%lu] Can not find an active "
+		    "transaction's information, txnid: %lx.",
+		    "%lu %lu %lx"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)argp->child);
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+	}
+	if (ptvi2 == NULL) {
+		if (ret == DB_NOTFOUND &&
+		    F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+
+	}
+	if (ptvi2->status != TXN_STAT_ACTIVE) {
+		__db_errx(lvh->dbenv->env, DB_STR_A("2556",
+		    "[%lu][%lu] Txn %lx ended before it commits.",
+		    "%lu %lu %lx"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)argp->child);
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+	ptvi2->status = TXN_STAT_COMMIT;
+	if ((ret = __put_txn_vrfy_info(lvh, ptvi2)) != 0)
+		goto err;
+out:
+err:
+	__os_free(env, argp);
+	if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+		ret = ret2;
+	if (ptvi2 != NULL && (ret2 = __free_txninfo(ptvi2)) != 0 && ret == 0)
+		ret = ret2;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_xa_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_xa_regop_42_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __txn_xa_regop_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+	/* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_prepare_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_prepare_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_prepare_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_TXN_INFO *ptvi;
+	int ret, ret2, started;
+
+	ret = ret2 = started = 0;
+	ptvi = NULL;
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+	if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+	if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+
+	if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+		if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+		    ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+		    argp->txnp->txnid, &started)) == 0) && started != 0) {
+			ret = 0;
+			goto err;
+		}
+		if (ret2 != 0)
+			ret = ret2;
+		__db_errx(lvh->dbenv->env, DB_STR_A("2557",
+		    "[%lu][%lu] Can not find an active transaction's "
+		    "information, txnid: %lx.", "%lu %lu %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)argp->txnp->txnid);
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+	}
+	if (ptvi == NULL) {
+		if (ret == DB_NOTFOUND &&
+		    F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+
+	}
+	DB_ASSERT(env,
+	    (IS_ZERO_LSN(ptvi->prep_lsn) && ptvi->status != TXN_STAT_PREPARE) ||
+	    (!IS_ZERO_LSN(ptvi->prep_lsn) && ptvi->status == TXN_STAT_PREPARE));
+
+	lvh->ntxn_prep++;
+	lvh->ntxn_active--;
+
+	if (!IS_ZERO_LSN(ptvi->prep_lsn)) {/* Prepared more than once. */
+
+		__db_errx(lvh->dbenv->env, DB_STR_A("2558",
+		    "[%lu][%lu] Multiple txn_prepare log record for "
+		    "transaction %lx, previous prepare lsn: [%lu, %lu].",
+		    "%lu %lu %lx %lu %lu"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)argp->txnp->txnid,
+		    (u_long)ptvi->prep_lsn.file, (u_long)ptvi->prep_lsn.offset);
+	} else {
+		ptvi->prep_lsn = *lsnp;
+		ptvi->status = TXN_STAT_PREPARE;
+	}
+	ret = __put_txn_vrfy_info(lvh, ptvi);
+out:
+err:
+	__os_free(env, argp);
+	if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+		ret = ret2;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_recycle_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_recycle_verify(env, dbtp, lsnp, notused2, lvhp)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *lvhp;
+{
+	__txn_recycle_args *argp;
+	DB_LOG_VRFY_INFO *lvh;
+	int ret;
+
+	notused2 = DB_TXN_LOG_VERIFY;
+	lvh = (DB_LOG_VRFY_INFO *)lvhp;
+	ret = 0;
+
+	if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+	/* Add recycle info for all txns whose ID is in the [min, max] range. */
+	ret = __add_recycle_lsn_range(lvh, lsnp, argp->min, argp->max);
+
+out:
+
+err:
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/* Handle log types having timestamps, so far only __txn_ckp and __txn_regop. */
+static int
+__lv_on_timestamp(lvh, lsn, timestamp, logtype)
+	DB_LOG_VRFY_INFO *lvh;
+	const DB_LSN *lsn;
+	int32_t timestamp;
+	u_int32_t logtype;
+{
+	VRFY_TIMESTAMP_INFO *ltsinfo;
+	int ret;
+
+	ltsinfo = NULL;
+	ret = 0;
+	if ((ret = __get_latest_timestamp_info(lvh, *lsn, &ltsinfo)) == 0) {
+		DB_ASSERT(lvh->dbenv->env, ltsinfo != NULL);
+		if (ltsinfo->timestamp >= timestamp &&
+		    F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE)) {
+			__db_errx(lvh->dbenv->env, DB_STR_A("2559",
+			    "[%lu][%lu] [WARNING] This log record of type %s "
+			    "does not have a greater time stamp than "
+			    "[%lu, %lu] of type %s", "%lu %lu %s %lu %lu %s"),
+			    (u_long)lsn->file, (u_long)lsn->offset,
+			    LOGTYPE_NAME(lvh, logtype),
+			    (u_long)ltsinfo->lsn.file,
+			    (u_long)ltsinfo->lsn.offset,
+			    LOGTYPE_NAME(lvh, ltsinfo->logtype));
+			lvh->flags |= DB_LOG_VERIFY_WARNING;
+		}
+	}
+	if (ltsinfo != NULL)
+		__os_free(lvh->dbenv->env, ltsinfo);
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	return (ret);
+}
+
+/*
+ * Called whenever the log record belongs to a transaction.
+ */
+static int
+__lv_on_txn_logrec(lvh, lsnp, prev_lsnp, txnp, type, dbregid)
+	DB_LOG_VRFY_INFO *lvh;
+	const DB_LSN *lsnp;
+	const DB_LSN *prev_lsnp;
+	const DB_TXN *txnp;
+	u_int32_t type;
+	int32_t dbregid;
+{
+	DBT fid;
+	VRFY_TXN_INFO *pvti;
+	u_int32_t txnid;
+	VRFY_FILEREG_INFO *fregp;
+	int ret, ret2, started;
+
+	ret = ret2 = started = 0;
+	pvti = NULL;
+	fregp = NULL;
+	lvh->lrtypes[type]++;/* Increment per-type log record count. */
+	txnid = txnp->txnid;
+	memset(&fid, 0, sizeof(fid));
+
+	if (dbregid == INVAL_DBREGID)
+		goto cont;
+	if ((ret = __get_filereg_by_dbregid(lvh, dbregid, &fregp)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			/*
+			 * It's likely that we are verifying a subset of logs
+			 * and the DBREG_OPEN is outside the range.
+			 */
+			if (!F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+				__db_msg(lvh->dbenv->env, DB_STR_A("2560",
+				    "[%lu][%lu] Transaction %lx is updating a "
+				    "db file %d not registered.",
+				    "%lu %lu %lx %d"),
+				    (u_long)lsnp->file, (u_long)lsnp->offset,
+				    (u_long)txnp->txnid, dbregid);
+			goto cont;
+		} else
+			goto err;
+	}
+
+	fid = fregp->fileid;
+cont:
+	if (IS_ZERO_LSN(*prev_lsnp) &&
+	    (ret = __lv_on_new_txn(lvh, lsnp, txnp, type, dbregid, &fid)) != 0)
+		goto err;
+
+	if ((ret = __get_txn_vrfy_info(lvh, txnid, &pvti)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+
+	/* If can't find the txn, there is an internal error. */
+	if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+		/*
+		 * If verifying from middle, it's expected that txns begun
+		 * before start are not found.
+		 */
+		if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) && ((ret2 =
+		    __txn_started(lvh, lvh->lv_config->start_lsn, txnid,
+		    &started)) == 0) && started != 0) {
+			ret = 0;
+			goto out;/* We are done. */
+		}
+		if (ret2 != 0)
+			ret = ret2;
+
+		__db_errx(lvh->dbenv->env, DB_STR_A("2561",
+		    "[%lu][%lu] Can not find an active transaction's "
+		    "information, txnid: %lx.", "%lu %lu %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset, (u_long)txnid);
+		ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+	}
+
+	/* Can't proceed without the txn info. */
+	if (pvti == NULL) {
+		if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+	}
+
+	/* Check if prev lsn is wrong, and some log records may be missing. */
+	if (!IS_ZERO_LSN(*prev_lsnp) &&
+	    LOG_COMPARE(prev_lsnp, &(pvti->cur_lsn)) != 0) {
+		__db_errx(lvh->dbenv->env, DB_STR_A("2562",
+		    "[%lu][%lu] Previous record for transaction %lx is "
+		    "[%lu][%lu] and prev_lsn is [%lu][%lu].",
+		    "%lu %lu %lx %lu %lu %lu %lu"), (u_long)lsnp->file,
+		    (u_long)lsnp->offset, (u_long)pvti->txnid,
+		    (u_long)pvti->cur_lsn.file, (u_long)pvti->cur_lsn.offset,
+		    (u_long)prev_lsnp->file, (u_long)prev_lsnp->offset);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+	/*
+	 * After the txn is prepared, the only valid log record for this txn
+	 * is the commit record.
+	 */
+	if (pvti->status == TXN_STAT_PREPARE && type != DB___txn_regop) {
+		__db_errx(lvh->dbenv->env, DB_STR_A("2563",
+		    "[%lu][%lu] Update action is performed in a "
+		    "prepared transaction %lx.", "%lu %lu %lx"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset, (u_long)txnid);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+	pvti->cur_lsn = *lsnp;
+	pvti->flags = txnp->flags;
+	if (dbregid != INVAL_DBREGID && fid.size > 0 &&
+	    (ret = __add_file_updated(pvti, &fid, dbregid)) != 0)
+		goto err;
+	if ((ret = __put_txn_vrfy_info(lvh, pvti)) != 0)
+		goto err;
+out:
+err:
+	if (pvti != NULL && (ret2 = __free_txninfo(pvti)) != 0 && ret == 0)
+		ret = ret2;
+	if (fregp != NULL &&
+	    (ret2 = __free_filereg_info(fregp)) != 0 && ret == 0)
+		ret = ret2;
+	return (ret);
+}
+
+/*
+ * Called whenever a new transaction is started, including child transactions.
+ */
+static int
+__lv_on_new_txn (lvh, lsnp, txnp, type, dbregid, fid)
+	DB_LOG_VRFY_INFO *lvh;
+	const DB_LSN *lsnp;
+	const DB_TXN *txnp;
+	u_int32_t type;
+	int32_t dbregid;
+	const DBT *fid;
+{
+	VRFY_TXN_INFO vti, *pvti, *vtip;
+	int ret, tret;
+	u_int32_t txnid;
+	ENV *env;
+
+	ret = tret = 0;
+	txnid = txnp->txnid;
+	pvti = NULL;
+	memset(&vti, 0, sizeof(vti));
+	vti.txnid = txnid;
+	env = lvh->dbenv->env;
+	/* Log record type, may be used later. Pass lint checks. */
+	COMPQUIET(type, 0);
+
+	/*
+	 * It's possible that the new txn is a child txn, we will decrement
+	 * this value in __txn_child_verify when we realize this, because
+	 * this value only records the number of outermost active txns.
+	 */
+	lvh->ntxn_active++;
+
+	if ((ret = __get_txn_vrfy_info(lvh, txnid, &pvti)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND)
+		vtip = &vti;
+	else {/* The txnid is reused, may be illegal. */
+		vtip = pvti;
+		/*
+		 * If this txn id was recycled, this use is legal. A legal
+		 * recyclable txnid is immediately not recyclable after
+		 * it's recycled here. And it's impossible for vtip->status
+		 * to be TXN_STAT_ACTIVE, since we have made it TXN_STAT_ABORT
+		 * when we detected this txn id recycle just now.
+		 */
+		if (vtip->num_recycle > 0 && LOG_COMPARE(&(vtip->recycle_lsns
+		    [vtip->num_recycle - 1]), lsnp) < 0) {
+			DB_ASSERT(env, vtip->status != TXN_STAT_ACTIVE);
+			if ((ret = __rem_last_recycle_lsn(vtip)) != 0)
+				goto err;
+			if ((ret = __clear_fileups(vtip)) != 0)
+				goto err;
+
+			vtip->status = 0;
+			ZERO_LSN(vtip->prep_lsn);
+			ZERO_LSN(vtip->last_lsn);
+
+			vtip->nchild_active = 0;
+			vtip->nchild_commit = 0;
+			vtip->nchild_abort = 0;
+		/*
+		 * We may goto the else branch if this txn has child txns
+		 * before any updates done on its behalf. So we should
+		 * exclude this possibility to conclude a failed verification.
+		 */
+		} else if (vtip->nchild_active + vtip->nchild_commit +
+		    vtip->nchild_abort == 0) {
+			__db_errx(lvh->dbenv->env, DB_STR_A("2564",
+			    "[%lu][%lu] Transaction id %lx reused without "
+			    "being recycled with a __txn_recycle.",
+			    "%lu %lu %lx"),
+			    (u_long)lsnp->file, (u_long)lsnp->offset,
+			    (u_long)txnid);
+			ret = DB_LOG_VERIFY_BAD;
+			ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+		}
+	}
+
+	vtip->first_lsn = *lsnp;
+	vtip->cur_lsn = *lsnp;
+	vtip->flags = txnp->flags;
+
+	/*
+	 * It's possible that the first log rec does not update any file,
+	 * like the __txn_child type of record.
+	 */
+	if (fid->size > 0 && (ret =
+	    __add_file_updated(vtip, fid, dbregid)) != 0)
+		goto err;
+	if ((ret = __put_txn_vrfy_info(lvh, vtip)) != 0)
+		goto err;
+
+err:
+	if (pvti != NULL && (tret = __free_txninfo(pvti)) != 0 && ret == 0)
+		ret = tret;
+	if ((tret = __free_txninfo_stack(&vti)) != 0 && ret == 0)
+		ret = tret;
+
+	return (ret);
+}
+
+/* Called when we detect that a new log file is used. */
+static int
+__lv_new_logfile_vrfy(lvh, lsnp)
+	DB_LOG_VRFY_INFO *lvh;
+	const DB_LSN *lsnp;
+{
+	int ret;
+
+	ret = 0;
+	if (IS_ZERO_LSN(lvh->last_lsn) || lvh->last_lsn.file == lsnp->file) {
+		lvh->last_lsn = *lsnp;
+		return (0);
+	}
+
+	/*
+	 * If file number changed, it must have been incremented,
+	 * and the offset is 0.
+	 * */
+	if (lsnp->file - lvh->last_lsn.file != 1 || lsnp->offset !=
+	    __lv_first_offset(lvh->dbenv->env)) {
+		__db_errx(lvh->dbenv->env,
+		    "[%lu][%lu] Last log record verified ([%lu][%lu]) is not "
+		    "immidiately before the current log record.",
+		    (u_long)lsnp->file, (u_long)lsnp->offset,
+		    (u_long)lvh->last_lsn.file, (u_long)lvh->last_lsn.offset);
+		ret = DB_LOG_VERIFY_BAD;
+		ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+	}
+
+	lvh->last_lsn = *lsnp;
+err:
+	return (ret);
+}
+
+static u_int32_t
+__lv_first_offset(env)
+	ENV *env;
+{
+	u_int32_t sz;
+
+	if (CRYPTO_ON(env))
+		sz = HDR_CRYPTO_SZ;
+	else
+		sz = HDR_NORMAL_SZ;
+
+	sz += sizeof(LOGP);
+
+	return sz;
+}
+
+/* Called when we see a non-transactional update log record. */
+static int
+__lv_on_nontxn_update(lvh, lsnp, txnid, logtype, fileid)
+	DB_LOG_VRFY_INFO *lvh;
+	const DB_LSN *lsnp;
+	u_int32_t txnid, logtype;
+	int32_t fileid;
+{
+	lvh->lrtypes[logtype]++;
+	COMPQUIET(txnid, 0);
+	if (fileid != INVAL_DBREGID) {
+		lvh->non_txnup_cnt++;
+		__db_msg(lvh->dbenv->env, DB_STR_A("2565",
+		    "[%lu][%lu] Non-transactional update, "
+		    "log type: %u, fileid: %d.", "%lu %lu %u %d"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset, logtype, fileid);
+	}
+
+	return (0);
+}
+
+static int
+__lv_on_txn_aborted(lvinfo)
+	DB_LOG_VRFY_INFO *lvinfo;
+{
+	int ret, ret2, sres;
+	VRFY_TXN_INFO *ptvi;
+	u_int32_t abtid;
+	DB_LSN lsn, slsn;
+
+	ret = ret2 = sres = 0;
+	abtid = lvinfo->aborted_txnid;
+	lsn = lvinfo->aborted_txnlsn;
+	slsn = lvinfo->lv_config->start_lsn;
+	ptvi = NULL;
+
+	if ((ret = __del_txn_pages(lvinfo, lvinfo->aborted_txnid)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;/* Some txns may have updated no pages. */
+	ret = __get_txn_vrfy_info(lvinfo, lvinfo->aborted_txnid, &ptvi);
+	if (ret == DB_NOTFOUND && !F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL)) {
+		/*
+		 * If verifying from slsn and the txn abtid started before
+		 * slsn, it's expected that we can't find the txn.
+		 */
+		if (!IS_ZERO_LSN(slsn) && (ret2 = __txn_started(lvinfo, slsn,
+		    abtid, &sres)) == 0 && sres != 0) {
+			ret = 0;
+			goto err;
+		}
+		if (ret2 != 0)
+			ret = ret2;/* Use the same error msg below. */
+		__db_errx(lvinfo->dbenv->env, DB_STR_A("2566",
+		    "[%lu][%lu] Can not find an active transaction's "
+		    "information, txnid: %lx.", "%lu %lu %lx"),
+		    (u_long)lsn.file, (u_long)lsn.offset,
+		    (u_long)lvinfo->aborted_txnid);
+		ON_ERROR(lvinfo, DB_LOG_VERIFY_INTERR);
+	}
+	if (ptvi == NULL) {
+		if (ret == DB_NOTFOUND &&
+		    F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL))
+			ret = 0;
+		goto out;
+	}
+	ptvi->status = TXN_STAT_ABORT;
+	lvinfo->ntxn_abort++;
+	lvinfo->ntxn_active--;
+	/* Report txn stats. */
+	if (F_ISSET(lvinfo, DB_LOG_VERIFY_VERBOSE)) {
+		__db_msg(lvinfo->dbenv->env, DB_STR_A("2567",
+		    "[%lu][%lu] Txn %lx aborted after this log record.",
+		    "%lu %lu %lx"), (u_long)lvinfo->aborted_txnlsn.file,
+		    (u_long)lvinfo->aborted_txnlsn.offset, (u_long)ptvi->txnid);
+		__db_msg(lvinfo->dbenv->env, DB_STR_A("2568",
+		    "\tThe number of active, committed and aborted child txns "
+		    "of txn %lx: %u, %u, %u.", "%lx %u %u %u"),
+		    (u_long)ptvi->txnid, ptvi->nchild_active,
+		    ptvi->nchild_commit, ptvi->nchild_abort);
+	}
+	lvinfo->aborted_txnid = 0;
+	lvinfo->aborted_txnlsn.file = lvinfo->aborted_txnlsn.offset = 0;
+	if ((ret = __put_txn_vrfy_info(lvinfo, ptvi)) != 0)
+		goto err;
+	if ((ret = __free_txninfo(ptvi)) != 0)
+		goto err;
+out:
+err:
+	return (ret);
+}
diff --git a/src/log/log_verify_stub.c b/src/log/log_verify_stub.c
new file mode 100644
index 00000000..e6589a50
--- /dev/null
+++ b/src/log/log_verify_stub.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+
+#include "db_config.h"
+#include "db_int.h"
+
+static int __db_log_novrfy __P((ENV *));
+int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify_wrap __P((ENV *env, const char *, u_int32_t, const char *,
+    const char *, time_t, time_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+    int, int));
+
+/*
+ * __db_log_novrfy --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_log_novrfy(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("2523",
+	    "library build did not include support for log verification"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__log_verify_pp(dbenv, lvconfig)
+	DB_ENV *dbenv;
+	const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+	COMPQUIET(lvconfig, NULL);
+
+	/* The dbenv is intact, callers should properly take care of it. */
+	return (__db_log_novrfy(dbenv->env));
+}
+
+int
+__log_verify(dbenv, lvconfig)
+	DB_ENV *dbenv;
+	const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+	COMPQUIET(lvconfig, NULL);
+
+	return (__db_log_novrfy(dbenv->env));
+}
+
+int
+__log_verify_wrap(env, envhome, cachesize, dbfile, dbname,
+    stime, etime, stfile, stoffset, efile, eoffset, caf, verbose)
+	ENV *env;
+	const char *envhome, *dbfile, *dbname;
+	time_t stime, etime;
+	u_int32_t cachesize, stfile, stoffset, efile, eoffset;
+	int caf, verbose;
+{
+	COMPQUIET(envhome, NULL);
+	COMPQUIET(dbfile, NULL);
+	COMPQUIET(dbname, NULL);
+	COMPQUIET(stime, 0);
+	COMPQUIET(etime, 0);
+	COMPQUIET(cachesize, 0);
+	COMPQUIET(stfile, 0);
+	COMPQUIET(stoffset, 0);
+	COMPQUIET(efile, 0);
+	COMPQUIET(eoffset, 0);
+	COMPQUIET(caf, 0);
+	COMPQUIET(verbose, 0);
+	return (__db_log_novrfy(env));
+}
+
+#endif /* !HAVE_VERIFY */
diff --git a/src/log/log_verify_util.c b/src/log/log_verify_util.c
new file mode 100644
index 00000000..88682921
--- /dev/null
+++ b/src/log/log_verify_util.c
@@ -0,0 +1,2234 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This file contains helper functions like data structure and in-memory db
+ * management, which are used to store various log verification information.
+ */
+#include "db_config.h"
+#include "db_int.h"
+
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+#include "dbinc/log_verify.h"
+
+#define	BDBOP(op)	do {		\
+	ret = (op);			\
+	if (ret != 0) {			\
+		__lv_on_bdbop_err(ret);	\
+		goto err;		\
+	}				\
+} while (0)
+
+#define	BDBOP2(dbenv, op, funct)	do {			\
+	ret = (op);						\
+	if (ret != 0) {						\
+		__lv_on_bdbop_err(ret);				\
+		__db_err(dbenv->env, ret, "\n%s", funct);	\
+		return (ret);					\
+	}							\
+} while (0)
+
+#define	BDBOP3(dbenv, op, excpt, funct)	do {				\
+	ret = (op);							\
+	if (ret != 0) {							\
+		__lv_on_bdbop_err(ret);					\
+		if (ret != excpt) {					\
+			__db_err(dbenv->env, ret, "\n%s", funct);	\
+			return (ret);					\
+		}							\
+	}								\
+} while (0)
+
+typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *);
+typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *);
+
+static int __lv_add_recycle_handler __P((
+    DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *));
+static int __lv_add_recycle_lsn __P((VRFY_TXN_INFO *, const DB_LSN *));
+static size_t __lv_dbt_arrsz __P((const DBT *, u_int32_t));
+static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *));
+static void __lv_on_bdbop_err __P((int));
+static int __lv_open_db __P((DB_ENV *, DB **, DB_THREAD_INFO *,
+    const char *, int, btcmp_funct, u_int32_t, dupcmp_funct));
+static int __lv_pack_filereg __P((const VRFY_FILEREG_INFO *, DBT *));
+static int __lv_pack_txn_vrfy_info __P((
+    const VRFY_TXN_INFO *, DBT *, DBT *data));
+static int __lv_seccbk_fname __P((DB *, const DBT *, const DBT *, DBT *));
+static int __lv_seccbk_lsn __P((DB *, const DBT *, const DBT *, DBT *));
+static int __lv_seccbk_txnpg __P((DB *, const DBT *, const DBT *, DBT *));
+static void __lv_setup_logtype_names __P((DB_LOG_VRFY_INFO *lvinfo));
+static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_unpack_txn_vrfy_info __P((VRFY_TXN_INFO **, const DBT *));
+static int __lv_unpack_filereg __P((const DBT *, VRFY_FILEREG_INFO **));
+
+static void __lv_on_bdbop_err(ret)
+	int ret;
+{
+	/* Pass lint checks. We need the ret and this function for debugging. */
+	COMPQUIET(ret, 0);
+}
+
+/*
+ * __create_log_vrfy_info --
+ *	Initialize and return a log verification handle to be used throughout
+ *	a verification process.
+ *
+ * PUBLIC: int __create_log_vrfy_info __P((const DB_LOG_VERIFY_CONFIG *,
+ * PUBLIC:     DB_LOG_VRFY_INFO **, DB_THREAD_INFO *));
+ */
+int
+__create_log_vrfy_info(cfg, lvinfopp, ip)
+	const DB_LOG_VERIFY_CONFIG *cfg;
+	DB_LOG_VRFY_INFO **lvinfopp;
+	DB_THREAD_INFO *ip;
+{
+	const char *envhome;
+	int inmem, ret;
+	u_int32_t cachesz, envflags;
+	const char *dbf1, *dbf2, *dbf3, *dbf4, *dbf5, *dbf6, *dbf7, *dbf8,
+	    *dbf9, *dbf10, *dbf11;
+	DB_LOG_VRFY_INFO *lvinfop;
+
+	dbf1 = "__db_log_vrfy_txninfo.db";
+	dbf2 = "__db_log_vrfy_fileregs.db";
+	dbf3 = "__db_log_vrfy_pgtxn.db";
+	dbf4 = "__db_log_vrfy_lsntime.db";
+	dbf5 = "__db_log_vrfy_timelsn.db";
+	dbf6 = "__db_log_vrfy_ckps.db";
+	dbf7 = "__db_log_vrfy_dbregids.db";
+	dbf8 = "__db_log_vrfy_fnameuid.db";
+	dbf9 = "__db_log_vrfy_timerange.db";
+	dbf10 = "__db_log_vrfy_txnaborts.db";
+	dbf11 = "__db_log_vrfy_txnpg.db";
+
+	envhome = cfg->temp_envhome;
+	lvinfop = NULL;
+	cachesz = cfg->cachesize;
+	if (cachesz== 0)
+		cachesz = 1024 * 1024 * 256;
+
+	BDBOP(__os_malloc(NULL, sizeof(DB_LOG_VRFY_INFO), &lvinfop));
+	memset(lvinfop, 0, sizeof(DB_LOG_VRFY_INFO));
+	lvinfop->ip = ip;
+	__lv_setup_logtype_names(lvinfop);
+	/* Avoid the VERIFY_PARTIAL bit being cleared if no ckp_lsn exists. */
+	lvinfop->valid_lsn.file = lvinfop->valid_lsn.offset = (u_int32_t)-1;
+
+	/*
+	 * The envhome parameter determines if we will use an in-memory
+	 * environment and databases.
+	 */
+	if (envhome == NULL) {
+		envflags = DB_PRIVATE;
+		inmem = 1;
+	} else {
+		envflags = 0;
+		inmem = 0;
+	}
+
+	/* Create log verify internal database environment. */
+	BDBOP(db_env_create(&lvinfop->dbenv, 0));
+	BDBOP(__memp_set_cachesize(lvinfop->dbenv, 0, cachesz, 1));
+	/*
+	 * Log verification internal db environment should be accessed
+	 * single-threaded. No transaction semantics needed.
+	 */
+	BDBOP(__env_open(lvinfop->dbenv, envhome,
+	    envflags | DB_CREATE | DB_INIT_MPOOL, 0666));
+
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txninfo, ip, dbf1,
+	    inmem, __lv_ui32_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->fileregs, ip, dbf2,
+	    inmem, NULL, 0, NULL));
+
+	/* No dup allowed, always overwrite data with same key. */
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->dbregids, ip, dbf7,
+	    inmem, __lv_i32_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->pgtxn, ip, dbf3,
+	    inmem, __lv_fidpgno_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnpg, ip, dbf11,
+	    inmem, __lv_ui32_cmp, DB_DUP | DB_DUPSORT, __lv_fidpgno_cmp));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->lsntime, ip, dbf4,
+	    inmem, __lv_lsn_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->timelsn, ip, dbf5,
+	    inmem, __lv_i32_cmp, DB_DUP | DB_DUPSORT, __lv_lsn_cmp));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnaborts, ip, dbf10,
+	    inmem, __lv_lsn_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->ckps, ip, dbf6,
+	    inmem, __lv_lsn_cmp, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->fnameuid, ip, dbf8,
+	    inmem, NULL, 0, NULL));
+	BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnrngs, ip, dbf9,
+	    inmem, __lv_ui32_cmp, DB_DUP | DB_DUPSORT, __lv_txnrgns_lsn_cmp));
+
+	BDBOP(__db_associate(lvinfop->lsntime, ip, NULL,
+	    lvinfop->timelsn, __lv_seccbk_lsn, DB_CREATE));
+	BDBOP(__db_associate(lvinfop->fileregs, ip, NULL,
+	    lvinfop->fnameuid, __lv_seccbk_fname, DB_CREATE));
+	BDBOP(__db_associate(lvinfop->pgtxn, ip, NULL,
+	    lvinfop->txnpg, __lv_seccbk_txnpg, DB_CREATE));
+
+	*lvinfopp = lvinfop;
+
+	return (0);
+err:
+	if (lvinfop->dbenv && ret != 0)
+		__db_err(lvinfop->dbenv->env, ret, "__create_log_vrfy_info");
+	(void)__destroy_log_vrfy_info(lvinfop);
+
+	return (ret);
+}
+
+/*
+ * __destroy_log_vrfy_info --
+ *	Destroy and free a log verification handle.
+ *
+ * PUBLIC: int __destroy_log_vrfy_info __P((DB_LOG_VRFY_INFO *));
+ */
+int
+__destroy_log_vrfy_info(lvinfop)
+	DB_LOG_VRFY_INFO *lvinfop;
+{
+	int ret;
+
+	ret = 0;
+	if (lvinfop == NULL)
+		return (0);
+
+	if (lvinfop->txnaborts != NULL &&
+	    (ret = __db_close(lvinfop->txnaborts, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->txninfo != NULL &&
+	    (ret = __db_close(lvinfop->txninfo, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->dbregids != NULL &&
+	    (ret = __db_close(lvinfop->dbregids, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->fileregs != NULL &&
+	    (ret = __db_close(lvinfop->fileregs, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->pgtxn != NULL &&
+	    (ret = __db_close(lvinfop->pgtxn, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->lsntime != NULL &&
+	    (ret = __db_close(lvinfop->lsntime, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->ckps != NULL &&
+	    (ret = __db_close(lvinfop->ckps, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->txnrngs != NULL &&
+	    (ret = __db_close(lvinfop->txnrngs, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->fnameuid != NULL &&
+	    (ret = __db_close(lvinfop->fnameuid, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->timelsn != NULL &&
+	    (ret = __db_close(lvinfop->timelsn, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->txnpg != NULL &&
+	    (ret = __db_close(lvinfop->txnpg, NULL, 0)) != 0)
+		goto err;
+	if (lvinfop->dbenv != NULL &&
+	    (ret = __env_close(lvinfop->dbenv, 0)) != 0)
+		goto err;
+err:
+	__os_free(NULL, lvinfop);
+
+	return (ret);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->timelsn. */
+static int
+__lv_seccbk_fname(secdb, key, data, result)
+	DB *secdb;
+	const DBT *key;
+	const DBT *data;
+	DBT *result;
+{
+	int ret, tret;
+	VRFY_FILEREG_INFO *freg;
+	char *buf;
+	size_t buflen, slen;
+
+	ret = tret = 0;
+	COMPQUIET(key, NULL);
+	if ((ret = __lv_unpack_filereg(data, &freg)) != 0)
+		goto out;
+	if (freg->fname == NULL || (slen = strlen(freg->fname)) == 0) {
+		ret = DB_DONOTINDEX;
+		goto out;
+	}
+
+	buflen = (slen + 1) * sizeof(char);
+	if ((ret = __os_umalloc(secdb->dbenv->env, buflen, &buf)) != 0)
+		goto out;
+	(void)strcpy(buf, freg->fname);
+	result->size = (u_int32_t)buflen;
+	result->flags |= DB_DBT_APPMALLOC;
+	result->data = buf;
+out:
+	if (freg != NULL && (tret = __free_filereg_info(freg)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->txnpg. */
+static int
+__lv_seccbk_txnpg(secdb, key, data, result)
+	DB *secdb;
+	const DBT *key;
+	const DBT *data;
+	DBT *result;
+{
+	COMPQUIET(key, NULL);
+	COMPQUIET(secdb, NULL);
+	/* Txnid is the secondary key, and it's all the data dbt has. */
+	result->data = data->data;
+	result->size = data->size;
+
+	return (0);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->timelsn. */
+static int
+__lv_seccbk_lsn(secdb, key, data, result)
+	DB *secdb;
+	const DBT *key;
+	const DBT *data;
+	DBT *result;
+{
+	VRFY_TIMESTAMP_INFO *lvti;
+
+	COMPQUIET(key, NULL);
+	COMPQUIET(secdb, NULL);
+
+	lvti = (VRFY_TIMESTAMP_INFO *)data->data;
+	result->data = &(lvti->timestamp);
+	result->size = sizeof(lvti->timestamp);
+
+	return (0);
+}
+
+/*
+ * Open a BTREE database handle, optionally set the btree compare function
+ * and flags if any.
+ */
+static int
+__lv_open_db(dbenv, dbpp, ip, name, inmem, cmpf, sflags, dupcmpf)
+	DB_ENV *dbenv;
+	DB **dbpp;
+	const char *name;
+	int inmem;
+	btcmp_funct cmpf;
+	u_int32_t sflags;
+	dupcmp_funct dupcmpf;
+	DB_THREAD_INFO *ip;
+{
+	int ret;
+	const char *dbfname, *dbname;
+	DB *dbp;
+
+	dbp = NULL;
+	ret = 0;
+	if (inmem) {
+		dbfname = NULL;
+		dbname = name;
+	} else {
+		dbfname = name;
+		dbname = NULL;
+	}
+
+	BDBOP(db_create(&dbp, dbenv, 0));
+
+	if (cmpf != NULL)
+		BDBOP(__bam_set_bt_compare(dbp, cmpf));
+	if (dupcmpf != NULL)
+		dbp->dup_compare = dupcmpf;
+	if (sflags != 0)
+		BDBOP(__db_set_flags(dbp, sflags));
+	/* No concurrency needed, a big page size reduces overflow pages. */
+	BDBOP(__db_set_pagesize(dbp, 16 * 1024));
+
+	BDBOP(__db_open(dbp, ip, NULL, dbfname, dbname, DB_BTREE, DB_CREATE,
+	    0666, PGNO_BASE_MD));
+
+	*dbpp = dbp;
+
+	return (0);
+err:
+	if (dbenv != NULL && ret != 0)
+		__db_err(dbenv->env, ret, "__lv_open_db");
+	if (dbp != NULL)
+		(void)__db_close(dbp, NULL, 0);
+
+	return (ret);
+}
+
+/* Btree compare function for a [fileid, pgno] key. */
+static int
+__lv_fidpgno_cmp(db, dbt1, dbt2)
+	DB *db;
+	const DBT *dbt1;
+	const DBT *dbt2;
+{
+	db_pgno_t pgno1, pgno2;
+	int ret;
+	size_t len;
+
+	COMPQUIET(db, NULL);
+	len = DB_FILE_ID_LEN;
+	ret = memcmp(dbt1->data, dbt2->data, len);
+	if (ret == 0) {
+		memcpy(&pgno1, (u_int8_t *)dbt1->data + len,
+		    sizeof(pgno1));
+		memcpy(&pgno2, (u_int8_t *)dbt2->data + len,
+		    sizeof(pgno2));
+		ret = NUMCMP(pgno1, pgno2);
+	}
+
+	return (ret);
+}
+
+/* Btree compare function for a int32_t type of key. */
+static int
+__lv_i32_cmp(db, dbt1, dbt2)
+	DB *db;
+	const DBT *dbt1;
+	const DBT *dbt2;
+{
+	int32_t k1, k2;
+
+	COMPQUIET(db, NULL);
+	memcpy(&k1, dbt1->data, sizeof(k1));
+	memcpy(&k2, dbt2->data, sizeof(k2));
+
+	return (NUMCMP(k1, k2));
+}
+
+/* Btree compare function for a u_int32_t type of key. */
+static int
+__lv_ui32_cmp(db, dbt1, dbt2)
+	DB *db;
+	const DBT *dbt1;
+	const DBT *dbt2;
+{
+	u_int32_t k1, k2;
+
+	COMPQUIET(db, NULL);
+	memcpy(&k1, dbt1->data, sizeof(k1));
+	memcpy(&k2, dbt2->data, sizeof(k2));
+
+	return (NUMCMP(k1, k2));
+}
+
+/* Btree compare function for a DB_LSN type of key. */
+static int
+__lv_lsn_cmp(db, dbt1, dbt2)
+	DB *db;
+	const DBT *dbt1;
+	const DBT *dbt2;
+{
+	DB_LSN lsn1, lsn2;
+
+	DB_ASSERT(db->env, dbt1->size == sizeof(DB_LSN));
+	DB_ASSERT(db->env, dbt2->size == sizeof(DB_LSN));
+	memcpy(&lsn1, dbt1->data, sizeof(DB_LSN));
+	memcpy(&lsn2, dbt2->data, sizeof(DB_LSN));
+
+	return (LOG_COMPARE(&lsn1, &lsn2));
+}
+
+/*
+ * Structure management routines. We keep each structure on a
+ * consecutive memory chunk.
+ *
+ * The get functions will allocate memory via __os_malloc, and callers
+ * should free the memory after use. The update functions for VRFY_TXN_INFO
+ * and VRFY_FILEREG_INFO may realloc the structure.
+ */
+
+/*
+ * PUBLIC: int __put_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     const VRFY_TXN_INFO *));
+ */
+int
+__put_txn_vrfy_info (lvinfo, txninfop)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	const VRFY_TXN_INFO *txninfop;
+{
+	int ret;
+	DBT key, data;
+
+	ret = __lv_pack_txn_vrfy_info(txninfop, &key, &data);
+	DB_ASSERT(lvinfo->dbenv->env, ret == 0);
+
+	BDBOP2(lvinfo->dbenv, __db_put(lvinfo->txninfo, lvinfo->ip, NULL,
+	    &key, &data, 0), "__put_txn_vrfy_info");
+	__os_free(lvinfo->dbenv->env, data.data);
+
+	return (0);
+}
+
+/* Construct a key and data DBT from the structure. */
+static int
+__lv_pack_txn_vrfy_info(txninfop, key, data)
+	const VRFY_TXN_INFO *txninfop;
+	DBT *key, *data;
+{
+	int ret;
+	char *buf, *p;
+	size_t bufsz, len;
+	u_int32_t i;
+	DBT *pdbt;
+
+	memset(key, 0, sizeof(DBT));
+	memset(data, 0, sizeof(DBT));
+	ret = 0;
+	bufsz = TXN_VERIFY_INFO_TOTSIZE(*txninfop);
+
+	if ((ret = __os_malloc(NULL, bufsz, &buf)) != 0)
+		goto err;
+	memset(buf, 0, bufsz);
+	memcpy(buf, txninfop, TXN_VERIFY_INFO_FIXSIZE);
+	p = buf + TXN_VERIFY_INFO_FIXSIZE;
+	memcpy(p, txninfop->recycle_lsns, len = sizeof(DB_LSN) *
+	    txninfop->num_recycle);
+	p += len;
+
+	for (i = 0; i < txninfop->filenum; i++) {
+
+		pdbt = &(txninfop->fileups[i]);
+		memcpy(p, &(pdbt->size), sizeof(pdbt->size));
+		p += sizeof(pdbt->size);
+		memcpy(p, pdbt->data, pdbt->size);
+		p += pdbt->size;
+	}
+
+	key->data = (void *)&txninfop->txnid;
+	key->size = sizeof(txninfop->txnid);
+	data->data = buf;
+	data->size = (u_int32_t)bufsz;
+	data->flags |= DB_DBT_MALLOC;
+err:
+	return (ret);
+}
+
+/* Calculate a DBT array's total number of bytes to store. */
+static size_t
+__lv_dbt_arrsz(arr, arrlen)
+	const DBT *arr;
+	u_int32_t arrlen;
+{
+	u_int32_t i;
+	size_t sz;
+
+	sz = 0;
+
+	/* For each DBT object, store its size and its data bytes. */
+	for (i = 0; i < arrlen; i++)
+		sz += arr[i].size + sizeof(arr[i].size);
+
+	return sz;
+}
+
+/*
+ *  __get_txn_vrfy_info --
+ *	Get a VRFY_TXN_INFO object from db by txnid. Callers should free the
+ *	object by calling __free_txninfo.
+ *
+ * PUBLIC: int __get_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC:     VRFY_TXN_INFO **));
+ */
+int
+__get_txn_vrfy_info (lvinfo, txnid, txninfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t txnid;
+	VRFY_TXN_INFO **txninfopp;
+{
+	int ret;
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = &txnid;
+	key.size = sizeof(txnid);
+
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->txninfo, lvinfo->ip, NULL,
+	    &key, &data, 0), DB_NOTFOUND, "__get_txn_vrfy_info");
+
+	if (ret != DB_NOTFOUND)
+		ret = __lv_unpack_txn_vrfy_info(txninfopp, &data);
+
+	return (ret);
+}
+
+/* Construct a structure from a DBT. */
+static int
+__lv_unpack_txn_vrfy_info(txninfopp, data)
+	VRFY_TXN_INFO **txninfopp;
+	const DBT *data;
+{
+	size_t bufsz;
+	VRFY_TXN_INFO *buf, *txninfop;
+	DB_LSN *lsns, *p;
+	u_int32_t i, sz;
+	char *pb, *q;
+	int ret;
+
+	ret = 0;
+	i = sz = 0;
+	lsns = p = NULL;
+	pb = q = NULL;
+	txninfop = (VRFY_TXN_INFO *)data->data;
+	lsns = (DB_LSN *)((char *)data->data + TXN_VERIFY_INFO_FIXSIZE);
+	pb = (char *)lsns + txninfop->num_recycle * sizeof(DB_LSN);
+
+	if ((ret = __os_malloc(NULL, bufsz = sizeof(VRFY_TXN_INFO), &buf)) != 0)
+		goto err;
+	memset(buf, 0, bufsz);
+	memcpy(buf, data->data, TXN_VERIFY_INFO_FIXSIZE);
+
+	if (txninfop->num_recycle != 0) {
+		if ((ret = __os_malloc(NULL,
+		    txninfop->num_recycle * sizeof(DB_LSN), &p)) != 0)
+			goto err;
+		memcpy(p, lsns, txninfop->num_recycle * sizeof(DB_LSN));
+		buf->recycle_lsns = p;
+	}
+
+	if (txninfop->filenum != 0) {
+		if ((ret = __os_malloc(NULL,
+		    txninfop->filenum * sizeof(DBT), &q)) != 0)
+			goto err;
+		memset(q, 0, txninfop->filenum * sizeof(DBT));
+		buf->fileups = (DBT *)q;
+		for (i = 0; i < txninfop->filenum; i++) {
+			memcpy(&sz, pb, sizeof(sz));
+			pb += sizeof(sz);
+			if ((ret = __os_malloc(NULL, sz, &q)) != 0)
+				goto err;
+			memcpy(q, pb, sz);
+			pb += sz;
+
+			buf->fileups[i].data = q;
+			buf->fileups[i].size = sz;
+		}
+	}
+
+	*txninfopp = buf;
+err:
+	return (ret);
+}
+
+static int
+__lv_add_recycle_lsn (txninfop, lsn)
+	VRFY_TXN_INFO *txninfop;
+	const DB_LSN *lsn;
+{
+	int ret;
+
+	ret = 0;
+	txninfop->num_recycle++;
+	if ((ret = __os_realloc(NULL, txninfop->num_recycle * sizeof(DB_LSN),
+	    &(txninfop->recycle_lsns))) != 0)
+		goto err;
+	txninfop->recycle_lsns[txninfop->num_recycle - 1] = *lsn;
+err:
+	return (ret);
+}
+
+/*
+ * __add_recycle_lsn_range --
+ *	Add recycle info for each txn within the recycled txnid range.
+ *
+ * PUBLIC: int __add_recycle_lsn_range __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     const DB_LSN *, u_int32_t, u_int32_t));
+ */
+int
+__add_recycle_lsn_range(lvinfo, lsn, min, max)
+	DB_LOG_VRFY_INFO *lvinfo;
+	const DB_LSN *lsn;
+	u_int32_t min, max;
+{
+	DBC *csr;
+	int ret, tret;
+	u_int32_t i;
+	DBT key2, data2;
+	struct __add_recycle_params param;
+
+	csr = NULL;
+	ret = tret = 0;
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+	memset(&param, 0, sizeof(param));
+
+	if ((ret = __os_malloc(lvinfo->dbenv->env, sizeof(VRFY_TXN_INFO *) *
+	    (param.ti2ul = 1024), &(param.ti2u))) != 0)
+		goto err;
+	param.ti2ui = 0;
+	param.recycle_lsn = *lsn;
+	param.min = min;
+	param.max = max;
+
+	/* Iterate the specified range and process each transaction. */
+	if ((ret = __iterate_txninfo(lvinfo, min, max, __lv_add_recycle_handler,
+	    &param)) != 0)
+		goto err;
+
+	/*
+	 * Save updated txninfo structures. We can't do so in the above
+	 * iteration, so we have to save them here.
+	 */
+	BDBOP(__db_cursor(lvinfo->txninfo, lvinfo->ip, NULL, &csr, DBC_BULK));
+
+	for (i = 0; i < param.ti2ui; i++) {
+		ret = __lv_pack_txn_vrfy_info(param.ti2u[i], &key2, &data2);
+		DB_ASSERT(lvinfo->dbenv->env, ret == 0);
+		BDBOP(__dbc_put(csr, &key2, &data2, DB_KEYLAST));
+		/*
+		 * key2.data refers to param.ti2u[i]'s memory, data2.data is
+		 * freed by DB since we set DB_DBT_MALLOC.
+		 */
+		if ((ret = __free_txninfo(param.ti2u[i])) != 0)
+			goto err;
+	}
+
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	__os_free(lvinfo->dbenv->env, param.ti2u);
+	if (ret != 0)
+		__db_err(lvinfo->dbenv->env, ret,
+		    "__add_recycle_lsn_range");
+
+	return (ret);
+}
+
+/*
+ *  __iterate_txninfo --
+ *	Iterate throught the transaction info database as fast as possible,
+ *	and process each key/data pair using a callback handler. Break the
+ *	iteration if the handler returns non-zero values.
+ *
+ * PUBLIC: int __iterate_txninfo __P((DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC:     u_int32_t, TXNINFO_HANDLER, void *));
+ */
+int
+__iterate_txninfo(lvinfo, min, max, handler, param)
+	DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t min, max;
+	TXNINFO_HANDLER handler;
+	void *param;
+{
+	ENV *env;
+	VRFY_TXN_INFO *txninfop;
+	int ret, tret;
+	u_int32_t bufsz, pgsz, txnid;
+	size_t retkl, retdl;
+	char *btbuf;
+	u_int8_t *retk, *retd;
+	DBT key, data, data2;
+	DBC *csr;
+	void *p;
+
+	csr = NULL;
+	env = lvinfo->dbenv->env;
+	txninfop = NULL;
+	ret = tret = 0;
+	txnid = 0;
+	retkl = retdl = 0;
+	bufsz = 64 * 1024;
+	btbuf = NULL;
+	retk = retd = NULL;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+
+	pgsz = lvinfo->txninfo->pgsize;
+	DB_ASSERT(env, ret == 0);
+
+	if (bufsz % pgsz != 0)
+		bufsz = pgsz * (bufsz / pgsz);
+
+	if ((ret = __os_malloc(env, bufsz, &btbuf)) != 0)
+		goto err;
+
+	BDBOP(__db_cursor(lvinfo->txninfo, lvinfo->ip, NULL, &csr, DBC_BULK));
+
+	/*
+	 * Use bulk retrieval to scan the database as fast as possible.
+	 */
+	data.data = btbuf;
+	data.ulen = bufsz;
+	data.flags |= DB_DBT_USERMEM;
+
+	for (ret = __dbc_get(csr, &key, &data, DB_FIRST | DB_MULTIPLE_KEY) ;;
+	    ret = __dbc_get(csr, &key, &data, DB_NEXT | DB_MULTIPLE_KEY)) {
+		switch (ret) {
+		case 0:
+			break;
+		case DB_NOTFOUND:
+			goto out;
+			/* No break statement allowed by lint here. */
+		case DB_BUFFER_SMALL:
+			if ((ret = __os_realloc(lvinfo->dbenv->env,
+			    bufsz *= 2, &btbuf)) != 0)
+				goto out;
+			data.ulen = bufsz;
+			data.data = btbuf;
+			continue;/* Continue the for-loop. */
+			/* No break statement allowed by lint here. */
+		default:
+			goto err;
+		}
+
+		/*
+		 * Do bulk get. Some txninfo objects may be updated by the
+		 * handler, but we can't store them immediately in the same
+		 * loop because we wouldn't be able to continue the bulk get
+		 * using the same cursor; and we can't use another cursor
+		 * otherwise we may self-block. In the handler we need to
+		 * store the updated objects and store them to db when we get
+		 * out of this loop.
+		 */
+		DB_MULTIPLE_INIT(p, &data);
+		while (1) {
+			DB_MULTIPLE_KEY_NEXT(p, &data,
+			    retk, retkl, retd, retdl);
+			if (p == NULL)
+				break;
+			DB_ASSERT(env, retkl == sizeof(txnid) && retk != NULL);
+			memcpy(&txnid, retk, retkl);
+			/*
+			 * Process it if txnid in range or no range specified.
+			 * The range must be a closed one.
+			 */
+			if ((min != 0 && txnid >= min && max != 0 &&
+			    txnid <= max) || (min == 0 && max == 0)) {
+				data2.data = retd;
+				data2.size = (u_int32_t)retdl;
+
+				if ((ret = __lv_unpack_txn_vrfy_info(
+				    &txninfop, &data2)) != 0)
+					goto out;
+				if ((ret = handler(lvinfo, txninfop,
+				    param)) != 0)
+					/* Stop the iteration on error. */
+					goto out;
+			}
+		}
+
+	}
+out:
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	__os_free(lvinfo->dbenv->env, btbuf);
+	return (ret);
+}
+
+/* Txninfo iteration handler to add recycle info for affected txns. */
+static int
+__lv_add_recycle_handler(lvinfo, txninfop, params)
+	DB_LOG_VRFY_INFO *lvinfo;
+	VRFY_TXN_INFO *txninfop;
+	void *params;
+{
+	int ret;
+	struct __add_recycle_params *param;
+
+	ret = 0;
+	param = (struct __add_recycle_params *)params;
+
+	/*
+	 * If the txnid is reused, update its recycle info and note it for
+	 * later update, otherwise free the txninfop structure.
+	 */
+	if (txninfop->txnid < param->min && txninfop->txnid > param->max) {
+		ret = __free_txninfo(txninfop);
+		return (ret);
+	}
+
+	ret = __lv_add_recycle_lsn(txninfop, &(param->recycle_lsn));
+
+	if (ret != 0)
+		goto err;
+	/*
+	 * Below is one way to tell if a txn is aborted without doing another
+	 * backward pass of the log. However if the txn id is not in the
+	 * chosen recycled txn id range, we can't tell, until all the log
+	 * records are passed --- the remaining active txns are the aborted
+	 * txns.
+	 * No longer needed since we did another backward pass of the log
+	 * and have all the txn lifetimes.
+	if (txninfop->status == TXN_STAT_ACTIVE)
+		__on_txn_abort(lvinfo, txninfop);
+	 */
+	if (txninfop->status == TXN_STAT_PREPARE) {
+		__db_errx(lvinfo->dbenv->env,
+		    "[ERROR] Transaction with ID %u is prepared and not "
+		    "committed, but its ID is recycled by log record [%u, %u].",
+		    txninfop->txnid, param->recycle_lsn.file,
+		    param->recycle_lsn.offset);
+	}
+	/* Note down to store later. */
+	param->ti2u[(param->ti2ui)++] = txninfop;
+	if (param->ti2ui == param->ti2ul)
+		BDBOP(__os_realloc(lvinfo->dbenv->env,
+		    sizeof(VRFY_TXN_INFO *) * (param->ti2ul *= 2),
+		    &(param->ti2u)));
+err:
+	return (ret);
+
+}
+/*
+ * PUBLIC: int __rem_last_recycle_lsn __P((VRFY_TXN_INFO *));
+ */
+int
+__rem_last_recycle_lsn(txninfop)
+	VRFY_TXN_INFO *txninfop;
+{
+	int ret;
+
+	ret = 0;
+	if (txninfop->num_recycle == 0)
+		return (0);
+	txninfop->num_recycle--;
+	if (txninfop->num_recycle > 0)
+		BDBOP(__os_realloc(NULL, txninfop->num_recycle * sizeof(DB_LSN),
+		    &(txninfop->recycle_lsns)));
+	else {
+		__os_free(NULL, txninfop->recycle_lsns);
+		txninfop->recycle_lsns = NULL;
+	}
+err:
+	return (ret);
+
+}
+
+/*
+ * __add_file_updated --
+ *	Add a file's dbregid and uid to the updating txn if it's not yet
+ *	recorded.
+ *
+ * PUBLIC: int __add_file_updated __P((VRFY_TXN_INFO *, const DBT *, int32_t));
+ */
+int
+__add_file_updated (txninfop, fileid, dbregid)
+	VRFY_TXN_INFO *txninfop;
+	const DBT *fileid;
+	int32_t dbregid;
+{
+	int ret;
+	DBT *pdbt, *p;
+	u_int32_t found, i;
+
+	ret = 0;
+	p = pdbt = NULL;
+
+	for (found = 0, i = 0; i < txninfop->filenum; i++) {
+		p = &(txninfop->fileups[i]);
+		if (p->size == fileid->size &&
+		    memcmp(p->data, fileid->data, p->size) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found)
+		return (0);
+
+	/* Add file's uid into the array, deep copy from fileid. */
+	txninfop->filenum++;
+	if ((ret = __os_realloc(NULL, txninfop->filenum *
+	    sizeof(DBT), &(txninfop->fileups))) != 0)
+		goto err;
+
+	pdbt = &(txninfop->fileups[txninfop->filenum - 1]);
+	memset(pdbt, 0, sizeof(DBT));
+	if ((ret = __os_malloc(NULL,
+	    pdbt->size = fileid->size, &(pdbt->data))) != 0)
+		goto err;
+	memcpy(pdbt->data, fileid->data, fileid->size);
+
+	/* Add file dbregid into the array. */
+	BDBOP(__os_realloc(NULL, txninfop->filenum *
+	    sizeof(int32_t), &(txninfop->dbregid)));
+	txninfop->dbregid[txninfop->filenum - 1] = dbregid;
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __del_file_updated __P((VRFY_TXN_INFO *, const DBT *));
+ */
+int
+__del_file_updated (txninfop, fileid)
+	VRFY_TXN_INFO *txninfop;
+	const DBT *fileid;
+{
+	u_int32_t found, i;
+	int ret;
+	DBT *p;
+	void *pdbtdata;
+
+	ret = 0;
+
+	if (txninfop->filenum == 0)
+		return (0);
+
+	/*
+	 * If the array has an element identical to fileid, remove it. fileid
+	 * itself is intact after this function call.
+	 */
+	for (found = 0, i = 0, pdbtdata = NULL; i < txninfop->filenum; i++) {
+		p = &(txninfop->fileups[i]);
+		if (p->size == fileid->size &&
+		    memcmp(p->data, fileid->data, p->size) == 0) {
+			pdbtdata = p->data;
+			if (txninfop->filenum > 1) {
+				memmove(txninfop->fileups + i, txninfop->
+				    fileups + i + 1, sizeof(DBT) * (txninfop->
+				    filenum - (i + 1)));
+				memmove(txninfop->dbregid + i, txninfop->
+				    dbregid + i + 1, sizeof(int32_t) *
+				    (txninfop->filenum - (i + 1)));
+			} else {
+				__os_free(NULL, txninfop->fileups);
+				__os_free(NULL, txninfop->dbregid);
+				txninfop->fileups = NULL;
+				txninfop->dbregid = NULL;
+			}
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		txninfop->filenum--;
+		if (txninfop->filenum) {
+			BDBOP(__os_realloc(NULL, sizeof(DBT) *
+			    txninfop->filenum, &(txninfop->fileups)));
+			BDBOP(__os_realloc(NULL, sizeof(int32_t) *
+			    txninfop->filenum, &(txninfop->dbregid)));
+		}
+		__os_free(NULL, pdbtdata);
+	}
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __clear_fileups __P((VRFY_TXN_INFO *));
+ */
+int
+__clear_fileups(txninfop)
+	VRFY_TXN_INFO *txninfop;
+{
+	u_int32_t i;
+
+	for (i = 0; i < txninfop->filenum; i++)
+		__os_free(NULL, txninfop->fileups[i].data);
+
+	__os_free(NULL, txninfop->fileups);
+	__os_free(NULL, txninfop->dbregid);
+	txninfop->fileups = NULL;
+	txninfop->dbregid = NULL;
+	txninfop->filenum = 0;
+
+	return (0);
+}
+
+/*
+ *  __free_txninfo_stack  --
+ *	The object is on stack, only free its internal memory, not itself.
+ * PUBLIC: int __free_txninfo_stack __P((VRFY_TXN_INFO *));
+ */
+int
+__free_txninfo_stack (p)
+	VRFY_TXN_INFO *p;
+{
+	u_int32_t i;
+
+	if (p == NULL)
+		return (0);
+
+	if (p->fileups != NULL) {
+		for (i = 0; i < p->filenum; i++)
+			__os_free(NULL, p->fileups[i].data);
+		__os_free(NULL, p->fileups);
+	}
+
+	if (p->dbregid != NULL)
+		__os_free(NULL, p->dbregid);
+
+	if (p->recycle_lsns != NULL)
+		__os_free(NULL, p->recycle_lsns);
+
+	return (0);
+}
+/*
+ * PUBLIC: int __free_txninfo __P((VRFY_TXN_INFO *));
+ */
+int
+__free_txninfo(p)
+	VRFY_TXN_INFO *p;
+{
+	(void)__free_txninfo_stack(p);
+	__os_free(NULL, p);
+
+	return (0);
+}
+
+/* Construct a key and data DBT from the structure. */
+static int
+__lv_pack_filereg(freginfo, data)
+	const VRFY_FILEREG_INFO *freginfo;
+	DBT *data;
+{
+	char *buf, *p;
+	size_t bufsz, offset;
+	int ret;
+
+	ret = 0;
+	if ((ret = __os_malloc(NULL,
+	    bufsz = FILE_REG_INFO_TOTSIZE(*freginfo), &buf)) != 0)
+		goto err;
+	memset(buf, 0, bufsz);
+
+	memcpy(buf, freginfo, FILE_REG_INFO_FIXSIZE);
+	p = buf + FILE_REG_INFO_FIXSIZE;
+
+	offset = sizeof(int32_t) * freginfo->regcnt;
+	memcpy(p, freginfo->dbregids, offset);
+	p += offset;
+
+	memcpy(p, &(freginfo->fileid.size), sizeof(freginfo->fileid.size));
+	p += sizeof(freginfo->fileid.size);
+	memcpy(p, freginfo->fileid.data, freginfo->fileid.size);
+	p += freginfo->fileid.size;
+	(void)strcpy(p, freginfo->fname);
+
+	data->data = buf;
+	data->size = (u_int32_t)bufsz;
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __put_filereg_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:    const VRFY_FILEREG_INFO *));
+ */
+int __put_filereg_info (lvinfo, freginfo)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	const VRFY_FILEREG_INFO *freginfo;
+{
+
+	int ret;
+	DBT data;
+
+	memset(&data, 0, sizeof(DBT));
+
+	if ((ret = __lv_pack_filereg(freginfo, &data)) != 0)
+		goto err;
+
+	/*
+	 * We store dbregid-filereg map into dbregids.db, but we can't make
+	 * dbregids.db the sec db of fileregs.db, because dbregid is only
+	 * valid when a db file is open, we want to delete data with same
+	 * key in dbregids.db, but we want to keep all filereg_info data in
+	 * fileregs.db to track all db file lifetime and status.
+	 *
+	 * Consequently we will store dbregid-file_uid in dbregs.db, so that we
+	 * can delete dbregid when the db handle is closed, and we can
+	 * use the dbregid to get the currently open db file's uid.
+	 */
+
+	BDBOP2(lvinfo->dbenv, __db_put(lvinfo->fileregs, lvinfo->ip, NULL,
+	    (DBT *)&(freginfo->fileid), &data, 0), "__put_filereg_info");
+
+err:
+	if (data.data != NULL)
+		__os_free(lvinfo->dbenv->env, data.data);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __del_filelife __P((const DB_LOG_VRFY_INFO *, int32_t));
+ */
+int
+__del_filelife(lvinfo, dbregid)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	int32_t dbregid;
+{
+	int ret;
+	DBT key;
+
+	memset(&key, 0, sizeof(DBT));
+	key.data = &(dbregid);
+	key.size = sizeof(dbregid);
+
+	if ((ret = __db_del(lvinfo->dbregids, lvinfo->ip, NULL,
+	    &key, 0)) != 0)
+		goto err;
+
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __put_filelife __P((const DB_LOG_VRFY_INFO *, VRFY_FILELIFE *));
+ */
+int
+__put_filelife (lvinfo, pflife)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	VRFY_FILELIFE *pflife;
+{
+	int ret;
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = &(pflife->dbregid);
+	key.size = sizeof(pflife->dbregid);
+	data.data = pflife;
+	data.size = sizeof(VRFY_FILELIFE);
+
+	if ((ret = __db_put(lvinfo->dbregids, lvinfo->ip, NULL,
+	    &key, &data, 0)) != 0)
+		goto err;
+
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __get_filelife __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     int32_t, VRFY_FILELIFE **));
+ */
+int
+__get_filelife (lvinfo, dbregid, flifepp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	int32_t dbregid;
+	VRFY_FILELIFE **flifepp;
+{
+	int ret;
+	DBT key, data;
+	VRFY_FILELIFE *flifep;
+
+	ret = 0;
+	flifep = NULL;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &dbregid;
+	key.size = sizeof(dbregid);
+	if ((ret = __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+	    &key, &data, 0)) != 0)
+		goto err;
+	if ((ret = __os_malloc(lvinfo->dbenv->env,
+	    sizeof(VRFY_FILELIFE), &flifep)) != 0)
+		goto err;
+	DB_ASSERT(lvinfo->dbenv->env, flifep != NULL);
+	memcpy(flifep, data.data, sizeof(VRFY_FILELIFE));
+	*flifepp = flifep;
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __get_filereg_by_dbregid __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     int32_t, VRFY_FILEREG_INFO **));
+ */
+int
+__get_filereg_by_dbregid(lvinfo, dbregid, freginfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	int32_t dbregid;
+	VRFY_FILEREG_INFO **freginfopp;
+{
+	int ret;
+	DBT key, data;
+	char uid[DB_FILE_ID_LEN];
+	VRFY_FILELIFE *pflife;
+
+	memset(&data, 0, sizeof(DBT));
+	memset(&key, 0, sizeof(DBT));
+	key.data = &dbregid;
+	key.size = sizeof(dbregid);
+
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+	    &key, &data, 0), DB_NOTFOUND,  "__get_filereg_by_dbregid");
+	if (ret == DB_NOTFOUND)
+		goto err;
+
+	/* Use the file-uid as key to retrieve from fileregs.db. */
+	pflife = (VRFY_FILELIFE *)data.data;
+	memcpy((void *)uid, (void *)pflife->fileid, key.size = DB_FILE_ID_LEN);
+
+	key.data = (void *)uid;
+	memset(&data, 0, sizeof(DBT));
+
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->fileregs, lvinfo->ip, NULL,
+	    &key, &data, 0), DB_NOTFOUND,  "__get_filereg_by_dbregid");
+	if (ret == DB_NOTFOUND)
+		goto err;
+	if ((ret = __lv_unpack_filereg(&data, freginfopp)) != 0)
+		goto err;
+
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __add_dbregid __P((DB_LOG_VRFY_INFO *, VRFY_FILEREG_INFO *,
+ * PUBLIC:     int32_t, u_int32_t, DB_LSN, DBTYPE, db_pgno_t, int *));
+ */
+int
+__add_dbregid(lvh, freg, dbregid, opcode, lsn, dbtype, meta_pgno, addp)
+	DB_LOG_VRFY_INFO *lvh;
+	VRFY_FILEREG_INFO *freg;
+	int32_t dbregid;
+	u_int32_t opcode;
+	DB_LSN lsn;
+	DBTYPE dbtype;
+	db_pgno_t meta_pgno;
+	int *addp;
+{
+	int inarray, ret, tret;
+	u_int32_t i, j;
+	VRFY_FILELIFE flife;
+
+	inarray = ret = tret = 0;
+	for (i = 0; i < freg->regcnt; i++) {
+		if (freg->dbregids[i] == dbregid) {
+			if (!IS_DBREG_CLOSE(opcode)) {
+				/* Opening an open dbreg id. */
+				if (IS_DBREG_OPEN(opcode) &&
+				    (opcode != DBREG_CHKPNT &&
+				    opcode != DBREG_XCHKPNT)) {
+					tret = 2;
+					goto err;
+				}
+				tret = 0;
+				inarray = 1;
+			} else
+				/* Found the dbregid; gonna remove it. */
+				tret = -1;
+			break;
+		}
+	}
+
+	if (IS_DBREG_OPEN(opcode))
+		tret = 1;/* dbregid not in the array, gonna add 1. */
+
+	/*
+	 * Remove closed dbregid. dbregid can be recycled, not unique to a db
+	 * file, it's dynamically allocated for each db handle.
+	 */
+	if (tret == -1) {
+		for (j = i; j < freg->regcnt - 1; j++)
+			freg->dbregids[j] = freg->dbregids[j + 1];
+		freg->regcnt--;
+		BDBOP(__os_realloc(lvh->dbenv->env,
+		    sizeof(int32_t) * freg->regcnt, &(freg->dbregids)));
+		/* Don't remove dbregid life info from dbregids db. */
+	} else if (tret == 1) {
+		if (!inarray) {
+			freg->regcnt++;
+			BDBOP(__os_realloc(lvh->dbenv->env,
+			    sizeof(int32_t) * freg->regcnt, &(freg->dbregids)));
+			freg->dbregids[freg->regcnt - 1] = dbregid;
+		}
+		flife.dbregid = dbregid;
+		memcpy(flife.fileid, freg->fileid.data, freg->fileid.size);
+		flife.lifetime = opcode;
+		flife.dbtype = dbtype;
+		flife.lsn = lsn;
+		flife.meta_pgno = meta_pgno;
+		if ((ret = __put_filelife(lvh, &flife)) != 0)
+			goto err;
+	}
+
+err:
+	*addp = tret;
+	return (ret);
+
+}
+
+/*
+ * PUBLIC: int __get_filereg_info __P((const DB_LOG_VRFY_INFO *, const DBT *,
+ * PUBLIC:     VRFY_FILEREG_INFO **));
+ */
+int
+__get_filereg_info (lvinfo, fuid, freginfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	const DBT *fuid;
+	VRFY_FILEREG_INFO **freginfopp;
+{
+	int ret;
+	DBT data;
+
+	memset(&data, 0, sizeof(DBT));
+
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->fileregs, lvinfo->ip, NULL,
+	    (DBT *)fuid, &data, 0), DB_NOTFOUND,  "__get_filereg_info");
+	if (ret == DB_NOTFOUND)
+		goto err;
+	if ((ret = __lv_unpack_filereg(&data, freginfopp)) != 0)
+		goto err;
+
+err:
+	return (ret);
+}
+
+static int
+__lv_unpack_filereg(data, freginfopp)
+	const DBT *data;
+	VRFY_FILEREG_INFO **freginfopp;
+{
+	char *p, *q;
+	u_int32_t fidsz, arrsz;
+	VRFY_FILEREG_INFO *buf;
+	int ret;
+
+	ret = 0;
+	p = q = NULL;
+	fidsz = arrsz = 0;
+	buf = NULL;
+
+	if ((ret = __os_malloc(NULL, sizeof(VRFY_FILEREG_INFO), &buf)) != 0)
+		goto err;
+	memset(buf, 0, sizeof(VRFY_FILEREG_INFO));
+
+	memcpy(buf, data->data, FILE_REG_INFO_FIXSIZE);
+	*freginfopp = (VRFY_FILEREG_INFO *)buf;
+	p = ((char *)(data->data)) + FILE_REG_INFO_FIXSIZE;
+
+	if ((ret = __os_malloc(NULL, arrsz = (*freginfopp)->regcnt *
+	    sizeof(int32_t), &((*freginfopp)->dbregids))) != 0)
+		goto err;
+	memcpy((*freginfopp)->dbregids, p, arrsz);
+	p += arrsz;
+
+	memcpy(&fidsz, p, sizeof(fidsz));
+	p += sizeof(fidsz);
+	if ((ret = __os_malloc(NULL, fidsz, &q)) != 0)
+		goto err;
+	memcpy(q, p, fidsz);
+	(*freginfopp)->fileid.data = q;
+	(*freginfopp)->fileid.size = fidsz;
+	p += fidsz;
+
+	if ((ret = __os_malloc(NULL, sizeof(char) * (strlen(p) + 1), &q)) != 0)
+		goto err;
+	(void)strcpy(q, p);
+
+	(*freginfopp)->fname = q;
+err:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __free_filereg_info __P((VRFY_FILEREG_INFO *));
+ */
+int
+__free_filereg_info(p)
+	VRFY_FILEREG_INFO *p;
+{
+	if (p == NULL)
+		return (0);
+	if (p ->fname != NULL)
+		__os_free(NULL, (void *)(p->fname));
+	if (p->fileid.data != NULL)
+		__os_free(NULL, p->fileid.data);
+	if (p->dbregids != NULL)
+		__os_free(NULL, p->dbregids);
+	__os_free(NULL, p);
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __get_ckp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN,
+ * PUBLIC:     VRFY_CKP_INFO **));
+ */
+int
+__get_ckp_info (lvinfo, lsn, ckpinfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+	VRFY_CKP_INFO **ckpinfopp;
+{
+	int ret;
+	DBT key, data;
+	VRFY_CKP_INFO *ckpinfo;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = &lsn;
+	key.size = sizeof(DB_LSN);
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->ckps, lvinfo->ip, NULL,
+	    &key, &data, 0), DB_NOTFOUND, "__get_ckp_info");
+
+	if (ret == DB_NOTFOUND)
+		goto err;
+
+	if ((ret = __os_malloc(lvinfo->dbenv->env,
+	    sizeof(VRFY_CKP_INFO), &ckpinfo)) != 0)
+		goto err;
+	memcpy(ckpinfo, data.data, sizeof(VRFY_CKP_INFO));
+	*ckpinfopp = ckpinfo;
+err:
+	return (ret);
+
+}
+
+/*
+ * PUBLIC: int __get_last_ckp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     VRFY_CKP_INFO **));
+ */
+int
+__get_last_ckp_info (lvinfo, ckpinfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	VRFY_CKP_INFO **ckpinfopp;
+{
+	int ret, tret;
+	DBT key, data;
+	VRFY_CKP_INFO *ckpinfo;
+	DBC *csr;
+
+	csr = NULL;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	BDBOP(__db_cursor(lvinfo->ckps, lvinfo->ip, NULL, &csr, 0));
+	if ((ret = __dbc_get(csr, &key, &data, DB_LAST)) != 0)
+		goto err;
+
+	if ((ret = __os_malloc(lvinfo->dbenv->env,
+	    sizeof(VRFY_CKP_INFO), &ckpinfo)) != 0)
+		goto err;
+	DB_ASSERT(lvinfo->dbenv->env, sizeof(VRFY_CKP_INFO) == data.size);
+	memcpy(ckpinfo, data.data, sizeof(VRFY_CKP_INFO));
+	*ckpinfopp = ckpinfo;
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	if (ret != 0 && ret != DB_NOTFOUND)
+		__db_err(lvinfo->dbenv->env, ret, "__get_last_ckp_info");
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __put_ckp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     const VRFY_CKP_INFO *));
+ */
+int __put_ckp_info (lvinfo, ckpinfo)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	const VRFY_CKP_INFO *ckpinfo;
+{
+	int ret;
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = (void *)&ckpinfo->lsn;
+	key.size = sizeof(DB_LSN);
+	data.data = (void *)ckpinfo;
+	data.size = sizeof(VRFY_CKP_INFO);
+
+	BDBOP2(lvinfo->dbenv, __db_put(lvinfo->ckps, lvinfo->ip,
+	    NULL, &key, &data, 0), "__put_ckp_info");
+	return (0);
+}
+
+/*
+ * PUBLIC: int __get_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     DB_LSN, VRFY_TIMESTAMP_INFO **));
+ */
+int __get_timestamp_info (lvinfo, lsn, tsinfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+	VRFY_TIMESTAMP_INFO **tsinfopp;
+{
+	int ret;
+	DBT key, data;
+	VRFY_TIMESTAMP_INFO *tsinfo;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = &lsn;
+	key.size = sizeof(DB_LSN);
+	BDBOP3(lvinfo->dbenv, __db_get(lvinfo->lsntime, lvinfo->ip, NULL,
+	    &key, &data, 0), DB_NOTFOUND, "__get_timestamp_info");
+
+	if (ret == DB_NOTFOUND)
+		goto err;
+
+	if ((ret = __os_malloc(lvinfo->dbenv->env,
+	    sizeof(VRFY_TIMESTAMP_INFO), &tsinfo)) != 0)
+		goto err;
+
+	memcpy(tsinfo, data.data, sizeof(VRFY_TIMESTAMP_INFO));
+	*tsinfopp = tsinfo;
+err:
+	return (ret);
+}
+
+/*
+ * __get_latest_timestamp_info --
+ *	Get latest timestamp info before lsn.
+ * PUBLIC: int __get_latest_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     DB_LSN, VRFY_TIMESTAMP_INFO **));
+ */
+int __get_latest_timestamp_info(lvinfo, lsn, tsinfopp)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+	VRFY_TIMESTAMP_INFO **tsinfopp;
+{
+	int ret, tret;
+	DBT key, data;
+	VRFY_TIMESTAMP_INFO *tsinfo;
+	DBC *csr;
+
+	csr = NULL;
+	ret = tret = 0;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &lsn;
+	key.size = sizeof(lsn);
+	BDBOP(__db_cursor(lvinfo->lsntime, lvinfo->ip, NULL, &csr, 0));
+
+	BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+	BDBOP(__dbc_get(csr, &key, &data, DB_PREV));
+
+	if ((ret = __os_malloc(lvinfo->dbenv->env, sizeof(VRFY_TIMESTAMP_INFO),
+	    &tsinfo)) != 0)
+		goto err;
+
+	memcpy(tsinfo, data.data, sizeof(VRFY_TIMESTAMP_INFO));
+	*tsinfopp = tsinfo;
+
+err:
+	if (ret != 0 && ret != DB_NOTFOUND)
+		__db_err(lvinfo->dbenv->env,
+		    ret, "__get_latest_timestamp_info");
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __put_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC:     const VRFY_TIMESTAMP_INFO *));
+ */
+int __put_timestamp_info (lvinfo, tsinfo)
+	const DB_LOG_VRFY_INFO *lvinfo;
+	const VRFY_TIMESTAMP_INFO *tsinfo;
+{
+	int ret;
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = (void *)&(tsinfo->lsn);
+	key.size = sizeof(DB_LSN);
+	data.data = (void *)tsinfo;
+	data.size = sizeof(VRFY_TIMESTAMP_INFO);
+	BDBOP2(lvinfo->dbenv, __db_put(lvinfo->lsntime, lvinfo->ip, NULL,
+	    &key, &data, 0), "__put_timestamp_info");
+
+	return (0);
+}
+
+static int
+__lv_txnrgns_lsn_cmp (db, d1, d2)
+	DB *db;
+	const DBT *d1, *d2;
+{
+	struct __lv_txnrange r1, r2;
+
+	DB_ASSERT(db->env, d1->size == sizeof(r1));
+	DB_ASSERT(db->env, d2->size == sizeof(r2));
+	memcpy(&r1, d1->data, d1->size);
+	memcpy(&r2, d2->data, d2->size);
+
+	return (LOG_COMPARE(&(r1.end), &(r2.end)));
+}
+
+/*
+ * __find_lsnrg_by_timerg --
+ *	Find the lsn closed interval [beginlsn, endlsn] so that the
+ *	corresponding timestamp interval fully contains interval [begin, end].
+ * PUBLIC: int __find_lsnrg_by_timerg __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     time_t, time_t, DB_LSN *, DB_LSN *));
+ */
+int
+__find_lsnrg_by_timerg(lvinfo, begin, end, startlsn, endlsn)
+	DB_LOG_VRFY_INFO *lvinfo;
+	time_t begin, end;
+	DB_LSN *startlsn, *endlsn;
+{
+	int ret, tret;
+	DBC *csr;
+	struct __lv_timestamp_info *t1, *t2;
+	DBT key, data;
+
+	ret = tret = 0;
+	csr = NULL;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	BDBOP(__db_cursor(lvinfo->timelsn, lvinfo->ip, NULL, &csr, 0));
+
+	/*
+	 * We want a lsn range that completely contains [begin, end], so
+	 * try move 1 record prev when getting the startlsn.
+	 */
+	key.data = &begin;
+	key.size = sizeof(begin);
+	BDBOP(__dbc_get(csr, &key, &data, DB_SET_RANGE));
+	if ((ret = __dbc_get(csr, &key, &data, DB_PREV)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND)/* begin is smaller than the smallest key. */
+		startlsn->file = startlsn->offset = 0;/* beginning. */
+	else {
+		t1 = (struct __lv_timestamp_info *)data.data;
+		*startlsn = t1->lsn;
+	}
+
+	/*
+	 * Move to the last key/data pair of the duplicate set to get the
+	 * biggest lsn having end as timestamp.
+	 */
+	key.data = &end;
+	key.size = sizeof(end);
+	if ((ret = __dbc_get(csr, &key, &data, DB_SET_RANGE)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+	if (ret == DB_NOTFOUND) {
+		endlsn->file = endlsn->offset = (u_int32_t)-1;/* Biggest lsn. */
+		ret = 0;
+		goto err; /* We are done. */
+	}
+
+	/*
+	 * Go to the biggest lsn of the dup set, if the key is the last one,
+	 * go to the last one.
+	 */
+	if ((ret = __dbc_get(csr, &key, &data, DB_NEXT_NODUP)) != 0 &&
+	    ret != DB_NOTFOUND)
+		goto err;
+
+	if (ret == DB_NOTFOUND)
+		BDBOP(__dbc_get(csr, &key, &data, DB_LAST));
+	else
+		BDBOP(__dbc_get(csr, &key, &data, DB_PREV));
+
+	t2 = (struct __lv_timestamp_info *)data.data;
+	*endlsn = t2->lsn;
+err:
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __add_txnrange __P((DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC:     DB_LSN, int32_t, int));
+ */
+int __add_txnrange (lvinfo, txnid, lsn, when, ishead)
+	DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t txnid;
+	DB_LSN lsn;
+	int32_t when;
+	int ishead; /* Whether it's the 1st log of the txn. */
+{
+	int ret, tret;
+	DBC *csr;
+	struct __lv_txnrange tr, *ptr;
+	DBT key, data;
+
+	csr = NULL;
+	ret = 0;
+	ptr = NULL;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	memset(&tr, 0, sizeof(tr));
+
+	key.data = &txnid;
+	key.size = sizeof(txnid);
+	tr.txnid = txnid;
+	BDBOP(__db_cursor(lvinfo->txnrngs, lvinfo->ip, NULL, &csr, 0));
+	/*
+	 * Note that we will backward play the logs to gather such information.
+	 */
+	if (!ishead) {
+		tr.end = lsn;
+		tr.when_commit = when;
+		data.data = &tr;
+		data.size = sizeof(tr);
+		BDBOP(__dbc_put(csr, &key, &data, DB_KEYFIRST));
+	} else {
+		/*
+		 * Dup data sorted by lsn, and we are backward playing logs,
+		 * so the 1st record should be the one we want.
+		 */
+		BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+		ptr = (struct __lv_txnrange *)data.data;
+		DB_ASSERT(lvinfo->dbenv->env, IS_ZERO_LSN(ptr->begin));
+		ptr->begin = lsn;
+		BDBOP(__dbc_put(csr, &key, &data, DB_CURRENT));
+	}
+
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * __get_aborttxn --
+ *	If lsn is the last log of an aborted txn T, T's txnid is
+ *	returned via the log verify handle.
+ *
+ * PUBLIC: int __get_aborttxn __P((DB_LOG_VRFY_INFO *, DB_LSN));
+ */
+int
+__get_aborttxn(lvinfo, lsn)
+	DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+{
+	int ret, tret;
+	u_int32_t txnid;
+	DBC *csr;
+	DBT key, data;
+
+	csr = NULL;
+	txnid = 0;
+	ret = tret = 0;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &lsn;
+	key.size = sizeof(lsn);
+	BDBOP(__db_cursor(lvinfo->txnaborts, lvinfo->ip, NULL, &csr, 0));
+	BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+	memcpy(&txnid, data.data, data.size);
+	/*
+	 * The lsn is the last op of an aborted txn, call __on_txnabort
+	 * before processing next log record.
+	 */
+	lvinfo->aborted_txnid = txnid;
+	lvinfo->aborted_txnlsn = lsn;
+
+err:
+	/* It's OK if can't find it. */
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * __txn_started --
+ *	Whether txnid is started before lsn and ended after lsn.
+ *
+ * PUBLIC: int __txn_started __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     DB_LSN, u_int32_t, int *));
+ */
+int
+__txn_started(lvinfo, lsn, txnid, res)
+	DB_LOG_VRFY_INFO *lvinfo;
+	DB_LSN lsn;
+	u_int32_t txnid;
+	int *res;
+{
+	int ret, tret;
+	DBC *csr;
+	DBT key, data;
+	struct __lv_txnrange *ptr, tr;
+
+	ret = *res = 0;
+	csr = NULL;
+	memset(&tr, 0, sizeof(tr));
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = &txnid;
+	key.size = sizeof(txnid);
+
+	BDBOP(__db_cursor(lvinfo->txnrngs, lvinfo->ip, NULL, &csr, 0));
+	BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+	for (;ret == 0; ret = __dbc_get(csr, &key, &data, DB_NEXT_DUP)) {
+		ptr = (struct __lv_txnrange *)data.data;
+		if (LOG_COMPARE(&lsn, &(ptr->begin)) > 0 &&
+		    LOG_COMPARE(&lsn, &(ptr->end)) <= 0) {
+			*res = 1;
+			break;
+		}
+	}
+err:
+	if (ret == DB_NOTFOUND)
+		ret = 0;/* It's OK if can't find it. */
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __set_logvrfy_dbfuid __P((DB_LOG_VRFY_INFO *));
+ */
+int
+__set_logvrfy_dbfuid(lvinfo)
+	DB_LOG_VRFY_INFO *lvinfo;
+{
+	int ret;
+	const char *p;
+	DBT key, data;
+	size_t buflen;
+
+	p = NULL;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	/* So far we only support verifying a specific db file. */
+	p = lvinfo->lv_config->dbfile;
+	buflen = sizeof(char) * (strlen(p) + 1);
+	key.data = (char *)p;
+	key.size = (u_int32_t)buflen;
+
+	BDBOP2(lvinfo->dbenv, __db_get(lvinfo->fnameuid, lvinfo->ip, NULL,
+	    &key, &data, 0), "__set_logvrfy_dbfuid");
+
+	memcpy(lvinfo->target_dbid, data.data, DB_FILE_ID_LEN);
+
+	return (ret);
+}
+
+/*
+ * __add_page_to_txn --
+ *	Try adding a page to a txn, result brings back if really added(0/1)
+ *	or if there is an access violation(-1).
+ * PUBLIC: int __add_page_to_txn __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     int32_t, db_pgno_t, u_int32_t, u_int32_t *, int *));
+ */
+int
+__add_page_to_txn (lvinfo, dbregid, pgno, txnid, otxn, result)
+	DB_LOG_VRFY_INFO *lvinfo;
+	int32_t dbregid;
+	db_pgno_t pgno;
+	u_int32_t txnid, *otxn;
+	int *result;
+{
+	int ret;
+	u_int8_t *buf;
+	DBT key, data;
+	size_t buflen;
+	u_int32_t txnid2;
+	VRFY_FILELIFE *pff;
+
+	if (txnid < TXN_MINIMUM) {
+		*result = 0;
+		return (0);
+	}
+	buf = NULL;
+	ret = 0;
+	txnid2 = 0;
+	pff = NULL;
+	buflen = sizeof(u_int8_t) * DB_FILE_ID_LEN + sizeof(db_pgno_t);
+	BDBOP(__os_malloc(lvinfo->dbenv->env, buflen, &buf));
+	memset(buf, 0, buflen);
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	/*
+	 * We use the file uid as key because a single db file can have
+	 * multiple dbregid at the same time, and we may neglect the fact
+	 * that the same db file is being updated by multiple txns if we use
+	 * dbregid as key.
+	 */
+	key.data = &dbregid;
+	key.size = sizeof(dbregid);
+	if ((ret = __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+	    &key, &data, 0)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			if (F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL)) {
+				ret = 0;
+				goto out;
+			} else
+				F_SET(lvinfo, DB_LOG_VERIFY_INTERR);
+		}
+		goto err;
+	}
+	pff = (VRFY_FILELIFE *)data.data;
+	memcpy(buf, pff->fileid, DB_FILE_ID_LEN);
+	memcpy(buf + DB_FILE_ID_LEN, (u_int8_t *)&pgno, sizeof(pgno));
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	key.data = buf;
+	key.size = (u_int32_t)buflen;
+	if ((ret = __db_get(lvinfo->pgtxn, lvinfo->ip, NULL,
+	    &key, &data, 0)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			data.data = &txnid;
+			data.size = sizeof(txnid);
+			BDBOP(__db_put(lvinfo->pgtxn, lvinfo->ip, NULL, &key,
+			    &data, 0));
+			*result = 1;
+			ret = 0;/* This is not an error. */
+		}
+		goto err;
+	}
+	DB_ASSERT(lvinfo->dbenv->env, data.size == sizeof(txnid2));
+	memcpy(&txnid2, data.data, data.size);
+	if (txnid == txnid2)/* The same txn already has the page. */
+		*result = 0;
+	else {/* Txn txnid is updating pages still held by txnid2. */
+		*result = -1;
+		*otxn = txnid2;
+	}
+out:
+	/* result is set to -1 on violation, 0 if already has it, 1 if added. */
+err:
+	if (buf != NULL)
+		__os_free(lvinfo->dbenv->env, buf);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __del_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t));
+ */
+int
+__del_txn_pages(lvinfo, txnid)
+	DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t txnid;
+{
+	int ret;
+	DBT key;
+
+	ret = 0;
+	memset(&key, 0, sizeof(DBT));
+	key.data = &txnid;
+	key.size = sizeof(txnid);
+
+	BDBOP(__db_del(lvinfo->txnpg, lvinfo->ip, NULL, &key, 0));
+
+err:
+	return (ret);
+}
+
+/*
+ * __is_ancestor_txn --
+ *	Tells via res if ptxnid is txnid's parent txn at the moment of lsn.
+ *
+ * PUBLIC: int __is_ancestor_txn __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LSN, int *));
+ */
+int
+__is_ancestor_txn (lvinfo, ptxnid, txnid, lsn, res)
+	DB_LOG_VRFY_INFO *lvinfo;
+	u_int32_t ptxnid, txnid;
+	DB_LSN lsn;
+	int *res;
+{
+	u_int32_t ptid;
+	int ret, tret;
+	DBC *csr;
+	DB *pdb;
+	DBT key, data;
+	struct __lv_txnrange tr;
+
+	ret = 0;
+	ptid = txnid;
+	csr = NULL;
+	pdb = lvinfo->txnrngs;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	*res = 0;
+	BDBOP(__db_cursor(pdb, lvinfo->ip, NULL, &csr, 0));
+
+	/* See if ptxnid is an ancestor of txnid. */
+	do {
+		key.data = &ptid;
+		key.size = sizeof(ptid);
+		BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+		/* A txnid maybe reused, we want the range having lsn in it. */
+		for (;ret == 0;
+		    ret = __dbc_get(csr, &key, &data, DB_NEXT_DUP)) {
+			DB_ASSERT(pdb->env, sizeof(tr) == data.size);
+			memcpy(&tr, data.data, data.size);
+			if (tr.ptxnid > 0 &&
+			    LOG_COMPARE(&lsn, &(tr.begin)) >= 0 &&
+			    LOG_COMPARE(&lsn, &(tr.end)) <= 0)
+				break;
+		}
+
+		if (tr.ptxnid == ptxnid) {
+			*res = 1;
+			goto out;
+		} else
+			ptid = tr.ptxnid;
+
+	} while (ptid != 0);
+out:
+
+err:
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __return_txn_pages __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC:     u_int32_t, u_int32_t));
+ */
+int __return_txn_pages(lvh, ctxn, ptxn)
+	DB_LOG_VRFY_INFO *lvh;
+	u_int32_t ctxn, ptxn;
+{
+	int ret, tret;
+	DBC *csr;
+	DB *pdb, *sdb;
+	DBT key, key2, data, data2;
+	char buf[DB_FILE_ID_LEN + sizeof(db_pgno_t)];
+
+	ret = tret = 0;
+	csr = NULL;
+	sdb = lvh->txnpg;
+	pdb = lvh->pgtxn;
+	memset(&key, 0, sizeof(DBT));
+	memset(&key2, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	memset(&data2, 0, sizeof(DBT));
+
+	BDBOP(__db_cursor(sdb, lvh->ip, NULL, &csr, 0));
+	key.data = &ctxn;
+	key.size = sizeof(ctxn);
+	key2.data = &ptxn;
+	key2.size = sizeof(ptxn);
+	data2.data = buf;
+	data2.ulen = DB_FILE_ID_LEN + sizeof(db_pgno_t);
+	data2.flags = DB_DBT_USERMEM;
+
+	for (ret = __dbc_pget(csr, &key, &data2, &data, DB_SET); ret == 0;
+	    ret = __dbc_pget(csr, &key, &data2, &data, DB_NEXT_DUP))
+		BDBOP(__db_put(pdb, lvh->ip, NULL, &data2, &key2, 0));
+	if ((ret = __del_txn_pages(lvh, ctxn)) != 0 && ret != DB_NOTFOUND)
+		goto err;
+err:
+	if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+		ret = tret;
+	return (ret);
+}
+
+#define	ADD_ITEM(lvh, logtype) ((lvh)->logtype_names[(logtype)] = (#logtype))
+static void
+__lv_setup_logtype_names(lvinfo)
+	DB_LOG_VRFY_INFO *lvinfo;
+{
+	ADD_ITEM(lvinfo, DB___bam_irep);
+	ADD_ITEM(lvinfo, DB___bam_split_42);
+	ADD_ITEM(lvinfo, DB___bam_split);
+	ADD_ITEM(lvinfo, DB___bam_rsplit);
+	ADD_ITEM(lvinfo, DB___bam_adj);
+	ADD_ITEM(lvinfo, DB___bam_cadjust);
+	ADD_ITEM(lvinfo, DB___bam_cdel);
+	ADD_ITEM(lvinfo, DB___bam_repl);
+	ADD_ITEM(lvinfo, DB___bam_root);
+	ADD_ITEM(lvinfo, DB___bam_curadj);
+	ADD_ITEM(lvinfo, DB___bam_rcuradj);
+	ADD_ITEM(lvinfo, DB___bam_relink_43);
+	ADD_ITEM(lvinfo, DB___bam_merge_44);
+	ADD_ITEM(lvinfo, DB___crdel_metasub);
+	ADD_ITEM(lvinfo, DB___crdel_inmem_create);
+	ADD_ITEM(lvinfo, DB___crdel_inmem_rename);
+	ADD_ITEM(lvinfo, DB___crdel_inmem_remove);
+	ADD_ITEM(lvinfo, DB___dbreg_register);
+	ADD_ITEM(lvinfo, DB___db_addrem);
+	ADD_ITEM(lvinfo, DB___db_big);
+	ADD_ITEM(lvinfo, DB___db_ovref);
+	ADD_ITEM(lvinfo, DB___db_relink_42);
+	ADD_ITEM(lvinfo, DB___db_debug);
+	ADD_ITEM(lvinfo, DB___db_noop);
+	ADD_ITEM(lvinfo, DB___db_pg_alloc_42);
+	ADD_ITEM(lvinfo, DB___db_pg_alloc);
+	ADD_ITEM(lvinfo, DB___db_pg_free_42);
+	ADD_ITEM(lvinfo, DB___db_pg_free);
+	ADD_ITEM(lvinfo, DB___db_cksum);
+	ADD_ITEM(lvinfo, DB___db_pg_freedata_42);
+	ADD_ITEM(lvinfo, DB___db_pg_freedata);
+	ADD_ITEM(lvinfo, DB___db_pg_init);
+	ADD_ITEM(lvinfo, DB___db_pg_sort_44);
+	ADD_ITEM(lvinfo, DB___db_pg_trunc);
+	ADD_ITEM(lvinfo, DB___db_realloc);
+	ADD_ITEM(lvinfo, DB___db_relink);
+	ADD_ITEM(lvinfo, DB___db_merge);
+	ADD_ITEM(lvinfo, DB___db_pgno);
+#ifdef HAVE_HASH
+	ADD_ITEM(lvinfo, DB___ham_insdel);
+	ADD_ITEM(lvinfo, DB___ham_newpage);
+	ADD_ITEM(lvinfo, DB___ham_splitdata);
+	ADD_ITEM(lvinfo, DB___ham_replace);
+	ADD_ITEM(lvinfo, DB___ham_copypage);
+	ADD_ITEM(lvinfo, DB___ham_metagroup_42);
+	ADD_ITEM(lvinfo, DB___ham_metagroup);
+	ADD_ITEM(lvinfo, DB___ham_groupalloc_42);
+	ADD_ITEM(lvinfo, DB___ham_groupalloc);
+	ADD_ITEM(lvinfo, DB___ham_changeslot);
+	ADD_ITEM(lvinfo, DB___ham_contract);
+	ADD_ITEM(lvinfo, DB___ham_curadj);
+	ADD_ITEM(lvinfo, DB___ham_chgpg);
+#endif
+#ifdef HAVE_QUEUE
+	ADD_ITEM(lvinfo, DB___qam_incfirst);
+	ADD_ITEM(lvinfo, DB___qam_mvptr);
+	ADD_ITEM(lvinfo, DB___qam_del);
+	ADD_ITEM(lvinfo, DB___qam_add);
+	ADD_ITEM(lvinfo, DB___qam_delext);
+#endif
+	ADD_ITEM(lvinfo, DB___txn_regop_42);
+	ADD_ITEM(lvinfo, DB___txn_regop);
+	ADD_ITEM(lvinfo, DB___txn_ckp_42);
+	ADD_ITEM(lvinfo, DB___txn_ckp);
+	ADD_ITEM(lvinfo, DB___txn_child);
+	ADD_ITEM(lvinfo, DB___txn_xa_regop_42);
+	ADD_ITEM(lvinfo, DB___txn_prepare);
+	ADD_ITEM(lvinfo, DB___txn_recycle);
+	ADD_ITEM(lvinfo, DB___fop_create_42);
+	ADD_ITEM(lvinfo, DB___fop_create);
+	ADD_ITEM(lvinfo, DB___fop_remove);
+	ADD_ITEM(lvinfo, DB___fop_write_42);
+	ADD_ITEM(lvinfo, DB___fop_write);
+	ADD_ITEM(lvinfo, DB___fop_rename_42);
+	ADD_ITEM(lvinfo, DB___fop_rename_noundo_46);
+	ADD_ITEM(lvinfo, DB___fop_rename);
+	ADD_ITEM(lvinfo, DB___fop_rename_noundo);
+	ADD_ITEM(lvinfo, DB___fop_file_remove);
+}
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c
new file mode 100644
index 00000000..dc331215
--- /dev/null
+++ b/src/mp/mp_alloc.c
@@ -0,0 +1,724 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * This configuration parameter limits the number of hash buckets which
+ * __memp_alloc() searches through while excluding buffers with a 'high'
+ * priority.
+ */
+#if !defined(MPOOL_ALLOC_SEARCH_LIMIT)
+#define	MPOOL_ALLOC_SEARCH_LIMIT	500
+#endif
+
+/*
+ * __memp_alloc --
+ *	Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
+	DB_MPOOL *dbmp;
+	REGINFO *infop;
+	MPOOLFILE *mfp;
+	size_t len;
+	roff_t *offsetp;
+	void *retp;
+{
+	BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
+	BH_FROZEN_PAGE *frozen_bhp;
+	DB_LSN oldest_reader, vlsn;
+	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
+	ENV *env;
+	MPOOL *c_mp;
+	MPOOLFILE *bh_mfp;
+	size_t freed_space;
+	u_int32_t buckets, bucket_priority, buffers, cache_reduction;
+	u_int32_t dirty_eviction, high_priority, priority, versions;
+	u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
+	int aggressive, alloc_freeze, b_lock, giveup;
+	int h_locked, need_free, obsolete, ret, write_error;
+	u_int8_t *endp;
+	void *p;
+
+	env = dbmp->env;
+	c_mp = infop->primary;
+	dbht = R_ADDR(infop, c_mp->htab);
+	hp_end = &dbht[c_mp->htab_buckets];
+	hp_saved = NULL;
+	priority_saved = 0;
+	write_error = 0;
+
+	buckets = buffers = put_counter = total_buckets = versions = 0;
+	aggressive = alloc_freeze = giveup = h_locked = 0;
+
+	/*
+	 * If we're allocating a buffer, and the one we're discarding is the
+	 * same size, we don't want to waste the time to re-integrate it into
+	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
+	 * NULL, we'll compare the underlying page sizes of the two buffers
+	 * before free-ing and re-allocating buffers.
+	 */
+	if (mfp != NULL) {
+		len = SSZA(BH, buf) + mfp->pagesize;
+		/* Add space for alignment padding for MVCC diagnostics. */
+		MVCC_BHSIZE(mfp, len);
+	}
+
+	STAT_INC(env, mpool, nallocs, c_mp->stat.st_alloc, len);
+
+	MPOOL_REGION_LOCK(env, infop);
+
+	/*
+	 * First we try to allocate from free memory.  If that fails, scan the
+	 * buffer pool to find buffers with low priorities.  We consider small
+	 * sets of hash buckets each time to limit the amount of work needing
+	 * to be done.  This approximates LRU, but not very well.  We either
+	 * find a buffer of the same size to use, or we will free 3 times what
+	 * we need in the hopes it will coalesce into a contiguous chunk of the
+	 * right size.  In the latter case we branch back here and try again.
+	 */
+alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
+		if (mfp != NULL) {
+			/*
+			 * For MVCC diagnostics, align the pointer so that the
+			 * buffer starts on a page boundary.
+			 */
+			MVCC_BHALIGN(p);
+			bhp = (BH *)p;
+
+			if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
+			    DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
+				MVCC_BHUNALIGN(bhp);
+				__env_alloc_free(infop, bhp);
+				goto search;
+			}
+			c_mp->pages++;
+		}
+		MPOOL_REGION_UNLOCK(env, infop);
+found:		if (offsetp != NULL)
+			*offsetp = R_OFFSET(infop, p);
+		*(void **)retp = p;
+
+		/*
+		 * Update the search statistics.
+		 *
+		 * We're not holding the region locked here, these statistics
+		 * can't be trusted.
+		 */
+#ifdef HAVE_STATISTICS
+		total_buckets += buckets;
+		if (total_buckets != 0) {
+			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
+				STAT_SET(env, mpool, alloc_max_buckets,
+				    c_mp->stat.st_alloc_max_buckets,
+				    total_buckets, infop->id);
+			STAT_ADJUST(env, mpool, alloc_buckets,
+			    c_mp->stat.st_alloc_buckets,
+			    total_buckets, infop->id);
+		}
+		if (buffers != 0) {
+			if (buffers > c_mp->stat.st_alloc_max_pages)
+				STAT_SET(env, mpool, alloc_max_pages,
+				    c_mp->stat.st_alloc_max_pages,
+				    buffers, infop->id);
+			STAT_ADJUST(env, mpool, alloc_pages,
+			    c_mp->stat.st_alloc_pages, buffers, infop->id);
+		}
+#endif
+		return (0);
+	} else if (giveup || c_mp->pages == 0) {
+		MPOOL_REGION_UNLOCK(env, infop);
+
+		__db_errx(env, DB_STR("3017",
+		    "unable to allocate space from the buffer cache"));
+		return ((ret == ENOMEM && write_error != 0) ? EIO : ret);
+	}
+
+search:
+	/*
+	 * Anything newer than 1/10th of the buffer pool is ignored during the
+	 * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
+	 */
+	cache_reduction = c_mp->pages / 10;
+	high_priority = aggressive ? MPOOL_LRU_MAX :
+	    c_mp->lru_priority - cache_reduction;
+	lru_generation = c_mp->lru_generation;
+
+	ret = 0;
+	MAX_LSN(oldest_reader);
+
+	/*
+	 * We re-attempt the allocation every time we've freed 3 times what
+	 * we need.  Reset our free-space counter.
+	 */
+	freed_space = 0;
+	total_buckets += buckets;
+	buckets = 0;
+
+	/*
+	 * Walk the hash buckets and find the next two with potentially useful
+	 * buffers.  Free the buffer with the lowest priority from the buckets'
+	 * chains.
+	 */
+	for (;;) {
+		/* All pages have been freed, make one last try */
+		if (c_mp->pages == 0)
+			goto alloc;
+
+		/* Check for wrap around. */
+		hp = &dbht[c_mp->last_checked++];
+		if (hp >= hp_end) {
+			c_mp->last_checked = 0;
+			hp = &dbht[c_mp->last_checked++];
+		}
+
+		/*
+		 * The failure mode is when there are too many buffers we can't
+		 * write or there's not enough memory in the system to support
+		 * the number of pinned buffers.
+		 *
+		 * Get aggressive if we've reviewed the entire cache without
+		 * freeing the needed space.  (The code resets "aggressive"
+		 * when we free any space.)  Aggressive means:
+		 *
+		 * a: set a flag to attempt to flush high priority buffers as
+		 *    well as other buffers.
+		 * b: look at a buffer in every hash bucket rather than choose
+		 *    the more preferable of two.
+		 * c: start to think about giving up.
+		 *
+		 * If we get here three or more times, sync the mpool to force
+		 * out queue extent pages.  While we might not have enough
+		 * space for what we want and flushing is expensive, why not?
+		 * Then sleep for a second, hopefully someone else will run and
+		 * free up some memory.
+		 *
+		 * Always try to allocate memory too, in case some other thread
+		 * returns its memory to the region.
+		 *
+		 * We don't have any way to know an allocation has no way to
+		 * succeed.  Fail if no pages are returned to the cache after
+		 * we've been trying for a relatively long time.
+		 *
+		 * !!!
+		 * This test ignores pathological cases like no buffers in the
+		 * system -- we check for that early on, so it isn't possible.
+		 */
+		if (buckets++ == c_mp->htab_buckets) {
+			if (freed_space > 0)
+				goto alloc;
+			MPOOL_REGION_UNLOCK(env, infop);
+
+			aggressive++;
+			/*
+			 * Once aggressive, we consider all buffers. By setting
+			 * this to MPOOL_LRU_MAX, we'll still select a victim
+			 * even if all buffers have the highest normal priority.
+			 */
+			high_priority = MPOOL_LRU_MAX;
+			PERFMON4(env, mpool, alloc_wrap,
+			    len, infop->id, aggressive, c_mp->put_counter);
+			switch (aggressive) {
+			case 1:
+				break;
+			case 2:
+				put_counter = c_mp->put_counter;
+				break;
+			case 3:
+			case 4:
+			case 5:
+			case 6:
+				(void)__memp_sync_int(
+				    env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
+
+				__os_yield(env, 1, 0);
+				break;
+			default:
+				aggressive = 1;
+				if (put_counter == c_mp->put_counter)
+					giveup = 1;
+				break;
+			}
+
+			MPOOL_REGION_LOCK(env, infop);
+			goto alloc;
+		}
+
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking the hash
+		 * bucket as we only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		/* Set aggressive if we have already searched for too long. */
+		if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) {
+			aggressive = 1;
+			/* Once aggressive, we consider all buffers. */
+			high_priority = MPOOL_LRU_MAX;
+		}
+
+		/* Unlock the region and lock the hash bucket. */
+		MPOOL_REGION_UNLOCK(env, infop);
+		MUTEX_READLOCK(env, hp->mtx_hash);
+		h_locked = 1;
+		b_lock = 0;
+
+		/*
+		 * Find a buffer we can use.
+		 *
+		 * We use the lowest-LRU singleton buffer if we find one and
+		 * it's better than the result of another hash bucket we've
+		 * reviewed.  We do not use a buffer which has a priority
+		 * greater than high_priority unless we are being aggressive.
+		 *
+		 * With MVCC buffers, the situation is more complicated: we
+		 * don't want to free a buffer out of the middle of an MVCC
+		 * chain, since that requires I/O.  So, walk the buffers,
+		 * looking for an obsolete buffer at the end of an MVCC chain.
+		 * Once a buffer becomes obsolete, its LRU priority is
+		 * irrelevant because that version can never be accessed again.
+		 *
+		 * If we don't find any obsolete MVCC buffers, we will get
+		 * aggressive, and in that case consider the lowest priority
+		 * buffer within a chain.
+		 *
+		 * Ignore referenced buffers, we can't get rid of them.
+		 */
+retry_search:	bhp = NULL;
+		bucket_priority = high_priority;
+		obsolete = 0;
+		SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
+			/*
+			 * First, do the standard LRU check for singletons.
+			 * We can use the buffer if it is unreferenced, has a
+			 * priority that isn't too high (unless we are
+			 * aggressive), and is better than the best candidate
+			 * we have found so far in this bucket.
+			 */
+#ifdef MPOOL_ALLOC_SEARCH_DYN
+			if (aggressive == 0 &&
+			     ++high_priority >= c_mp->lru_priority)
+				aggressive = 1;
+#endif
+
+			if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
+				if (BH_REFCOUNT(current_bhp) != 0)
+					continue;
+				buffers++;
+				if (bucket_priority > current_bhp->priority) {
+					bucket_priority = current_bhp->priority;
+					if (bhp != NULL)
+						atomic_dec(env, &bhp->ref);
+					bhp = current_bhp;
+					atomic_inc(env, &bhp->ref);
+				}
+				continue;
+			}
+
+			/*
+			 * For MVCC buffers, walk through the chain.  If we are
+			 * aggressive, choose the best candidate from within
+			 * the chain for freezing.
+			 */
+			for (mvcc_bhp = oldest_bhp = current_bhp;
+			    mvcc_bhp != NULL;
+			    oldest_bhp = mvcc_bhp,
+			    mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
+#ifdef MPOOL_ALLOC_SEARCH_DYN
+				if (aggressive == 0 &&
+				     ++high_priority >= c_mp->lru_priority)
+					aggressive = 1;
+#endif
+				DB_ASSERT(env, mvcc_bhp !=
+				    SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
+				if ((aggressive < 2 &&
+				    ++versions < (buffers >> 2)) ||
+				    BH_REFCOUNT(mvcc_bhp) != 0)
+					continue;
+				buffers++;
+				if (!F_ISSET(mvcc_bhp, BH_FROZEN) &&
+				    (bhp == NULL ||
+				    bhp->priority > mvcc_bhp->priority)) {
+					if (bhp != NULL)
+						atomic_dec(env, &bhp->ref);
+					bhp = mvcc_bhp;
+					atomic_inc(env, &bhp->ref);
+				}
+			}
+
+			/*
+			 * oldest_bhp is the last buffer on the MVCC chain, and
+			 * an obsolete buffer at the end of the MVCC chain gets
+			 * used without further search. Before checking for
+			 * obsolescence, update the cached oldest reader LSN in
+			 * the bucket if it is older than call's oldest_reader.
+			 */
+			if (BH_REFCOUNT(oldest_bhp) != 0)
+				continue;
+
+			if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) {
+				if (IS_MAX_LSN(oldest_reader) &&
+				   (ret = __txn_oldest_reader(
+				    env, &oldest_reader)) != 0) {
+					MUTEX_UNLOCK(env, hp->mtx_hash);
+					if (bhp != NULL)
+						atomic_dec(env, &bhp->ref);
+					return (ret);
+				}
+				if (LOG_COMPARE(&oldest_reader,
+				    &hp->old_reader) > 0)
+					hp->old_reader = oldest_reader;
+			}
+
+			if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
+				if (aggressive < 2)
+					buffers++;
+				obsolete = 1;
+				if (bhp != NULL)
+					atomic_dec(env, &bhp->ref);
+				bhp = oldest_bhp;
+				atomic_inc(env, &bhp->ref);
+				goto this_buffer;
+			}
+		}
+
+		/*
+		 * bhp is either NULL or the best candidate buffer.
+		 * We'll use the chosen buffer only if we have compared its
+		 * priority against one chosen from another hash bucket.
+		 */
+		if (bhp == NULL)
+			goto next_hb;
+
+		priority = bhp->priority;
+
+		/*
+		 * Compare two hash buckets and select the one with the lower
+		 * priority. Performance testing showed looking at two improves
+		 * the LRU-ness and looking at more only does a little better.
+		 */
+		if (hp_saved == NULL) {
+			hp_saved = hp;
+			priority_saved = priority;
+			goto next_hb;
+		}
+
+		/*
+		 * If the buffer we just found is a better choice than our
+		 * previous choice, use it.
+		 *
+		 * If the previous choice was better, pretend we're moving
+		 * from this hash bucket to the previous one and re-do the
+		 * search.
+		 *
+		 * We don't worry about simply swapping between two buckets
+		 * because that could only happen if a buffer was removed
+		 * from the chain, or its priority updated.   If a buffer
+		 * is removed from the chain, some other thread has managed
+		 * to discard a buffer, so we're moving forward.  Updating
+		 * a buffer's priority will make it a high-priority buffer,
+		 * so we'll ignore it when we search again, and so we will
+		 * eventually zero in on a buffer to use, or we'll decide
+		 * there are no buffers we can use.
+		 *
+		 * If there's only a single hash bucket with buffers, we'll
+		 * search the bucket once, choose a buffer, walk the entire
+		 * list of buckets and search it again.   In the case of a
+		 * system that's busy, it's possible to imagine a case where
+		 * we'd loop for a long while.  For that reason, and because
+		 * the test is easy, we special case and test for it.
+		 */
+		if (priority > priority_saved && hp != hp_saved) {
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			hp_tmp = hp_saved;
+			hp_saved = hp;
+			hp = hp_tmp;
+			priority_saved = priority;
+			MUTEX_READLOCK(env, hp->mtx_hash);
+			h_locked = 1;
+			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+			atomic_dec(env, &bhp->ref);
+			goto retry_search;
+		}
+
+		/*
+		 * If another thread has called __memp_reset_lru() while we were
+		 * looking for this buffer, it is possible that we've picked a
+		 * poor choice for a victim. If so toss it and start over.
+		 */
+		if (lru_generation != c_mp->lru_generation) {
+			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+			atomic_dec(env, &bhp->ref);
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			MPOOL_REGION_LOCK(env, infop);
+			hp_saved = NULL;
+			goto search;
+		}
+
+this_buffer:	/*
+		 * Discard any previously remembered hash bucket, we've got
+		 * a winner.
+		 */
+		hp_saved = NULL;
+
+		/* Drop the hash mutex and lock the buffer exclusively. */
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		h_locked = 0;
+
+		/* Don't bother trying to latch a busy buffer. */
+		if (BH_REFCOUNT(bhp) > 1)
+			goto next_hb;
+
+		/* We cannot block as the caller is probably holding locks. */
+		if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
+			if (ret != DB_LOCK_NOTGRANTED)
+				return (ret);
+			goto next_hb;
+		}
+		F_SET(bhp, BH_EXCLUSIVE);
+		b_lock = 1;
+
+		/* Someone may have grabbed it while we got the lock. */
+		if (BH_REFCOUNT(bhp) != 1)
+			goto next_hb;
+
+		/* Find the associated MPOOLFILE. */
+		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/* If the page is dirty, write it. */
+		ret = 0;
+		dirty_eviction = 0;
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+			DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
+
+			/*
+			 * If a write fails for any reason, we can't proceed.
+			 *
+			 * If there's a write error and we're having problems
+			 * finding something to allocate, avoid selecting this
+			 * buffer again by maximizing its priority.
+			 */
+			if (ret != 0) {
+				if (ret != EPERM && ret != EAGAIN) {
+					write_error++;
+					__db_errx(env, DB_STR_A("3018",
+		"%s: unwritable page %d remaining in the cache after error %d",
+					    "%s %d %d"),
+					    __memp_fns(dbmp, bh_mfp),
+					    bhp->pgno, ret);
+				}
+				bhp->priority = MPOOL_LRU_REDZONE;
+
+				goto next_hb;
+			}
+
+			dirty_eviction = 1;
+		}
+
+		/*
+		 * Freeze this buffer, if necessary.  That is, if the buffer is
+		 * part of an MVCC chain and could be required by a reader.
+		 */
+		if (SH_CHAIN_HASPREV(bhp, vc) ||
+		    (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) {
+			if (!aggressive ||
+			    F_ISSET(bhp, BH_DIRTY | BH_FROZEN))
+				goto next_hb;
+			ret = __memp_bh_freeze(
+			    dbmp, infop, hp, bhp, &alloc_freeze);
+			if (ret == EIO)
+				write_error++;
+			if (ret == EBUSY || ret == EIO ||
+			    ret == ENOMEM || ret == ENOSPC) {
+				ret = 0;
+				goto next_hb;
+			} else if (ret != 0) {
+				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+				atomic_dec(env, &bhp->ref);
+				DB_ASSERT(env, b_lock);
+				F_CLR(bhp, BH_EXCLUSIVE);
+				MUTEX_UNLOCK(env, bhp->mtx_buf);
+				DB_ASSERT(env, !h_locked);
+				return (ret);
+			}
+		}
+
+		MUTEX_LOCK(env, hp->mtx_hash);
+		h_locked = 1;
+
+		/*
+		 * We released the hash bucket lock while doing I/O, so another
+		 * thread may have acquired this buffer and incremented the ref
+		 * count or dirtied the buffer or installed a new version after
+		 * we wrote it, in which case we can't have it.
+		 */
+		if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
+		    (SH_CHAIN_HASNEXT(bhp, vc) &&
+		    SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
+		    !BH_OBSOLETE(bhp, hp->old_reader, vlsn)))
+			goto next_hb;
+
+		/*
+		 * If the buffer is frozen, thaw it and look for another one
+		 * we can use. (Calling __memp_bh_freeze above will not
+		 * mark bhp BH_FROZEN.)
+		 */
+		if (F_ISSET(bhp, BH_FROZEN)) {
+			DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc));
+			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+			if (!F_ISSET(bhp, BH_THAWED)) {
+				/*
+				 * This call releases the hash bucket mutex.
+				 * We're going to retry the search, so we need
+				 * to re-lock it.
+				 */
+				if ((ret = __memp_bh_thaw(dbmp,
+				    infop, hp, bhp, NULL)) != 0)
+					return (ret);
+				MUTEX_READLOCK(env, hp->mtx_hash);
+			} else {
+				need_free = (atomic_dec(env, &bhp->ref) == 0);
+				F_CLR(bhp, BH_EXCLUSIVE);
+				MUTEX_UNLOCK(env, bhp->mtx_buf);
+				if (need_free) {
+					MPOOL_REGION_LOCK(env, infop);
+					SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+					    bhp, hq);
+					MPOOL_REGION_UNLOCK(env, infop);
+				}
+			}
+			bhp = NULL;
+			b_lock = alloc_freeze = 0;
+			goto retry_search;
+		}
+
+		/* We are certainly freeing this buf; now update statistic. */
+		if (dirty_eviction)
+			STAT_INC(env, mpool,
+			    dirty_eviction, c_mp->stat.st_rw_evict, infop->id);
+		else
+			STAT_INC(env, mpool,
+			    clean_eviction, c_mp->stat.st_ro_evict, infop->id);
+		/*
+		 * If we need some empty buffer headers for freezing, turn the
+		 * buffer we've found into frozen headers and put them on the
+		 * free list.  Only reset alloc_freeze if we've actually
+		 * allocated some frozen buffer headers.
+		 */
+		if (alloc_freeze) {
+			if ((ret = __memp_bhfree(dbmp,
+			     infop, bh_mfp, hp, bhp, 0)) != 0)
+				return (ret);
+			b_lock = 0;
+			h_locked = 0;
+
+			MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
+			    PROT_READ | PROT_WRITE | PROT_EXEC);
+
+			MPOOL_REGION_LOCK(env, infop);
+			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+			    (BH_FROZEN_ALLOC *)bhp, links);
+			frozen_bhp = (BH_FROZEN_PAGE *)
+			    ((BH_FROZEN_ALLOC *)bhp + 1);
+			endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
+			while ((u_int8_t *)(frozen_bhp + 1) < endp) {
+				frozen_bhp->header.mtx_buf = MUTEX_INVALID;
+				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+				    (BH *)frozen_bhp, hq);
+				frozen_bhp++;
+			}
+			MPOOL_REGION_UNLOCK(env, infop);
+
+			alloc_freeze = 0;
+			MUTEX_READLOCK(env, hp->mtx_hash);
+			h_locked = 1;
+			goto retry_search;
+		}
+
+		/*
+		 * Check to see if the buffer is the size we're looking for.
+		 * If so, we can simply reuse it.  Otherwise, free the buffer
+		 * and its space and keep looking.
+		 */
+		if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
+			if ((ret = __memp_bhfree(dbmp,
+			     infop, bh_mfp, hp, bhp, 0)) != 0)
+				return (ret);
+			p = bhp;
+			goto found;
+		}
+
+		freed_space += sizeof(*bhp) + bh_mfp->pagesize;
+		if ((ret =
+		    __memp_bhfree(dbmp, infop,
+			 bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
+			return (ret);
+
+		/* Reset "aggressive" and "write_error" if we free any space. */
+		if (aggressive > 1)
+			aggressive = 1;
+		write_error = 0;
+
+		/*
+		 * Unlock this buffer and re-acquire the region lock. If
+		 * we're reaching here as a result of calling memp_bhfree, the
+		 * buffer lock has already been discarded.
+		 */
+		if (0) {
+next_hb:		if (bhp != NULL) {
+				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+				atomic_dec(env, &bhp->ref);
+				if (b_lock) {
+					F_CLR(bhp, BH_EXCLUSIVE);
+					MUTEX_UNLOCK(env, bhp->mtx_buf);
+				}
+			}
+			if (h_locked)
+				MUTEX_UNLOCK(env, hp->mtx_hash);
+			h_locked = 0;
+		}
+		MPOOL_REGION_LOCK(env, infop);
+
+		/*
+		 * Retry the allocation as soon as we've freed up sufficient
+		 * space.  We're likely to have to coalesce of memory to
+		 * satisfy the request, don't try until it's likely (possible?)
+		 * we'll succeed.
+		 */
+		if (freed_space >= 3 * len)
+			goto alloc;
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __memp_free --
+ *	Free some space from a cache region.
+ *
+ * PUBLIC: void __memp_free __P((REGINFO *, void *));
+ */
+void
+__memp_free(infop, buf)
+	REGINFO *infop;
+	void *buf;
+{
+	__env_alloc_free(infop, buf);
+}
diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c
new file mode 100644
index 00000000..f376cda7
--- /dev/null
+++ b/src/mp/mp_backup.c
@@ -0,0 +1,333 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#ifndef HAVE_ATOMICFILEREAD
+#include "dbinc/db_page.h"
+#endif
+
+#ifndef HAVE_ATOMICFILEREAD
+static int __memp_check_backup __P((ENV *,
+    MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+#endif
+
+/*
+ * __memp_backup_open --
+ *	Setup to backup a database file.
+ *
+ * PUBLIC: int __memp_backup_open __P((ENV *, DB_MPOOLFILE *,
+ * PUBLIC:     const char *, const char *, u_int32_t, DB_FH **, void**));
+ */
+int
+__memp_backup_open(env, mpf, dbfile, target, flags, fpp, handlep)
+	ENV *env;
+	DB_MPOOLFILE *mpf;
+	const char *dbfile;
+	const char *target;
+	u_int32_t flags;
+	DB_FH **fpp;
+	void **handlep;
+{
+	DB_BACKUP *backup;
+#ifndef HAVE_ATOMICFILEREAD
+	MPOOLFILE *mfp;
+#endif
+	u_int32_t oflags;
+	size_t len;
+	int ret;
+	char *path;
+
+	path = NULL;
+	*fpp = NULL;
+	backup = env->backup_handle;
+	*handlep = NULL;
+
+	if (backup != NULL && backup->open != NULL)
+		ret = backup->open(env->dbenv, dbfile, target, handlep);
+	else {
+		len = strlen(target) + strlen(dbfile) + 2;
+		if ((ret = __os_malloc(env, len, &path)) != 0) {
+			__db_err(env, ret, DB_STR_A("0703",
+			    "Cannot allocate space for path: %s", "%s"),
+			    target);
+			goto err;
+		}
+
+		if ((ret = __os_concat_path(path, len, target, dbfile)) != 0)
+			goto err;
+
+		oflags = DB_OSO_CREATE | DB_OSO_TRUNC;
+		if (LF_ISSET(DB_EXCL))
+			FLD_SET(oflags, DB_OSO_EXCL);
+		if (backup != NULL && F_ISSET(backup, BACKUP_WRITE_DIRECT))
+			FLD_SET(oflags, DB_OSO_DIRECT);
+		ret = __os_open(env, path, 0, oflags, DB_MODE_600, fpp);
+	}
+	if (ret != 0) {
+		__db_err(env, ret, DB_STR_A("0704",
+		    "Cannot open target file: %s", "%s"), path);
+		goto err;
+	}
+
+#ifndef HAVE_ATOMICFILEREAD
+	mfp = mpf->mfp;
+
+	/*
+	 * Need to register thread with fail check.
+	 */
+	MUTEX_LOCK(env, mfp->mtx_write);
+	if (mfp->backup_in_progress) {
+		__db_err(env, ret, DB_STR_A("0712",
+		    "%s is already in a backup", "%s"), dbfile);
+		MUTEX_UNLOCK(env, mfp->mtx_write);
+		goto err;
+	}
+	mfp->backup_in_progress = 1;
+	env->dbenv->thread_id(env->dbenv, &mfp->pid, &mfp->tid);
+	MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+	COMPQUIET(mpf, NULL);
+#endif
+err:	if (path != NULL)
+		__os_free(env, path);
+	if (ret != 0) {
+		if (*fpp != NULL)
+			(void)__os_closehandle(env, *fpp);
+		if (backup != NULL && backup->close != NULL)
+			(void)backup->close(env->dbenv, dbfile, *handlep);
+	}
+	return (ret);
+}
+
+/*
+ * __memp_backup_mpf --
+ *	Copy a database file while maintaining synchronization with
+ * mpool write activity.
+ *
+ * PUBLIC: int __memp_backup_mpf __P((ENV *, DB_MPOOLFILE *, DB_THREAD_INFO *,
+ * PUBLIC:     db_pgno_t, db_pgno_t, DB_FH *, void *,  u_int32_t));
+ */
+int
+__memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
+	ENV *env;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	db_pgno_t first_pgno, last_pgno;
+	DB_FH *fp;
+	void *handle;
+	u_int32_t flags;
+{
+	DB_BACKUP *backup;
+	MPOOLFILE *mfp;
+	db_pgno_t high_pgno, pgno;
+	off_t t_off;
+	u_int32_t read_count, write_size;
+	u_int32_t gigs, off;
+	size_t len, nr, nw;
+	u_int8_t *buf;
+	int ret;
+
+	COMPQUIET(flags, 0);
+	backup = env->backup_handle;
+	read_count = 0;
+	buf = NULL;
+	mfp = mpf->mfp;
+	gigs = 0;
+	off = 0;
+
+	if (backup == NULL || (len = backup->size) == 0)
+		len = MEGABYTE;
+	if ((ret = __os_malloc(env, len, &buf)) != 0)
+		return (ret);
+	write_size = (u_int32_t)(len / mfp->pagesize);
+
+	if (first_pgno > 0) {
+		t_off = (off_t)first_pgno * mfp->pagesize;
+		gigs = (u_int32_t)(t_off / GIGABYTE);
+		off = (u_int32_t)(t_off - (off_t)gigs * GIGABYTE);
+	}
+
+	for (pgno = first_pgno; pgno <= last_pgno; pgno = high_pgno + 1) {
+		high_pgno = pgno + write_size - 1;
+		if (high_pgno > last_pgno)
+			high_pgno = last_pgno;
+		len = ((high_pgno - pgno) + 1) * mfp->pagesize;
+#ifndef HAVE_ATOMICFILEREAD
+		if (ip != NULL)
+			ip->dbth_state = THREAD_ACTIVE;
+		MUTEX_LOCK(env, mfp->mtx_write);
+
+		/* Eventually the writers will drain and block on the mutex. */
+		while (atomic_read(&mfp->writers) != 0) {
+			STAT_INC_VERB(env, mpool, backup_spins,
+			     mfp->stat.st_backup_spins, __memp_fn(mpf), pgno);
+			__os_yield(env, 0, 1000);
+		}
+
+		mfp->low_pgno = pgno;
+		mfp->high_pgno = high_pgno;
+		MUTEX_UNLOCK(env, mfp->mtx_write);
+		if (ip != NULL)
+			ip->dbth_state = THREAD_OUT;
+#endif
+
+		if ((ret = __os_io(env, DB_IO_READ, mpf->fhp, pgno,
+		    mfp->pagesize, 0, (u_int32_t)len, buf, &nr)) != 0)
+			break;
+
+		if (nr == 0)
+			break;
+
+		if (backup != NULL && backup->write != NULL) {
+			if ((ret = backup->write(
+			     env->dbenv, gigs, off, (u_int32_t)nr, 
+			     buf, handle)) != 0)
+				break;
+		} else {
+			if ((ret = __os_io(env, DB_IO_WRITE, fp, pgno,
+			    mfp->pagesize, 0, (u_int32_t)nr, buf, &nw)) != 0)
+				break;
+			if (nr != nw) {
+				ret = EIO;
+				break;
+			}
+		}
+
+		off += (u_int32_t)nr;
+		if (off >= GIGABYTE) {
+			gigs++;
+			off -= GIGABYTE;
+		}
+
+		if (backup != NULL && backup->read_count != 0) {
+			if ((read_count += write_size) >= backup->read_count)
+				__os_yield(env, 0, backup->read_sleep);
+		}
+
+		/*
+		 * There may be pages not written to the file yet.  The
+		 * next read will probably see the end of file.
+		 */
+		if (nr != len)
+			high_pgno = pgno + (db_pgno_t)(nr / mfp->pagesize);
+	}
+	DB_ASSERT(env, ret == 0);
+	__os_free(env, buf);
+
+#ifndef HAVE_ATOMICFILEREAD
+	if (ip != NULL)
+		ip->dbth_state = THREAD_ACTIVE;
+	MUTEX_LOCK(env, mfp->mtx_write);
+	mfp->low_pgno = PGNO_INVALID;
+	mfp->high_pgno = PGNO_INVALID;
+	MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+	COMPQUIET(ip, NULL);
+#endif
+
+	return (ret);
+}
+
+/*
+ * __memp_backup_close --
+ *	Close backup file.
+ *
+ * PUBLIC: int __memp_backup_close __P((ENV *, DB_MPOOLFILE *,
+ * PUBLIC:	const char *, DB_FH *, void *HANDLE));
+ */
+int
+__memp_backup_close(env, mpf, dbfile, fp, handle)
+	ENV *env;
+	DB_MPOOLFILE *mpf;
+	const char *dbfile;
+	DB_FH *fp;
+	void *handle;
+{
+	DB_BACKUP *backup;
+#ifndef HAVE_ATOMICFILEREAD
+	MPOOLFILE *mfp;
+#endif
+	int ret, t_ret;
+
+	backup = env->backup_handle;
+	ret = t_ret = 0;
+
+#ifndef HAVE_ATOMICFILEREAD
+	mfp = mpf->mfp;
+	MUTEX_LOCK(env, mfp->mtx_write);
+	mfp->backup_in_progress = 0;
+	MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+	COMPQUIET(mpf, NULL);
+#endif
+	if (fp != NULL)
+		ret = __os_closehandle(env, fp);
+	if (backup != NULL && backup->close != NULL)
+		t_ret = backup->close(env->dbenv, dbfile, handle);
+	return (ret == 0 ? t_ret : ret);
+}
+
+#ifndef HAVE_ATOMICFILEREAD
+/*
+ * __memp_check_backup --
+ *	check for a dead thread backing up a mp file.
+ */
+static int
+__memp_check_backup(env, mfp, arg, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *arg;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	char buf[DB_THREADID_STRLEN];
+
+	COMPQUIET(arg, NULL);
+	COMPQUIET(countp, NULL);
+	COMPQUIET(flags, 0);
+
+	dbenv = env->dbenv;
+
+	if (mfp->backup_in_progress == 0 ||
+	    dbenv->is_alive(dbenv, mfp->pid, mfp->tid, 0))
+		return (0);
+
+	__db_msg(env, DB_STR_A("3042", "Releasing backup of %s for %s.",
+	    "%s %s"), (char *)R_ADDR(env->mp_handle->reginfo, mfp->path_off),
+	    dbenv->thread_id_string(dbenv, mfp->pid, mfp->tid, buf));
+	mfp->backup_in_progress = 0;
+	return (0);
+}
+#endif
+
+/*
+ * __memp_failchk --
+ *	Remove in process database backups.
+ * PUBLIC: int __memp_failchk __P((ENV *));
+ */
+int
+__memp_failchk(env)
+	ENV *env;
+{
+#ifdef HAVE_ATOMICFILEREAD
+	COMPQUIET(env, NULL);
+	return (0);
+#else
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	return (__memp_walk_files(env, mp, __memp_check_backup, NULL, NULL, 0));
+#endif
+}
diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c
new file mode 100644
index 00000000..1df8e206
--- /dev/null
+++ b/src/mp/mp_bh.c
@@ -0,0 +1,690 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"		/* Required for diagnostic code. */
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __memp_pgwrite
+	       __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
+
+/*
+ * __memp_bhwrite --
+ *	Write the page associated with a given buffer header.
+ *
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
+ */
+int
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOLFILE *mfp;
+	BH *bhp;
+	int open_extents;
+{
+	DB_MPOOLFILE *dbmfp;
+	DB_MPREG *mpreg;
+	ENV *env;
+	int opened, ret;
+
+	env = dbmp->env;
+	opened = 0;
+
+	/*
+	 * If the file has been removed or is a closed temporary file, we're
+	 * done -- the page-write function knows how to handle the fact that
+	 * we don't have (or need!) any real file descriptor information.
+	 */
+	if (mfp->deadfile)
+		return (__memp_pgwrite(env, NULL, hp, bhp));
+
+	/*
+	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
+	 * the file.  We also check that the descriptor is open for writing.
+	 */
+	MUTEX_LOCK(env, dbmp->mutex);
+	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
+		if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) {
+			++dbmfp->ref;
+			break;
+		}
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	if (dbmfp != NULL) {
+		/*
+		 * Temporary files may not have been created.  We only handle
+		 * temporary files in this path, because only the process that
+		 * created a temporary file will ever flush buffers to it.
+		 */
+		if (dbmfp->fhp == NULL) {
+			/* We may not be allowed to create backing files. */
+			if (mfp->no_backing_file) {
+				--dbmfp->ref;
+				return (EPERM);
+			}
+
+			MUTEX_LOCK(env, dbmp->mutex);
+			if (dbmfp->fhp == NULL) {
+				ret = __db_tmp_open(env,
+				    F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ?
+				    DB_OSO_DIRECT : 0, &dbmfp->fhp);
+			} else
+				ret = 0;
+			MUTEX_UNLOCK(env, dbmp->mutex);
+			if (ret != 0) {
+				__db_errx(env, DB_STR("3014",
+			    "unable to create temporary backing file"));
+				--dbmfp->ref;
+				return (ret);
+			}
+		}
+
+		goto pgwrite;
+	}
+
+	/*
+	 * There's no file handle for this file in our process.
+	 *
+	 * !!!
+	 * It's the caller's choice if we're going to open extent files.
+	 */
+	if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+		return (EPERM);
+
+	/*
+	 * !!!
+	 * Don't try to attach to temporary files.  There are two problems in
+	 * trying to do that.  First, if we have different privileges than the
+	 * process that "owns" the temporary file, we might create the backing
+	 * disk file such that the owning process couldn't read/write its own
+	 * buffers, e.g., memp_trickle running as root creating a file owned
+	 * as root, mode 600.  Second, if the temporary file has already been
+	 * created, we don't have any way of finding out what its real name is,
+	 * and, even if we did, it was already unlinked (so that it won't be
+	 * left if the process dies horribly).  This decision causes a problem,
+	 * however: if the temporary file consumes the entire buffer cache,
+	 * and the owner doesn't flush the buffers to disk, we could end up
+	 * with resource starvation, and the memp_trickle thread couldn't do
+	 * anything about it.  That's a pretty unlikely scenario, though.
+	 *
+	 * Note we should never get here when the temporary file in question
+	 * has already been closed in another process, in which case it should
+	 * be marked dead.
+	 */
+	if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file)
+		return (EPERM);
+
+	/*
+	 * It's not a page from a file we've opened.  If the file requires
+	 * application-specific input/output processing, see if this process
+	 * has ever registered information as to how to write this type of
+	 * file.  If not, there's nothing we can do.
+	 */
+	if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) {
+		MUTEX_LOCK(env, dbmp->mutex);
+		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+			if (mpreg->ftype == mfp->ftype)
+				break;
+		MUTEX_UNLOCK(env, dbmp->mutex);
+		if (mpreg == NULL)
+			return (EPERM);
+	}
+
+	/*
+	 * Try and open the file, specifying the known underlying shared area.
+	 *
+	 * !!!
+	 * There's no negative cache, so we may repeatedly try and open files
+	 * that we have previously tried (and failed) to open.
+	 */
+	if ((ret = __memp_fcreate(env, &dbmfp)) != 0)
+		return (ret);
+	/*
+	 * The open will set MP_FLUSH and so we need to keep
+	 * a checkpoint from closing this before we finish with it.
+	 */
+	dbmfp->ref++;
+	opened = 1;
+	if ((ret = __memp_fopen(dbmfp, mfp, NULL,
+	    NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) {
+	    	dbmfp->ref--;
+		(void)__memp_fclose(dbmfp, 0);
+
+		/*
+		 * Ignore any error if the file is marked dead, assume the file
+		 * was removed from under us.
+		 */
+		if (!mfp->deadfile)
+			return (ret);
+
+		dbmfp = NULL;
+	}
+
+pgwrite:
+	MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+	    PROT_READ | PROT_WRITE | PROT_EXEC);
+	ret = __memp_pgwrite(env, dbmfp, hp, bhp);
+	if (dbmfp == NULL)
+		return (ret);
+
+	/*
+	 * Discard our reference, and, if we're the last reference, make sure
+	 * the file eventually gets closed.
+	 */
+	MUTEX_LOCK(env, dbmp->mutex);
+	if (!opened && dbmfp->ref == 1) {
+		/*
+		 * If we are the last reference, then we need to mark
+		 * this as having been used to flush.  If this dbmf
+		 * has not been counted as a neutral reference do it.
+		 *
+		 * Getting the mfp mutex while holding the dbmp is
+		 * ok we never do it in the reverse order.
+		 */
+		if (!F_ISSET(dbmfp, MP_FLUSH)) {
+			F_SET(dbmfp, MP_FLUSH);
+			MUTEX_LOCK(env,dbmfp->mfp->mutex);
+			if (!F_ISSET(dbmfp, MP_FOR_FLUSH)) {
+				mfp->neutral_cnt++;
+				F_SET(dbmfp, MP_FOR_FLUSH);
+			}
+			MUTEX_UNLOCK(env, dbmfp->mfp->mutex);
+		}
+	} else
+		--dbmfp->ref;
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	return (ret);
+}
+
+/*
+ * __memp_pgread --
+ *	Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int can_create;
+{
+	ENV *env;
+	MPOOLFILE *mfp;
+	size_t len, nr;
+	u_int32_t pagesize;
+	int ret;
+
+	env = dbmfp->env;
+	mfp = dbmfp->mfp;
+	pagesize = mfp->pagesize;
+
+	/* We should never be called with a dirty or unlocked buffer. */
+	DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_FROZEN));
+	DB_ASSERT(env, can_create ||
+	    F_ISSET(bhp, BH_TRASH) || !F_ISSET(bhp, BH_DIRTY));
+	DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE));
+
+	/* Mark the buffer as in transition. */
+	F_SET(bhp, BH_TRASH);
+
+	/*
+	 * Temporary files may not yet have been created.  We don't create
+	 * them now, we create them when the pages have to be flushed.
+	 */
+	nr = 0;
+	if (dbmfp->fhp != NULL) {
+		PERFMON3(env, mpool, read, __memp_fn(dbmfp), bhp->pgno, bhp);
+		if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp,
+		    bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0)
+			goto err;
+	}
+
+	/*
+	 * The page may not exist; if it doesn't, nr may well be 0, but we
+	 * expect the underlying OS calls not to return an error code in
+	 * this case.
+	 */
+	if (nr < pagesize) {
+		/*
+		 * Don't output error messages for short reads.  In particular,
+		 * DB recovery processing may request pages never written to
+		 * disk or for which only some part have been written to disk,
+		 * in which case we won't find the page.  The caller must know
+		 * how to handle the error.
+		 */
+		if (!can_create) {
+			ret = DB_PAGE_NOTFOUND;
+			goto err;
+		}
+
+		/* Clear any bytes that need to be cleared. */
+		len = mfp->clear_len == DB_CLEARLEN_NOTSET ?
+		    pagesize : mfp->clear_len;
+		memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
+		/*
+		 * If we're running in diagnostic mode, corrupt any bytes on
+		 * the page that are unknown quantities for the caller.
+		 */
+		if (len < pagesize)
+			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
+#endif
+		STAT_INC_VERB(env, mpool, page_create,
+		    mfp->stat.st_page_create, __memp_fn(dbmfp), bhp->pgno);
+	} else
+		STAT_INC_VERB(env, mpool, page_in,
+		    mfp->stat.st_page_in, __memp_fn(dbmfp), bhp->pgno);
+
+	/* Call any pgin function. */
+	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
+
+	/*
+	 * If no errors occurred, the data is now valid, clear the BH_TRASH
+	 * flag.
+	 */
+	if (ret == 0)
+		F_CLR(bhp, BH_TRASH);
+err:	return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ *	Write a page to a file.
+ */
+static int
+__memp_pgwrite(env, dbmfp, hp, bhp)
+	ENV *env;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
+	BH *bhp;
+{
+	DB_LSN lsn;
+	MPOOLFILE *mfp;
+	size_t nw;
+	int ret;
+	void * buf;
+
+	/*
+	 * Since writing does not require exclusive access, another thread
+	 * could have already written this buffer.
+	 */
+	if (!F_ISSET(bhp, BH_DIRTY))
+		return (0);
+
+	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
+	ret = 0;
+	buf = NULL;
+
+	/* We should never be called with a frozen or trashed buffer. */
+	DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
+
+	/*
+	 * It's possible that the underlying file doesn't exist, either
+	 * because of an outright removal or because it was a temporary
+	 * file that's been closed.
+	 *
+	 * !!!
+	 * Once we pass this point, we know that dbmfp and mfp aren't NULL,
+	 * and that we have a valid file reference.
+	 */
+	if (mfp == NULL || mfp->deadfile)
+		goto file_dead;
+
+	/*
+	 * If the page is in a file for which we have LSN information, we have
+	 * to ensure the appropriate log records are on disk.
+	 */
+	if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
+	    !IS_CLIENT_PGRECOVER(env)) {
+		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+		if (!IS_NOT_LOGGED_LSN(lsn) &&
+		    (ret = __log_flush(env, &lsn)) != 0)
+			goto err;
+	}
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Verify write-ahead logging semantics.
+	 *
+	 * !!!
+	 * Two special cases.  There is a single field on the meta-data page,
+	 * the last-page-number-in-the-file field, for which we do not log
+	 * changes.  If the page was originally created in a database that
+	 * didn't have logging turned on, we can see a page marked dirty but
+	 * for which no corresponding log record has been written.  However,
+	 * the only way that a page can be created for which there isn't a
+	 * previous log record and valid LSN is when the page was created
+	 * without logging turned on, and so we check for that special-case
+	 * LSN value.
+	 *
+	 * Second, when a client is reading database pages from a master
+	 * during an internal backup, we may get pages modified after
+	 * the current end-of-log.
+	 */
+	if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
+	    !IS_CLIENT_PGRECOVER(env)) {
+		/*
+		 * There is a potential race here.  If we are in the midst of
+		 * switching log files, it's possible we could test against the
+		 * old file and the new offset in the log region's LSN.  If we
+		 * fail the first test, acquire the log mutex and check again.
+		 */
+		DB_LOG *dblp;
+		LOG *lp;
+
+		dblp = env->lg_handle;
+		lp = dblp->reginfo.primary;
+		if (!lp->db_log_inmemory &&
+		    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+			MUTEX_LOCK(env, lp->mtx_flush);
+			DB_ASSERT(env, F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||
+			    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+			MUTEX_UNLOCK(env, lp->mtx_flush);
+		}
+	}
+#endif
+
+#ifndef HAVE_ATOMICFILEREAD
+	if (mfp->backup_in_progress != 0) {
+		MUTEX_READLOCK(env, mfp->mtx_write);
+		if (bhp->pgno >= mfp->low_pgno && bhp->pgno <= mfp->high_pgno) {
+			MUTEX_UNLOCK(env, mfp->mtx_write);
+			ret = EAGAIN;
+			goto err;
+		}
+		atomic_inc(env, &mfp->writers);
+		MUTEX_UNLOCK(env, mfp->mtx_write);
+	} else
+		atomic_inc(env, &mfp->writers);
+#endif
+
+	/*
+	 * Call any pgout function.  If we have the page exclusive then
+	 * we are going to reuse it otherwise make a copy of the page so
+	 * that others can continue looking at the page while we write it.
+	 */
+	buf = bhp->buf;
+	if (mfp->ftype != 0) {
+		if (F_ISSET(bhp, BH_EXCLUSIVE))
+			F_SET(bhp, BH_TRASH);
+		else {
+			if ((ret = __os_malloc(env, mfp->pagesize, &buf)) != 0)
+				goto err;
+			memcpy(buf, bhp->buf, mfp->pagesize);
+		}
+		if ((ret = __memp_pg(dbmfp, bhp->pgno, buf, 0)) != 0)
+			goto err;
+	}
+
+	PERFMON3(env, mpool, write, __memp_fn(dbmfp), bhp->pgno, bhp);
+	/* Write the page. */
+	if ((ret = __os_io(env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno,
+	    mfp->pagesize, 0, mfp->pagesize, buf, &nw)) != 0) {
+#ifndef HAVE_ATOMICFILEREAD
+		atomic_dec(env, &mfp->writers);
+#endif
+		__db_errx(env, DB_STR_A("3015",
+		    "%s: write failed for page %lu", "%s %lu"),
+		    __memp_fn(dbmfp), (u_long)bhp->pgno);
+		goto err;
+	}
+#ifndef HAVE_ATOMICFILEREAD
+	atomic_dec(env, &mfp->writers);
+#endif
+	STAT_INC_VERB(env, mpool, page_out,
+	    mfp->stat.st_page_out, __memp_fn(dbmfp), bhp->pgno);
+	if (bhp->pgno > mfp->last_flushed_pgno) {
+		MUTEX_LOCK(env, mfp->mutex);
+		if (bhp->pgno > mfp->last_flushed_pgno)
+			mfp->last_flushed_pgno = bhp->pgno;
+		MUTEX_UNLOCK(env, mfp->mutex);
+	}
+
+err:
+file_dead:
+	if (buf != NULL && buf != bhp->buf)
+		__os_free(env, buf);
+	/*
+	 * !!!
+	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
+	 * a valid file reference.
+	 */
+
+	/*
+	 * Update the hash bucket statistics, reset the flags.  If we were
+	 * successful, the page is no longer dirty.  Someone else may have
+	 * also written the page so we need to latch the hash bucket here
+	 * to get the accounting correct.  Since we have the buffer
+	 * shared it cannot be marked dirty again till we release it.
+	 * This is the only place we update the flags field only holding
+	 * a shared latch.
+	 */
+	if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) {
+		MUTEX_LOCK(env, hp->mtx_hash);
+		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+		if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) {
+			F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+			atomic_dec(env, &hp->hash_page_dirty);
+		}
+
+		/* put the page back if necessary. */
+		if ((ret != 0 || BH_REFCOUNT(bhp) > 1) &&
+		    F_ISSET(bhp, BH_TRASH)) {
+			ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
+			F_CLR(bhp, BH_TRASH);
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+
+	return (ret);
+}
+
+/*
+ * __memp_pg --
+ *	Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
+ */
+int
+__memp_pg(dbmfp, pgno, buf, is_pgin)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t pgno;
+	void *buf;
+	int is_pgin;
+{
+	DBT dbt, *dbtp;
+	DB_MPOOL *dbmp;
+	DB_MPREG *mpreg;
+	ENV *env;
+	MPOOLFILE *mfp;
+	int ftype, ret;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+
+	if ((ftype = mfp->ftype) == DB_FTYPE_SET)
+		mpreg = dbmp->pg_inout;
+	else {
+		MUTEX_LOCK(env, dbmp->mutex);
+		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+			if (ftype == mpreg->ftype)
+				break;
+		MUTEX_UNLOCK(env, dbmp->mutex);
+	}
+	if (mpreg == NULL)
+		return (0);
+
+	if (mfp->pgcookie_len == 0)
+		dbtp = NULL;
+	else {
+		DB_SET_DBT(dbt, R_ADDR(
+		    dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len);
+		dbtp = &dbt;
+	}
+
+	if (is_pgin) {
+		if (mpreg->pgin != NULL && (ret =
+		    mpreg->pgin(env->dbenv, pgno, buf, dbtp)) != 0)
+			goto err;
+	} else
+		if (mpreg->pgout != NULL && (ret =
+		    mpreg->pgout(env->dbenv, pgno, buf, dbtp)) != 0)
+			goto err;
+
+	return (0);
+
+err:	__db_errx(env, DB_STR_A("3016",
+	    "%s: %s failed for page %lu", "%s %s %lu"), __memp_fn(dbmfp),
+	    is_pgin ? DB_STR_P("pgin") : DB_STR_P("pgout"), (u_long)pgno);
+	return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ *	Free a bucket header and its referenced data.
+ *
+ * PUBLIC: int __memp_bhfree __P((DB_MPOOL *,
+ * PUBLIC:	REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
+ */
+int
+__memp_bhfree(dbmp, infop, mfp, hp, bhp, flags)
+	DB_MPOOL *dbmp;
+	REGINFO *infop;
+	MPOOLFILE *mfp;
+	DB_MPOOL_HASH *hp;
+	BH *bhp;
+	u_int32_t flags;
+{
+	ENV *env;
+#ifdef DIAGNOSTIC
+	DB_LSN vlsn;
+#endif
+	BH *prev_bhp;
+	MPOOL *c_mp;
+	int ret, t_ret;
+#ifdef DIAG_MVCC
+	size_t pagesize;
+#endif
+
+	ret = 0;
+
+	/*
+	 * Assumes the hash bucket is locked and the MPOOL is not.
+	 */
+	env = dbmp->env;
+#ifdef DIAG_MVCC
+	if (mfp != NULL)
+		pagesize = mfp->pagesize;
+#endif
+
+	DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
+	    (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash)));
+	DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 &&
+	    !F_ISSET(bhp, BH_DIRTY | BH_FROZEN));
+	DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
+	    SH_CHAIN_SINGLETON(bhp, vc) || (SH_CHAIN_HASNEXT(bhp, vc) &&
+	    (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
+	    bhp->td_off == INVALID_ROFF ||
+	    IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+	    BH_OBSOLETE(bhp, hp->old_reader, vlsn))));
+
+	PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp);
+
+	/*
+	 * Delete the buffer header from the hash bucket queue or the
+	 * version chain.
+	 */
+	if (hp == NULL)
+		goto no_hp;
+	prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh);
+	if (!SH_CHAIN_HASNEXT(bhp, vc)) {
+		if (prev_bhp != NULL)
+			SH_TAILQ_INSERT_AFTER(&hp->hash_bucket,
+			    bhp, prev_bhp, hq, __bh);
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+	}
+	SH_CHAIN_REMOVE(bhp, vc, __bh);
+
+	/*
+	 * Remove the reference to this buffer from the transaction that
+	 * created it, if any.  When the BH_FREE_UNLOCKED flag is set, we're
+	 * discarding the environment, so the transaction region is already
+	 * gone.
+	 */
+	if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) {
+		ret = __txn_remove_buffer(
+		    env, BH_OWNER(env, bhp), hp->mtx_hash);
+		bhp->td_off = INVALID_ROFF;
+	}
+
+	/*
+	 * We're going to use the memory for something else -- it had better be
+	 * accessible.
+	 */
+no_hp:	if (mfp != NULL)
+		MVCC_MPROTECT(bhp->buf,
+		    pagesize, PROT_READ | PROT_WRITE | PROT_EXEC);
+
+	/*
+	 * Discard the hash bucket's mutex, it's no longer needed, and
+	 * we don't want to be holding it when acquiring other locks.
+	 */
+	if (!LF_ISSET(BH_FREE_UNLOCKED))
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+
+	/*
+	 * If we're only removing this header from the chain for reuse, we're
+	 * done.
+	 */
+	if (LF_ISSET(BH_FREE_REUSE))
+		return (ret);
+
+	/*
+	 * If we're not reusing the buffer immediately, free the buffer for
+	 * real.
+	 */
+	if (!LF_ISSET(BH_FREE_UNLOCKED))
+		MUTEX_UNLOCK(env, bhp->mtx_buf);
+	if (LF_ISSET(BH_FREE_FREEMEM)) {
+		if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
+			return (ret);
+		MPOOL_REGION_LOCK(env, infop);
+
+		MVCC_BHUNALIGN(bhp);
+		__memp_free(infop, bhp);
+		c_mp = infop->primary;
+		c_mp->pages--;
+
+		MPOOL_REGION_UNLOCK(env, infop);
+	}
+
+	if (mfp == NULL)
+		return (ret);
+
+	/*
+	 * Decrement the reference count of the underlying MPOOLFILE.
+	 * If this is its last reference, remove it.
+	 */
+	MUTEX_LOCK(env, mfp->mutex);
+	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+		if ((t_ret = __memp_mf_discard(dbmp, mfp, 0)) != 0 && ret == 0)
+			ret = t_ret;
+	} else
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+	return (ret);
+}
diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c
new file mode 100644
index 00000000..5f9a4bf9
--- /dev/null
+++ b/src/mp/mp_fget.c
@@ -0,0 +1,1230 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#ifdef DIAGNOSTIC
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#endif
+
+/*
+ * __memp_fget_pp --
+ *	DB_MPOOLFILE->get pre/post processing.
+ *
+ * PUBLIC: int __memp_fget_pp
+ * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+ */
+int
+__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+	DB_TXN *txnp;
+	u_int32_t flags;
+	void *addrp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_blocked, ret;
+
+	env = dbmfp->env;
+
+	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
+
+	/*
+	 * Validate arguments.
+	 *
+	 * !!!
+	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+	 * files here, and create non-existent pages in readonly files if the
+	 * flags are set, later.  The reason is that the hash access method
+	 * wants to get empty pages that don't really exist in readonly files.
+	 * The only alternative is for hash to write the last "bucket" all the
+	 * time, which we don't want to do because one of our big goals in life
+	 * is to keep database files small.  It's sleazy as hell, but we catch
+	 * any attempt to actually write the file in memp_fput().
+	 */
+#undef	OKFLAGS
+#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
+	    DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
+	if (flags != 0) {
+		if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
+			return (ret);
+
+		switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) {
+		case DB_MPOOL_CREATE:
+		case DB_MPOOL_LAST:
+		case DB_MPOOL_NEW:
+		case 0:
+			break;
+		default:
+			return (__db_ferr(env, "memp_fget", 1));
+		}
+	}
+
+	ENV_ENTER(env, ip);
+
+	rep_blocked = 0;
+	if (txnp == NULL && IS_ENV_REPLICATED(env)) {
+		if ((ret = __op_rep_enter(env, 0, 1)) != 0)
+			goto err;
+		rep_blocked = 1;
+	}
+	ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp);
+	/*
+	 * We only decrement the count in op_rep_exit if the operation fails.
+	 * Otherwise the count will be decremented when the page is no longer
+	 * pinned in memp_fput.
+	 */
+	if (ret != 0 && rep_blocked)
+		(void)__op_rep_exit(env);
+
+	/* Similarly if an app has a page pinned it is ACTIVE. */
+err:	if (ret != 0)
+		ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __memp_fget --
+ *	Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *,
+ * PUBLIC:     db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
+ */
+int
+__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+	void *addrp;
+{
+	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+	BH *alloc_bhp, *bhp, *oldest_bhp;
+	ENV *env;
+	DB_LSN *read_lsnp, vlsn;
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	MPOOLFILE *mfp;
+	PIN_LIST *list, *lp;
+	REGENV *renv;
+	REGINFO *infop, *t_infop, *reginfo;
+	TXN_DETAIL *td;
+	roff_t list_off, mf_offset;
+	u_int32_t bucket, pinmax, st_hsearch;
+	int b_incr, b_lock, h_locked, dirty, extending;
+	int makecopy, mvcc, need_free, ret;
+#ifdef DIAGNOSTIC
+	DB_LOCKTAB *lt;
+	DB_LOCKER *locker;
+#endif
+
+	*(void **)addrp = NULL;
+	COMPQUIET(c_mp, NULL);
+	COMPQUIET(infop, NULL);
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+
+	mfp = dbmfp->mfp;
+	mvcc = atomic_read(&mfp->multiversion) && (txn != NULL);
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	alloc_bhp = bhp = oldest_bhp = NULL;
+	read_lsnp = NULL;
+	td = NULL;
+	hp = NULL;
+	b_incr = b_lock = h_locked = extending = makecopy = ret = 0;
+
+	if (LF_ISSET(DB_MPOOL_DIRTY)) {
+		if (F_ISSET(dbmfp, MP_READONLY)) {
+			__db_errx(env, DB_STR_A("3021",
+			    "%s: dirty flag set for readonly file page",
+			    "%s"), __memp_fn(dbmfp));
+			return (EINVAL);
+		}
+		if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get",
+		    flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
+			return (ret);
+	}
+
+	dirty = LF_ISSET(DB_MPOOL_DIRTY | DB_MPOOL_EDIT | DB_MPOOL_FREE);
+	LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
+
+	/*
+	 * If the transaction is being used to update a multiversion database
+	 * for the first time, set the read LSN.  In addition, if this is an
+	 * update, allocate a mutex.  If no transaction has been supplied, that
+	 * will be caught later, when we know whether one is required.
+	 */
+	if (mvcc && txn != NULL && txn->td != NULL) {
+		/* We're only interested in the ultimate parent transaction. */
+		while (txn->parent != NULL)
+			txn = txn->parent;
+		td = (TXN_DETAIL *)txn->td;
+		if (F_ISSET(txn, TXN_SNAPSHOT)) {
+			read_lsnp = &td->read_lsn;
+			if (IS_MAX_LSN(*read_lsnp) &&
+			    (ret = __log_current_lsn_int(env, read_lsnp,
+			    NULL, NULL)) != 0)
+				return (ret);
+		}
+		if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
+		    td->mvcc_mtx == MUTEX_INVALID && (ret =
+		    __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
+			return (ret);
+	}
+
+	switch (flags) {
+	case DB_MPOOL_LAST:
+		/* Get the last page number in the file. */
+		MUTEX_LOCK(env, mfp->mutex);
+		*pgnoaddr = mfp->last_pgno;
+		MUTEX_UNLOCK(env, mfp->mutex);
+		break;
+	case DB_MPOOL_NEW:
+		/*
+		 * If always creating a page, skip the first search
+		 * of the hash bucket.
+		 */
+		goto newpg;
+	case DB_MPOOL_CREATE:
+	default:
+		break;
+	}
+
+	/*
+	 * If mmap'ing the file and the page is not past the end of the file,
+	 * just return a pointer.  We can't use R_ADDR here: this is an offset
+	 * into an mmap'd file, not a shared region, and doesn't change for
+	 * private environments.
+	 *
+	 * The page may be past the end of the file, so check the page number
+	 * argument against the original length of the file.  If we previously
+	 * returned pages past the original end of the file, last_pgno will
+	 * have been updated to match the "new" end of the file, and checking
+	 * against it would return pointers past the end of the mmap'd region.
+	 *
+	 * If another process has opened the file for writing since we mmap'd
+	 * it, we will start playing the game by their rules, i.e. everything
+	 * goes through the cache.  All pages previously returned will be safe,
+	 * as long as the correct locking protocol was observed.
+	 *
+	 * We don't discard the map because we don't know when all of the
+	 * pages will have been discarded from the process' address space.
+	 * It would be possible to do so by reference counting the open
+	 * pages from the mmap, but it's unclear to me that it's worth it.
+	 */
+	if (dbmfp->addr != NULL &&
+	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+		*(void **)addrp = (u_int8_t *)dbmfp->addr +
+		    (*pgnoaddr * mfp->pagesize);
+		STAT_INC_VERB(env,
+		    mpool, map, mfp->stat.st_map, __memp_fn(dbmfp), *pgnoaddr);
+		return (0);
+	}
+
+	/*
+	 * Determine the cache and hash bucket where this page lives and get
+	 * local pointers to them.  Reset on each pass through this code, the
+	 * page number can change.
+	 */
+	MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, bucket, ret);
+	if (ret != 0)
+		return (ret);
+	c_mp = infop->primary;
+
+	if (0) {
+		/* if we search again, get an exclusive lock. */
+retry:		MUTEX_LOCK(env, hp->mtx_hash);
+	}
+
+	/* Search the hash chain for the page. */
+	st_hsearch = 0;
+	h_locked = 1;
+	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+		++st_hsearch;
+		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
+			continue;
+
+		/* Snapshot reads -- get the version visible at read_lsn. */
+		if (read_lsnp != NULL) {
+			while (bhp != NULL &&
+			    !BH_OWNED_BY(env, bhp, txn) &&
+			    !BH_VISIBLE(env, bhp, read_lsnp, vlsn))
+				bhp = SH_CHAIN_PREV(bhp, vc, __bh);
+
+			/*
+			 * We can get a null bhp if we are looking for a
+			 * page that was created after the transaction was
+			 * started so its not visible  (i.e. page added to
+			 * the BTREE in a subsequent txn).
+			 */
+			if (bhp == NULL) {
+				ret = DB_PAGE_NOTFOUND;
+				goto err;
+			}
+		}
+
+		makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn);
+
+		/*
+		 * Increment the reference count.  This signals that the
+		 * buffer may not be discarded.  We must drop the hash
+		 * mutex before we lock the buffer mutex.
+		 */
+		if (BH_REFCOUNT(bhp) == UINT16_MAX) {
+			__db_errx(env, DB_STR_A("3022",
+			    "%s: page %lu: reference count overflow",
+			    "%s %lu"), __memp_fn(dbmfp), (u_long)bhp->pgno);
+			ret = __env_panic(env, EINVAL);
+			goto err;
+		}
+		atomic_inc(env, &bhp->ref);
+		b_incr = 1;
+
+		/*
+		 * Lock the buffer. If the page is being read in or modified it
+		 * will be exclusively locked and we will block.
+		 */
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		h_locked = 0;
+		if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) {
+xlatch:			if (LF_ISSET(DB_MPOOL_TRY)) {
+				if ((ret =
+				    MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0)
+					goto err;
+			} else
+				MUTEX_LOCK(env, bhp->mtx_buf);
+			F_SET(bhp, BH_EXCLUSIVE);
+		} else if (LF_ISSET(DB_MPOOL_TRY)) {
+			if ((ret = MUTEX_TRY_READLOCK(env, bhp->mtx_buf)) != 0)
+				goto err;
+		} else
+			MUTEX_READLOCK(env, bhp->mtx_buf);
+
+#ifdef HAVE_SHARED_LATCHES
+		/*
+		 * If buffer is still in transit once we have a shared latch,
+		 * upgrade to an exclusive latch.
+		 */
+		if (F_ISSET(bhp, BH_FREED | BH_TRASH) &&
+		    !F_ISSET(bhp, BH_EXCLUSIVE)) {
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			goto xlatch;
+		}
+#else
+		F_SET(bhp, BH_EXCLUSIVE);
+#endif
+		b_lock = 1;
+
+		/*
+		 * If the buffer was frozen before we waited for any I/O to
+		 * complete and is still frozen, we will need to thaw it.
+		 * Otherwise, it was thawed while we waited, and we need to
+		 * search again.
+		 */
+		if (F_ISSET(bhp, BH_THAWED)) {
+thawed:			need_free = (atomic_dec(env, &bhp->ref) == 0);
+			b_incr = 0;
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			b_lock = 0;
+			if (need_free) {
+				MPOOL_REGION_LOCK(env, infop);
+				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+				    bhp, hq);
+				MPOOL_REGION_UNLOCK(env, infop);
+			}
+			bhp = NULL;
+			goto retry;
+		}
+
+		/*
+		 * If the buffer we wanted was frozen or thawed while we
+		 * waited, we need to start again.  That is indicated by
+		 * a new buffer header in the version chain owned by the same
+		 * transaction as the one we pinned.
+		 *
+		 * Also, if we're doing an unversioned read on a multiversion
+		 * file, another thread may have dirtied this buffer while we
+		 * swapped from the hash bucket lock to the buffer lock.
+		 */
+		if (SH_CHAIN_HASNEXT(bhp, vc) &&
+		    (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
+		    (!dirty && read_lsnp == NULL))) {
+			DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
+			atomic_dec(env, &bhp->ref);
+			b_incr = 0;
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			b_lock = 0;
+			bhp = NULL;
+			goto retry;
+		} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
+			ret = DB_LOCK_DEADLOCK;
+			goto err;
+		} else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE &&
+		    flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) {
+			ret = DB_PAGE_NOTFOUND;
+			goto err;
+		}
+
+		/* Is it worthwhile to publish oh-so-frequent cache hits? */
+		STAT_INC_VERB(env, mpool, hit,
+		    mfp->stat.st_cache_hit, __memp_fn(dbmfp), *pgnoaddr);
+		break;
+	}
+
+#ifdef HAVE_STATISTICS
+	/*
+	 * Update the hash bucket search statistics -- do now because our next
+	 * search may be for a different bucket. Are these too frequent also?
+	 */
+	STAT_INC_VERB(env, mpool, hash_search,
+	    c_mp->stat.st_hash_searches, __memp_fn(dbmfp), *pgnoaddr);
+	if (st_hsearch > c_mp->stat.st_hash_longest)
+		STAT_SET_VERB(env, mpool, hash_longest,
+		    c_mp->stat.st_hash_longest,
+		    st_hsearch, __memp_fn(dbmfp), *pgnoaddr);
+	STAT_ADJUST_VERB(env, mpool, hash_examined, c_mp->stat.st_hash_searches,
+	    st_hsearch, __memp_fn(dbmfp), *pgnoaddr);
+#endif
+
+	/*
+	 * There are 4 possible paths to this location:
+	 *
+	 * FIRST_MISS:
+	 *	Didn't find the page in the hash bucket on our first pass:
+	 *	bhp == NULL, alloc_bhp == NULL
+	 *
+	 * FIRST_FOUND:
+	 *	Found the page in the hash bucket on our first pass:
+	 *	bhp != NULL, alloc_bhp == NULL
+	 *
+	 * SECOND_FOUND:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and found the page in the hash bucket on
+	 *	our second pass:
+	 *	bhp != NULL, alloc_bhp != NULL
+	 *
+	 * SECOND_MISS:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and didn't find the page in the hash bucket
+	 *	on our second pass:
+	 *	bhp == NULL, alloc_bhp != NULL
+	 */
+	state = bhp == NULL ?
+	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+
+	switch (state) {
+	case FIRST_FOUND:
+		/*
+		 * If we are to free the buffer, then this had better be the
+		 * only reference. If so, just free the buffer.  If not,
+		 * complain and get out.
+		 */
+		if (flags == DB_MPOOL_FREE) {
+freebuf:		MUTEX_LOCK(env, hp->mtx_hash);
+			h_locked = 1;
+			if (F_ISSET(bhp, BH_DIRTY)) {
+				F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+				DB_ASSERT(env,
+				   atomic_read(&hp->hash_page_dirty) > 0);
+				atomic_dec(env, &hp->hash_page_dirty);
+			}
+
+			/*
+			 * If the buffer we found is already freed, we're done.
+			 * If the ref count is not 1 then someone may be
+			 * peeking at the buffer.  We cannot free it until they
+			 * determine that it is not what they want.  Clear the
+			 * buffer so that waiting threads get an empty page.
+			 */
+			if (F_ISSET(bhp, BH_FREED))
+				goto done;
+			else if (BH_REFCOUNT(bhp) != 1 ||
+			    !SH_CHAIN_SINGLETON(bhp, vc)) {
+				/*
+				 * Create an empty page in the chain for
+				 * subsequent gets.  Otherwise, a thread that
+				 * re-creates this page while it is still in
+				 * cache will see stale data.
+				 */
+				F_SET(bhp, BH_FREED);
+				F_CLR(bhp, BH_TRASH);
+			} else if (F_ISSET(bhp, BH_FROZEN)) {
+				/*
+				 * Freeing a singleton frozen buffer: just free
+				 * it.  This call will release the hash bucket
+				 * mutex.
+				 */
+				ret =
+				    __memp_bh_thaw(dbmp, infop, hp, bhp, NULL);
+				bhp = NULL;
+				b_incr = b_lock = h_locked = 0;
+			} else {
+				ret = __memp_bhfree(dbmp, infop, mfp,
+				    hp, bhp, BH_FREE_FREEMEM);
+				bhp = NULL;
+				b_incr = b_lock = h_locked = 0;
+			}
+			goto done;
+		} else if (F_ISSET(bhp, BH_FREED | BH_TRASH)) {
+revive:			if (F_ISSET(bhp, BH_FREED))
+				makecopy = makecopy ||
+				    (mvcc && !BH_OWNED_BY(env, bhp, txn)) ||
+				    F_ISSET(bhp, BH_FROZEN);
+			if (flags == DB_MPOOL_CREATE) {
+				MUTEX_LOCK(env, mfp->mutex);
+				if (*pgnoaddr > mfp->last_pgno)
+					mfp->last_pgno = *pgnoaddr;
+				MUTEX_UNLOCK(env, mfp->mutex);
+			}
+			/* We can race with a thread trying to free this. */
+			if (F_ISSET(bhp, BH_TRASH) &&
+			    *pgnoaddr <= mfp->last_pgno)
+				break;
+
+			/* Otherwise this page does not currently exist. */
+			if (flags != DB_MPOOL_CREATE && flags != DB_MPOOL_NEW) {
+				ret = DB_PAGE_NOTFOUND;
+				goto done;
+			}
+		}
+		if (mvcc) {
+			/*
+			 * With multiversion databases, we might need to
+			 * allocate a new buffer into which we can copy the one
+			 * that we found.  In that case, check the last buffer
+			 * in the chain to see whether we can reuse an obsolete
+			 * buffer.
+			 *
+			 * To provide snapshot isolation, we need to make sure
+			 * that we've seen a buffer older than the oldest
+			 * snapshot read LSN.
+			 */
+reuse:			if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
+			    !h_locked) {
+				MUTEX_LOCK(env, hp->mtx_hash);
+				h_locked = 1;
+			}
+			if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
+			    SH_CHAIN_HASPREV(bhp, vc)) {
+				oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
+				while (SH_CHAIN_HASPREV(oldest_bhp, vc))
+					oldest_bhp = SH_CHAIN_PREVP(
+					    oldest_bhp, vc, __bh);
+
+				if (BH_REFCOUNT(oldest_bhp) == 0 &&
+				    !BH_OBSOLETE(
+				    oldest_bhp, hp->old_reader, vlsn) &&
+				    (ret = __txn_oldest_reader(env,
+				    &hp->old_reader)) != 0)
+					goto err;
+
+				if (BH_OBSOLETE(
+				    oldest_bhp, hp->old_reader, vlsn) &&
+				    BH_REFCOUNT(oldest_bhp) == 0) {
+					DB_ASSERT(env,
+					    !F_ISSET(oldest_bhp, BH_DIRTY));
+					atomic_inc(env, &oldest_bhp->ref);
+					if (F_ISSET(oldest_bhp, BH_FROZEN)) {
+						/*
+						 * This call will release the
+						 * hash bucket mutex.
+						 */
+						ret = __memp_bh_thaw(dbmp,
+						    infop, hp, oldest_bhp,
+						    NULL);
+						h_locked = 0;
+						if (ret != 0)
+							goto err;
+						goto reuse;
+					}
+					if ((ret = __memp_bhfree(dbmp,
+					    infop, mfp, hp, oldest_bhp,
+					    BH_FREE_REUSE)) != 0)
+						goto err;
+					alloc_bhp = oldest_bhp;
+					h_locked = 0;
+				}
+
+				DB_ASSERT(env, alloc_bhp == NULL ||
+				    !F_ISSET(alloc_bhp, BH_FROZEN));
+			}
+		}
+
+		/* We found the buffer or we're ready to copy -- we're done. */
+		if (!(makecopy || F_ISSET(bhp, BH_FROZEN)) || alloc_bhp != NULL)
+			break;
+
+		/* FALLTHROUGH */
+	case FIRST_MISS:
+		/*
+		 * We didn't find the buffer in our first check.  Figure out
+		 * if the page exists, and allocate structures so we can add
+		 * the page to the buffer pool.
+		 */
+		if (h_locked)
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+		h_locked = 0;
+
+		/*
+		 * The buffer is not in the pool, so we don't need to free it.
+		 */
+		if (LF_ISSET(DB_MPOOL_FREE) &&
+		    (bhp == NULL || F_ISSET(bhp, BH_FREED) || !makecopy))
+			goto done;
+
+		if (bhp != NULL)
+			goto alloc;
+
+newpg:		/*
+		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
+		 * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then
+		 * it's an error to try and get a page past the end of file.
+		 */
+		DB_ASSERT(env, !h_locked);
+		MUTEX_LOCK(env, mfp->mutex);
+		switch (flags) {
+		case DB_MPOOL_NEW:
+			extending = 1;
+			if (mfp->maxpgno != 0 &&
+			    mfp->last_pgno >= mfp->maxpgno) {
+				__db_errx(env, DB_STR_A("3023",
+				    "%s: file limited to %lu pages", "%s %lu"),
+				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+				ret = ENOSPC;
+			} else
+				*pgnoaddr = mfp->last_pgno + 1;
+			break;
+		case DB_MPOOL_CREATE:
+			if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
+				__db_errx(env, DB_STR_A("3024",
+				    "%s: file limited to %lu pages", "%s %lu"),
+				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+				ret = ENOSPC;
+			} else if (!extending)
+				extending = *pgnoaddr > mfp->last_pgno;
+			break;
+		default:
+			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+			break;
+		}
+		MUTEX_UNLOCK(env, mfp->mutex);
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * !!!
+		 * In the DB_MPOOL_NEW code path, hp, infop and c_mp have
+		 * not yet been initialized.
+		 */
+		if (hp == NULL) {
+			MP_GET_BUCKET(env,
+			    mfp, *pgnoaddr, &infop, hp, bucket, ret);
+			if (ret != 0)
+				goto err;
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			c_mp = infop->primary;
+		}
+
+alloc:		/* Allocate a new buffer header and data space. */
+		if (alloc_bhp == NULL && (ret =
+		    __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+			goto err;
+
+		/* Initialize enough so we can call __memp_bhfree. */
+		alloc_bhp->flags = 0;
+		atomic_init(&alloc_bhp->ref, 1);
+#ifdef DIAGNOSTIC
+		if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+			__db_errx(env, DB_STR("3025",
+		    "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"));
+			ret = __env_panic(env, EINVAL);
+			goto err;
+		}
+#endif
+
+		/*
+		 * If we're doing copy-on-write, we will already have the
+		 * buffer header.  In that case, we don't need to search again.
+		 */
+		if (bhp != NULL)
+			break;
+
+		/*
+		 * If we are extending the file, we'll need the mfp lock
+		 * again.
+		 */
+		if (extending)
+			MUTEX_LOCK(env, mfp->mutex);
+
+		/*
+		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control.  (That guarantee is interesting
+		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+		 * did not specify the page number, and so, may reasonably not
+		 * have any way to lock the page outside of mpool.) Regardless,
+		 * if we allocate the page, and some other thread of control
+		 * requests the page by number, we will not detect that and the
+		 * thread of control that allocated using DB_MPOOL_NEW may not
+		 * have a chance to initialize the page.  (Note: we *could*
+		 * detect this case if we set a flag in the buffer header which
+		 * guaranteed that no gets of the page would succeed until the
+		 * reference count went to 0, that is, until the creating page
+		 * put the page.)  What we do guarantee is that if two threads
+		 * of control are both doing DB_MPOOL_NEW calls, they won't
+		 * collide, that is, they won't both get the same page.
+		 *
+		 * There's a possibility that another thread allocated the page
+		 * we were planning to allocate while we were off doing buffer
+		 * allocation.  We can do that by making sure the page number
+		 * we were going to use is still available.  If it's not, then
+		 * we check to see if the next available page number hashes to
+		 * the same mpool region as the old one -- if it does, we can
+		 * continue, otherwise, we have to start over.
+		 */
+		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+			*pgnoaddr = mfp->last_pgno + 1;
+			MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop, ret);
+			if (ret != 0)
+				goto err;
+			if (t_infop != infop) {
+				/*
+				 * flags == DB_MPOOL_NEW, so extending is set
+				 * and we're holding the mfp locked.
+				 */
+				MUTEX_UNLOCK(env, mfp->mutex);
+				hp = NULL;
+				goto newpg;
+			}
+		}
+
+		/*
+		 * We released the mfp lock, so another thread might have
+		 * extended the file.  Update the last_pgno and initialize
+		 * the file, as necessary, if we extended the file.
+		 */
+		if (extending) {
+			if (*pgnoaddr > mfp->last_pgno)
+				mfp->last_pgno = *pgnoaddr;
+			else
+				extending = 0;
+			MUTEX_UNLOCK(env, mfp->mutex);
+			if (ret != 0)
+				goto err;
+		}
+		goto retry;
+	case SECOND_FOUND:
+		/*
+		 * We allocated buffer space for the requested page, but then
+		 * found the page in the buffer cache on our second check.
+		 * That's OK -- we can use the page we found in the pool,
+		 * unless DB_MPOOL_NEW is set.  If we're about to copy-on-write,
+		 * this is exactly the situation we want.
+		 *
+		 * For multiversion files, we may have left some pages in cache
+		 * beyond the end of a file after truncating.  In that case, we
+		 * would get to here with extending set.  If so, we need to
+		 * insert the new page in the version chain similar to when
+		 * we copy on write.
+		 */
+		if (F_ISSET(bhp, BH_FREED) &&
+		    (flags == DB_MPOOL_NEW || flags == DB_MPOOL_CREATE))
+			goto revive;
+		else if (flags == DB_MPOOL_FREE)
+			goto freebuf;
+		else if (makecopy || F_ISSET(bhp, BH_FROZEN))
+			break;
+
+		/*
+		 * We can't use the page we found in the pool if DB_MPOOL_NEW
+		 * was set.  (For details, see the above comment beginning
+		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
+		 * release our pin on this particular buffer, and try to get
+		 * another one.
+		 */
+		if (flags == DB_MPOOL_NEW) {
+			DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
+			atomic_dec(env, &bhp->ref);
+			b_incr = 0;
+			if (F_ISSET(bhp, BH_EXCLUSIVE))
+				F_CLR(bhp, BH_EXCLUSIVE);
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			b_lock = 0;
+			bhp = NULL;
+			hp = NULL;
+			goto newpg;
+		}
+
+		break;
+	case SECOND_MISS:
+		/*
+		 * We allocated buffer space for the requested page, and found
+		 * the page still missing on our second pass through the buffer
+		 * cache.  Instantiate the page.
+		 */
+		DB_ASSERT(env, alloc_bhp != NULL);
+		bhp = alloc_bhp;
+		alloc_bhp = NULL;
+
+		/*
+		 * Initialize all the BH and hash bucket fields so we can call
+		 * __memp_bhfree if an error occurs.
+		 *
+		 * Append the buffer to the tail of the bucket list.
+		 */
+		bhp->priority = MPOOL_LRU_REDZONE;
+		bhp->pgno = *pgnoaddr;
+		bhp->mf_offset = mf_offset;
+		bhp->bucket = bucket;
+		bhp->region = (int)(infop - dbmp->reginfo);
+		bhp->td_off = INVALID_ROFF;
+		SH_CHAIN_INIT(bhp, vc);
+		bhp->flags = 0;
+
+		/*
+		 * Reference the buffer and lock exclusive.  We either
+		 * need to read the buffer or create it from scratch
+		 * and don't want anyone looking at it till we do.
+		 */
+		MUTEX_LOCK(env, bhp->mtx_buf);
+		b_lock = 1;
+		F_SET(bhp, BH_EXCLUSIVE);
+		b_incr = 1;
+
+		/* We created a new page, it starts dirty. */
+		if (extending) {
+			atomic_inc(env, &hp->hash_page_dirty);
+			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+		}
+
+		MUTEX_REQUIRED(env, hp->mtx_hash);
+		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, bhp, hq, __bh);
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		h_locked = 0;
+
+		/*
+		 * If we created the page, zero it out.  If we didn't create
+		 * the page, read from the backing file.
+		 *
+		 * !!!
+		 * DB_MPOOL_NEW doesn't call the pgin function.
+		 *
+		 * If DB_MPOOL_CREATE is used, then the application's pgin
+		 * function has to be able to handle pages of 0's -- if it
+		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
+		 * and not bother.
+		 *
+		 * If we're running in diagnostic mode, smash any bytes on the
+		 * page that are unknown quantities for the caller.
+		 *
+		 * Otherwise, read the page into memory, optionally creating it
+		 * if DB_MPOOL_CREATE is set.
+		 */
+		if (extending) {
+			MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+			    PROT_READ | PROT_WRITE);
+			memset(bhp->buf, 0,
+			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+			    mfp->pagesize : mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+			if (mfp->clear_len != DB_CLEARLEN_NOTSET)
+				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+				    mfp->pagesize - mfp->clear_len);
+#endif
+
+			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0 &&
+			    (ret = __memp_pg(dbmfp,
+			    bhp->pgno, bhp->buf, 1)) != 0)
+				goto err;
+
+			STAT_INC_VERB(env, mpool, page_create,
+			    mfp->stat.st_page_create,
+			    __memp_fn(dbmfp), *pgnoaddr);
+		} else {
+			F_SET(bhp, BH_TRASH);
+			STAT_INC_VERB(env, mpool, miss, mfp->stat.st_cache_miss,
+			    __memp_fn(dbmfp), *pgnoaddr);
+		}
+
+		makecopy = mvcc && dirty && !extending;
+
+		/* Increment buffer count referenced by MPOOLFILE. */
+		MUTEX_LOCK(env, mfp->mutex);
+		++mfp->block_cnt;
+		MUTEX_UNLOCK(env, mfp->mutex);
+	}
+
+	DB_ASSERT(env, bhp != NULL && BH_REFCOUNT(bhp) != 0 && b_lock);
+	DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN) || !F_ISSET(bhp, BH_FREED) ||
+	    makecopy);
+
+	/* We've got a buffer header we're re-instantiating. */
+	if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
+		if (alloc_bhp == NULL)
+			goto reuse;
+
+		/*
+		 * To thaw the buffer, we must hold the hash bucket mutex,
+		 * and the call to __memp_bh_thaw will release it.
+		 */
+		if (h_locked == 0)
+			MUTEX_LOCK(env, hp->mtx_hash);
+		h_locked = 1;
+
+		/*
+		 * If the empty buffer has been filled in the meantime, don't
+		 * overwrite it.
+		 */
+		if (F_ISSET(bhp, BH_THAWED)) {
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			h_locked = 0;
+			goto thawed;
+		}
+
+		ret = __memp_bh_thaw(dbmp, infop, hp, bhp, alloc_bhp);
+		bhp = NULL;
+		b_lock = h_locked = 0;
+		if (ret != 0)
+			goto err;
+		bhp = alloc_bhp;
+		alloc_bhp = NULL;
+		MUTEX_REQUIRED(env, bhp->mtx_buf);
+		b_incr = b_lock = 1;
+	}
+
+	/*
+	 * BH_TRASH --
+	 * The buffer we found may need to be filled from the disk.
+	 *
+	 * It's possible for the read function to fail, which means we fail
+	 * as well.  Discard the buffer on failure unless another thread
+	 * is waiting on our I/O to complete.  It's OK to leave the buffer
+	 * around, as the waiting thread will see the BH_TRASH flag set,
+	 * and will also attempt to discard it.  If there's a waiter,
+	 * we need to decrement our reference count.
+	 */
+	if (F_ISSET(bhp, BH_TRASH) &&
+	    flags != DB_MPOOL_FREE && !F_ISSET(bhp, BH_FREED)) {
+		MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+		    PROT_READ | PROT_WRITE);
+		if ((ret = __memp_pgread(dbmfp,
+		    bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+			goto err;
+		DB_ASSERT(env, read_lsnp != NULL || !SH_CHAIN_HASNEXT(bhp, vc));
+	}
+
+	/* Copy-on-write. */
+	if (makecopy) {
+		/*
+		 * If we read a page from disk that we want to modify, we now
+		 * need to make copy, so we now need to allocate another buffer
+		 * to hold the new copy.
+		 */
+		if (alloc_bhp == NULL)
+			goto reuse;
+
+		DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp);
+		DB_ASSERT(env, bhp->td_off == INVALID_ROFF ||
+		    !IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+		    (F_ISSET(bhp, BH_FREED) && F_ISSET(bhp, BH_FROZEN)));
+		DB_ASSERT(env, txn != NULL ||
+		    (F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED)));
+		DB_ASSERT(env, (extending || flags == DB_MPOOL_FREE ||
+		    F_ISSET(bhp, BH_FREED)) ||
+		    !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
+		MUTEX_REQUIRED(env, bhp->mtx_buf);
+
+		if (BH_REFCOUNT(bhp) == 1)
+			MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+			    PROT_READ);
+
+		atomic_init(&alloc_bhp->ref, 1);
+		MUTEX_LOCK(env, alloc_bhp->mtx_buf);
+		alloc_bhp->priority = bhp->priority;
+		alloc_bhp->pgno = bhp->pgno;
+		alloc_bhp->bucket = bhp->bucket;
+		alloc_bhp->region = bhp->region;
+		alloc_bhp->mf_offset = bhp->mf_offset;
+		alloc_bhp->td_off = INVALID_ROFF;
+		if (txn == NULL) {
+			DB_ASSERT(env,
+			    F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED));
+			if (bhp->td_off != INVALID_ROFF && (ret =
+			    __memp_bh_settxn(dbmp, mfp, alloc_bhp,
+			    BH_OWNER(env, bhp))) != 0)
+				goto err;
+		} else if ((ret =
+		    __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
+			goto err;
+		MVCC_MPROTECT(alloc_bhp->buf, mfp->pagesize,
+		    PROT_READ | PROT_WRITE);
+		if (extending ||
+		    F_ISSET(bhp, BH_FREED) || flags == DB_MPOOL_FREE) {
+			memset(alloc_bhp->buf, 0,
+			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+			    mfp->pagesize : mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+			if (mfp->clear_len != DB_CLEARLEN_NOTSET)
+				memset(alloc_bhp->buf + mfp->clear_len,
+				    CLEAR_BYTE,
+				    mfp->pagesize - mfp->clear_len);
+#endif
+			if (mfp->ftype != 0 && (ret = __memp_pg(dbmfp,
+			    alloc_bhp->pgno, alloc_bhp->buf, 1)) != 0)
+				goto err;
+		} else
+			memcpy(alloc_bhp->buf, bhp->buf, mfp->pagesize);
+		MVCC_MPROTECT(alloc_bhp->buf, mfp->pagesize, 0);
+
+		if (h_locked == 0)
+			MUTEX_LOCK(env, hp->mtx_hash);
+		MUTEX_REQUIRED(env, hp->mtx_hash);
+		h_locked = 1;
+
+		alloc_bhp->flags = BH_EXCLUSIVE |
+		    ((flags == DB_MPOOL_FREE) ? BH_FREED :
+		    F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE));
+		DB_ASSERT(env, flags != DB_MPOOL_FREE ||
+		    !F_ISSET(bhp, BH_DIRTY));
+		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+		SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
+		SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
+		    bhp, alloc_bhp, hq, __bh);
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		h_locked = 0;
+		DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0);
+		if (atomic_dec(env, &bhp->ref) == 0) {
+			bhp->priority = c_mp->lru_priority;
+			MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0);
+		}
+		F_CLR(bhp, BH_EXCLUSIVE);
+		MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+		bhp = alloc_bhp;
+		DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+		b_incr = 1;
+		MUTEX_REQUIRED(env, bhp->mtx_buf);
+		b_lock = 1;
+
+		if (alloc_bhp != oldest_bhp) {
+			MUTEX_LOCK(env, mfp->mutex);
+			++mfp->block_cnt;
+			MUTEX_UNLOCK(env, mfp->mutex);
+		}
+
+		alloc_bhp = NULL;
+	} else if (mvcc && extending &&
+	    (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
+		goto err;
+
+	if (flags == DB_MPOOL_FREE) {
+		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+		/* If we have created an empty buffer, it is not returned. */
+		if (!F_ISSET(bhp, BH_FREED))
+			goto freebuf;
+		goto done;
+	}
+
+	/*
+	 * Free the allocated memory, we no longer need it.
+	 */
+	if (alloc_bhp != NULL) {
+		if ((ret = __memp_bhfree(dbmp, infop, NULL,
+		     NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0)
+			goto err;
+		alloc_bhp = NULL;
+	}
+
+	if (dirty || extending ||
+	    (F_ISSET(bhp, BH_FREED) &&
+	    (flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW))) {
+		MUTEX_REQUIRED(env, bhp->mtx_buf);
+		if (F_ISSET(bhp, BH_FREED)) {
+			DB_ASSERT(env, bhp->pgno <= mfp->last_pgno);
+			memset(bhp->buf, 0,
+			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+			    mfp->pagesize : mfp->clear_len);
+			F_CLR(bhp, BH_FREED);
+			if (mfp->ftype != 0 && (ret =
+			    __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1)) != 0)
+				goto err;
+		}
+		if (!F_ISSET(bhp, BH_DIRTY)) {
+#ifdef DIAGNOSTIC
+			MUTEX_LOCK(env, hp->mtx_hash);
+#endif
+			DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+			atomic_inc(env, &hp->hash_page_dirty);
+			F_SET(bhp, BH_DIRTY);
+#ifdef DIAGNOSTIC
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+		}
+	} else if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+		F_CLR(bhp, BH_EXCLUSIVE);
+#ifdef HAVE_SHARED_LATCHES
+		MUTEX_UNLOCK(env, bhp->mtx_buf);
+		MUTEX_READLOCK(env, bhp->mtx_buf);
+		/*
+		 * If another thread has dirtied the page while we
+		 * switched locks, we have to go through it all again.
+		 */
+		if (SH_CHAIN_HASNEXT(bhp, vc) && read_lsnp == NULL) {
+			atomic_dec(env, &bhp->ref);
+			b_incr = 0;
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			b_lock = 0;
+			bhp = NULL;
+			goto retry;
+		}
+#endif
+	}
+
+	MVCC_MPROTECT(bhp->buf, mfp->pagesize, PROT_READ |
+	    (dirty || extending || F_ISSET(bhp, BH_DIRTY) ?
+	    PROT_WRITE : 0));
+
+#ifdef DIAGNOSTIC
+	MUTEX_LOCK(env, hp->mtx_hash);
+	{
+	BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
+
+	DB_ASSERT(env, !atomic_read(&mfp->multiversion) || read_lsnp != NULL ||
+	    next_bhp == NULL);
+	DB_ASSERT(env, !mvcc || read_lsnp == NULL ||
+	    bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) ||
+	    (BH_VISIBLE(env, bhp, read_lsnp, vlsn) &&
+	    (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
+	    (next_bhp->td_off != INVALID_ROFF &&
+	    (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED ||
+	    IS_ZERO_LSN(BH_OWNER(env, next_bhp)->last_lsn) ||
+	    !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn))))));
+	}
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+
+	/*
+	 * Record this pin for this thread.  Holding the page pinned
+	 * without recording the pin is ok since we do not recover from
+	 * a death from within the library itself.
+	 */
+	if (ip != NULL) {
+		reginfo = env->reginfo;
+		if (ip->dbth_pincount == ip->dbth_pinmax) {
+			pinmax = ip->dbth_pinmax;
+			renv = reginfo->primary;
+			MUTEX_LOCK(env, renv->mtx_regenv);
+			if ((ret = __env_alloc(reginfo,
+			    2 * pinmax * sizeof(PIN_LIST), &list)) != 0) {
+				MUTEX_UNLOCK(env, renv->mtx_regenv);
+				goto err;
+			}
+
+			memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist),
+			    pinmax * sizeof(PIN_LIST));
+			memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST));
+			list_off = R_OFFSET(reginfo, list);
+			list = R_ADDR(reginfo, ip->dbth_pinlist);
+			ip->dbth_pinmax = 2 * pinmax;
+			ip->dbth_pinlist = list_off;
+			if (list != ip->dbth_pinarray)
+				__env_alloc_free(reginfo, list);
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+		}
+		list = R_ADDR(reginfo, ip->dbth_pinlist);
+		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
+			if (lp->b_ref == INVALID_ROFF)
+				break;
+
+		ip->dbth_pincount++;
+		lp->b_ref = R_OFFSET(infop, bhp);
+		lp->region = (int)(infop - dbmp->reginfo);
+#ifdef DIAGNOSTIC
+		if (dirty && ip->dbth_locker != INVALID_ROFF &&
+		    ip->dbth_check_off == 0) {
+			lt = env->lk_handle;
+			locker = (DB_LOCKER *)
+			    (R_ADDR(&lt->reginfo, ip->dbth_locker));
+			DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp,
+			    (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0);
+		}
+#endif
+
+	}
+	/*
+	 * During recovery we can read past the end of the file.  Also
+	 * last_pgno is not versioned, so if this is an older version
+	 * that is ok as well.
+	 */
+	DB_ASSERT(env, IS_RECOVERING(env) ||
+	     bhp->pgno <= mfp->last_pgno || !SH_CHAIN_SINGLETON(bhp, vc));
+
+#ifdef DIAGNOSTIC
+	/* Update the file's pinned reference count. */
+	MPOOL_SYSTEM_LOCK(env);
+	++dbmfp->pinref;
+	MPOOL_SYSTEM_UNLOCK(env);
+
+	/*
+	 * We want to switch threads as often as possible, and at awkward
+	 * times.  Yield every time we get a new page to ensure contention.
+	 */
+	if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+
+	DB_ASSERT(env, alloc_bhp == NULL);
+	DB_ASSERT(env, !(dirty || extending) ||
+	    atomic_read(&hp->hash_page_dirty) > 0);
+	DB_ASSERT(env, BH_REFCOUNT(bhp) > 0 &&
+	    !F_ISSET(bhp, BH_FREED | BH_FROZEN | BH_TRASH));
+
+	*(void **)addrp = bhp->buf;
+	return (0);
+
+done:
+err:	/*
+	 * We should only get to here with ret == 0 if freeing a buffer.
+	 * In that case, check that it has in fact been freed.
+	 */
+	DB_ASSERT(env, ret != 0 || flags != DB_MPOOL_FREE || bhp == NULL ||
+	    (F_ISSET(bhp, BH_FREED) && !SH_CHAIN_HASNEXT(bhp, vc)));
+
+	if (bhp != NULL) {
+		if (b_incr)
+			atomic_dec(env, &bhp->ref);
+		if (b_lock) {
+			F_CLR(bhp, BH_EXCLUSIVE);
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+		}
+	}
+
+	if (h_locked)
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+
+	/* If alloc_bhp is set, free the memory. */
+	if (alloc_bhp != NULL)
+		(void)__memp_bhfree(dbmp, infop, NULL,
+		     NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED);
+
+	return (ret);
+}
diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c
new file mode 100644
index 00000000..41bd638c
--- /dev/null
+++ b/src/mp/mp_fmethod.c
@@ -0,0 +1,589 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_get_clear_len __P((DB_MPOOLFILE *, u_int32_t *));
+static int __memp_get_lsn_offset __P((DB_MPOOLFILE *, int32_t *));
+static int __memp_get_maxsize __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *));
+static int __memp_set_maxsize __P((DB_MPOOLFILE *, u_int32_t, u_int32_t));
+static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+static int __memp_get_last_pgno_pp __P((DB_MPOOLFILE *, db_pgno_t *));
+
+/*
+ * __memp_fcreate_pp --
+ *	ENV->memp_fcreate pre/post processing.
+ *
+ * PUBLIC: int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+ */
+int
+__memp_fcreate_pp(dbenv, retp, flags)
+	DB_ENV *dbenv;
+	DB_MPOOLFILE **retp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	/* Validate arguments. */
+	if ((ret =
+	    __db_fchk(env, "DB_ENV->memp_fcreate", flags, DB_VERIFY)) != 0)
+		return (ret);
+
+	/* We look the other way on mpool operations if we're verifying. */
+	if (REP_ON(env) && !LF_ISSET(DB_VERIFY)) {
+		__db_errx(env, DB_STR("3029",
+"DB_ENV->memp_fcreate: method not permitted when replication is configured"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	ret = __memp_fcreate(env, retp);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_fcreate --
+ *	ENV->memp_fcreate.
+ *
+ * PUBLIC: int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
+ */
+int
+__memp_fcreate(env, retp)
+	ENV *env;
+	DB_MPOOLFILE **retp;
+{
+	DB_MPOOLFILE *dbmfp;
+	int ret;
+
+	/* Allocate and initialize the per-process structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+		return (ret);
+
+	dbmfp->ref = 1;
+	dbmfp->lsn_offset = DB_LSN_OFF_NOTSET;
+	dbmfp->env = env;
+	dbmfp->mfp = INVALID_ROFF;
+
+	dbmfp->close = __memp_fclose_pp;
+	dbmfp->get = __memp_fget_pp;
+	dbmfp->get_clear_len = __memp_get_clear_len;
+	dbmfp->get_fileid = __memp_get_fileid;
+	dbmfp->get_flags = __memp_get_flags;
+	dbmfp->get_ftype = __memp_get_ftype;
+	dbmfp->get_last_pgno = __memp_get_last_pgno_pp;
+	dbmfp->get_lsn_offset = __memp_get_lsn_offset;
+	dbmfp->get_maxsize = __memp_get_maxsize;
+	dbmfp->get_pgcookie = __memp_get_pgcookie;
+	dbmfp->get_priority = __memp_get_priority;
+	dbmfp->open = __memp_fopen_pp;
+	dbmfp->put = __memp_fput_pp;
+	dbmfp->set_clear_len = __memp_set_clear_len;
+	dbmfp->set_fileid = __memp_set_fileid;
+	dbmfp->set_flags = __memp_set_flags;
+	dbmfp->set_ftype = __memp_set_ftype;
+	dbmfp->set_lsn_offset = __memp_set_lsn_offset;
+	dbmfp->set_maxsize = __memp_set_maxsize;
+	dbmfp->set_pgcookie = __memp_set_pgcookie;
+	dbmfp->set_priority = __memp_set_priority;
+	dbmfp->sync = __memp_fsync_pp;
+
+	*retp = dbmfp;
+	return (0);
+}
+
+/*
+ * __memp_get_clear_len --
+ *	Get the clear length.
+ */
+static int
+__memp_get_clear_len(dbmfp, clear_lenp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t *clear_lenp;
+{
+	*clear_lenp = dbmfp->clear_len;
+	return (0);
+}
+
+/*
+ * __memp_set_clear_len --
+ *	DB_MPOOLFILE->set_clear_len.
+ *
+ * PUBLIC: int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_set_clear_len(dbmfp, clear_len)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t clear_len;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_clear_len");
+
+	dbmfp->clear_len = clear_len;
+	return (0);
+}
+
+/*
+ * __memp_get_fileid --
+ *	DB_MPOOLFILE->get_fileid.
+ *
+ * PUBLIC: int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+ */
+int
+__memp_get_fileid(dbmfp, fileid)
+	DB_MPOOLFILE *dbmfp;
+	u_int8_t *fileid;
+{
+	if (!F_ISSET(dbmfp, MP_FILEID_SET)) {
+		__db_errx(dbmfp->env, DB_STR("3030",
+		    "get_fileid: file ID not set"));
+		return (EINVAL);
+	}
+
+	memcpy(fileid, dbmfp->fileid, DB_FILE_ID_LEN);
+	return (0);
+}
+
+/*
+ * __memp_set_fileid --
+ *	DB_MPOOLFILE->set_fileid.
+ *
+ * PUBLIC: int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+ */
+int
+__memp_set_fileid(dbmfp, fileid)
+	DB_MPOOLFILE *dbmfp;
+	u_int8_t *fileid;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_fileid");
+
+	memcpy(dbmfp->fileid, fileid, DB_FILE_ID_LEN);
+	F_SET(dbmfp, MP_FILEID_SET);
+
+	return (0);
+}
+
+/*
+ * __memp_get_flags --
+ *	Get the DB_MPOOLFILE flags;
+ *
+ * PUBLIC: int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *));
+ */
+int
+__memp_get_flags(dbmfp, flagsp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t *flagsp;
+{
+	MPOOLFILE *mfp;
+
+	mfp = dbmfp->mfp;
+
+	*flagsp = 0;
+
+	if (mfp == NULL)
+		*flagsp = FLD_ISSET(dbmfp->config_flags,
+		     DB_MPOOL_NOFILE | DB_MPOOL_UNLINK);
+	else {
+		if (mfp->no_backing_file)
+			FLD_SET(*flagsp, DB_MPOOL_NOFILE);
+		if (mfp->unlink_on_close)
+			FLD_SET(*flagsp, DB_MPOOL_UNLINK);
+	}
+	return (0);
+}
+
+/*
+ * __memp_set_flags --
+ *	Set the DB_MPOOLFILE flags;
+ *
+ * PUBLIC: int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int));
+ */
+int
+__memp_set_flags(dbmfp, flags, onoff)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t flags;
+	int onoff;
+{
+	ENV *env;
+	MPOOLFILE *mfp;
+	int ret;
+
+	env = dbmfp->env;
+	mfp = dbmfp->mfp;
+
+	switch (flags) {
+	case DB_MPOOL_NOFILE:
+		if (mfp == NULL)
+			if (onoff)
+				FLD_SET(dbmfp->config_flags, DB_MPOOL_NOFILE);
+			else
+				FLD_CLR(dbmfp->config_flags, DB_MPOOL_NOFILE);
+		else
+			mfp->no_backing_file = onoff;
+		break;
+	case DB_MPOOL_UNLINK:
+		if (mfp == NULL)
+			if (onoff)
+				FLD_SET(dbmfp->config_flags, DB_MPOOL_UNLINK);
+			else
+				FLD_CLR(dbmfp->config_flags, DB_MPOOL_UNLINK);
+		else
+			mfp->unlink_on_close = onoff;
+		break;
+	default:
+		if ((ret = __db_fchk(env, "DB_MPOOLFILE->set_flags",
+		    flags, DB_MPOOL_NOFILE | DB_MPOOL_UNLINK)) != 0)
+			return (ret);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __memp_get_ftype --
+ *	Get the file type (as registered).
+ *
+ * PUBLIC: int __memp_get_ftype __P((DB_MPOOLFILE *, int *));
+ */
+int
+__memp_get_ftype(dbmfp, ftypep)
+	DB_MPOOLFILE *dbmfp;
+	int *ftypep;
+{
+	*ftypep = dbmfp->ftype;
+	return (0);
+}
+
+/*
+ * __memp_set_ftype --
+ *	DB_MPOOLFILE->set_ftype.
+ *
+ * PUBLIC: int __memp_set_ftype __P((DB_MPOOLFILE *, int));
+ */
+int
+__memp_set_ftype(dbmfp, ftype)
+	DB_MPOOLFILE *dbmfp;
+	int ftype;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_ftype");
+
+	dbmfp->ftype = ftype;
+	return (0);
+}
+
+/*
+ * __memp_get_lsn_offset --
+ *	Get the page's LSN offset.
+ */
+static int
+__memp_get_lsn_offset(dbmfp, lsn_offsetp)
+	DB_MPOOLFILE *dbmfp;
+	int32_t *lsn_offsetp;
+{
+	*lsn_offsetp = dbmfp->lsn_offset;
+	return (0);
+}
+
+/*
+ * __memp_set_lsn_offset --
+ *	Set the page's LSN offset.
+ *
+ * PUBLIC: int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+ */
+int
+__memp_set_lsn_offset(dbmfp, lsn_offset)
+	DB_MPOOLFILE *dbmfp;
+	int32_t lsn_offset;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_lsn_offset");
+
+	dbmfp->lsn_offset = lsn_offset;
+	return (0);
+}
+
+/*
+ * __memp_get_maxsize --
+ *	Get the file's maximum size.
+ */
+static int
+__memp_get_maxsize(dbmfp, gbytesp, bytesp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t *gbytesp, *bytesp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOLFILE *mfp;
+
+	if ((mfp = dbmfp->mfp) == NULL) {
+		*gbytesp = dbmfp->gbytes;
+		*bytesp = dbmfp->bytes;
+	} else {
+		env = dbmfp->env;
+		ENV_ENTER(env, ip);
+
+		MUTEX_LOCK(env, mfp->mutex);
+		*gbytesp = (u_int32_t)
+		    (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
+		*bytesp = (u_int32_t)
+		    ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) *
+		    mfp->pagesize);
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+		ENV_LEAVE(env, ip);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_set_maxsize --
+ *	Set the file's maximum size.
+ */
+static int
+__memp_set_maxsize(dbmfp, gbytes, bytes)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t gbytes, bytes;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOLFILE *mfp;
+
+	if ((mfp = dbmfp->mfp) == NULL) {
+		dbmfp->gbytes = gbytes;
+		dbmfp->bytes = bytes;
+	} else {
+		env = dbmfp->env;
+		ENV_ENTER(env, ip);
+
+		MUTEX_LOCK(env, mfp->mutex);
+		mfp->maxpgno = (db_pgno_t)
+		    (gbytes * (GIGABYTE / mfp->pagesize));
+		mfp->maxpgno += (db_pgno_t)
+		    ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+		ENV_LEAVE(env, ip);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_get_pgcookie --
+ *	Get the pgin/pgout cookie.
+ *
+ * PUBLIC: int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *));
+ */
+int
+__memp_get_pgcookie(dbmfp, pgcookie)
+	DB_MPOOLFILE *dbmfp;
+	DBT *pgcookie;
+{
+	if (dbmfp->pgcookie == NULL) {
+		pgcookie->size = 0;
+		pgcookie->data = "";
+	} else
+		memcpy(pgcookie, dbmfp->pgcookie, sizeof(DBT));
+	return (0);
+}
+
+/*
+ * __memp_set_pgcookie --
+ *	Set the pgin/pgout cookie.
+ *
+ * PUBLIC: int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+ */
+int
+__memp_set_pgcookie(dbmfp, pgcookie)
+	DB_MPOOLFILE *dbmfp;
+	DBT *pgcookie;
+{
+	DBT *cookie;
+	ENV *env;
+	int ret;
+
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_pgcookie");
+	env = dbmfp->env;
+
+	if ((ret = __os_calloc(env, 1, sizeof(*cookie), &cookie)) != 0)
+		return (ret);
+	if ((ret = __os_malloc(env, pgcookie->size, &cookie->data)) != 0) {
+		__os_free(env, cookie);
+		return (ret);
+	}
+
+	memcpy(cookie->data, pgcookie->data, pgcookie->size);
+	cookie->size = pgcookie->size;
+
+	dbmfp->pgcookie = cookie;
+	return (0);
+}
+
+/*
+ * __memp_get_priority --
+ *	Set the cache priority for pages from this file.
+ *
+ * PUBLIC: int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+ */
+int
+__memp_get_priority(dbmfp, priorityp)
+	DB_MPOOLFILE *dbmfp;
+	DB_CACHE_PRIORITY *priorityp;
+{
+	switch (dbmfp->priority) {
+	case MPOOL_PRI_VERY_LOW:
+		*priorityp = DB_PRIORITY_VERY_LOW;
+		break;
+	case MPOOL_PRI_LOW:
+		*priorityp = DB_PRIORITY_LOW;
+		break;
+	case MPOOL_PRI_DEFAULT:
+		*priorityp = DB_PRIORITY_DEFAULT;
+		break;
+	case MPOOL_PRI_HIGH:
+		*priorityp = DB_PRIORITY_HIGH;
+		break;
+	case MPOOL_PRI_VERY_HIGH:
+		*priorityp = DB_PRIORITY_VERY_HIGH;
+		break;
+	default:
+		__db_errx(dbmfp->env, DB_STR_A("3031",
+		    "DB_MPOOLFILE->get_priority: unknown priority value: %d",
+		    "%d"), dbmfp->priority);
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_set_priority --
+ *	Set the cache priority for pages from this file.
+ */
+static int
+__memp_set_priority(dbmfp, priority)
+	DB_MPOOLFILE *dbmfp;
+	DB_CACHE_PRIORITY priority;
+{
+	switch (priority) {
+	case DB_PRIORITY_VERY_LOW:
+		dbmfp->priority = MPOOL_PRI_VERY_LOW;
+		break;
+	case DB_PRIORITY_LOW:
+		dbmfp->priority = MPOOL_PRI_LOW;
+		break;
+	case DB_PRIORITY_DEFAULT:
+		dbmfp->priority = MPOOL_PRI_DEFAULT;
+		break;
+	case DB_PRIORITY_HIGH:
+		dbmfp->priority = MPOOL_PRI_HIGH;
+		break;
+	case DB_PRIORITY_VERY_HIGH:
+		dbmfp->priority = MPOOL_PRI_VERY_HIGH;
+		break;
+	default:
+		__db_errx(dbmfp->env, DB_STR_A("3032",
+		    "DB_MPOOLFILE->set_priority: unknown priority value: %d",
+		    "%d"), priority);
+		return (EINVAL);
+	}
+
+	/* Update the underlying file if we've already opened it. */
+	if (dbmfp->mfp != NULL)
+		dbmfp->mfp->priority = dbmfp->priority;
+
+	return (0);
+}
+
+/*
+ * __memp_get_last_pgno --
+ *	Return the page number of the last page in the file.
+ *
+ * !!!
+ * The method is undocumented, but the handle is exported, users occasionally
+ * ask for it.
+ *
+ * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+ */
+int
+__memp_get_last_pgno(dbmfp, pgnoaddr)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+{
+	ENV *env;
+	MPOOLFILE *mfp;
+
+	env = dbmfp->env;
+	mfp = dbmfp->mfp;
+
+	MUTEX_LOCK(env, mfp->mutex);
+	*pgnoaddr = mfp->last_pgno;
+	MUTEX_UNLOCK(env, mfp->mutex);
+
+	return (0);
+}
+
+/*
+ * __memp_get_last_pgno_pp --
+ *	pre/post processing for __memp_get_last_pgno.
+ *
+ */
+static int
+__memp_get_last_pgno_pp(dbmfp, pgnoaddr)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ret = 0;
+	ENV_ENTER(dbmfp->env, ip);
+
+	ret = __memp_get_last_pgno(dbmfp, pgnoaddr);
+
+	ENV_LEAVE(dbmfp->env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_fn --
+ *	On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *));
+ */
+char *
+__memp_fn(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	return (__memp_fns(dbmfp->env->mp_handle, dbmfp->mfp));
+}
+
+/*
+ * __memp_fns --
+ *	On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+ *
+ */
+char *
+__memp_fns(dbmp, mfp)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+{
+	if (mfp == NULL || mfp->path_off == 0)
+		return ((char *)"unknown");
+
+	return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
+}
diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c
new file mode 100644
index 00000000..ef7f886a
--- /dev/null
+++ b/src/mp/mp_fopen.c
@@ -0,0 +1,1220 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+static int __memp_mpf_alloc __P((DB_MPOOL *,
+    DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
+static int __memp_mpf_find __P((ENV *,
+    DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
+
+/*
+ * __memp_fopen_pp --
+ *	DB_MPOOLFILE->open pre/post processing.
+ *
+ * PUBLIC: int __memp_fopen_pp
+ * PUBLIC:     __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+ */
+int
+__memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
+	DB_MPOOLFILE *dbmfp;
+	const char *path;
+	u_int32_t flags;
+	int mode;
+	size_t pagesize;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbmfp->env;
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags,
+	    DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION |
+	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+		return (ret);
+
+	/*
+	 * Require a power-of-two pagesize, smaller than the clear length.  A
+	 * non-zero page size is only allowed if opening an existing, in-memory
+	 * db.
+	 */
+	if (!POWER_OF_TWO(pagesize) ||
+	    (pagesize == 0 && (LF_ISSET(DB_CREATE) ||
+	    !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)))) {
+		__db_errx(env, DB_STR("3033",
+		    "DB_MPOOLFILE->open: page sizes must be a power-of-2"));
+		return (EINVAL);
+	}
+	if (pagesize != 0 && dbmfp->clear_len > pagesize) {
+		__db_errx(env, DB_STR("3034",
+		    "DB_MPOOLFILE->open: clear length larger than page size"));
+		return (EINVAL);
+	}
+
+	/* Read-only checks, and local flag. */
+	if (LF_ISSET(DB_RDONLY) && path == NULL) {
+		__db_errx(env, DB_STR("3035",
+		    "DB_MPOOLFILE->open: temporary files can't be readonly"));
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) {
+		__db_errx(env, DB_STR("3036",
+	    "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__memp_fopen(dbmfp, NULL,
+	    path, NULL, flags, mode, pagesize)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Generate the number of user opens.  If there is no backing file
+ * there is an extra open count to keep the in memory db around.
+ */
+#define MFP_OPEN_CNT(mfp)	((mfp)->mpf_cnt - ((mfp)->neutral_cnt +	\
+				   (u_int32_t)(mfp)->no_backing_file))
+/*
+ * __memp_fopen --
+ *	DB_MPOOLFILE->open.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *,
+ * PUBLIC:     const char *, const char **, u_int32_t, int, size_t));
+ */
+int
+__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	const char *path;
+	const char **dirp;
+	u_int32_t flags;
+	int mode;
+	size_t pgsize;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *tmp_dbmfp;
+	DB_MPOOL_HASH *hp;
+	ENV *env;
+	MPOOL *mp;
+	MPOOLFILE *alloc_mfp;
+	size_t maxmap;
+	db_pgno_t last_pgno;
+	u_int32_t bucket, mbytes, bytes, oflags, pagesize;
+	int refinc, ret, isdir;
+	char *rpath;
+
+	/* If this handle is already open, return. */
+	if (F_ISSET(dbmfp, MP_OPEN_CALLED))
+		return (0);
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	dbenv = env->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	alloc_mfp = NULL;
+	mbytes = bytes = 0;
+	refinc = ret = isdir = 0;
+	rpath = NULL;
+
+	/*
+	 * We're keeping the page size as a size_t in the public API, but
+	 * it's a u_int32_t everywhere internally.
+	 */
+	pagesize = (u_int32_t)pgsize;
+
+	/*
+	 * We're called internally with a specified mfp, in which case the
+	 * path is NULL, but we'll get the path from the underlying region
+	 * information.  Otherwise, if the path is NULL, it's a temporary
+	 * file -- we know we can't join any existing files, and we'll delay
+	 * the open until we actually need to write the file. All temporary
+	 * files will go into the first hash bucket.
+	 */
+	DB_ASSERT(env, mfp == NULL || path == NULL);
+
+	bucket = 0;
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+	if (mfp == NULL) {
+		if (path == NULL)
+			goto alloc;
+
+		/*
+		 * If fileid is not set but the file exists on the disk,
+		 * we try to use __os_fileid to set it. We do this
+		 * because we want to use the fileid to check if we have
+		 * opened the mpoolfile as early as possible.
+		 *
+		 * Note: DB layer always calls __memp_fopen with fileid set,
+		 * so this is only for using mpool api to open a file.
+		 */
+
+		if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
+		    !F_ISSET(dbmfp, MP_FILEID_SET)) {
+			if ((ret = __db_appname(env,
+			     DB_APP_DATA, path, dirp, &rpath)) != 0)
+				goto err;
+			ret = __os_exists(env, rpath, &isdir);
+			if (ret == 0 && isdir) {
+				ret = EINVAL;
+				goto err;
+			} else if (ret == 0) {
+				if  ((ret = __os_fileid(env,
+				    rpath, 0, dbmfp->fileid)) != 0)
+					goto err;
+				F_SET(dbmfp, MP_FILEID_SET);
+			}
+		}
+
+		/*
+		 * Hash to the proper file table entry and walk it.
+		 *
+		 * The fileID is a filesystem unique number (e.g., a
+		 * UNIX dev/inode pair) plus a timestamp.  If files are
+		 * removed and created in less than a second, the fileID
+		 * can be repeated.  The problem with repetition happens
+		 * when the file that previously had the fileID value still
+		 * has pages in the pool, since we don't want to use them
+		 * to satisfy requests for the new file. Because the
+		 * DB_TRUNCATE flag reuses the dev/inode pair, repeated
+		 * opens with that flag set guarantees matching fileIDs
+		 * when the machine can open a file and then re-open
+		 * with truncate within a second.  For this reason, we
+		 * pass that flag down, and, if we find a matching entry,
+		 * we ensure that it's never found again, and we create
+		 * a new entry for the current request.
+		 */
+		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) ||
+		    F_ISSET(dbmfp, MP_FILEID_SET)) {
+			if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+				bucket = FNBUCKET(path, strlen(path));
+			else
+				bucket = FNBUCKET(dbmfp->fileid,
+				    DB_FILE_ID_LEN);
+
+			hp += bucket;
+			/*
+			 * If we find the MPOOLFILE and inc its ref count.
+			 * That way it cannot go away while we open it.
+			 */
+			MUTEX_LOCK(env, hp->mtx_hash);
+			ret =
+			    __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp);
+			if (ret == 0 && mfp != NULL) {
+				refinc = 1;
+
+				if (LF_ISSET(DB_MULTIVERSION)) {
+					if (MFP_OPEN_CNT(mfp) > (u_int32_t)
+					    (LF_ISSET(DB_RDONLY) ? 0 : 1) &&
+					    atomic_read(
+					    &mfp->multiversion) == 0) {
+						MUTEX_UNLOCK(env, hp->mtx_hash);
+						goto mvcc_err;
+					}
+					atomic_inc(env, &mfp->multiversion);
+					F_SET(dbmfp, MP_MULTIVERSION);
+				}
+			}
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			if (ret != 0)
+				goto err;
+		}
+	} else {
+		/*
+		 * Deadfile can only be set if mpf_cnt goes to zero (or if we
+		 * failed creating the file DB_AM_DISCARD).  Increment the ref
+		 * count so the file cannot become dead and be unlinked.
+		 */
+		MUTEX_LOCK(env, mfp->mutex);
+		if (!mfp->deadfile) {
+			if (LF_ISSET(DB_MULTIVERSION)) {
+				MUTEX_UNLOCK(env, mfp->mutex);
+				if (MFP_OPEN_CNT(mfp) > 0 &&
+				     atomic_read(&mfp->multiversion) == 0) {
+mvcc_err:				__db_errx(env, DB_STR("3041",
+"DB_MULTIVERSION cannot be specified on a database file which is already open"));
+					ret = EINVAL;
+					goto err;
+				}
+
+				atomic_inc(env, &mfp->multiversion);
+				F_SET(dbmfp, MP_MULTIVERSION);
+			}
+			/*
+			 * Increment the reference count.  We also track
+			 * those references that don't effect the ability
+			 * to convert the handle to either NOT_DURABLE or
+			 * MVCC.  These are readonly opens or threads that
+			 * are using the handle just to flush a buffer.
+			 */
+			++mfp->mpf_cnt;
+			if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+				++mfp->neutral_cnt;
+			if (LF_ISSET(DB_FLUSH))
+				F_SET(dbmfp, MP_FOR_FLUSH);
+			refinc = 1;
+		}
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+		/*
+		 * Test one last time to see if the file is dead -- it may have
+		 * been removed.  This happens when a checkpoint trying to open
+		 * the file to flush a buffer races with the Db::remove method.
+		 * The error will be ignored, so don't output an error message.
+		 */
+		if (mfp->deadfile) {
+			ret = EINVAL;
+			goto err;
+		}
+	}
+
+	if (LF_ISSET(DB_RDONLY))
+		F_SET(dbmfp, MP_READONLY);
+	if (LF_ISSET(DB_FLUSH))
+		F_SET(dbmfp, MP_FLUSH);
+	/*
+	 * Share the underlying file descriptor if that's possible.
+	 */
+	if (mfp != NULL && !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+		MUTEX_LOCK(env, dbmp->mutex);
+		TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q)
+			if (mfp == tmp_dbmfp->mfp &&
+			    (F_ISSET(dbmfp, MP_READONLY) ||
+			    !F_ISSET(tmp_dbmfp, MP_READONLY))) {
+				++tmp_dbmfp->fhp->ref;
+				dbmfp->fhp = tmp_dbmfp->fhp;
+				dbmfp->addr = tmp_dbmfp->addr;
+				dbmfp->len = tmp_dbmfp->len;
+				break;
+			}
+		MUTEX_UNLOCK(env, dbmp->mutex);
+		if (dbmfp->fhp != NULL)
+			goto have_mfp;
+	}
+
+	/*
+	 * If there's no backing file, we can join existing files in the cache,
+	 * but there's nothing to read from disk.
+	 */
+	if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+		/* Convert MP open flags to DB OS-layer open flags. */
+		oflags = 0;
+		if (LF_ISSET(DB_CREATE))
+			oflags |= DB_OSO_CREATE;
+		if (LF_ISSET(DB_DIRECT))
+			oflags |= DB_OSO_DIRECT;
+		if (LF_ISSET(DB_RDONLY))
+			oflags |= DB_OSO_RDONLY;
+
+		/*
+		 * XXX
+		 * A grievous layering violation, the DB_DSYNC_DB flag
+		 * was left in the ENV structure and not driven through
+		 * the cache API.  This needs to be fixed when the general
+		 * API configuration is fixed.
+		 */
+		if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB))
+			oflags |= DB_OSO_DSYNC;
+
+		/*
+		 * Get the real name for this file and open it.
+		 *
+		 * Supply a page size so os_open can decide whether to
+		 * turn buffering off if the DB_DIRECT_DB flag is set.
+		 *
+		 * Acquire the region lock if we're using a path from
+		 * an underlying MPOOLFILE -- there's a race in accessing
+		 * the path name stored in the region, __memp_nameop may
+		 * be simultaneously renaming the file.
+		 */
+
+		ret = 0;
+		if (mfp != NULL) {
+			MPOOL_SYSTEM_LOCK(env);
+			path = R_ADDR(dbmp->reginfo, mfp->path_off);
+			if (rpath != NULL) {
+				__os_free(env, rpath);
+				rpath = NULL;
+			}
+		}
+		if (rpath == NULL)
+			ret = __db_appname(env,
+			    DB_APP_DATA, path, dirp, &rpath);
+		if (ret == 0)
+			ret = __os_open(env, rpath,
+			     (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
+		if (mfp != NULL)
+			MPOOL_SYSTEM_UNLOCK(env);
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Cache file handles are shared, and have mutexes to
+		 * protect the underlying file handle across seek and
+		 * read/write calls.
+		 */
+		dbmfp->fhp->ref = 1;
+		if ((ret = __mutex_alloc(env, MTX_MPOOL_FH,
+		     DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0)
+			goto err;
+
+		/* Figure out the file's size. */
+		if ((ret = __os_ioinfo(
+		    env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
+			__db_err(env, ret, "%s", rpath);
+			goto err;
+		}
+
+		/*
+		 * Don't permit files that aren't a multiple of the pagesize,
+		 * and find the number of the last page in the file, all the
+		 * time being careful not to overflow 32 bits.
+		 *
+		 * During verify or recovery, we might have to cope with a
+		 * truncated file; if the file size is not a multiple of the
+		 * page size, round down to a page, we'll take care of the
+		 * partial page outside the mpool system.
+		 *
+		 * Pagesize of 0 is only allowed for in-mem dbs.
+		 */
+		DB_ASSERT(env, pagesize != 0);
+		if (bytes % pagesize != 0) {
+			if (LF_ISSET(DB_ODDFILESIZE))
+				bytes -= (u_int32_t)(bytes % pagesize);
+			else {
+				__db_errx(env, DB_STR_A("3037",
+		    "%s: file size not a multiple of the pagesize", "%s"),
+				    rpath);
+				ret = EINVAL;
+				goto err;
+			}
+		}
+
+		/*
+		 * Get the file id if we weren't given one.  Generated file id's
+		 * don't use timestamps, otherwise there'd be no chance of any
+		 * other process joining the party.  Don't bother looking for
+		 * this id in the hash table, its new.
+		 */
+		if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) {
+			if  ((ret =
+			     __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
+				goto err;
+			F_SET(dbmfp, MP_FILEID_SET);
+			bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
+			hp += bucket;
+			goto alloc;
+		}
+	}
+
+	if (mfp != NULL)
+		goto have_mfp;
+
+	/*
+	 * We can race with another process opening the same file when
+	 * we allocate the mpoolfile structure.  We will come back
+	 * here and check the hash table again to see if it has appeared.
+	 * For most files this is not a problem, since the name is locked
+	 * at a higher layer but QUEUE extent files are not locked.
+	 */
+check:	MUTEX_LOCK(env, hp->mtx_hash);
+	if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0))
+		goto err;
+
+	if (alloc_mfp != NULL && mfp == NULL) {
+		mfp = alloc_mfp;
+		alloc_mfp = NULL;
+		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
+	} else if (mfp != NULL) {
+		refinc = 1;
+		/*
+		 * Some things about a file cannot be changed: the clear length,
+		 * page size, or LSN location.  However, if this is an attempt
+		 * to open a named in-memory file, we may not yet have that
+		 * information. so accept uninitialized entries.
+		 *
+		 * The file type can change if the application's pre- and post-
+		 * processing needs change.  For example, an application that
+		 * created a hash subdatabase in a database that was previously
+		 * all btree.
+		 *
+		 * !!!
+		 * We do not check to see if the pgcookie information changed,
+		 * or update it if it is.
+		 */
+		if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET &&
+		    mfp->clear_len != DB_CLEARLEN_NOTSET &&
+		    dbmfp->clear_len != mfp->clear_len) ||
+		    (pagesize != 0 && pagesize != mfp->pagesize) ||
+		    (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
+		    mfp->lsn_off != DB_LSN_OFF_NOTSET &&
+		    dbmfp->lsn_offset != mfp->lsn_off)) {
+			__db_errx(env, DB_STR_A("3038",
+		    "%s: clear length, page size or LSN location changed",
+			    "%s"), path);
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			ret = EINVAL;
+			goto err;
+		}
+	}
+	if (mfp != NULL && LF_ISSET(DB_MULTIVERSION)) {
+		if (MFP_OPEN_CNT(mfp) > 1 &&
+		    atomic_read(&mfp->multiversion) == 0) {
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+			goto mvcc_err;
+		}
+		atomic_inc(env, &mfp->multiversion);
+		F_SET(dbmfp, MP_MULTIVERSION);
+	}
+
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+	if (alloc_mfp != NULL) {
+		MUTEX_LOCK(env, alloc_mfp->mutex);
+		if ((ret = __memp_mf_discard(dbmp, alloc_mfp, 0)) != 0)
+			goto err;
+	}
+
+	if (mfp == NULL) {
+		/*
+		 * If we didn't find the file and this is an in-memory file,
+		 * then the create flag should be set.
+		 */
+		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
+		    !LF_ISSET(DB_CREATE)) {
+			ret = ENOENT;
+			goto err;
+		}
+
+alloc:		if ((ret = __memp_mpf_alloc(dbmp,
+		     dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
+			goto err;
+
+		/*
+		 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
+		 * page get, we have to increment the last page in the file.
+		 * Figure it out and save it away.
+		 *
+		 * Note correction: page numbers are zero-based, not 1-based.
+		 */
+		DB_ASSERT(env, pagesize != 0);
+		last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
+		last_pgno += (db_pgno_t)(bytes / pagesize);
+		if (last_pgno != 0)
+			--last_pgno;
+
+		alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno =
+		    alloc_mfp->last_pgno = last_pgno;
+
+		alloc_mfp->bucket = bucket;
+
+		/* Go back and see if someone else has opened the file. */
+		if (path != NULL)
+			goto check;
+
+		mfp = alloc_mfp;
+
+		if (LF_ISSET(DB_MULTIVERSION)) {
+			atomic_inc(env, &mfp->multiversion);
+			F_SET(dbmfp, MP_MULTIVERSION);
+		}
+
+		/* This is a temp, noone else can see it, put it at the end. */
+		MUTEX_LOCK(env, hp->mtx_hash);
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q);
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+have_mfp:
+	/*
+	 * We need to verify that all handles open a file either durable or not
+	 * durable.  This needs to be cross process and cross sub-databases, so
+	 * mpool is the place to do it.
+	 */
+	if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) {
+		if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) {
+			if (LF_ISSET(DB_TXN_NOT_DURABLE))
+				F_SET(mfp, MP_NOT_DURABLE);
+			F_CLR(mfp, MP_DURABLE_UNKNOWN);
+		} else if (!LF_ISSET(DB_TXN_NOT_DURABLE) !=
+		    !F_ISSET(mfp, MP_NOT_DURABLE)) {
+			__db_errx(env, DB_STR("3039",
+	     "Cannot open DURABLE and NOT DURABLE handles in the same file"));
+			ret = EINVAL;
+			goto err;
+		}
+	}
+
+	/*
+	 * All paths to here have initialized the mfp variable to reference
+	 * the selected (or allocated) MPOOLFILE.
+	 */
+	dbmfp->mfp = mfp;
+
+	/*
+	 * Check to see if we can mmap the file.  If a file:
+	 *	+ isn't temporary
+	 *	+ is read-only
+	 *	+ doesn't require any pgin/pgout support
+	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
+	 *	  the environment in which it was opened)
+	 *	+ and is less than mp_mmapsize bytes in size
+	 *
+	 * we can mmap it instead of reading/writing buffers.  Don't do error
+	 * checking based on the mmap call failure.  We want to do normal I/O
+	 * on the file if the reason we failed was because the file was on an
+	 * NFS mounted partition, and we can fail in buffer I/O just as easily
+	 * as here.
+	 *
+	 * We'd like to test to see if the file is too big to mmap.  Since we
+	 * don't know what size or type off_t's or size_t's are, or the largest
+	 * unsigned integral type is, or what random insanity the local C
+	 * compiler will perpetrate, doing the comparison in a portable way is
+	 * flatly impossible.  Hope that mmap fails if the file is too large.
+	 */
+#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 MB. */
+	if (F_ISSET(mfp, MP_CAN_MMAP) && dbmfp->addr == NULL) {
+		maxmap = dbenv->mp_mmapsize == 0 ?
+		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
+		if (path == NULL ||
+		    FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+			F_CLR(mfp, MP_CAN_MMAP);
+		else if (!F_ISSET(dbmfp, MP_READONLY))
+			F_CLR(mfp, MP_CAN_MMAP);
+		else if (dbmfp->ftype != 0)
+			F_CLR(mfp, MP_CAN_MMAP);
+		else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
+			F_CLR(mfp, MP_CAN_MMAP);
+		else {
+			MPOOL_SYSTEM_LOCK(env);
+			maxmap = mp->mp_mmapsize == 0 ?
+			    DB_MAXMMAPSIZE : mp->mp_mmapsize;
+			MPOOL_SYSTEM_UNLOCK(env);
+			if (mbytes > maxmap / MEGABYTE ||
+			    (mbytes == maxmap / MEGABYTE &&
+			    bytes >= maxmap % MEGABYTE))
+				F_CLR(mfp, MP_CAN_MMAP);
+		}
+
+		dbmfp->addr = NULL;
+		if (F_ISSET(mfp, MP_CAN_MMAP)) {
+			dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+			if (__os_mapfile(env, rpath,
+			    dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
+				dbmfp->addr = NULL;
+				F_CLR(mfp, MP_CAN_MMAP);
+			}
+		}
+	}
+
+	F_SET(dbmfp, MP_OPEN_CALLED);
+
+	/*
+	 * Add the file to the process' list of DB_MPOOLFILEs.
+	 */
+	MUTEX_LOCK(env, dbmp->mutex);
+	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	if (0) {
+err:		if (refinc) {
+			/*
+			 * If mpf_cnt goes to zero here and unlink_on_close is
+			 * set, then we missed the last close, but there was an
+			 * error trying to open the file, so we probably cannot
+			 * unlink it anyway.
+			 */
+			MUTEX_LOCK(env, mfp->mutex);
+			--mfp->mpf_cnt;
+			if (LF_ISSET(DB_FLUSH | DB_RDONLY)) {
+				DB_ASSERT(env, mfp->neutral_cnt != 0);
+				--mfp->neutral_cnt;
+			}
+			MUTEX_UNLOCK(env, mfp->mutex);
+		}
+
+	}
+	if (rpath != NULL)
+		__os_free(env, rpath);
+	return (ret);
+}
+
+/*
+ * __memp_mpf_find --
+ *	Search a hash bucket for a MPOOLFILE.
+ */
+static int
+__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp)
+	ENV *env;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
+	const char *path;
+	u_int32_t flags;
+	MPOOLFILE **mfpp;
+{
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+
+	dbmp = env->mp_handle;
+
+	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+		/* Skip dead files and temporary files. */
+		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+			continue;
+
+		/*
+		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
+		 * named databases and need only match other in-memory
+		 * databases with the same name.
+		 */
+		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+			if (!mfp->no_backing_file)
+				continue;
+
+			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
+				continue;
+
+			/*
+			 * We matched an in-memory file; grab the fileid if
+			 * it is set in the region, but not in the dbmfp.
+			 */
+			if (!F_ISSET(dbmfp, MP_FILEID_SET))
+				(void)__memp_set_fileid(dbmfp,
+				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+		} else
+			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+				continue;
+
+		/*
+		 * If the file is being truncated, remove it from the system
+		 * and create a new entry.
+		 *
+		 * !!!
+		 * We should be able to set mfp to NULL and break out of the
+		 * loop, but I like the idea of checking all the entries.
+		 */
+		if (LF_ISSET(DB_TRUNCATE)) {
+			MUTEX_LOCK(env, mfp->mutex);
+			mfp->deadfile = 1;
+			MUTEX_UNLOCK(env, mfp->mutex);
+			continue;
+		}
+
+		/*
+		 * Check to see if this file has died while we waited.
+		 *
+		 * We normally don't lock the deadfile field when we read it as
+		 * we only care if the field is zero or non-zero.  We do lock
+		 * on read when searching for a matching MPOOLFILE so that two
+		 * threads of control don't race between setting the deadfile
+		 * bit and incrementing the reference count, that is, a thread
+		 * of control decrementing the reference count and then setting
+		 * deadfile because the reference count is 0 blocks us finding
+		 * the file without knowing it's about to be marked dead.
+		 */
+		MUTEX_LOCK(env, mfp->mutex);
+		if (mfp->deadfile) {
+			MUTEX_UNLOCK(env, mfp->mutex);
+			continue;
+		}
+		++mfp->mpf_cnt;
+		if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+			++mfp->neutral_cnt;
+		if (LF_ISSET(DB_FLUSH))
+			F_SET(dbmfp, MP_FOR_FLUSH);
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+		/* Initialize any fields that are not yet set. */
+		if (dbmfp->ftype != 0)
+			mfp->ftype = dbmfp->ftype;
+		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
+			mfp->clear_len = dbmfp->clear_len;
+		if (dbmfp->lsn_offset != -1)
+			mfp->lsn_off = dbmfp->lsn_offset;
+
+		break;
+	}
+
+	*mfpp = mfp;
+	return (0);
+}
+
+static int
+__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	const char *path;
+	u_int32_t pagesize;
+	u_int32_t flags;
+	MPOOLFILE **retmfp;
+{
+	ENV *env;
+	MPOOLFILE *mfp;
+	int ret;
+	void *p;
+
+	env = dbmp->env;
+	ret = 0;
+	/* Allocate and initialize a new MPOOLFILE. */
+	if ((ret = __memp_alloc(dbmp,
+	     dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+		goto err;
+	memset(mfp, 0, sizeof(MPOOLFILE));
+	mfp->mpf_cnt = 1;
+	if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+		mfp->neutral_cnt = 1;
+	if (LF_ISSET(DB_FLUSH))
+		F_SET(dbmfp, MP_FOR_FLUSH);
+	mfp->ftype = dbmfp->ftype;
+	mfp->pagesize = pagesize;
+	mfp->lsn_off = dbmfp->lsn_offset;
+	mfp->clear_len = dbmfp->clear_len;
+	mfp->priority = dbmfp->priority;
+	if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
+		mfp->maxpgno = (db_pgno_t)
+		    (dbmfp->gbytes * (GIGABYTE / mfp->pagesize));
+		mfp->maxpgno += (db_pgno_t)
+		    ((dbmfp->bytes + mfp->pagesize - 1) /
+		    mfp->pagesize);
+	}
+	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+		mfp->no_backing_file = 1;
+	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
+		mfp->unlink_on_close = 1;
+
+	F_SET(mfp, MP_CAN_MMAP);
+	if (F_ISSET(env->dbenv, DB_ENV_DATABASE_LOCKING))
+		F_SET(mfp, MP_DATABASE_LOCKING);
+	if (LF_ISSET(DB_DIRECT))
+		F_SET(mfp, MP_DIRECT);
+	if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY))
+		F_SET(mfp, MP_DURABLE_UNKNOWN);
+	if (LF_ISSET(DB_EXTENT))
+		F_SET(mfp, MP_EXTENT);
+	if (LF_ISSET(DB_TXN_NOT_DURABLE))
+		F_SET(mfp, MP_NOT_DURABLE);
+
+	/*
+	 * An in-memory database with no name is a temp file.  Named
+	 * in-memory databases get an artificially  bumped reference
+	 * count so they don't disappear on close; they need a remove
+	 * to make them disappear.
+	 */
+	if (path == NULL)
+		F_SET(mfp, MP_TEMP);
+	else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+		mfp->mpf_cnt++;
+
+	/* Copy the file identification string into shared memory. */
+	if (F_ISSET(dbmfp, MP_FILEID_SET)) {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+			goto err;
+		memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
+	}
+
+	/* Copy the file path into shared memory. */
+	if (path != NULL) {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+			goto err;
+		memcpy(p, path, strlen(path) + 1);
+	}
+
+	/* Copy the page cookie into shared memory. */
+	if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
+		mfp->pgcookie_len = 0;
+		mfp->pgcookie_off = 0;
+	} else {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, dbmfp->pgcookie->size,
+		    &mfp->pgcookie_off, &p)) != 0)
+			goto err;
+		memcpy(p,
+		     dbmfp->pgcookie->data, dbmfp->pgcookie->size);
+		mfp->pgcookie_len = dbmfp->pgcookie->size;
+	}
+
+	if ((ret = __mutex_alloc(env,
+	    MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0)
+		goto err;
+#ifndef HAVE_ATOMICFILEREAD
+	if ((ret = __mutex_alloc(env,
+	    MTX_MPOOLFILE_HANDLE, DB_MUTEX_SHARED, &mfp->mtx_write)) != 0)
+		goto err;
+#endif
+	*retmfp = mfp;
+
+err:	return (ret);
+}
+
+/*
+ * memp_fclose_pp --
+ *	DB_MPOOLFILE->close pre/post processing.
+ *
+ * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_fclose_pp(dbmfp, flags)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbmfp->env;
+
+	/*
+	 * Validate arguments, but as a handle destructor, we can't fail.
+	 */
+	if (flags != 0)
+		(void)__db_ferr(env, "DB_MPOOLFILE->close", 0);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_fclose --
+ *	DB_MPOOLFILE->close.
+ *
+ * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_fclose(dbmfp, flags)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOLFILE *mfp;
+	char *rpath;
+	u_int32_t ref;
+	int deleted, ret, t_ret;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	ret = 0;
+
+	/*
+	 * Remove the DB_MPOOLFILE from the process' list.
+	 *
+	 * It's possible the underlying mpool cache may never have been created.
+	 * In that case, all we have is a structure, discard it.
+	 *
+	 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
+	 * file list, check the MP_OPEN_CALLED flag to be sure.
+	 */
+	if (dbmp == NULL)
+		goto done;
+
+	MUTEX_LOCK(env, dbmp->mutex);
+
+	DB_ASSERT(env, dbmfp->ref >= 1);
+	if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED))
+		TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+
+	/*
+	 * Decrement the file descriptor's ref count -- if we're the last ref,
+	 * we'll discard the file descriptor.
+	 */
+	if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0)
+		dbmfp->fhp = NULL;
+	MUTEX_UNLOCK(env, dbmp->mutex);
+	if (ref != 0)
+		return (0);
+
+	/* Complain if pinned blocks never returned. */
+	if (dbmfp->pinref != 0) {
+		__db_errx(env, DB_STR_A("3040",
+		    "%s: close: %lu blocks left pinned", "%s %lu"),
+		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
+		ret = __env_panic(env, DB_RUNRECOVERY);
+	}
+
+	/* Discard any mmap information. */
+	if (dbmfp->addr != NULL && dbmfp->fhp != NULL &&
+	    (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0)
+		__db_err(env, ret, "%s", __memp_fn(dbmfp));
+
+	/*
+	 * Close the file and discard the descriptor structure; temporary
+	 * files may not yet have been created.
+	 */
+	if (dbmfp->fhp != NULL) {
+		if ((t_ret =
+		    __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) {
+			__db_err(env, t_ret, "%s", __memp_fn(dbmfp));
+			if (ret == 0)
+				ret = t_ret;
+		}
+		dbmfp->fhp = NULL;
+	}
+
+	/*
+	 * Discard our reference on the underlying MPOOLFILE, and close it
+	 * if it's no longer useful to anyone.  It possible the open of the
+	 * file never happened or wasn't successful, in which case, mpf will
+	 * be NULL and MP_OPEN_CALLED will not be set.
+	 */
+	mfp = dbmfp->mfp;
+	DB_ASSERT(env,
+	    (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) ||
+	    (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL));
+	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))
+		goto done;
+
+	/*
+	 * If it's a temp file, all outstanding references belong to unflushed
+	 * buffers.  (A temp file can only be referenced by one DB_MPOOLFILE).
+	 * We don't care about preserving any of those buffers, so mark the
+	 * MPOOLFILE as dead so that even the dirty ones just get discarded
+	 * when we try to flush them.
+	 */
+	deleted = 0;
+	if (!LF_ISSET(DB_MPOOL_NOLOCK))
+		MUTEX_LOCK(env, mfp->mutex);
+	if (F_ISSET(dbmfp, MP_MULTIVERSION))
+		atomic_dec(env, &mfp->multiversion);
+	if (F_ISSET(dbmfp, MP_READONLY) ||
+	    (LF_ISSET(DB_FLUSH) && F_ISSET(dbmfp, MP_FOR_FLUSH))) {
+		DB_ASSERT(env, mfp->neutral_cnt != 0);
+		--mfp->neutral_cnt;
+	}
+	DB_ASSERT(env, mfp->neutral_cnt < mfp->mpf_cnt);
+	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
+		if (LF_ISSET(DB_MPOOL_DISCARD) ||
+		    F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
+			mfp->deadfile = 1;
+		}
+		if (mfp->unlink_on_close) {
+			if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA,
+			    R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
+			    &rpath)) != 0 && ret == 0)
+				ret = t_ret;
+			if (t_ret == 0) {
+				if ((t_ret = __os_unlink(
+				    dbmp->env, rpath, 0)) != 0 && ret == 0)
+					ret = t_ret;
+				__os_free(env, rpath);
+			}
+		}
+		if (MFP_OPEN_CNT(mfp) == 0) {
+			F_CLR(mfp, MP_NOT_DURABLE);
+			F_SET(mfp, MP_DURABLE_UNKNOWN);
+		}
+		if (mfp->block_cnt == 0) {
+			/*
+			 * We should never discard this mp file if our caller
+			 * is holding the lock on it.  See comment in
+			 * __memp_sync_file.
+			 */
+			DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK));
+			if ((t_ret =
+			    __memp_mf_discard(dbmp, mfp, 0)) != 0 && ret == 0)
+				ret = t_ret;
+			deleted = 1;
+		}
+	}
+	if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
+		MUTEX_UNLOCK(env, mfp->mutex);
+
+done:	/* Discard the DB_MPOOLFILE structure. */
+	if (dbmfp->pgcookie != NULL) {
+		__os_free(env, dbmfp->pgcookie->data);
+		__os_free(env, dbmfp->pgcookie);
+	}
+	__os_free(env, dbmfp);
+
+	return (ret);
+}
+
+/*
+ * __memp_mf_discard --
+ *	Discard an MPOOLFILE.
+ *
+ * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
+ */
+int
+__memp_mf_discard(dbmp, mfp, hp_locked)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	int hp_locked;
+{
+	DB_MPOOL_HASH *hp;
+	ENV *env;
+#ifdef HAVE_STATISTICS
+	DB_MPOOL_STAT *sp;
+#endif
+	MPOOL *mp;
+	int need_sync, ret, t_ret;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+	hp += mfp->bucket;
+	ret = 0;
+
+	/*
+	 * Expects caller to be holding the MPOOLFILE mutex.
+	 *
+	 * When discarding a file, we have to flush writes from it to disk.
+	 * The scenario is that dirty buffers from this file need to be
+	 * flushed to satisfy a future checkpoint, but when the checkpoint
+	 * calls mpool sync, the sync code won't know anything about them.
+	 * Ignore files not written, discarded, or only temporary.
+	 */
+	need_sync = mfp->file_written && !mfp->deadfile &&
+	    !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file;
+
+	/*
+	 * We have to release the MPOOLFILE mutex before acquiring the region
+	 * mutex so we don't deadlock.  Make sure nobody ever looks at this
+	 * structure again.
+	 */
+	mfp->deadfile = 1;
+
+	/* Discard the mutex we're holding and return it too the pool. */
+	MUTEX_UNLOCK(env, mfp->mutex);
+	if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
+		ret = t_ret;
+#ifndef HAVE_ATOMICFILEREAD
+	if ((ret = __mutex_free(env, &mfp->mtx_write)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+
+	/*
+	 * Lock the bucket and delete from the list of MPOOLFILEs.
+	 * If this function is called by __memp_discard_all_mpfs,
+	 * the MPOOLFILE hash bucket is already locked.
+	 */
+	if (!hp_locked)
+		MUTEX_LOCK(env, hp->mtx_hash);
+	SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
+	if (!hp_locked)
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+
+	/* Lock the region and collect stats and free the space. */
+	MPOOL_SYSTEM_LOCK(env);
+	if (need_sync &&
+	    (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+#ifdef HAVE_STATISTICS
+	/* Copy the statistics into the region. */
+	sp = &mp->stat;
+	sp->st_cache_hit += mfp->stat.st_cache_hit;
+	sp->st_cache_miss += mfp->stat.st_cache_miss;
+	sp->st_map += mfp->stat.st_map;
+	sp->st_page_create += mfp->stat.st_page_create;
+	sp->st_page_in += mfp->stat.st_page_in;
+	sp->st_page_out += mfp->stat.st_page_out;
+#endif
+
+	/* Free the space. */
+	if (mfp->path_off != 0)
+		__memp_free(&dbmp->reginfo[0],
+		    R_ADDR(dbmp->reginfo, mfp->path_off));
+	if (mfp->fileid_off != 0)
+		__memp_free(&dbmp->reginfo[0],
+		    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+	if (mfp->pgcookie_off != 0)
+		__memp_free(&dbmp->reginfo[0],
+		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
+	__memp_free(&dbmp->reginfo[0], mfp);
+
+	MPOOL_SYSTEM_UNLOCK(env);
+
+	return (ret);
+}
+
+/*
+ * __memp_inmemlist --
+ *	Return a list of the named in-memory databases.
+ *
+ * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *));
+ */
+int
+__memp_inmemlist(env, namesp, cntp)
+	ENV *env;
+	char ***namesp;
+	int *cntp;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	int arraysz, cnt, i, ret;
+	char **names;
+
+	names = NULL;
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+
+	arraysz = cnt = 0;
+	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+		MUTEX_LOCK(env, hp->mtx_hash);
+		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+			/* Skip dead files and temporary files. */
+			if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+				continue;
+
+			/* Skip entries that allow files. */
+			if (!mfp->no_backing_file)
+				continue;
+
+			/* We found one. */
+			if (cnt >= arraysz) {
+				arraysz += 100;
+				if ((ret = __os_realloc(env,
+				    (u_int)arraysz * sizeof(names[0]),
+				    &names)) != 0)
+					goto nomem;
+			}
+			if ((ret = __os_strdup(env,
+			    R_ADDR(dbmp->reginfo, mfp->path_off),
+			    &names[cnt])) != 0)
+				goto nomem;
+
+			cnt++;
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+	*namesp = names;
+	*cntp = cnt;
+	return (0);
+
+nomem:	MUTEX_UNLOCK(env, hp->mtx_hash);
+	if (names != NULL) {
+		while (--cnt >= 0)
+			__os_free(env, names[cnt]);
+		__os_free(env, names);
+	}
+
+	/* Make sure we don't return any garbage. */
+	*cntp = 0;
+	*namesp = NULL;
+	return (ret);
+}
diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c
new file mode 100644
index 00000000..7a900fd0
--- /dev/null
+++ b/src/mp/mp_fput.c
@@ -0,0 +1,374 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_reset_lru __P((ENV *, REGINFO *));
+
+/*
+ * __memp_fput_pp --
+ *	DB_MPOOLFILE->put pre/post processing.
+ *
+ * PUBLIC: int __memp_fput_pp
+ * PUBLIC:     __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__memp_fput_pp(dbmfp, pgaddr, priority, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+	DB_CACHE_PRIORITY priority;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbmfp->env;
+
+	if (flags != 0)
+		return (__db_ferr(env, "DB_MPOOLFILE->put", 0));
+
+	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
+
+	ENV_ENTER(env, ip);
+
+	ret = __memp_fput(dbmfp, ip, pgaddr, priority);
+	if (IS_ENV_REPLICATED(env) &&
+	    (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_fput --
+ *	DB_MPOOLFILE->put.
+ *
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
+ * PUBLIC:      DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
+ */
+int
+__memp_fput(dbmfp, ip, pgaddr, priority)
+	DB_MPOOLFILE *dbmfp;
+	DB_THREAD_INFO *ip;
+	void *pgaddr;
+	DB_CACHE_PRIORITY priority;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	ENV *env;
+	MPOOL *c_mp;
+	MPOOLFILE *mfp;
+	PIN_LIST *list, *lp;
+	REGINFO *infop, *reginfo;
+	roff_t b_ref;
+	int region;
+	int adjust, pfactor, ret, t_ret;
+	char buf[DB_THREADID_STRLEN];
+
+	env = dbmfp->env;
+	dbenv = env->dbenv;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+	ret = 0;
+
+	/*
+	 * If this is marked dummy, we are using it to unpin a buffer for
+	 * another thread.
+	 */
+	if (F_ISSET(dbmfp, MP_DUMMY))
+		goto unpin;
+
+	/*
+	 * If we're mapping the file, there's nothing to do.  Because we can
+	 * stop mapping the file at any time, we have to check on each buffer
+	 * to see if the address we gave the application was part of the map
+	 * region.
+	 */
+	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
+		return (0);
+
+	DB_ASSERT(env, IS_RECOVERING(env) || bhp->pgno <= mfp->last_pgno ||
+	    F_ISSET(bhp, BH_FREED) || !SH_CHAIN_SINGLETON(bhp, vc));
+#ifdef DIAGNOSTIC
+	/*
+	 * Decrement the per-file pinned buffer count (mapped pages aren't
+	 * counted).
+	 */
+	MPOOL_SYSTEM_LOCK(env);
+	if (dbmfp->pinref == 0) {
+		MPOOL_SYSTEM_UNLOCK(env);
+		__db_errx(env, DB_STR_A("3011",
+		    "%s: more pages returned than retrieved", "%s"),
+		    __memp_fn(dbmfp));
+		return (__env_panic(env, EACCES));
+	}
+	--dbmfp->pinref;
+	MPOOL_SYSTEM_UNLOCK(env);
+#endif
+
+unpin:
+	infop = &dbmp->reginfo[bhp->region];
+	c_mp = infop->primary;
+	hp = R_ADDR(infop, c_mp->htab);
+	hp = &hp[bhp->bucket];
+
+	/*
+	 * Check for a reference count going to zero.  This can happen if the
+	 * application returns a page twice.
+	 */
+	if (atomic_read(&bhp->ref) == 0) {
+		__db_errx(env, DB_STR_A("3012",
+		    "%s: page %lu: unpinned page returned", "%s %lu"),
+		    __memp_fn(dbmfp), (u_long)bhp->pgno);
+		DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
+		return (__env_panic(env, EACCES));
+	}
+
+	/* Note the activity so allocation won't decide to quit. */
+	++c_mp->put_counter;
+
+	if (ip != NULL) {
+		reginfo = env->reginfo;
+		list = R_ADDR(reginfo, ip->dbth_pinlist);
+		region = (int)(infop - dbmp->reginfo);
+		b_ref = R_OFFSET(infop, bhp);
+		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
+			if (lp->b_ref == b_ref && lp->region == region)
+				break;
+
+		if (lp == &list[ip->dbth_pinmax]) {
+			__db_errx(env, DB_STR_A("3013",
+		    "__memp_fput: pinned buffer not found for thread %s",
+			    "%s"), dbenv->thread_id_string(dbenv,
+			    ip->dbth_pid, ip->dbth_tid, buf));
+			return (__env_panic(env, EINVAL));
+		}
+
+		lp->b_ref = INVALID_ROFF;
+		ip->dbth_pincount--;
+	}
+
+	/*
+	 * Mark the file dirty.
+	 */
+	if (F_ISSET(bhp, BH_EXCLUSIVE) && F_ISSET(bhp, BH_DIRTY)) {
+		DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+		mfp->file_written = 1;
+	}
+
+	/*
+	 * If more than one reference to the page we're done.  Ignore the
+	 * discard flags (for now) and leave the buffer's priority alone.
+	 * We are doing this a little early as the remaining ref may or
+	 * may not be a write behind.  If it is we set the priority
+	 * here, if not it will get set again later.  We might race
+	 * and miss setting the priority which would leave it wrong
+	 * for a while.
+	 */
+	DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
+	if (atomic_dec(env, &bhp->ref) > 1 || (atomic_read(&bhp->ref) == 1 &&
+	    !F_ISSET(bhp, BH_DIRTY))) {
+		/*
+		 * __memp_pgwrite only has a shared lock while it clears
+		 * the BH_DIRTY bit. If we only have a shared latch then
+		 * we can't touch the flags bits.
+		 */
+		if (F_ISSET(bhp, BH_EXCLUSIVE))
+			F_CLR(bhp, BH_EXCLUSIVE);
+		MUTEX_UNLOCK(env, bhp->mtx_buf);
+		return (0);
+	}
+
+	/* The buffer should not be accessed again. */
+#ifdef DIAG_MVCC
+	MUTEX_LOCK(env, hp->mtx_hash);
+	if (BH_REFCOUNT(bhp) == 0)
+		MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0);
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+
+	/* Update priority values. */
+	if (priority == DB_PRIORITY_VERY_LOW ||
+	    mfp->priority == MPOOL_PRI_VERY_LOW)
+		bhp->priority = 0;
+	else {
+		/*
+		 * We don't lock the LRU priority or the pages field, if
+		 * we get garbage (which won't happen on a 32-bit machine), it
+		 * only means a buffer has the wrong priority.
+		 */
+		bhp->priority = c_mp->lru_priority;
+
+		switch (priority) {
+		default:
+		case DB_PRIORITY_UNCHANGED:
+			pfactor = mfp->priority;
+			break;
+		case DB_PRIORITY_VERY_LOW:
+			pfactor = MPOOL_PRI_VERY_LOW;
+			break;
+		case DB_PRIORITY_LOW:
+			pfactor = MPOOL_PRI_LOW;
+			break;
+		case DB_PRIORITY_DEFAULT:
+			pfactor = MPOOL_PRI_DEFAULT;
+			break;
+		case DB_PRIORITY_HIGH:
+			pfactor = MPOOL_PRI_HIGH;
+			break;
+		case DB_PRIORITY_VERY_HIGH:
+			pfactor = MPOOL_PRI_VERY_HIGH;
+			break;
+		}
+
+		adjust = 0;
+		if (pfactor != 0)
+			adjust = (int)c_mp->pages / pfactor;
+
+		if (F_ISSET(bhp, BH_DIRTY))
+			adjust += (int)c_mp->pages / MPOOL_PRI_DIRTY;
+
+		if (adjust > 0) {
+			if (MPOOL_LRU_REDZONE - bhp->priority >=
+			    (u_int32_t)adjust)
+				bhp->priority += adjust;
+		} else if (adjust < 0)
+			if (bhp->priority > (u_int32_t)-adjust)
+				bhp->priority += adjust;
+	}
+
+	/*
+	 * __memp_pgwrite only has a shared lock while it clears the
+	 * BH_DIRTY bit. If we only have a shared latch then we can't
+	 * touch the flags bits.
+	 */
+	if (F_ISSET(bhp, BH_EXCLUSIVE))
+		F_CLR(bhp, BH_EXCLUSIVE);
+	MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+	/*
+	 * On every buffer put we update the cache lru priority and check
+	 * for wraparound. The increment doesn't need to be atomic: occasional
+	 * lost increments are okay; __memp_reset_lru handles race conditions.
+	 */
+	if (++c_mp->lru_priority >= MPOOL_LRU_REDZONE &&
+	    (t_ret = __memp_reset_lru(env, infop)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __memp_reset_lru --
+ *	Reset the cache LRU priority when it reaches the upper limit.
+ */
+static int
+__memp_reset_lru(env, infop)
+	ENV *env;
+	REGINFO *infop;
+{
+	BH *bhp, *tbhp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	u_int32_t bucket;
+	int reset;
+
+	/*
+	 * Update the priority so all future allocations will start at the
+	 * bottom. Lock this cache region to ensure that exactly one thread
+	 * will reset this cache's buffers.
+	 */
+	c_mp = infop->primary;
+	MPOOL_REGION_LOCK(env, infop);
+	reset = c_mp->lru_priority >= MPOOL_LRU_DECREMENT;
+	if (reset) {
+		c_mp->lru_priority -= MPOOL_LRU_DECREMENT;
+		c_mp->lru_generation++;
+	}
+	MPOOL_REGION_UNLOCK(env, infop);
+
+	if (!reset)
+		return (0);
+
+	/* Reduce the priority of every buffer in this cache region. */
+	for (hp = R_ADDR(infop, c_mp->htab),
+	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		MUTEX_LOCK(env, hp->mtx_hash);
+		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+			for (tbhp = bhp; tbhp != NULL;
+			    tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
+				if (tbhp->priority > MPOOL_LRU_DECREMENT)
+					tbhp->priority -= MPOOL_LRU_DECREMENT;
+				else
+					tbhp->priority = 0;
+			}
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+/*
+ * __memp_unpin_buffers --
+ *	Unpin buffers pinned by a thread.
+ *
+ * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
+ */
+int
+__memp_unpin_buffers(env, ip)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE dbmf;
+	PIN_LIST *list, *lp;
+	REGINFO *rinfop, *reginfo;
+	int ret;
+
+	memset(&dbmf, 0, sizeof(dbmf));
+	dbmf.env = env;
+	dbmf.flags = MP_DUMMY;
+	dbmp = env->mp_handle;
+	reginfo = env->reginfo;
+
+	list = R_ADDR(reginfo, ip->dbth_pinlist);
+	for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
+		if (lp->b_ref == INVALID_ROFF)
+			continue;
+		rinfop = &dbmp->reginfo[lp->region];
+		bhp = R_ADDR(rinfop, lp->b_ref);
+		dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+		if ((ret = __memp_fput(&dbmf, ip,
+		    (u_int8_t *)bhp + SSZA(BH, buf),
+		    DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+	}
+	return (0);
+}
diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c
new file mode 100644
index 00000000..1129853f
--- /dev/null
+++ b/src/mp/mp_fset.c
@@ -0,0 +1,170 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __memp_dirty --
+ *	Upgrade a page from a read-only to a writable pointer.
+ *
+ * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__memp_dirty(dbmfp, addrp, ip, txn, priority, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *addrp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_CACHE_PRIORITY priority;
+	u_int32_t flags;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	DB_TXN *ancestor;
+	ENV *env;
+	MPOOL *c_mp;
+#ifdef DIAG_MVCC
+	MPOOLFILE *mfp;
+#endif
+	REGINFO *infop;
+	int mvcc, ret;
+	db_pgno_t pgno;
+	void *pgaddr;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mvcc = atomic_read(&dbmfp->mfp->multiversion);
+
+	/* Convert the page address to a buffer header. */
+	pgaddr = *(void **)addrp;
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+	pgno = bhp->pgno;
+
+	/* If we have it exclusively then its already dirty. */
+	if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+		DB_ASSERT(env, F_ISSET(bhp, BH_DIRTY));
+		return (0);
+	}
+
+	if (flags == 0)
+		flags = DB_MPOOL_DIRTY;
+	DB_ASSERT(env, flags == DB_MPOOL_DIRTY || flags == DB_MPOOL_EDIT);
+
+	if (F_ISSET(dbmfp, MP_READONLY)) {
+		__db_errx(env, DB_STR_A("3008",
+		    "%s: dirty flag set for readonly file page", "%s"),
+		    __memp_fn(dbmfp));
+		return (EACCES);
+	}
+
+	for (ancestor = txn;
+	    ancestor != NULL && ancestor->parent != NULL;
+	    ancestor = ancestor->parent)
+		;
+
+	if (mvcc && txn != NULL && flags == DB_MPOOL_DIRTY &&
+	    (!BH_OWNED_BY(env, bhp, ancestor) || SH_CHAIN_HASNEXT(bhp, vc))) {
+		atomic_inc(env, &bhp->ref);
+		*(void **)addrp = NULL;
+		if ((ret = __memp_fput(dbmfp, ip, pgaddr, priority)) != 0) {
+			__db_errx(env, DB_STR_A("3009",
+			    "%s: error releasing a read-only page", "%s"),
+			    __memp_fn(dbmfp));
+			atomic_dec(env, &bhp->ref);
+			return (ret);
+		}
+		if ((ret = __memp_fget(dbmfp,
+		    &pgno, ip, txn, flags, addrp)) != 0) {
+			if (ret != DB_LOCK_DEADLOCK)
+				__db_errx(env, DB_STR_A("3010",
+				    "%s: error getting a page for writing",
+				    "%s"), __memp_fn(dbmfp));
+			atomic_dec(env, &bhp->ref);
+			return (ret);
+		}
+		atomic_dec(env, &bhp->ref);
+
+		/*
+		 * If the MVCC handle count hasn't changed, we should get a
+		 * different version of the page.
+		 */
+		DB_ASSERT(env, *(void **)addrp != pgaddr ||
+		    mvcc != atomic_read(&dbmfp->mfp->multiversion));
+
+		pgaddr = *(void **)addrp;
+		bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+		DB_ASSERT(env, pgno == bhp->pgno);
+		return (0);
+	}
+
+	infop = &dbmp->reginfo[bhp->region];
+	c_mp = infop->primary;
+	hp = R_ADDR(infop, c_mp->htab);
+	hp = &hp[bhp->bucket];
+
+	/* Drop the shared latch and get an exclusive. We have the buf ref'ed.*/
+	MUTEX_UNLOCK(env, bhp->mtx_buf);
+	MUTEX_LOCK(env, bhp->mtx_buf);
+	DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE));
+	F_SET(bhp, BH_EXCLUSIVE);
+
+	/* Set/clear the page bits. */
+	if (!F_ISSET(bhp, BH_DIRTY)) {
+#ifdef DIAGNOSTIC
+		MUTEX_LOCK(env, hp->mtx_hash);
+#endif
+		atomic_inc(env, &hp->hash_page_dirty);
+		F_SET(bhp, BH_DIRTY);
+#ifdef DIAGNOSTIC
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+	}
+
+#ifdef DIAG_MVCC
+	mfp = R_ADDR(env->mp_handle->reginfo, bhp->mf_offset);
+	MVCC_MPROTECT(bhp->buf, mfp->pagesize, PROT_READ | PROT_WRITE);
+#endif
+	DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) ||
+	    atomic_read(&hp->hash_page_dirty) != 0);
+	return (0);
+}
+
+/*
+ * __memp_shared --
+ *	Downgrade a page from exlusively held to shared.
+ *
+ * PUBLIC: int __memp_shared __P((DB_MPOOLFILE *, void *));
+ */
+int
+__memp_shared(dbmfp, pgaddr)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+{
+	BH *bhp;
+	ENV *env;
+
+	env = dbmfp->env;
+	/* Convert the page address to a buffer header. */
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+	if (F_ISSET(bhp, BH_DIRTY))
+		dbmfp->mfp->file_written = 1;
+	DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE));
+	F_CLR(bhp, BH_EXCLUSIVE);
+	MUTEX_UNLOCK(env, bhp->mtx_buf);
+	MUTEX_READLOCK(env, bhp->mtx_buf);
+
+	return (0);
+}
diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c
new file mode 100644
index 00000000..7afae248
--- /dev/null
+++ b/src/mp/mp_method.c
@@ -0,0 +1,1091 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __memp_env_create --
+ *	Mpool specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: int __memp_env_create __P((DB_ENV *));
+ */
+int
+__memp_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 *
+	 * We default to 32 8K pages.  We don't default to a flat 256K, because
+	 * we want to include the size of the buffer header which can vary
+	 * from system to system.
+	 */
+	dbenv->mp_bytes =
+	    32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
+	dbenv->mp_ncache = 1;
+
+	return (0);
+}
+
+/*
+ * __memp_env_destroy --
+ *	Mpool specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __memp_env_destroy __P((DB_ENV *));
+ */
+void
+__memp_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * __memp_get_cachesize --
+ *	{DB_ENV,DB}->get_cachesize.
+ *
+ * PUBLIC: int __memp_get_cachesize
+ * PUBLIC:         __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+ */
+int
+__memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
+	DB_ENV *dbenv;
+	u_int32_t *gbytesp, *bytesp;
+	int *ncachep;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_cachesize", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		if (gbytesp != NULL)
+			*gbytesp = mp->gbytes;
+		if (bytesp != NULL)
+			*bytesp = mp->bytes;
+		if (ncachep != NULL)
+			*ncachep = (int)mp->nreg;
+	} else {
+		if (gbytesp != NULL)
+			*gbytesp = dbenv->mp_gbytes;
+		if (bytesp != NULL)
+			*bytesp = dbenv->mp_bytes;
+		if (ncachep != NULL)
+			*ncachep = (int)dbenv->mp_ncache;
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_set_cachesize --
+ *	{DB_ENV,DB}->set_cachesize.
+ *
+ * PUBLIC: int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+ */
+int
+__memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
+	DB_ENV *dbenv;
+	u_int32_t gbytes, bytes;
+	int arg_ncache;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int ncache;
+	int ret;
+
+	env = dbenv->env;
+	ret = 0;
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->set_cachesize", DB_INIT_MPOOL);
+
+	/* Normalize the cache count. */
+	ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache;
+
+	/*
+	 * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
+	 * applications that specify 4GB cache sizes -- we know what they meant.
+	 */
+	if (sizeof(roff_t) == 4 && gbytes / ncache == 4 && bytes == 0) {
+		--gbytes;
+		bytes = GIGABYTE - 1;
+	} else {
+		gbytes += bytes / GIGABYTE;
+		bytes %= GIGABYTE;
+	}
+
+	/*
+	 * !!!
+	 * With 32-bit region offsets, individual cache regions must be smaller
+	 * than 4GB.  Also, cache sizes larger than 10TB would cause 32-bit
+	 * wrapping in the calculation of the number of hash buckets.  See
+	 * __memp_open for details.
+	 */
+	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+		if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) {
+			__db_errx(env, DB_STR("3003",
+		    "individual cache size too large: maximum is 4GB"));
+			return (EINVAL);
+		}
+		if (gbytes / ncache > 10000) {
+			__db_errx(env, DB_STR("3004",
+		    "individual cache size too large: maximum is 10TB"));
+			return (EINVAL);
+		}
+	}
+
+	/*
+	 * If the application requested less than 500Mb, increase the cachesize
+	 * by 25% and factor in the size of the hash buckets to account for our
+	 * overhead.  (I'm guessing caches over 500Mb are specifically sized,
+	 * that is, it's a large server and the application actually knows how
+	 * much memory is available.  We only document the 25% overhead number,
+	 * not the hash buckets, but I don't see a reason to confuse the issue,
+	 * it shouldn't matter to an application.)
+	 *
+	 * There is a minimum cache size, regardless.
+	 */
+	if (gbytes == 0) {
+		if (bytes < 500 * MEGABYTE)
+			bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
+		if (bytes / ncache < DB_CACHESIZE_MIN)
+			bytes = ncache * DB_CACHESIZE_MIN;
+	}
+
+	if (F_ISSET(env, ENV_OPEN_CALLED)) {
+		ENV_ENTER(env, ip);
+		ret = __memp_resize(env->mp_handle, gbytes, bytes);
+		ENV_LEAVE(env, ip);
+		return ret;
+	}
+
+	dbenv->mp_gbytes = gbytes;
+	dbenv->mp_bytes = bytes;
+	dbenv->mp_ncache = ncache;
+
+	return (0);
+}
+
+/*
+ * __memp_set_config --
+ *	Set the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__memp_set_config(dbenv, which, on)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int on;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL);
+
+	switch (which) {
+	case DB_MEMP_SUPPRESS_WRITE:
+	case DB_MEMP_SYNC_INTERRUPT:
+		if (MPOOL_ON(env)) {
+			dbmp = env->mp_handle;
+			mp = dbmp->reginfo[0].primary;
+			if (on)
+				FLD_SET(mp->config_flags, which);
+			else
+				FLD_CLR(mp->config_flags, which);
+		}
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __memp_get_config --
+ *	Return the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__memp_get_config(dbenv, which, onp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL);
+
+	switch (which) {
+	case DB_MEMP_SUPPRESS_WRITE:
+	case DB_MEMP_SYNC_INTERRUPT:
+		if (MPOOL_ON(env)) {
+			dbmp = env->mp_handle;
+			mp = dbmp->reginfo[0].primary;
+			*onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0;
+		} else
+			*onp = 0;
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
+ */
+int
+__memp_get_mp_max_openfd(dbenv, maxopenfdp)
+	DB_ENV *dbenv;
+	int *maxopenfdp;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_openfd", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		*maxopenfdp = mp->mp_maxopenfd;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		*maxopenfdp = dbenv->mp_maxopenfd;
+	return (0);
+}
+
+/*
+ * __memp_set_mp_max_openfd --
+ *	Set the maximum number of open fd's when flushing the cache.
+ * PUBLIC: int __memp_set_mp_max_openfd __P((DB_ENV *, int));
+ */
+int
+__memp_set_mp_max_openfd(dbenv, maxopenfd)
+	DB_ENV *dbenv;
+	int maxopenfd;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->set_mp_max_openfd", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		mp->mp_maxopenfd = maxopenfd;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		dbenv->mp_maxopenfd = maxopenfd;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
+ */
+int
+__memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
+	DB_ENV *dbenv;
+	int *maxwritep;
+	db_timeout_t *maxwrite_sleepp;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		*maxwritep = mp->mp_maxwrite;
+		*maxwrite_sleepp = mp->mp_maxwrite_sleep;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		*maxwritep = dbenv->mp_maxwrite;
+		*maxwrite_sleepp = dbenv->mp_maxwrite_sleep;
+	}
+	return (0);
+}
+
+/*
+ * __memp_set_mp_max_write --
+ *	Set the maximum continuous I/O count.
+ *
+ * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
+ */
+int
+__memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
+	DB_ENV *dbenv;
+	int maxwrite;
+	db_timeout_t maxwrite_sleep;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		mp->mp_maxwrite = maxwrite;
+		mp->mp_maxwrite_sleep = maxwrite_sleep;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		dbenv->mp_maxwrite = maxwrite;
+		dbenv->mp_maxwrite_sleep = maxwrite_sleep;
+	}
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *));
+ */
+int
+__memp_get_mp_mmapsize(dbenv, mp_mmapsizep)
+	DB_ENV *dbenv;
+	size_t *mp_mmapsizep;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		*mp_mmapsizep = mp->mp_mmapsize;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		*mp_mmapsizep = dbenv->mp_mmapsize;
+	return (0);
+}
+
+/*
+ * __memp_set_mp_mmapsize --
+ *	DB_ENV->set_mp_mmapsize.
+ *
+ * PUBLIC: int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+ */
+int
+__memp_set_mp_mmapsize(dbenv, mp_mmapsize)
+	DB_ENV *dbenv;
+	size_t mp_mmapsize;
+{
+	DB_MPOOL *dbmp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		ENV_ENTER(env, ip);
+		MPOOL_SYSTEM_LOCK(env);
+		/*
+		 * We need to cast here because size_t and db_size_t can be
+		 * different on a 64 bit build, when building in 32 bit
+		 * compatibility mode. The cast is safe, because we check for
+		 * overflow when the fields are assigned.
+		 */
+		mp->mp_mmapsize = (db_size_t)mp_mmapsize;
+		MPOOL_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else
+		dbenv->mp_mmapsize = (db_size_t)mp_mmapsize;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_pagesize(dbenv, mp_pagesizep)
+	DB_ENV *dbenv;
+	u_int32_t *mp_pagesizep;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_pagesize", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		*mp_pagesizep = mp->pagesize;
+	} else {
+		*mp_pagesizep = dbenv->mp_pagesize;
+	}
+	return (0);
+}
+
+/*
+ * __memp_set_mp_pagesize --
+ *	DB_ENV->set_mp_pagesize.
+ *
+ * PUBLIC: int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_pagesize(dbenv, mp_pagesize)
+	DB_ENV *dbenv;
+	u_int32_t mp_pagesize;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize");
+
+	dbenv->mp_pagesize = mp_pagesize;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_tablesize(dbenv, mp_tablesizep)
+	DB_ENV *dbenv;
+	u_int32_t *mp_tablesizep;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_tablesize", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		*mp_tablesizep = mp->htab_buckets;
+	} else
+		*mp_tablesizep = dbenv->mp_tablesize;
+	return (0);
+}
+
+/*
+ * __memp_set_mp_tablesize --
+ *	DB_ENV->set_mp_tablesize.
+ *
+ * PUBLIC: int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_tablesize(dbenv, mp_tablesize)
+	DB_ENV *dbenv;
+	u_int32_t mp_tablesize;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize");
+
+	dbenv->mp_tablesize = mp_tablesize;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_mtxcount __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_mtxcount(dbenv, mp_mtxcountp)
+	DB_ENV *dbenv;
+	u_int32_t *mp_mtxcountp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		*mp_mtxcountp = mp->htab_mutexes;
+	} else
+		*mp_mtxcountp = dbenv->mp_mtxcount;
+	return (0);
+}
+
+/*
+ * __memp_set_mp_mtxcount --
+ *	DB_ENV->set_mp_mtxcount.
+ *
+ * PUBLIC: int __memp_set_mp_mtxcount __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_mtxcount(dbenv, mp_mtxcount)
+	DB_ENV *dbenv;
+	u_int32_t mp_mtxcount;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount");
+
+	dbenv->mp_mtxcount = mp_mtxcount;
+	return (0);
+}
+
+/*
+ * __memp_nameop
+ *	Remove or rename a file in the pool.
+ *
+ * PUBLIC: int __memp_nameop __P((ENV *,
+ * PUBLIC:     u_int8_t *, const char *, const char *, const char *, int));
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+int
+__memp_nameop(env, fileid, newname, fullold, fullnew, inmem)
+	ENV *env;
+	u_int8_t *fileid;
+	const char *newname, *fullold, *fullnew;
+	int inmem;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp, *nhp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	roff_t newname_off;
+	u_int32_t bucket;
+	int locked, ret;
+	size_t nlen;
+	void *p;
+
+#undef	op_is_remove
+#define	op_is_remove	(newname == NULL)
+
+	COMPQUIET(bucket, 0);
+	COMPQUIET(hp, NULL);
+	COMPQUIET(newname_off, 0);
+	COMPQUIET(nlen, 0);
+
+	dbmp = NULL;
+	mfp = NULL;
+	nhp = NULL;
+	p = NULL;
+	locked = ret = 0;
+
+	if (!MPOOL_ON(env))
+		goto fsop;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+
+	if (!op_is_remove) {
+		nlen = strlen(newname);
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL,  nlen + 1, &newname_off, &p)) != 0)
+			return (ret);
+		memcpy(p, newname, nlen + 1);
+	}
+
+	/*
+	 * Remove or rename a file that the mpool might know about.  We assume
+	 * that the fop layer has the file locked for exclusive access, so we
+	 * don't worry about locking except for the mpool mutexes.  Checkpoint
+	 * can happen at any time, independent of file locking, so we have to
+	 * do the actual unlink or rename system call while holding
+	 * all affected buckets locked.
+	 *
+	 * If this is a rename and this is a memory file then we need
+	 * to make sure that the new name does not exist.  Since we
+	 * are locking two buckets lock them in ascending order.
+	 */
+	if (inmem) {
+		DB_ASSERT(env, fullold != NULL);
+		hp += FNBUCKET(fullold, strlen(fullold));
+		if (!op_is_remove) {
+			bucket = FNBUCKET(newname, nlen);
+			nhp = R_ADDR(dbmp->reginfo, mp->ftab);
+			nhp += bucket;
+		}
+	} else
+		hp += FNBUCKET(fileid, DB_FILE_ID_LEN);
+
+	if (nhp != NULL && nhp < hp)
+		MUTEX_LOCK(env, nhp->mtx_hash);
+	MUTEX_LOCK(env, hp->mtx_hash);
+	if (nhp != NULL && nhp > hp)
+		MUTEX_LOCK(env, nhp->mtx_hash);
+	locked = 1;
+
+	if (!op_is_remove && inmem) {
+		SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
+			if (!mfp->deadfile &&
+			    mfp->no_backing_file && strcmp(newname,
+			    R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
+				break;
+		if (mfp != NULL) {
+			ret = EEXIST;
+			goto err;
+		}
+	}
+
+	/*
+	 * Find the file -- if mpool doesn't know about this file, that may
+	 * not be an error.
+	 */
+	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+		/* Ignore non-active files. */
+		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+			continue;
+
+		/* Try to match on fileid. */
+		if (memcmp(fileid, R_ADDR(
+		    dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+			continue;
+
+		break;
+	}
+
+	if (mfp == NULL) {
+		if (inmem) {
+			ret = ENOENT;
+			goto err;
+		}
+		goto fsop;
+	}
+
+	if (op_is_remove) {
+		MUTEX_LOCK(env, mfp->mutex);
+		/*
+		 * In-memory dbs have an artificially incremented ref count so
+		 * they do not get reclaimed as long as they exist.  Since we
+		 * are now deleting the database, we need to dec that count.
+		 */
+		if (mfp->no_backing_file)
+			mfp->mpf_cnt--;
+		mfp->deadfile = 1;
+		MUTEX_UNLOCK(env, mfp->mutex);
+	} else {
+		/*
+		 * Else, it's a rename.  We've allocated memory for the new
+		 * name.  Swap it with the old one.  If it's in memory we
+		 * need to move it the right bucket.
+		 */
+		p = R_ADDR(dbmp->reginfo, mfp->path_off);
+		mfp->path_off = newname_off;
+
+		if (inmem && hp != nhp) {
+			DB_ASSERT(env, nhp != NULL);
+			SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
+			mfp->bucket = bucket;
+			SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q);
+		}
+	}
+
+fsop:	/*
+	 * If this is a real file, then mfp could be NULL, because
+	 * mpool isn't turned on, and we still need to do the file ops.
+	 */
+	if (mfp == NULL || !mfp->no_backing_file) {
+		if (op_is_remove) {
+			/*
+			 * !!!
+			 * Replication may ask us to unlink a file that's been
+			 * renamed.  Don't complain if it doesn't exist.
+			 */
+			if ((ret = __os_unlink(env, fullold, 0)) == ENOENT)
+				ret = 0;
+		} else {
+			/*
+			 * Defensive only, fullnew should never be
+			 * NULL.
+			 */
+			DB_ASSERT(env, fullnew != NULL);
+			if (fullnew == NULL) {
+				ret = EINVAL;
+				goto err;
+			}
+			ret = __os_rename(env, fullold, fullnew, 1);
+		}
+	}
+
+	/* Delete the memory we no longer need. */
+err:	if (p != NULL) {
+		MPOOL_REGION_LOCK(env, &dbmp->reginfo[0]);
+		__memp_free(&dbmp->reginfo[0], p);
+		MPOOL_REGION_UNLOCK(env, &dbmp->reginfo[0]);
+	}
+
+	/* If we have buckets locked, unlock them when done moving files. */
+	if (locked == 1) {
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		if (nhp != NULL && nhp != hp)
+			MUTEX_UNLOCK(env, nhp->mtx_hash);
+	}
+	return (ret);
+}
+
+/*
+ * __memp_ftruncate __
+ *	Truncate the file.
+ *
+ * PUBLIC: int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *,
+ * PUBLIC:     DB_THREAD_INFO *, db_pgno_t, u_int32_t));
+ */
+int
+__memp_ftruncate(dbmfp, txn, ip, pgno, flags)
+	DB_MPOOLFILE *dbmfp;
+	DB_TXN *txn;
+	DB_THREAD_INFO *ip;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *pagep;
+	db_pgno_t last_pgno, pg;
+	int ret;
+
+	env = dbmfp->env;
+	mfp = dbmfp->mfp;
+	ret = 0;
+
+	MUTEX_LOCK(env, mfp->mutex);
+	last_pgno = mfp->last_pgno;
+	MUTEX_UNLOCK(env, mfp->mutex);
+
+	if (pgno > last_pgno) {
+		if (LF_ISSET(MP_TRUNC_RECOVER))
+			return (0);
+		__db_errx(env, DB_STR("3005",
+		    "Truncate beyond the end of file"));
+		return (EINVAL);
+	}
+
+	pg = pgno;
+	if (!LF_ISSET(MP_TRUNC_NOCACHE))
+		do {
+			if (mfp->block_cnt == 0)
+				break;
+			if ((ret = __memp_fget(dbmfp, &pg,
+			    ip, txn, DB_MPOOL_FREE, &pagep)) != 0)
+				return (ret);
+		} while (pg++ < last_pgno);
+
+	/*
+	 * If we are aborting an extend of a file, the call to __os_truncate
+	 * could extend the file if the new page(s) had not yet been
+	 * written to disk.  We do not want to extend the file to pages
+	 * whose log records are not yet flushed [#14031].  In addition if
+	 * we are out of disk space we can generate an error [#12743].
+	 */
+	MUTEX_LOCK(env, mfp->mutex);
+	if (!F_ISSET(mfp, MP_TEMP) &&
+	    !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno)
+#ifdef HAVE_FTRUNCATE
+		ret = __os_truncate(env,
+		    dbmfp->fhp, pgno, mfp->pagesize);
+#else
+		ret = __db_zero_extend(env,
+		    dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize);
+#endif
+
+	/*
+	 * This set could race with another thread of control that extending
+	 * the file.  It's not a problem because we should have the page
+	 * locked at a higher level of the system.
+	 */
+	if (ret == 0) {
+		mfp->last_pgno = pgno - 1;
+		if (mfp->last_flushed_pgno > mfp->last_pgno)
+			mfp->last_flushed_pgno = mfp->last_pgno;
+	}
+	MUTEX_UNLOCK(env, mfp->mutex);
+
+	return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * Support routines for maintaining a sorted freelist while we try to rearrange
+ * and truncate the file.
+ */
+
+/*
+ * __memp_alloc_freelist --
+ *	Allocate mpool space for the freelist.
+ *
+ * PUBLIC: int __memp_alloc_freelist __P((DB_MPOOLFILE *,
+ * PUBLIC:	 u_int32_t, db_pgno_t **));
+ */
+int
+__memp_alloc_freelist(dbmfp, nelems, listp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t nelems;
+	db_pgno_t **listp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *retp;
+	int ret;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+
+	*listp = NULL;
+
+	/*
+	 * These fields are protected because the database layer
+	 * has the metapage locked while manipulating them.
+	 */
+	mfp->free_ref++;
+	if (mfp->free_size != 0)
+		return (EBUSY);
+
+	/* Allocate at least a few slots. */
+	mfp->free_cnt = nelems;
+	if (nelems == 0)
+		nelems = 50;
+
+	if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+	    NULL, nelems * sizeof(db_pgno_t), &mfp->free_list, &retp)) != 0)
+		return (ret);
+
+	mfp->free_size = nelems * sizeof(db_pgno_t);
+	*listp = retp;
+	return (0);
+}
+
+/*
+ * __memp_free_freelist --
+ *	Free the list.
+ *
+ * PUBLIC: int __memp_free_freelist __P((DB_MPOOLFILE *));
+ */
+int
+__memp_free_freelist(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOLFILE *mfp;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+
+	DB_ASSERT(env, mfp->free_ref > 0);
+	if (--mfp->free_ref > 0)
+		return (0);
+
+	DB_ASSERT(env, mfp->free_size != 0);
+
+	MPOOL_SYSTEM_LOCK(env);
+	__memp_free(dbmp->reginfo, R_ADDR(dbmp->reginfo, mfp->free_list));
+	MPOOL_SYSTEM_UNLOCK(env);
+
+	mfp->free_cnt = 0;
+	mfp->free_list = 0;
+	mfp->free_size = 0;
+	return (0);
+}
+
+/*
+ * __memp_get_freelst --
+ *	Return current list.
+ *
+ * PUBLIC: int __memp_get_freelist __P((
+ * PUBLIC:	DB_MPOOLFILE *, u_int32_t *, db_pgno_t **));
+ */
+int
+__memp_get_freelist(dbmfp, nelemp, listp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t *nelemp;
+	db_pgno_t **listp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOLFILE *mfp;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+
+	if (mfp->free_size == 0) {
+		*nelemp = 0;
+		*listp = NULL;
+	} else {
+		*nelemp = mfp->free_cnt;
+		*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_extend_freelist --
+ *	Extend the list.
+ *
+ * PUBLIC: int __memp_extend_freelist __P((
+ * PUBLIC:	DB_MPOOLFILE *, u_int32_t , db_pgno_t **));
+ */
+int
+__memp_extend_freelist(dbmfp, count, listp)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t count;
+	db_pgno_t **listp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOLFILE *mfp;
+	int ret;
+	size_t size;
+	void *retp;
+
+	env = dbmfp->env;
+	dbmp = env->mp_handle;
+	mfp = dbmfp->mfp;
+
+	if (mfp->free_size == 0)
+		return (EINVAL);
+
+	if (count * sizeof(db_pgno_t) > mfp->free_size) {
+		size = (size_t)DB_ALIGN(count * sizeof(db_pgno_t), 512);
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+		if (size >= 0xFFFFFFFF) {
+			__db_errx(env, DB_STR("3006",
+			    "Can't get the required free size while"
+			    "operating in mixed-size-addressing mode"));
+			return EINVAL;
+		}
+#endif
+		*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, size, &mfp->free_list, &retp)) != 0)
+			return (ret);
+		mfp->free_size = (db_size_t)size;
+
+		memcpy(retp, *listp, mfp->free_cnt * sizeof(db_pgno_t));
+
+		MPOOL_SYSTEM_LOCK(env);
+		__memp_free(dbmp->reginfo, *listp);
+		MPOOL_SYSTEM_UNLOCK(env);
+	}
+
+	mfp->free_cnt = count;
+	*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+
+	return (0);
+}
+#endif
+
+/*
+ * __memp_set_last_pgno -- set the last page of the file
+ *
+ * PUBLIC: int __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t));
+ */
+int
+__memp_set_last_pgno(dbmfp, pgno)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t pgno;
+{
+	MPOOLFILE *mfp;
+
+	mfp = dbmfp->mfp;
+
+	if (mfp->mpf_cnt == 1) {
+		MUTEX_LOCK(dbmfp->env, mfp->mutex);
+		if (mfp->mpf_cnt == 1)
+			dbmfp->mfp->last_pgno = pgno;
+		MUTEX_UNLOCK(dbmfp->env, mfp->mutex);
+	}
+	return (0);
+}
diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c
new file mode 100644
index 00000000..47531528
--- /dev/null
+++ b/src/mp/mp_mvcc.c
@@ -0,0 +1,636 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __pgno_cmp __P((const void *, const void *));
+
+/*
+ * __memp_bh_settxn --
+ *	Set the transaction that owns the given buffer.
+ *
+ * PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
+ */
+int
+__memp_bh_settxn(dbmp, mfp, bhp, vtd)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	BH *bhp;
+	void *vtd;
+{
+	ENV *env;
+	TXN_DETAIL *td;
+
+	env = dbmp->env;
+	td = (TXN_DETAIL *)vtd;
+
+	if (td == NULL) {
+		__db_errx(env, DB_STR_A("3002",
+		    "%s: non-transactional update to a multiversion file",
+		    "%s"), __memp_fns(dbmp, mfp));
+		return (EINVAL);
+	}
+
+	if (bhp->td_off != INVALID_ROFF) {
+		DB_ASSERT(env, BH_OWNER(env, bhp) == td);
+		return (0);
+	}
+
+	bhp->td_off = R_OFFSET(&env->tx_handle->reginfo, td);
+	return (__txn_add_buffer(env, td));
+}
+
+/*
+ * __memp_skip_curadj --
+ *	Indicate whether a cursor adjustment can be skipped for a snapshot
+ *	cursor.
+ *
+ * PUBLIC: int __memp_skip_curadj __P((DBC *, db_pgno_t));
+ */
+int
+__memp_skip_curadj(dbc, pgno)
+	DBC * dbc;
+	db_pgno_t pgno;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
+	DB_TXN *txn;
+	ENV *env;
+	MPOOLFILE *mfp;
+	REGINFO *infop;
+	roff_t mf_offset;
+	int ret, skip;
+	u_int32_t bucket;
+
+	env = dbc->env;
+	dbmp = env->mp_handle;
+	dbmfp = dbc->dbp->mpf;
+	mfp = dbmfp->mfp;
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	skip = 0;
+
+	for (txn = dbc->txn; txn->parent != NULL; txn = txn->parent)
+		;
+
+	/*
+	 * Determine the cache and hash bucket where this page lives and get
+	 * local pointers to them.  Reset on each pass through this code, the
+	 * page number can change.
+	 */
+	MP_GET_BUCKET(env, mfp, pgno, &infop, hp, bucket, ret);
+	if (ret != 0) {
+		/* Panic: there is no way to return the error. */
+		(void)__env_panic(env, ret);
+		return (0);
+	}
+
+	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+		if (bhp->pgno != pgno || bhp->mf_offset != mf_offset)
+			continue;
+
+		if (!BH_OWNED_BY(env, bhp, txn))
+			skip = 1;
+		break;
+	}
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+
+	return (skip);
+}
+
+#define	DB_FREEZER_MAGIC 0x06102002
+
+/*
+ * __memp_bh_freeze --
+ *	Save a buffer to temporary storage in case it is needed later by
+ *	a snapshot transaction.  This function should be called with the buffer
+ *	locked and will exit with it locked.  A BH_FROZEN buffer header is
+ *	allocated to represent the frozen data in mpool.
+ *
+ * PUBLIC: int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *,
+ * PUBLIC:     BH *, int *));
+ */
+int
+__memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
+	DB_MPOOL *dbmp;
+	REGINFO *infop;
+	DB_MPOOL_HASH *hp;
+	BH *bhp;
+	int *need_frozenp;
+{
+	BH *frozen_bhp;
+	BH_FROZEN_ALLOC *frozen_alloc;
+	DB_FH *fhp;
+	ENV *env;
+	MPOOL *c_mp;
+	MPOOLFILE *mfp;
+	db_mutex_t mutex;
+	db_pgno_t maxpgno, newpgno, nextfree;
+	size_t nio;
+	int created, h_locked, ret, t_ret;
+	u_int32_t magic, nbucket, ncache, pagesize;
+	char filename[100], *real_name;
+
+	env = dbmp->env;
+	c_mp = infop->primary;
+	created = h_locked = ret = 0;
+	/* Find the associated MPOOLFILE. */
+	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+	pagesize = mfp->pagesize;
+	real_name = NULL;
+	fhp = NULL;
+
+	MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE);
+
+	MPOOL_REGION_LOCK(env, infop);
+	frozen_bhp = SH_TAILQ_FIRST(&c_mp->free_frozen, __bh);
+	if (frozen_bhp != NULL) {
+		SH_TAILQ_REMOVE(&c_mp->free_frozen, frozen_bhp, hq, __bh);
+		*need_frozenp = SH_TAILQ_EMPTY(&c_mp->free_frozen);
+	} else {
+		*need_frozenp = 1;
+
+		/* There might be a small amount of unallocated space. */
+		if (__env_alloc(infop,
+		    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
+		    &frozen_alloc) == 0) {
+			frozen_bhp = (BH *)(frozen_alloc + 1);
+			frozen_bhp->mtx_buf = MUTEX_INVALID;
+			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+			    frozen_alloc, links);
+		}
+	}
+	MPOOL_REGION_UNLOCK(env, infop);
+
+	/*
+	 * If we can't get a frozen buffer header, return ENOMEM immediately:
+	 * we don't want to call __memp_alloc recursively.  __memp_alloc will
+	 * turn the next free page it finds into frozen buffer headers.
+	 */
+	if (frozen_bhp == NULL) {
+		ret = ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * For now, keep things simple and have one file per page size per
+	 * hash bucket.  This improves concurrency but can mean lots of files
+	 * if there is lots of freezing.
+	 */
+	ncache = (u_int32_t)(infop - dbmp->reginfo);
+	nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
+	snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
+	    (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
+
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, filename, NULL, &real_name)) != 0)
+		goto err;
+
+	MUTEX_LOCK(env, hp->mtx_hash);
+	h_locked = 1;
+	DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE) && !F_ISSET(bhp, BH_FROZEN));
+
+	if (BH_REFCOUNT(bhp) > 1 || F_ISSET(bhp, BH_DIRTY)) {
+		ret = EBUSY;
+		goto err;
+	}
+
+	if ((ret = __os_open(env, real_name, pagesize,
+	    DB_OSO_CREATE | DB_OSO_EXCL, env->db_mode, &fhp)) == 0) {
+		/* We're creating the file -- initialize the metadata page. */
+		created = 1;
+		magic = DB_FREEZER_MAGIC;
+		maxpgno = newpgno = 0;
+		if ((ret = __os_write(env, fhp,
+		    &magic, sizeof(u_int32_t), &nio)) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    &newpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    &maxpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+		    (ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+			goto err;
+	} else if (ret == EEXIST)
+		ret = __os_open(env,
+		    real_name, pagesize, 0, env->db_mode, &fhp);
+	if (ret != 0)
+		goto err;
+	if ((ret = __os_read(env, fhp,
+	    &magic, sizeof(u_int32_t), &nio)) != 0 ||
+	    (ret = __os_read(env, fhp,
+	    &newpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+	    (ret = __os_read(env, fhp,
+	    &maxpgno, sizeof(db_pgno_t), &nio)) != 0)
+		goto err;
+	if (magic != DB_FREEZER_MAGIC) {
+		ret = EINVAL;
+		goto err;
+	}
+	if (newpgno == 0) {
+		newpgno = ++maxpgno;
+		if ((ret = __os_seek(env,
+		    fhp, 0, 0, sizeof(u_int32_t) + sizeof(db_pgno_t))) != 0 ||
+		    (ret = __os_write(env, fhp, &maxpgno, sizeof(db_pgno_t),
+		    &nio)) != 0)
+			goto err;
+	} else {
+		if ((ret = __os_seek(env, fhp, newpgno, pagesize, 0)) != 0 ||
+		    (ret = __os_read(env, fhp, &nextfree, sizeof(db_pgno_t),
+		    &nio)) != 0)
+			goto err;
+		if ((ret =
+		    __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
+		    (ret = __os_write(env, fhp, &nextfree, sizeof(db_pgno_t),
+		    &nio)) != 0)
+			goto err;
+	}
+
+	/* Write the buffer to the allocated page. */
+	if ((ret = __os_io(env, DB_IO_WRITE, fhp, newpgno, pagesize, 0,
+	    pagesize, bhp->buf, &nio)) != 0)
+		goto err;
+
+	ret = __os_closehandle(env, fhp);
+	fhp = NULL;
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Set up the frozen_bhp with the freezer page number.  The original
+	 * buffer header is about to be freed, so transfer resources to the
+	 * frozen header here.
+	 */
+	mutex = frozen_bhp->mtx_buf;
+#ifdef DIAG_MVCC
+	memcpy(frozen_bhp, bhp, SSZ(BH, align_off));
+#else
+	memcpy(frozen_bhp, bhp, SSZA(BH, buf));
+#endif
+	atomic_init(&frozen_bhp->ref, 0);
+	if (mutex != MUTEX_INVALID)
+		frozen_bhp->mtx_buf = mutex;
+	else if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
+	    DB_MUTEX_SHARED, &frozen_bhp->mtx_buf)) != 0)
+		goto err;
+	F_SET(frozen_bhp, BH_FROZEN);
+	F_CLR(frozen_bhp, BH_EXCLUSIVE);
+	((BH_FROZEN_PAGE *)frozen_bhp)->spgno = newpgno;
+
+	/*
+	 * We're about to add the frozen buffer header to the version chain, so
+	 * we have temporarily created another buffer for the owning
+	 * transaction.
+	 */
+	if (frozen_bhp->td_off != INVALID_ROFF &&
+	    (ret = __txn_add_buffer(env, BH_OWNER(env, frozen_bhp))) != 0) {
+		(void)__env_panic(env, ret);
+		goto err;
+	}
+
+	STAT_INC(env, mpool, freeze, hp->hash_frozen, bhp->pgno);
+
+	/*
+	 * Add the frozen buffer to the version chain and update the hash
+	 * bucket if this is the head revision.  The original buffer will be
+	 * freed by __memp_alloc calling __memp_bhfree (assuming no other
+	 * thread has blocked waiting for it while we were freezing).
+	 */
+	SH_CHAIN_INSERT_AFTER(bhp, frozen_bhp, vc, __bh);
+	if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) {
+		SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
+		    bhp, frozen_bhp, hq, __bh);
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+	}
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+	h_locked = 0;
+
+	/*
+	 * Increment the file's block count -- freeing the original buffer will
+	 * decrement it.
+	 */
+	MUTEX_LOCK(env, mfp->mutex);
+	++mfp->block_cnt;
+	MUTEX_UNLOCK(env, mfp->mutex);
+
+	if (0) {
+err:		if (fhp != NULL &&
+		    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+			ret = t_ret;
+		if (created) {
+			DB_ASSERT(env, h_locked);
+			if ((t_ret = __os_unlink(env, real_name, 0)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+		}
+		if (h_locked)
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+		if (ret == 0)
+			ret = EIO;
+		if (frozen_bhp != NULL) {
+			MPOOL_REGION_LOCK(env, infop);
+			SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+			    frozen_bhp, hq);
+			MPOOL_REGION_UNLOCK(env, infop);
+		}
+	}
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	if (ret != 0 && ret != EBUSY && ret != ENOMEM)
+		__db_err(env, ret, "__memp_bh_freeze");
+
+	return (ret);
+}
+
+static int
+__pgno_cmp(a, b)
+	const void *a, *b;
+{
+	db_pgno_t *ap, *bp;
+
+	ap = (db_pgno_t *)a;
+	bp = (db_pgno_t *)b;
+
+	return (int)(*ap - *bp);
+}
+
+/*
+ * __memp_bh_thaw --
+ *	Free a buffer header in temporary storage.  Optionally restore the
+ *	buffer (if alloc_bhp != NULL).  This function should be
+ *	called with the hash bucket locked and will return with it unlocked.
+ *
+ * PUBLIC: int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *,
+ * PUBLIC:	DB_MPOOL_HASH *, BH *, BH *));
+ */
+int
+__memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
+	DB_MPOOL *dbmp;
+	REGINFO *infop;
+	DB_MPOOL_HASH *hp;
+	BH *frozen_bhp, *alloc_bhp;
+{
+	DB_FH *fhp;
+	ENV *env;
+#ifdef DIAGNOSTIC
+	DB_LSN vlsn;
+#endif
+	MPOOL *c_mp;
+	MPOOLFILE *mfp;
+	db_mutex_t mutex;
+	db_pgno_t *freelist, *ppgno, freepgno, maxpgno, spgno;
+	size_t nio;
+	u_int32_t listsize, magic, nbucket, ncache, ntrunc, nfree, pagesize;
+#ifdef HAVE_FTRUNCATE
+	int i;
+#endif
+	int h_locked, needfree, ret, t_ret;
+	char filename[100], *real_name;
+
+	env = dbmp->env;
+	fhp = NULL;
+	c_mp = infop->primary;
+	mfp = R_ADDR(dbmp->reginfo, frozen_bhp->mf_offset);
+	freelist = NULL;
+	pagesize = mfp->pagesize;
+	ret = 0;
+	real_name = NULL;
+
+	MUTEX_REQUIRED(env, hp->mtx_hash);
+	DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL);
+	h_locked = 1;
+
+	DB_ASSERT(env, F_ISSET(frozen_bhp, BH_FROZEN) &&
+	    !F_ISSET(frozen_bhp, BH_THAWED));
+	DB_ASSERT(env, alloc_bhp != NULL ||
+	    SH_CHAIN_SINGLETON(frozen_bhp, vc) ||
+	    (SH_CHAIN_HASNEXT(frozen_bhp, vc) &&
+	    BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)));
+	DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN));
+
+	spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno;
+
+	if (alloc_bhp != NULL) {
+		mutex = alloc_bhp->mtx_buf;
+#ifdef DIAG_MVCC
+		memcpy(alloc_bhp, frozen_bhp, SSZ(BH, align_off));
+#else
+		memcpy(alloc_bhp, frozen_bhp, SSZA(BH, buf));
+#endif
+		alloc_bhp->mtx_buf = mutex;
+		MUTEX_LOCK(env, alloc_bhp->mtx_buf);
+		atomic_init(&alloc_bhp->ref, 1);
+		F_CLR(alloc_bhp, BH_FROZEN);
+	}
+
+	/*
+	 * For now, keep things simple and have one file per page size per
+	 * hash bucket.  This improves concurrency but can mean lots of files
+	 * if there is lots of freezing.
+	 */
+	ncache = (u_int32_t)(infop - dbmp->reginfo);
+	nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
+	snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
+	    (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
+
+	if ((ret = __db_appname(env,
+	    DB_APP_NONE, filename, NULL, &real_name)) != 0)
+		goto err;
+	if ((ret = __os_open(env,
+	    real_name, pagesize, 0, env->db_mode, &fhp)) != 0)
+		goto err;
+
+	/*
+	 * Read the first free page number -- we're about to free the page
+	 * after we we read it.
+	 */
+	if ((ret = __os_read(env, fhp, &magic, sizeof(u_int32_t), &nio)) != 0 ||
+	    (ret =
+	    __os_read(env, fhp, &freepgno, sizeof(db_pgno_t), &nio)) != 0 ||
+	    (ret = __os_read(env, fhp, &maxpgno, sizeof(db_pgno_t), &nio)) != 0)
+		goto err;
+
+	if (magic != DB_FREEZER_MAGIC) {
+		ret = EINVAL;
+		goto err;
+	}
+
+	/* Read the buffer from the frozen page. */
+	if (alloc_bhp != NULL) {
+		DB_ASSERT(env, !F_ISSET(frozen_bhp, BH_FREED));
+		if ((ret = __os_io(env, DB_IO_READ, fhp,
+		    spgno, pagesize, 0, pagesize, alloc_bhp->buf, &nio)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Free the page from the file.  If it's the last page, truncate.
+	 * Otherwise, update free page linked list.
+	 */
+	needfree = 1;
+	if (spgno == maxpgno) {
+		listsize = 100;
+		if ((ret = __os_malloc(env,
+		    listsize * sizeof(db_pgno_t), &freelist)) != 0)
+			goto err;
+		nfree = 0;
+		while (freepgno != 0) {
+			if (nfree == listsize - 1) {
+				listsize *= 2;
+				if ((ret = __os_realloc(env,
+				    listsize * sizeof(db_pgno_t),
+				    &freelist)) != 0)
+					goto err;
+			}
+			freelist[nfree++] = freepgno;
+			if ((ret = __os_seek(env, fhp,
+			    freepgno, pagesize, 0)) != 0 ||
+			    (ret = __os_read(env, fhp, &freepgno,
+			    sizeof(db_pgno_t), &nio)) != 0)
+				goto err;
+		}
+		freelist[nfree++] = spgno;
+		qsort(freelist, nfree, sizeof(db_pgno_t), __pgno_cmp);
+		for (ppgno = &freelist[nfree - 1]; ppgno > freelist; ppgno--)
+			if (*(ppgno - 1) != *ppgno - 1)
+				break;
+		ntrunc = (u_int32_t)(&freelist[nfree] - ppgno);
+		if (ntrunc == (u_int32_t)maxpgno) {
+			needfree = 0;
+			ret = __os_closehandle(env, fhp);
+			fhp = NULL;
+			if (ret != 0 ||
+			    (ret = __os_unlink(env, real_name, 0)) != 0)
+				goto err;
+		}
+#ifdef HAVE_FTRUNCATE
+		else {
+			maxpgno -= (db_pgno_t)ntrunc;
+			if ((ret = __os_truncate(env, fhp,
+			    maxpgno + 1, pagesize)) != 0)
+				goto err;
+
+			/* Fix up the linked list */
+			freelist[nfree - ntrunc] = 0;
+			if ((ret = __os_seek(env, fhp,
+			    0, 0, sizeof(u_int32_t))) != 0 ||
+			    (ret = __os_write(env, fhp, &freelist[0],
+			    sizeof(db_pgno_t), &nio)) != 0 ||
+			    (ret = __os_write(env, fhp, &maxpgno,
+			    sizeof(db_pgno_t), &nio)) != 0)
+				goto err;
+
+			for (i = 0; i < (int)(nfree - ntrunc); i++)
+				if ((ret = __os_seek(env,
+				    fhp, freelist[i], pagesize, 0)) != 0 ||
+				    (ret = __os_write(env, fhp,
+				    &freelist[i + 1], sizeof(db_pgno_t),
+				    &nio)) != 0)
+					goto err;
+			needfree = 0;
+		}
+#endif
+	}
+	if (needfree) {
+		if ((ret = __os_seek(env, fhp, spgno, pagesize, 0)) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    &freepgno, sizeof(db_pgno_t), &nio)) != 0 ||
+		    (ret = __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    &spgno, sizeof(db_pgno_t), &nio)) != 0)
+			goto err;
+
+		ret = __os_closehandle(env, fhp);
+		fhp = NULL;
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Add the thawed buffer (if any) to the version chain.  We can't
+	 * do this any earlier, because we can't guarantee that another thread
+	 * won't be waiting for it, which means we can't clean up if there are
+	 * errors reading from the freezer.  We can't do it any later, because
+	 * we're about to free frozen_bhp, and without it we would need to do
+	 * another cache lookup to find out where the new page should live.
+	 */
+	MUTEX_REQUIRED(env, hp->mtx_hash);
+	if (alloc_bhp != NULL) {
+		alloc_bhp->priority = c_mp->lru_priority;
+
+		SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh);
+		if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) {
+			SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp,
+			    alloc_bhp, hq, __bh);
+			SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
+		}
+	} else if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) {
+		if (SH_CHAIN_HASPREV(frozen_bhp, vc))
+			SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp,
+			    SH_CHAIN_PREV(frozen_bhp, vc, __bh), hq, __bh);
+		SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
+	}
+	SH_CHAIN_REMOVE(frozen_bhp, vc, __bh);
+
+	if (alloc_bhp == NULL && frozen_bhp->td_off != INVALID_ROFF &&
+	    (ret = __txn_remove_buffer(env,
+	    BH_OWNER(env, frozen_bhp), MUTEX_INVALID)) != 0) {
+		(void)__env_panic(env, ret);
+		goto err;
+	}
+	frozen_bhp->td_off = INVALID_ROFF;
+
+	/*
+	 * If other threads are waiting for this buffer as well, they will have
+	 * incremented the reference count and will be waiting on the mutex.
+	 * For that reason, we can't unconditionally free the memory here.
+	 */
+	needfree = (atomic_dec(env, &frozen_bhp->ref) == 0);
+	if (!needfree)
+		F_SET(frozen_bhp, BH_THAWED);
+	MUTEX_UNLOCK(env, hp->mtx_hash);
+	if (F_ISSET(frozen_bhp, BH_EXCLUSIVE))
+		MUTEX_UNLOCK(env, frozen_bhp->mtx_buf);
+	h_locked = 0;
+	if (needfree) {
+		MPOOL_REGION_LOCK(env, infop);
+		SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, frozen_bhp, hq);
+		MPOOL_REGION_UNLOCK(env, infop);
+	}
+
+#ifdef HAVE_STATISTICS
+	if (alloc_bhp != NULL)
+		STAT_INC_VERB(env, mpool, thaw,
+		    hp->hash_thawed, __memp_fns(dbmp, mfp), frozen_bhp->pgno);
+	else
+		STAT_INC_VERB(env, mpool, free_frozen, hp->hash_frozen_freed,
+		    __memp_fns(dbmp, mfp), frozen_bhp->pgno);
+#endif
+
+	if (0) {
+err:		if (h_locked)
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+		if (ret == 0)
+			ret = EIO;
+	}
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	if (freelist != NULL)
+		__os_free(env, freelist);
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		__db_err(env, ret, "__memp_bh_thaw");
+
+	return (ret);
+}
diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c
new file mode 100644
index 00000000..07134de7
--- /dev/null
+++ b/src/mp/mp_region.c
@@ -0,0 +1,620 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+
+static int	__memp_init_config __P((ENV *, MPOOL *));
+static void	__memp_region_size __P((ENV *, roff_t *, u_int32_t *));
+
+#define	MPOOL_DEFAULT_PAGESIZE	(4 * 1024)
+
+/*
+ * __memp_open --
+ *	Internal version of memp_open: only called from ENV->open.
+ *
+ * PUBLIC: int __memp_open __P((ENV *, int));
+ */
+int
+__memp_open(env, create_ok)
+	ENV *env;
+	int create_ok;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOL *mp, *mp_i;
+	REGINFO reginfo;
+	roff_t cache_size, max_size, reg_size;
+	u_int i, max_nreg;
+	u_int32_t htab_buckets, *regids;
+	int ret;
+
+	dbenv = env->dbenv;
+	cache_size = 0;
+
+	/* Calculate the region size and hash bucket count. */
+	__memp_region_size(env, &max_size, &htab_buckets);
+
+	/* Create and initialize the DB_MPOOL structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0)
+		return (ret);
+	LIST_INIT(&dbmp->dbregq);
+	TAILQ_INIT(&dbmp->dbmfq);
+	dbmp->env = env;
+
+	/* Join/create the first mpool region. */
+	memset(&reginfo, 0, sizeof(REGINFO));
+	reginfo.env = env;
+	reginfo.type = REGION_TYPE_MPOOL;
+	reginfo.id = INVALID_REGION_ID;
+	reginfo.flags = REGION_JOIN_OK;
+
+	/* Calculate the minimum allocation. */
+	reg_size = sizeof(MPOOL);
+	reg_size += MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH);
+	reg_size += htab_buckets * sizeof(DB_MPOOL_HASH);
+	reg_size += (dbenv->mp_pagesize == 0 ?
+		MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize) * 10;
+	if (reg_size > max_size)
+		reg_size = max_size;
+
+	if (create_ok)
+		F_SET(&reginfo, REGION_CREATE_OK);
+	if ((ret = __env_region_attach(env, &reginfo, reg_size, max_size)) != 0)
+		goto err;
+	cache_size = reginfo.rp->max;
+	if (F_ISSET(env, ENV_PRIVATE))
+		reginfo.max_alloc = reginfo.rp->max;
+
+	/*
+	 * If we created the region, initialize it.  Create or join any
+	 * additional regions.
+	 */
+	if (F_ISSET(&reginfo, REGION_CREATE)) {
+		/*
+		 * We define how many regions there are going to be, allocate
+		 * the REGINFO structures and create them.  Make sure we don't
+		 * clear the wrong entries on error.
+		 */
+		max_nreg = __memp_max_regions(env);
+		if ((ret = __os_calloc(env,
+		    max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+			goto err;
+		/* Make sure we don't clear the wrong entries on error. */
+		dbmp->reginfo[0] = reginfo;
+		for (i = 1; i < max_nreg; ++i)
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+
+		/* Initialize the first region. */
+		if ((ret = __memp_init(env, dbmp,
+		    0, htab_buckets, max_nreg)) != 0)
+			goto err;
+
+		/*
+		 * Create/initialize remaining regions and copy their IDs into
+		 * the first region.
+		 */
+		mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+		regids[0] = dbmp->reginfo[0].id;
+		for (i = 1; i < dbenv->mp_ncache; ++i) {
+			dbmp->reginfo[i].env = env;
+			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+			dbmp->reginfo[i].flags = REGION_CREATE_OK;
+			if ((ret = __env_region_attach(
+			    env, &dbmp->reginfo[i], reg_size, max_size)) != 0)
+				goto err;
+			if (F_ISSET(env, ENV_PRIVATE))
+				dbmp->reginfo[i].max_alloc = max_size;
+			cache_size += dbmp->reginfo[i].rp->max;
+			if ((ret = __memp_init(env, dbmp,
+			    i, htab_buckets, max_nreg)) != 0)
+				goto err;
+
+			regids[i] = dbmp->reginfo[i].id;
+		}
+		mp->gbytes = (u_int32_t) (cache_size / GIGABYTE);
+		mp->bytes = (u_int32_t) (cache_size % GIGABYTE);
+	} else {
+		/*
+		 * Determine how many regions there are going to be, allocate
+		 * the REGINFO structures and fill in local copies of that
+		 * information.
+		 */
+		mp = R_ADDR(&reginfo, reginfo.rp->primary);
+		dbenv->mp_ncache = mp->nreg;
+		if ((ret = __os_calloc(env,
+		    mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+			goto err;
+		/* Make sure we don't clear the wrong entries on error. */
+		for (i = 0; i < dbenv->mp_ncache; ++i)
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+		dbmp->reginfo[0] = reginfo;
+
+		/* Join remaining regions. */
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+		for (i = 1; i < dbenv->mp_ncache; ++i) {
+			dbmp->reginfo[i].env = env;
+			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+			dbmp->reginfo[i].id = regids[i];
+			dbmp->reginfo[i].flags = REGION_JOIN_OK;
+			if ((ret = __env_region_attach(
+			    env, &dbmp->reginfo[i], 0, 0)) != 0)
+				goto err;
+		}
+	}
+
+	/* Set the local addresses for the regions. */
+	for (i = 0; i < dbenv->mp_ncache; ++i) {
+		mp_i = dbmp->reginfo[i].primary =
+		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
+		dbmp->reginfo[i].mtx_alloc = mp_i->mtx_region;
+	}
+
+	/* If the region is threaded, allocate a mutex to lock the handles. */
+	if ((ret = __mutex_alloc(env,
+	    MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0)
+		goto err;
+
+	env->mp_handle = dbmp;
+
+	/* A process joining the region may reset the mpool configuration. */
+	if ((ret = __memp_init_config(env, mp)) != 0)
+		return (ret);
+
+	return (0);
+
+err:	env->mp_handle = NULL;
+	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+		for (i = 0; i < dbenv->mp_ncache; ++i)
+			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
+				(void)__env_region_detach(
+				    env, &dbmp->reginfo[i], 0);
+		__os_free(env, dbmp->reginfo);
+	}
+
+	(void)__mutex_free(env, &dbmp->mutex);
+	__os_free(env, dbmp);
+	return (ret);
+}
+
+/*
+ * __memp_init --
+ *	Initialize a MPOOL structure in shared memory.
+ *
+ * PUBLIC: int	__memp_init
+ * PUBLIC:     __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
+ */
+int
+__memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
+	ENV *env;
+	DB_MPOOL *dbmp;
+	u_int reginfo_off, max_nreg;
+	u_int32_t htab_buckets;
+{
+	BH *frozen_bhp;
+	BH_FROZEN_ALLOC *frozen;
+	DB_ENV *dbenv;
+	DB_MPOOL_HASH *htab, *hp;
+	MPOOL *mp, *main_mp;
+	REGINFO *infop;
+	db_mutex_t mtx_base, mtx_discard, mtx_prev;
+	u_int32_t i;
+	int ret;
+	void *p;
+
+	dbenv = env->dbenv;
+
+	infop = &dbmp->reginfo[reginfo_off];
+	if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0)
+		goto mem_err;
+	infop->rp->primary = R_OFFSET(infop, infop->primary);
+	mp = infop->primary;
+	memset(mp, 0, sizeof(*mp));
+
+	if ((ret =
+	    __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0)
+		return (ret);
+
+	if (reginfo_off == 0) {
+		ZERO_LSN(mp->lsn);
+
+		mp->nreg = dbenv->mp_ncache;
+		mp->max_nreg = max_nreg;
+		if ((ret = __env_alloc(&dbmp->reginfo[0],
+		    max_nreg * sizeof(u_int32_t), &p)) != 0)
+			goto mem_err;
+		mp->regids = R_OFFSET(dbmp->reginfo, p);
+		mp->nbuckets = dbenv->mp_ncache * htab_buckets;
+
+		/* Allocate file table space and initialize it. */
+		if ((ret = __env_alloc(infop,
+		    MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0)
+			goto mem_err;
+		mp->ftab = R_OFFSET(infop, htab);
+		for (i = 0; i < MPOOL_FILE_BUCKETS; i++) {
+			if ((ret = __mutex_alloc(env,
+			     MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0)
+				return (ret);
+			SH_TAILQ_INIT(&htab[i].hash_bucket);
+			atomic_init(&htab[i].hash_page_dirty, 0);
+		}
+
+		/*
+		 * Allocate all of the hash bucket mutexes up front.  We do
+		 * this so that we don't need to free and reallocate mutexes as
+		 * the cache is resized.
+		 */
+		mtx_base = mtx_prev = MUTEX_INVALID;
+		if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE))
+			goto no_prealloc;
+		for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) {
+			if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+			    DB_MUTEX_SHARED, &mtx_discard)) != 0)
+				return (ret);
+			if (i == 0)
+				mtx_base = mtx_discard;
+			else
+				DB_ASSERT(env, mtx_base == MUTEX_INVALID ||
+				    mtx_discard == mtx_prev + 1);
+			mtx_prev = mtx_discard;
+		}
+	} else {
+		main_mp = dbmp->reginfo[0].primary;
+		htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab);
+		mtx_base = htab[0].mtx_hash;
+	}
+
+	/*
+	 * We preallocated all of the mutexes in a block, so for regions after
+	 * the first, we skip mutexes in use in earlier regions.  Each region
+	 * has the same number of buckets
+	 */
+no_prealloc:
+	if (MUTEX_ON(env))
+		mtx_base += reginfo_off * dbenv->mp_mtxcount;
+
+	/* Allocate hash table space and initialize it. */
+	if ((ret = __env_alloc(infop,
+	    htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0)
+		goto mem_err;
+	mp->htab = R_OFFSET(infop, htab);
+	for (i = 0; i < htab_buckets; i++) {
+		hp = &htab[i];
+		if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0)
+			hp->mtx_hash = MUTEX_INVALID;
+		else if (F_ISSET(env, ENV_PRIVATE)) {
+			if (i >= dbenv->mp_mtxcount)
+				hp->mtx_hash =
+				    htab[i % dbenv->mp_mtxcount].mtx_hash;
+			else if
+			    ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+			    DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
+				return (ret);
+		} else
+			hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount);
+		SH_TAILQ_INIT(&hp->hash_bucket);
+		atomic_init(&hp->hash_page_dirty, 0);
+#ifdef HAVE_STATISTICS
+		hp->hash_io_wait = 0;
+		hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0;
+#endif
+		hp->flags = 0;
+		ZERO_LSN(hp->old_reader);
+	}
+	mp->htab_buckets = htab_buckets;
+	mp->htab_mutexes = dbenv->mp_mtxcount;
+	mp->pagesize = dbenv->mp_pagesize == 0 ?
+		MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize;
+
+	SH_TAILQ_INIT(&mp->free_frozen);
+	SH_TAILQ_INIT(&mp->alloc_frozen);
+
+	/*
+	 * Pre-allocate one frozen buffer header.  This avoids situations where
+	 * the cache becomes full of pages and we don't even have the 28 bytes
+	 * (or so) available to allocate a frozen buffer header.
+	 */
+	if ((ret = __env_alloc(infop,
+	    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0)
+		goto mem_err;
+	SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links);
+	frozen_bhp = (BH *)(frozen + 1);
+	frozen_bhp->mtx_buf = MUTEX_INVALID;
+	SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq);
+
+	/*
+	 * Only the environment creator knows the total cache size,
+	 * fill in those fields now.
+	 */
+	mp->gbytes = dbenv->mp_gbytes;
+	mp->bytes = dbenv->mp_bytes;
+	infop->mtx_alloc = mp->mtx_region;
+	return (0);
+
+mem_err:__db_errx(env, DB_STR("3026",
+	    "Unable to allocate memory for mpool region"));
+	return (ret);
+}
+
+/*
+ * PUBLIC: u_int32_t __memp_max_regions __P((ENV *));
+ */
+u_int32_t
+__memp_max_regions(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	roff_t reg_size, max_size;
+	size_t max_nreg;
+
+	dbenv = env->dbenv;
+
+	if (dbenv->mp_max_gbytes == 0 && dbenv->mp_max_bytes == 0)
+		return (dbenv->mp_ncache);
+	__memp_region_size(env, &reg_size, NULL);
+	max_size =
+	    (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes;
+	max_nreg = (max_size + reg_size / 2) / reg_size;
+
+	/* Sanity check that the number of regions fits in 32 bits. */
+	DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg);
+
+	if (max_nreg <= dbenv->mp_ncache)
+		max_nreg = dbenv->mp_ncache;
+	return ((u_int32_t)max_nreg);
+}
+
+/*
+ * __memp_region_size --
+ *	Size the region and figure out how many hash buckets we'll have.
+ */
+static void
+__memp_region_size(env, reg_sizep, htab_bucketsp)
+	ENV *env;
+	roff_t *reg_sizep;
+	u_int32_t *htab_bucketsp;
+{
+	DB_ENV *dbenv;
+	roff_t reg_size, cache_size;
+	u_int32_t pgsize;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * Figure out how big each cache region is.  Cast an operand to roff_t
+	 * so we do 64-bit arithmetic as appropriate.
+	 */
+	cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes;
+	reg_size = cache_size / dbenv->mp_ncache;
+	if (reg_sizep != NULL)
+		*reg_sizep = reg_size;
+
+	/*
+	 * Figure out how many hash buckets each region will have.  Assume we
+	 * want to keep the hash chains with under 3 pages on each chain.  We
+	 * don't know the pagesize in advance, and it may differ for different
+	 * files.  Use a pagesize of 4K for the calculation -- we walk these
+	 * chains a lot, they must be kept short.  We use 2.5 as this maintains
+	 * compatibility with previous releases.
+	 *
+	 * XXX
+	 * Cache sizes larger than 10TB would cause 32-bit wrapping in the
+	 * calculation of the number of hash buckets.  This probably isn't
+	 * something we need to worry about right now, but is checked when the
+	 * cache size is set.
+	 */
+	if (htab_bucketsp != NULL) {
+		if (dbenv->mp_tablesize != 0)
+			*htab_bucketsp = __db_tablesize(dbenv->mp_tablesize);
+		else {
+			if ((pgsize = dbenv->mp_pagesize) == 0)
+				pgsize = MPOOL_DEFAULT_PAGESIZE;
+			*htab_bucketsp = __db_tablesize(
+				(u_int32_t)(reg_size / (2.5 * pgsize)));
+		}
+	}
+
+}
+
+/*
+ * __memp_region_mutex_count --
+ *	Return the number of mutexes the mpool region will need.
+ *
+ * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__memp_region_mutex_count(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	u_int32_t htab_buckets;
+	roff_t reg_size;
+	u_int32_t max_region, num_per_cache, pgsize;
+
+	dbenv = env->dbenv;
+
+	__memp_region_size(env, &reg_size, &htab_buckets);
+	if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+		pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE);
+	if ((pgsize = dbenv->mp_pagesize) == 0)
+		pgsize = MPOOL_DEFAULT_PAGESIZE;
+	max_region = __memp_max_regions(env);
+
+	/*
+	 * We need a couple of mutexes for the region itself, one for each
+	 * file handle (MPOOLFILE) the application allocates, one for each
+	 * of the MPOOL_FILE_BUCKETS, and each cache has one mutex per
+	 * hash bucket. We then need one mutex per page in the cache,
+	 * the worst case is really big if the pages are 512 bytes.
+	 */
+	if (dbenv->mp_mtxcount != 0)
+		htab_buckets = dbenv->mp_mtxcount;
+	else
+		dbenv->mp_mtxcount = htab_buckets;
+	num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize);
+	return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS);
+}
+
+/*
+ * __memp_init_config --
+ *	Initialize shared configuration information.
+ */
+static int
+__memp_init_config(env, mp)
+	ENV *env;
+	MPOOL *mp;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env->dbenv;
+
+	MPOOL_SYSTEM_LOCK(env);
+	if (dbenv->mp_mmapsize != 0)
+		mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize;
+	if (dbenv->mp_maxopenfd != 0)
+		mp->mp_maxopenfd = dbenv->mp_maxopenfd;
+	if (dbenv->mp_maxwrite != 0)
+		mp->mp_maxwrite = dbenv->mp_maxwrite;
+	if (dbenv->mp_maxwrite_sleep != 0)
+		mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep;
+	MPOOL_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __memp_env_refresh --
+ *	Clean up after the mpool system on a close or failed open.
+ *
+ * PUBLIC: int __memp_env_refresh __P((ENV *));
+ */
+int
+__memp_env_refresh(env)
+	ENV *env;
+{
+	BH *bhp;
+	BH_FROZEN_ALLOC *frozen_alloc;
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
+	DB_MPREG *mpreg;
+	MPOOL *mp, *c_mp;
+	REGINFO *infop;
+	u_int32_t bucket, i, nreg;
+	int ret, t_ret;
+
+	ret = 0;
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	nreg = mp->nreg;
+	hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (!F_ISSET(env, ENV_PRIVATE))
+		goto not_priv;
+
+	/* Discard buffers. */
+	for (i = 0; i < nreg; ++i) {
+		infop = &dbmp->reginfo[i];
+		c_mp = infop->primary;
+		for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+		    bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+			while ((bhp = SH_TAILQ_FIRST(
+			    &hp->hash_bucket, __bh)) != NULL)
+				if (F_ISSET(bhp, BH_FROZEN))
+					SH_TAILQ_REMOVE(
+					    &hp->hash_bucket, bhp,
+					    hq, __bh);
+				else {
+					if (F_ISSET(bhp, BH_DIRTY)) {
+						atomic_dec(env,
+						     &hp->hash_page_dirty);
+						F_CLR(bhp,
+						    BH_DIRTY | BH_DIRTY_CREATE);
+					}
+					atomic_inc(env, &bhp->ref);
+					if ((t_ret = __memp_bhfree(dbmp, infop,
+					    R_ADDR(dbmp->reginfo,
+					    bhp->mf_offset), hp, bhp,
+					    BH_FREE_FREEMEM |
+					    BH_FREE_UNLOCKED)) != 0 && ret == 0)
+						ret = t_ret;
+				}
+		}
+		MPOOL_REGION_LOCK(env, infop);
+		while ((frozen_alloc = SH_TAILQ_FIRST(
+		    &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+			SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
+			    links, __bh_frozen_a);
+			__env_alloc_free(infop, frozen_alloc);
+		}
+		MPOOL_REGION_UNLOCK(env, infop);
+	}
+
+not_priv:
+	/* Discard DB_MPOOLFILEs. */
+	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+		if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Discard DB_MPREGs. */
+	if (dbmp->pg_inout != NULL)
+		__os_free(env, dbmp->pg_inout);
+	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+		LIST_REMOVE(mpreg, q);
+		__os_free(env, mpreg);
+	}
+
+	/* Discard the DB_MPOOL thread mutex. */
+	if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		/* Discard REGION IDs. */
+		infop = &dbmp->reginfo[0];
+		infop->mtx_alloc = MUTEX_INVALID;
+		__memp_free(infop, R_ADDR(infop, mp->regids));
+
+		/* Discard all the MPOOLFILEs. */
+		if ((t_ret = __memp_discard_all_mpfs(env, mp)) != 0 && ret == 0)
+			ret = t_ret;
+		/* Discard the File table. */
+		__memp_free(infop, R_ADDR(infop, mp->ftab));
+
+		/* Discard Hash tables. */
+		for (i = 0; i < nreg; ++i) {
+			infop = &dbmp->reginfo[i];
+			c_mp = infop->primary;
+			infop->mtx_alloc = MUTEX_INVALID;
+			__memp_free(infop, R_ADDR(infop, c_mp->htab));
+		}
+	}
+
+	/* Detach from the region. */
+	for (i = 0; i < nreg; ++i) {
+		infop = &dbmp->reginfo[i];
+		if ((t_ret =
+		    __env_region_detach(env, infop, 0)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* Discard DB_MPOOL. */
+	__os_free(env, dbmp->reginfo);
+	__os_free(env, dbmp);
+
+	env->mp_handle = NULL;
+	return (ret);
+}
diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c
new file mode 100644
index 00000000..dc7015a7
--- /dev/null
+++ b/src/mp/mp_register.c
@@ -0,0 +1,116 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+/*
+ * memp_register_pp --
+ *	ENV->memp_register pre/post processing.
+ *
+ * PUBLIC: int __memp_register_pp __P((DB_ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+ */
+int
+__memp_register_pp(dbenv, ftype, pgin, pgout)
+	DB_ENV *dbenv;
+	int ftype;
+	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);
+
+	if (REP_ON(env)) {
+		__db_errx(env, DB_STR_A("3001",
+		    "%smethod not permitted when replication is configured",
+		    "%s"), "DB_ENV->memp_register: ");
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	ret = __memp_register(env, ftype, pgin, pgout);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * memp_register --
+ *	ENV->memp_register.
+ *
+ * PUBLIC: int __memp_register __P((ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+ */
+int
+__memp_register(env, ftype, pgin, pgout)
+	ENV *env;
+	int ftype;
+	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+	DB_MPOOL *dbmp;
+	DB_MPREG *mpreg;
+	int ret;
+
+	dbmp = env->mp_handle;
+
+	/*
+	 * We keep the DB pgin/pgout functions outside of the linked list
+	 * to avoid locking/unlocking the linked list on every page I/O.
+	 *
+	 * The Berkeley DB I/O conversion functions are registered when the
+	 * environment is first created, so there's no need for locking here.
+	 */
+	if (ftype == DB_FTYPE_SET) {
+		if (dbmp->pg_inout != NULL)
+			return (0);
+		if ((ret =
+		    __os_malloc(env, sizeof(DB_MPREG), &dbmp->pg_inout)) != 0)
+			return (ret);
+		dbmp->pg_inout->ftype = ftype;
+		dbmp->pg_inout->pgin = pgin;
+		dbmp->pg_inout->pgout = pgout;
+		return (0);
+	}
+
+	/*
+	 * The item may already have been registered.  If already registered,
+	 * just update the entry, although it's probably unchanged.
+	 */
+	MUTEX_LOCK(env, dbmp->mutex);
+	LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+		if (mpreg->ftype == ftype) {
+			mpreg->pgin = pgin;
+			mpreg->pgout = pgout;
+			break;
+		}
+
+	if (mpreg == NULL) {			/* New entry. */
+		if ((ret = __os_malloc(env, sizeof(DB_MPREG), &mpreg)) != 0)
+			return (ret);
+		mpreg->ftype = ftype;
+		mpreg->pgin = pgin;
+		mpreg->pgout = pgout;
+
+		LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q);
+	}
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	return (0);
+}
diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c
new file mode 100644
index 00000000..97719554
--- /dev/null
+++ b/src/mp/mp_resize.c
@@ -0,0 +1,605 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __memp_add_bucket __P((DB_MPOOL *));
+static int __memp_add_region __P((DB_MPOOL *));
+static int __memp_map_regions __P((DB_MPOOL *));
+static int __memp_merge_buckets
+    __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t));
+static int __memp_remove_bucket __P((DB_MPOOL *));
+static int __memp_remove_region __P((DB_MPOOL *));
+
+/*
+ * PUBLIC: int __memp_get_bucket __P((ENV *, MPOOLFILE *,
+ * PUBLIC:     db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *));
+ */
+int
+__memp_get_bucket(env, mfp, pgno, infopp, hpp, bucketp)
+	ENV *env;
+	MPOOLFILE *mfp;
+	db_pgno_t pgno;
+	REGINFO **infopp;
+	DB_MPOOL_HASH **hpp;
+	u_int32_t *bucketp;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp, *mp;
+	REGINFO *infop;
+	roff_t mf_offset;
+	u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region;
+	u_int32_t *regids;
+	int ret;
+
+	dbmp = env->mp_handle;
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	mp = dbmp->reginfo[0].primary;
+	ret = 0;
+
+	for (;;) {
+		nbuckets = mp->nbuckets;
+		MP_BUCKET(mf_offset, pgno, nbuckets, bucket);
+
+		/*
+		 * Once we work out which region we are looking in, we have to
+		 * check that we have that region mapped, and that the version
+		 * we have matches the ID in the main mpool region.  Otherwise
+		 * we have to go and map in any regions that don't match and
+		 * retry.
+		 */
+		region = NREGION(mp, bucket);
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+
+		for (;;) {
+			infop = *infopp = &dbmp->reginfo[region];
+			c_mp = infop->primary;
+
+			/* If we have the correct region mapped, we're done. */
+			if (c_mp != NULL && regids[region] == infop->id)
+				break;
+			if ((ret = __memp_map_regions(dbmp)) != 0)
+				return (ret);
+		}
+
+		/* If our caller wants the hash bucket, lock it here. */
+		if (hpp != NULL) {
+			hp = R_ADDR(infop, c_mp->htab);
+			hp = &hp[bucket - region * mp->htab_buckets];
+
+			MUTEX_READLOCK(env, hp->mtx_hash);
+
+			/*
+			 * Check that we still have the correct region mapped.
+			 */
+			if (regids[region] != infop->id) {
+				MUTEX_UNLOCK(env, hp->mtx_hash);
+				continue;
+			}
+
+			/*
+			 * Now that the bucket is locked, we need to check that
+			 * the cache has not been resized while we waited.
+			 */
+			new_nbuckets = mp->nbuckets;
+			if (nbuckets != new_nbuckets) {
+				MP_BUCKET(mf_offset, pgno, new_nbuckets,
+				    new_bucket);
+
+				if (new_bucket != bucket) {
+					MUTEX_UNLOCK(env, hp->mtx_hash);
+					continue;
+				}
+			}
+
+			*hpp = hp;
+		}
+
+		break;
+	}
+
+	if (bucketp != NULL)
+		*bucketp = bucket - region * mp->htab_buckets;
+	return (ret);
+}
+
+static int
+__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
+	DB_MPOOL *dbmp;
+	u_int32_t new_nbuckets, old_bucket, new_bucket;
+{
+	BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp;
+	DB_LSN vlsn;
+	DB_MPOOL_HASH *new_hp, *old_hp;
+	ENV *env;
+	MPOOL *mp, *new_mp, *old_mp;
+	MPOOLFILE *mfp;
+	REGINFO *new_infop, *old_infop;
+	u_int32_t bucket, high_mask, new_region, old_region;
+	int ret;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	new_bhp = NULL;
+	ret = 0;
+
+	MP_MASK(new_nbuckets, high_mask);
+
+	old_region = NREGION(mp, old_bucket);
+	old_infop = &dbmp->reginfo[old_region];
+	old_mp = old_infop->primary;
+	old_hp = R_ADDR(old_infop, old_mp->htab);
+	old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets];
+
+	new_region = NREGION(mp, new_bucket);
+	new_infop = &dbmp->reginfo[new_region];
+	new_mp = new_infop->primary;
+	new_hp = R_ADDR(new_infop, new_mp->htab);
+	new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets];
+
+	/*
+	 * Before merging, we need to check that there are no old buffers left
+	 * in the target hash bucket after a previous split.
+	 */
+free_old:
+	MUTEX_LOCK(env, new_hp->mtx_hash);
+	SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+		MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+		if (bucket != new_bucket) {
+			/*
+			 * There is no way that an old buffer can be locked
+			 * after a split, since everyone will look for it in
+			 * the new hash bucket.
+			 */
+			DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
+			    atomic_read(&bhp->ref) == 0);
+			atomic_inc(env, &bhp->ref);
+			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+			if ((ret = __memp_bhfree(dbmp, new_infop,
+			    mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+				MUTEX_UNLOCK(env, new_hp->mtx_hash);
+				return (ret);
+			}
+
+			/*
+			 * The free has modified the list of buffers and
+			 * dropped the mutex.  We need to start again.
+			 */
+			goto free_old;
+		}
+	}
+	MUTEX_UNLOCK(env, new_hp->mtx_hash);
+
+	/*
+	 * Before we begin, make sure that all of the buffers we care about are
+	 * not in use and not frozen.  We do this because we can't drop the old
+	 * hash bucket mutex once we start moving buffers around.
+	 */
+retry:	MUTEX_LOCK(env, old_hp->mtx_hash);
+	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+		    new_nbuckets, high_mask, bucket);
+
+		if (bucket == new_bucket && atomic_read(&bhp->ref) != 0) {
+			MUTEX_UNLOCK(env, old_hp->mtx_hash);
+			__os_yield(env, 0, 0);
+			goto retry;
+		} else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) {
+			atomic_inc(env, &bhp->ref);
+			/*
+			 * We need to drop the hash bucket mutex to avoid
+			 * self-blocking when we allocate a new buffer.
+			 */
+			MUTEX_UNLOCK(env, old_hp->mtx_hash);
+			MUTEX_LOCK(env, bhp->mtx_buf);
+			F_SET(bhp, BH_EXCLUSIVE);
+			if (BH_OBSOLETE(bhp, old_hp->old_reader, vlsn))
+				alloc_bhp = NULL;
+			else {
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+				if ((ret = __memp_alloc(dbmp,
+				    old_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+					goto err;
+			}
+			/*
+			 * But we need to lock the hash bucket again before
+			 * thawing the buffer.  The call to __memp_bh_thaw
+			 * will unlock the hash bucket mutex.
+			 */
+			MUTEX_LOCK(env, old_hp->mtx_hash);
+			if (F_ISSET(bhp, BH_THAWED)) {
+				ret = __memp_bhfree(dbmp, old_infop, NULL, NULL,
+				    alloc_bhp,
+				    BH_FREE_FREEMEM | BH_FREE_UNLOCKED);
+			} else
+				ret = __memp_bh_thaw(dbmp,
+				    old_infop, old_hp, bhp, alloc_bhp);
+
+			/*
+			 * We've dropped the mutex in order to thaw, so we need
+			 * to go back to the beginning and check that all of
+			 * the buffers we care about are still unlocked and
+			 * unreferenced.
+			 */
+err:			atomic_dec(env, &bhp->ref);
+			F_CLR(bhp, BH_EXCLUSIVE);
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			if (ret != 0)
+				return (ret);
+			goto retry;
+		}
+	}
+
+	/*
+	 * We now know that all of the buffers we care about are unlocked and
+	 * unreferenced.  Go ahead and copy them.
+	 */
+	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+		    new_nbuckets, high_mask, bucket);
+		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/*
+		 * We ignore buffers that don't hash to the new bucket.  We
+		 * could also ignore clean buffers which are not part of a
+		 * multiversion chain as long as they have a backing file.
+		 */
+		if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) &&
+		    SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file))
+			continue;
+
+		for (current_bhp = bhp, next_bhp = NULL;
+		    current_bhp != NULL;
+		    current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh),
+		    next_bhp = alloc_bhp) {
+			/* Allocate in the new region. */
+			if ((ret = __memp_alloc(dbmp,
+			    new_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+				break;
+
+			alloc_bhp->ref = current_bhp->ref;
+			alloc_bhp->priority = current_bhp->priority;
+			alloc_bhp->pgno = current_bhp->pgno;
+			alloc_bhp->mf_offset = current_bhp->mf_offset;
+			alloc_bhp->flags = current_bhp->flags;
+			alloc_bhp->td_off = current_bhp->td_off;
+
+			/*
+			 * We've duplicated the buffer, so now we need to
+			 * update reference counts, including the counts in the
+			 * per-MPOOLFILE and the transaction detail (for MVCC
+			 * buffers).
+			 */
+			MUTEX_LOCK(env, mfp->mutex);
+			++mfp->block_cnt;
+			MUTEX_UNLOCK(env, mfp->mutex);
+
+			if (alloc_bhp->td_off != INVALID_ROFF &&
+			    (ret = __txn_add_buffer(env,
+			    R_ADDR(&env->tx_handle->reginfo,
+			    alloc_bhp->td_off))) != 0)
+				break;
+
+			memcpy(alloc_bhp->buf, bhp->buf, mfp->pagesize);
+
+			/*
+			 * We build up the MVCC chain first, then insert the
+			 * head (stored in new_bhp) once.
+			 */
+			if (next_bhp == NULL) {
+				SH_CHAIN_INIT(alloc_bhp, vc);
+				new_bhp = alloc_bhp;
+			} else
+				SH_CHAIN_INSERT_BEFORE(
+				    next_bhp, alloc_bhp, vc, __bh);
+		}
+
+		DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash);
+		MUTEX_LOCK(env, new_hp->mtx_hash);
+		SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
+		if (F_ISSET(new_bhp, BH_DIRTY))
+			atomic_inc(env, &new_hp->hash_page_dirty);
+
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			F_CLR(bhp, BH_DIRTY);
+			atomic_dec(env, &old_hp->hash_page_dirty);
+		}
+		MUTEX_UNLOCK(env, new_hp->mtx_hash);
+	}
+
+	if (ret == 0)
+		mp->nbuckets = new_nbuckets;
+	MUTEX_UNLOCK(env, old_hp->mtx_hash);
+
+	return (ret);
+}
+
+static int
+__memp_add_bucket(dbmp)
+	DB_MPOOL *dbmp;
+{
+	ENV *env;
+	MPOOL *mp;
+	u_int32_t high_mask, new_bucket, old_bucket;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+
+	new_bucket = mp->nbuckets;
+	/* We should always be adding buckets to the last region. */
+	DB_ASSERT(env, NREGION(mp, new_bucket) == mp->nreg - 1);
+	MP_MASK(mp->nbuckets, high_mask);
+	old_bucket = new_bucket & (high_mask >> 1);
+
+	/*
+	 * With fixed-sized regions, the new region is always smaller than the
+	 * existing total cache size, so buffers always need to be copied.  If
+	 * we implement variable region sizes, it's possible that we will be
+	 * splitting a hash bucket in the new region.  Catch that here.
+	 */
+	DB_ASSERT(env, NREGION(mp, old_bucket) != NREGION(mp, new_bucket));
+
+	return (__memp_merge_buckets(dbmp, mp->nbuckets + 1,
+	    old_bucket, new_bucket));
+}
+
+static int
+__memp_add_region(dbmp)
+	DB_MPOOL *dbmp;
+{
+	ENV *env;
+	MPOOL *mp;
+	REGINFO *infop;
+	int ret;
+	roff_t cache_size, reg_size;
+	u_int i;
+	u_int32_t *regids;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
+
+	/* All cache regions are the same size. */
+	reg_size = dbmp->reginfo[0].rp->size;
+	ret = 0;
+
+	infop = &dbmp->reginfo[mp->nreg];
+	infop->env = env;
+	infop->type = REGION_TYPE_MPOOL;
+	infop->id = INVALID_REGION_ID;
+	infop->flags = REGION_CREATE_OK;
+	if ((ret = __env_region_attach(env, infop, reg_size, reg_size)) != 0)
+		return (ret);
+	if ((ret = __memp_init(env,
+	    dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
+		return (ret);
+	cache_size += reg_size;
+	mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
+	mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
+	regids = R_ADDR(dbmp->reginfo, mp->regids);
+	regids[mp->nreg++] = infop->id;
+
+	for (i = 0; i < mp->htab_buckets; i++)
+		if ((ret = __memp_add_bucket(dbmp)) != 0)
+			break;
+
+	return (ret);
+}
+
+static int
+__memp_remove_bucket(dbmp)
+	DB_MPOOL *dbmp;
+{
+	ENV *env;
+	MPOOL *mp;
+	u_int32_t high_mask, new_bucket, old_bucket;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+
+	old_bucket = mp->nbuckets - 1;
+
+	/* We should always be removing buckets from the last region. */
+	DB_ASSERT(env, NREGION(mp, old_bucket) == mp->nreg - 1);
+	MP_MASK(mp->nbuckets - 1, high_mask);
+	new_bucket = old_bucket & (high_mask >> 1);
+
+	return (__memp_merge_buckets(dbmp, mp->nbuckets - 1,
+	    old_bucket, new_bucket));
+}
+
+static int
+__memp_remove_region(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_MPOOL_HASH *hp;
+	ENV *env;
+	MPOOL *mp;
+	REGINFO *infop;
+	int ret;
+	roff_t cache_size, reg_size;
+	u_int i;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	reg_size = dbmp->reginfo[0].rp->size;
+	cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
+	ret = 0;
+
+	if (mp->nreg == 1) {
+		__db_errx(env, DB_STR("3019",
+		    "cannot remove the last cache"));
+		return (EINVAL);
+	}
+
+	for (i = 0; i < mp->htab_buckets; i++)
+		if ((ret = __memp_remove_bucket(dbmp)) != 0)
+			return (ret);
+
+	/* Detach from the region then destroy it. */
+	infop = &dbmp->reginfo[mp->nreg];
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab);
+		for (i = 0; i < env->dbenv->mp_mtxcount; i++)
+			if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0)
+				return (ret);
+	}
+
+	ret = __env_region_detach(env, infop, 1);
+	if  (ret == 0) {
+		mp->nreg--;
+		cache_size -= reg_size;
+		mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
+		mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
+	}
+
+	return (ret);
+}
+
+static int
+__memp_map_regions(dbmp)
+	DB_MPOOL *dbmp;
+{
+	ENV *env;
+	MPOOL *mp;
+	int ret;
+	u_int i;
+	u_int32_t *regids;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	regids = R_ADDR(dbmp->reginfo, mp->regids);
+	ret = 0;
+
+	for (i = 1; i < mp->nreg; ++i) {
+		if (dbmp->reginfo[i].primary != NULL &&
+		    dbmp->reginfo[i].id == regids[i])
+			continue;
+
+		if (dbmp->reginfo[i].primary != NULL)
+			ret = __env_region_detach(env, &dbmp->reginfo[i], 0);
+
+		dbmp->reginfo[i].env = env;
+		dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+		dbmp->reginfo[i].id = regids[i];
+		dbmp->reginfo[i].flags = REGION_JOIN_OK;
+		if ((ret =
+		    __env_region_attach(env, &dbmp->reginfo[i], 0, 0)) != 0)
+			return (ret);
+		dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i],
+		    dbmp->reginfo[i].rp->primary);
+	}
+
+	for (; i < mp->max_nreg; i++)
+		if (dbmp->reginfo[i].primary != NULL &&
+		    (ret = __env_region_detach(env,
+		    &dbmp->reginfo[i], 0)) != 0)
+			break;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+ */
+int
+__memp_resize(dbmp, gbytes, bytes)
+	DB_MPOOL *dbmp;
+	u_int32_t gbytes, bytes;
+{
+	ENV *env;
+	MPOOL *mp;
+	int ret;
+	u_int32_t ncache;
+	roff_t reg_size, total_size;
+
+	env = dbmp->env;
+	mp = dbmp->reginfo[0].primary;
+	reg_size = dbmp->reginfo[0].rp->size;
+	total_size = (roff_t)gbytes * GIGABYTE + bytes;
+	ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
+
+	if (ncache < 1)
+		ncache = 1;
+	else if (ncache > mp->max_nreg) {
+		__db_errx(env, DB_STR_A("3020",
+		    "cannot resize to %lu cache regions: maximum is %lu",
+		    "%lu %lu"), (u_long)ncache, (u_long)mp->max_nreg);
+		return (EINVAL);
+	}
+
+	ret = 0;
+	MUTEX_LOCK(env, mp->mtx_resize);
+	while (mp->nreg != ncache)
+		if ((ret = (mp->nreg < ncache ?
+		    __memp_add_region(dbmp) :
+		    __memp_remove_region(dbmp))) != 0)
+			break;
+	MUTEX_UNLOCK(env, mp->mtx_resize);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
+	DB_ENV *dbenv;
+	u_int32_t *max_gbytesp, *max_bytesp;
+{
+	DB_MPOOL *dbmp;
+	ENV *env;
+	MPOOL *mp;
+	roff_t reg_size, max_size;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		dbmp = env->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		reg_size = dbmp->reginfo[0].rp->size;
+		max_size = mp->max_nreg * reg_size;
+		*max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
+		*max_bytesp = (u_int32_t)(max_size % GIGABYTE);
+	} else {
+		*max_gbytesp = dbenv->mp_max_gbytes;
+		*max_bytesp = dbenv->mp_max_bytes;
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__memp_set_cache_max(dbenv, max_gbytes, max_bytes)
+	DB_ENV *dbenv;
+	u_int32_t max_gbytes, max_bytes;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_cache_max");
+	dbenv->mp_max_gbytes = max_gbytes;
+	dbenv->mp_max_bytes = max_bytes;
+
+	return (0);
+}
diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c
new file mode 100644
index 00000000..246b44d7
--- /dev/null
+++ b/src/mp/mp_stat.c
@@ -0,0 +1,905 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static void __memp_print_bh __P((ENV *,
+		DB_MPOOL *, const char *, BH *, roff_t *));
+static int  __memp_print_all __P((ENV *, u_int32_t));
+static int  __memp_print_stats __P((ENV *, u_int32_t));
+static int __memp_print_hash __P((ENV *,
+		DB_MPOOL *, REGINFO *, roff_t *, u_int32_t));
+static int  __memp_stat __P((ENV *,
+		DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+static void __memp_stat_wait
+		__P((ENV *, REGINFO *, MPOOL *, DB_MPOOL_STAT *, u_int32_t));
+static int __memp_file_stats __P((ENV *,
+		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_count_files __P((ENV *,
+		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_get_files __P((ENV *,
+		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_print_files __P((ENV *,
+		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+
+/*
+ * __memp_stat_pp --
+ *	DB_ENV->memp_stat pre/post processing.
+ *
+ * PUBLIC: int __memp_stat_pp
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+ */
+int
+__memp_stat_pp(dbenv, gspp, fspp, flags)
+	DB_ENV *dbenv;
+	DB_MPOOL_STAT **gspp;
+	DB_MPOOL_FSTAT ***fspp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "DB_ENV->memp_stat", DB_INIT_MPOOL);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_stat(env, gspp, fspp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_stat --
+ *	ENV->memp_stat
+ */
+static int
+__memp_stat(env, gspp, fspp, flags)
+	ENV *env;
+	DB_MPOOL_STAT **gspp;
+	DB_MPOOL_FSTAT ***fspp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_FSTAT **tfsp;
+	DB_MPOOL_STAT *sp;
+	MPOOL *c_mp, *mp;
+	size_t len;
+	int ret;
+	u_int32_t i;
+	uintmax_t tmp_wait, tmp_nowait;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/* Global statistics. */
+	if (gspp != NULL) {
+		*gspp = NULL;
+
+		if ((ret = __os_umalloc(env, sizeof(**gspp), gspp)) != 0)
+			return (ret);
+		memset(*gspp, 0, sizeof(**gspp));
+		sp = *gspp;
+
+		/*
+		 * Initialization and information that is not maintained on
+		 * a per-cache basis.  Note that configuration information
+		 * may be modified at any time, and so we have to lock.
+		 */
+		sp->st_gbytes = mp->gbytes;
+		sp->st_bytes = mp->bytes;
+		sp->st_pagesize = mp->pagesize;
+		sp->st_ncache = mp->nreg;
+		sp->st_max_ncache = mp->max_nreg;
+		sp->st_regsize = dbmp->reginfo[0].rp->size;
+		sp->st_regmax = dbmp->reginfo[0].rp->max;
+		sp->st_sync_interrupted = mp->stat.st_sync_interrupted;
+
+		MPOOL_SYSTEM_LOCK(env);
+		sp->st_mmapsize = mp->mp_mmapsize;
+		sp->st_maxopenfd = mp->mp_maxopenfd;
+		sp->st_maxwrite = mp->mp_maxwrite;
+		sp->st_maxwrite_sleep = mp->mp_maxwrite_sleep;
+		MPOOL_SYSTEM_UNLOCK(env);
+
+		/* Walk the cache list and accumulate the global information. */
+		for (i = 0; i < mp->nreg; ++i) {
+			c_mp = dbmp->reginfo[i].primary;
+
+			sp->st_map += c_mp->stat.st_map;
+			sp->st_cache_hit += c_mp->stat.st_cache_hit;
+			sp->st_cache_miss += c_mp->stat.st_cache_miss;
+			sp->st_page_create += c_mp->stat.st_page_create;
+			sp->st_page_in += c_mp->stat.st_page_in;
+			sp->st_page_out += c_mp->stat.st_page_out;
+			sp->st_ro_evict += c_mp->stat.st_ro_evict;
+			sp->st_rw_evict += c_mp->stat.st_rw_evict;
+			sp->st_page_trickle += c_mp->stat.st_page_trickle;
+			sp->st_pages += c_mp->pages;
+			/*
+			 * st_page_dirty	calculated by __memp_stat_hash
+			 * st_page_clean	calculated here
+			 */
+			__memp_stat_hash(
+			    &dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
+			sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
+			sp->st_hash_buckets += c_mp->htab_buckets;
+			sp->st_hash_mutexes += c_mp->htab_mutexes;
+			sp->st_hash_searches += c_mp->stat.st_hash_searches;
+			sp->st_hash_longest += c_mp->stat.st_hash_longest;
+			sp->st_hash_examined += c_mp->stat.st_hash_examined;
+			/*
+			 * st_hash_nowait	calculated by __memp_stat_wait
+			 * st_hash_wait
+			 */
+			__memp_stat_wait(
+			    env, &dbmp->reginfo[i], c_mp, sp, flags);
+			__mutex_set_wait_info(env,
+			    c_mp->mtx_region, &tmp_wait, &tmp_nowait);
+			sp->st_region_nowait += tmp_nowait;
+			sp->st_region_wait += tmp_wait;
+			sp->st_alloc += c_mp->stat.st_alloc;
+			sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
+			if (sp->st_alloc_max_buckets <
+			    c_mp->stat.st_alloc_max_buckets)
+				sp->st_alloc_max_buckets =
+				    c_mp->stat.st_alloc_max_buckets;
+			sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
+			if (sp->st_alloc_max_pages <
+			    c_mp->stat.st_alloc_max_pages)
+				sp->st_alloc_max_pages =
+				    c_mp->stat.st_alloc_max_pages;
+
+			if (LF_ISSET(DB_STAT_CLEAR)) {
+				if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+					__mutex_clear(env, c_mp->mtx_region);
+
+				memset(&c_mp->stat, 0, sizeof(c_mp->stat));
+			}
+		}
+
+		/*
+		 * We have duplicate statistics fields in per-file structures
+		 * and the cache.  The counters are only incremented in the
+		 * per-file structures, except if a file is flushed from the
+		 * mpool, at which time we copy its information into the cache
+		 * statistics.  We added the cache information above, now we
+		 * add the per-file information.
+		 */
+		if ((ret = __memp_walk_files(env, mp, __memp_file_stats,
+		    sp, NULL, fspp == NULL ? LF_ISSET(DB_STAT_CLEAR) : 0)) != 0)
+			return (ret);
+	}
+
+	/* Per-file statistics. */
+	if (fspp != NULL) {
+		*fspp = NULL;
+
+		/* Count the MPOOLFILE structures. */
+		i = 0;
+		len = 0;
+		if ((ret = __memp_walk_files(env,
+		     mp, __memp_count_files, &len, &i, flags)) != 0)
+			return (ret);
+
+		if (i == 0)
+			return (0);
+		len += sizeof(DB_MPOOL_FSTAT *);	/* Trailing NULL */
+
+		/* Allocate space */
+		if ((ret = __os_umalloc(env, len, fspp)) != 0)
+			return (ret);
+
+		tfsp = *fspp;
+		*tfsp = NULL;
+
+		/*
+		 * Files may have been opened since we counted, don't walk
+		 * off the end of the allocated space.
+		 */
+		if ((ret = __memp_walk_files(env,
+		    mp, __memp_get_files, &tfsp, &i, flags)) != 0)
+			return (ret);
+
+		*++tfsp = NULL;
+	}
+
+	return (0);
+}
+
+static int
+__memp_file_stats(env, mfp, argp, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *argp;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_MPOOL_STAT *sp;
+
+	COMPQUIET(env, NULL);
+	COMPQUIET(countp, NULL);
+
+	sp = argp;
+
+	sp->st_map += mfp->stat.st_map;
+	sp->st_cache_hit += mfp->stat.st_cache_hit;
+	sp->st_cache_miss += mfp->stat.st_cache_miss;
+	sp->st_page_create += mfp->stat.st_page_create;
+	sp->st_page_in += mfp->stat.st_page_in;
+	sp->st_page_out += mfp->stat.st_page_out;
+	if (LF_ISSET(DB_STAT_CLEAR))
+		memset(&mfp->stat, 0, sizeof(mfp->stat));
+
+	return (0);
+}
+
+static int
+__memp_count_files(env, mfp, argp, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *argp;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	size_t len;
+
+	COMPQUIET(flags, 0);
+	dbmp = env->mp_handle;
+	len = *(size_t *)argp;
+
+	(*countp)++;
+	len += sizeof(DB_MPOOL_FSTAT *) +
+	    sizeof(DB_MPOOL_FSTAT) + strlen(__memp_fns(dbmp, mfp)) + 1;
+
+	*(size_t *)argp = len;
+	return (0);
+}
+
+/*
+ * __memp_get_files --
+ *	get file specific statistics
+ *
+ * Build each individual entry.  We assume that an array of pointers are
+ * aligned correctly to be followed by an array of structures, which should
+ * be safe (in this particular case, the first element of the structure
+ * is a pointer, so we're doubly safe).  The array is followed by space
+ * for the text file names.
+ */
+static int
+__memp_get_files(env, mfp, argp, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *argp;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_FSTAT **tfsp, *tstruct;
+	char *name, *tname;
+	size_t nlen;
+
+	if (*countp == 0)
+		return (0);
+
+	dbmp = env->mp_handle;
+	tfsp = *(DB_MPOOL_FSTAT ***)argp;
+
+	if (*tfsp == NULL) {
+		/* Add 1 to count because we need to skip over the NULL. */
+		tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1);
+		tname = (char *)(tstruct + *countp);
+		*tfsp = tstruct;
+	} else {
+		tstruct = *tfsp + 1;
+		tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1;
+		*++tfsp = tstruct;
+	}
+
+	name = __memp_fns(dbmp, mfp);
+	nlen = strlen(name) + 1;
+	memcpy(tname, name, nlen);
+	memcpy(tstruct, &mfp->stat, sizeof(mfp->stat));
+	tstruct->file_name = tname;
+
+	/* Grab the pagesize from the mfp. */
+	tstruct->st_pagesize = mfp->pagesize;
+
+	*(DB_MPOOL_FSTAT ***)argp = tfsp;
+	(*countp)--;
+
+	if (LF_ISSET(DB_STAT_CLEAR))
+		memset(&mfp->stat, 0, sizeof(mfp->stat));
+
+	return (0);
+}
+
+/*
+ * __memp_stat_print_pp --
+ *	ENV->memp_stat_print pre/post processing.
+ *
+ * PUBLIC: int __memp_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "DB_ENV->memp_stat_print", DB_INIT_MPOOL);
+
+#define	DB_STAT_MEMP_FLAGS						\
+	(DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR | DB_STAT_MEMP_HASH)
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->memp_stat_print", flags, DB_STAT_MEMP_FLAGS)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+#define	FMAP_ENTRIES	200			/* Files we map. */
+
+/*
+ * __memp_stat_print --
+ *	ENV->memp_stat_print method.
+ *
+ * PUBLIC: int  __memp_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__memp_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __memp_print_stats(env,
+		    LF_ISSET(DB_STAT_ALL) ? flags : orig_flags);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL | DB_STAT_MEMP_HASH) &&
+	    (ret = __memp_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __memp_print_stats --
+ *	Display default mpool region statistics.
+ */
+static int
+__memp_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_MPOOL_FSTAT **fsp, **tfsp;
+	DB_MPOOL_STAT *gsp;
+	int ret;
+
+	if ((ret = __memp_stat(env, &gsp, &fsp, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default cache region information:");
+	__db_dlbytes(env, "Total cache size",
+	    (u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes);
+	__db_dl(env, "Number of caches", (u_long)gsp->st_ncache);
+	__db_dl(env, "Maximum number of caches", (u_long)gsp->st_max_ncache);
+	__db_dlbytes(env, "Pool individual cache size",
+	    (u_long)0, (u_long)0, (u_long)gsp->st_regsize);
+	__db_dlbytes(env, "Pool individual cache max",
+	    (u_long)0, (u_long)0, (u_long)gsp->st_regmax);
+	__db_dlbytes(env, "Maximum memory-mapped file size",
+	    (u_long)0, (u_long)0, (u_long)gsp->st_mmapsize);
+	STAT_LONG("Maximum open file descriptors", gsp->st_maxopenfd);
+	STAT_LONG("Maximum sequential buffer writes", gsp->st_maxwrite);
+	STAT_LONG("Sleep after writing maximum sequential buffers",
+	    gsp->st_maxwrite_sleep);
+	__db_dl(env,
+	    "Requested pages mapped into the process' address space",
+	    (u_long)gsp->st_map);
+	__db_dl_pct(env, "Requested pages found in the cache",
+	    (u_long)gsp->st_cache_hit, DB_PCT(
+	    gsp->st_cache_hit, gsp->st_cache_hit + gsp->st_cache_miss), NULL);
+	__db_dl(env, "Requested pages not found in the cache",
+	    (u_long)gsp->st_cache_miss);
+	__db_dl(env,
+	    "Pages created in the cache", (u_long)gsp->st_page_create);
+	__db_dl(env, "Pages read into the cache", (u_long)gsp->st_page_in);
+	__db_dl(env, "Pages written from the cache to the backing file",
+	    (u_long)gsp->st_page_out);
+	__db_dl(env, "Clean pages forced from the cache",
+	    (u_long)gsp->st_ro_evict);
+	__db_dl(env, "Dirty pages forced from the cache",
+	    (u_long)gsp->st_rw_evict);
+	__db_dl(env, "Dirty pages written by trickle-sync thread",
+	    (u_long)gsp->st_page_trickle);
+	__db_dl(env, "Current total page count",
+	    (u_long)gsp->st_pages);
+	__db_dl(env, "Current clean page count",
+	    (u_long)gsp->st_page_clean);
+	__db_dl(env, "Current dirty page count",
+	    (u_long)gsp->st_page_dirty);
+	__db_dl(env, "Number of hash buckets used for page location",
+	    (u_long)gsp->st_hash_buckets);
+	__db_dl(env, "Number of mutexes for the hash buckets",
+	    (u_long)gsp->st_hash_mutexes);
+	__db_dl(env, "Assumed page size used",
+	    (u_long)gsp->st_pagesize);
+	__db_dl(env,
+	    "Total number of times hash chains searched for a page",
+	    (u_long)gsp->st_hash_searches);
+	__db_dl(env, "The longest hash chain searched for a page",
+	    (u_long)gsp->st_hash_longest);
+	__db_dl(env,
+	    "Total number of hash chain entries checked for page",
+	    (u_long)gsp->st_hash_examined);
+	__db_dl_pct(env,
+	    "The number of hash bucket locks that required waiting",
+	    (u_long)gsp->st_hash_wait, DB_PCT(
+	    gsp->st_hash_wait, gsp->st_hash_wait + gsp->st_hash_nowait), NULL);
+	__db_dl_pct(env,
+    "The maximum number of times any hash bucket lock was waited for",
+	    (u_long)gsp->st_hash_max_wait, DB_PCT(gsp->st_hash_max_wait,
+	    gsp->st_hash_max_wait + gsp->st_hash_max_nowait), NULL);
+	__db_dl_pct(env,
+	    "The number of region locks that required waiting",
+	    (u_long)gsp->st_region_wait, DB_PCT(gsp->st_region_wait,
+	    gsp->st_region_wait + gsp->st_region_nowait), NULL);
+	__db_dl(env, "The number of buffers frozen",
+	    (u_long)gsp->st_mvcc_frozen);
+	__db_dl(env, "The number of buffers thawed",
+	    (u_long)gsp->st_mvcc_thawed);
+	__db_dl(env, "The number of frozen buffers freed",
+	    (u_long)gsp->st_mvcc_freed);
+	__db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc);
+	__db_dl(env,
+	    "The number of hash buckets examined during allocations",
+	    (u_long)gsp->st_alloc_buckets);
+	__db_dl(env,
+	    "The maximum number of hash buckets examined for an allocation",
+	    (u_long)gsp->st_alloc_max_buckets);
+	__db_dl(env, "The number of pages examined during allocations",
+	    (u_long)gsp->st_alloc_pages);
+	__db_dl(env, "The max number of pages examined for an allocation",
+	    (u_long)gsp->st_alloc_max_pages);
+	__db_dl(env, "Threads waited on page I/O", (u_long)gsp->st_io_wait);
+	__db_dl(env, "The number of times a sync is interrupted",
+	    (u_long)gsp->st_sync_interrupted);
+
+	for (tfsp = fsp; fsp != NULL && *tfsp != NULL; ++tfsp) {
+		if (LF_ISSET(DB_STAT_ALL))
+			__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Pool File: %s", (*tfsp)->file_name);
+		__db_dl(env, "Page size", (u_long)(*tfsp)->st_pagesize);
+		__db_dl(env,
+		    "Requested pages mapped into the process' address space",
+		    (u_long)(*tfsp)->st_map);
+		__db_dl_pct(env, "Requested pages found in the cache",
+		    (u_long)(*tfsp)->st_cache_hit, DB_PCT((*tfsp)->st_cache_hit,
+		    (*tfsp)->st_cache_hit + (*tfsp)->st_cache_miss), NULL);
+		__db_dl(env, "Requested pages not found in the cache",
+		    (u_long)(*tfsp)->st_cache_miss);
+		__db_dl(env, "Pages created in the cache",
+		    (u_long)(*tfsp)->st_page_create);
+		__db_dl(env, "Pages read into the cache",
+		    (u_long)(*tfsp)->st_page_in);
+		__db_dl(env,
+		    "Pages written from the cache to the backing file",
+		    (u_long)(*tfsp)->st_page_out);
+		if ((*tfsp)->st_backup_spins != 0)
+			__db_dl(env,
+			    "Spins while trying to backup the file",
+			    (u_long)(*tfsp)->st_backup_spins);
+	}
+
+	__os_ufree(env, fsp);
+	__os_ufree(env, gsp);
+	return (0);
+}
+
+/*
+ * __memp_print_all --
+ *	Display debugging mpool region statistics.
+ */
+static int
+__memp_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN cfn[] = {
+		{ DB_MPOOL_NOFILE,	"DB_MPOOL_NOFILE" },
+		{ DB_MPOOL_UNLINK,	"DB_MPOOL_UNLINK" },
+		{ 0,			NULL }
+	};
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	MPOOL *mp;
+	roff_t fmap[FMAP_ENTRIES + 1];
+	u_int32_t i, cnt;
+	int ret;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	ret = 0;
+
+	MPOOL_SYSTEM_LOCK(env);
+
+	__db_print_reginfo(env, dbmp->reginfo, "Mpool", flags);
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+
+	__db_msg(env, "MPOOL structure:");
+	__mutex_print_debug_single(
+	    env, "MPOOL region mutex", mp->mtx_region, flags);
+	STAT_LSN("Maximum checkpoint LSN", &mp->lsn);
+	STAT_ULONG("Hash table entries", mp->htab_buckets);
+	STAT_ULONG("Hash table mutexes", mp->htab_mutexes);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_MPOOL handle information:");
+	__mutex_print_debug_single(
+	    env, "DB_MPOOL handle mutex", dbmp->mutex, flags);
+	STAT_ULONG("Underlying cache regions", mp->nreg);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_MPOOLFILE structures:");
+	for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
+		__db_msg(env, "File #%lu: %s: per-process, %s",
+		    (u_long)cnt + 1, __memp_fn(dbmfp),
+		    F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+		STAT_ULONG("Reference count", dbmfp->ref);
+		STAT_ULONG("Pinned block reference count", dbmfp->ref);
+		STAT_ULONG("Clear length", dbmfp->clear_len);
+		__db_print_fileid(env, dbmfp->fileid, "\tID");
+		STAT_ULONG("File type", dbmfp->ftype);
+		STAT_ULONG("LSN offset", dbmfp->lsn_offset);
+		STAT_ULONG("Max gbytes", dbmfp->gbytes);
+		STAT_ULONG("Max bytes", dbmfp->bytes);
+		STAT_ULONG("Cache priority", dbmfp->priority);
+		STAT_POINTER("mmap address", dbmfp->addr);
+		STAT_ULONG("mmap length", dbmfp->len);
+		__db_prflags(env, NULL, dbmfp->flags, cfn, NULL, "\tFlags");
+		__db_print_fh(env, "File handle", dbmfp->fhp, flags);
+	}
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "MPOOLFILE structures:");
+	cnt = 0;
+	ret = __memp_walk_files(env, mp, __memp_print_files, fmap, &cnt, flags);
+	MPOOL_SYSTEM_UNLOCK(env);
+	if (ret != 0)
+		return (ret);
+
+	if (cnt < FMAP_ENTRIES)
+		fmap[cnt] = INVALID_ROFF;
+	else
+		fmap[FMAP_ENTRIES] = INVALID_ROFF;
+
+	/* Dump the individual caches. */
+	for (i = 0; i < mp->nreg; ++i) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Cache #%d:", i + 1);
+		if (i > 0)
+			__env_alloc_print(&dbmp->reginfo[i], flags);
+		if ((ret = __memp_print_hash(
+		    env, dbmp, &dbmp->reginfo[i], fmap, flags)) != 0)
+			break;
+	}
+
+	return (ret);
+}
+
+static int
+__memp_print_files(env, mfp, argp, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *argp;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	roff_t *fmap;
+	DB_MPOOL *dbmp;
+	u_int32_t mfp_flags;
+	static const FN fn[] = {
+		{ MP_CAN_MMAP,		"MP_CAN_MMAP" },
+		{ MP_DIRECT,		"MP_DIRECT" },
+		{ MP_EXTENT,		"MP_EXTENT" },
+		{ MP_FAKE_DEADFILE,	"deadfile" },
+		{ MP_FAKE_FILEWRITTEN,	"file written" },
+		{ MP_FAKE_NB,		"no backing file" },
+		{ MP_FAKE_UOC,		"unlink on close" },
+		{ MP_NOT_DURABLE,	"not durable" },
+		{ MP_TEMP,		"MP_TEMP" },
+		{ 0,			NULL }
+	};
+
+	dbmp = env->mp_handle;
+	fmap = argp;
+
+	__db_msg(env, "File #%d: %s", *countp + 1, __memp_fns(dbmp, mfp));
+	__mutex_print_debug_single(env, "Mutex", mfp->mutex, flags);
+
+	MUTEX_LOCK(env, mfp->mutex);
+	STAT_ULONG("Revision count", mfp->revision);
+	STAT_ULONG("Reference count", mfp->mpf_cnt);
+	STAT_ULONG("Sync/read only open count", mfp->neutral_cnt);
+	STAT_ULONG("Block count", mfp->block_cnt);
+	STAT_ULONG("Last page number", mfp->last_pgno);
+	STAT_ULONG("Original last page number", mfp->orig_last_pgno);
+	STAT_ULONG("Maximum page number", mfp->maxpgno);
+	STAT_LONG("Type", mfp->ftype);
+	STAT_LONG("Priority", mfp->priority);
+	STAT_LONG("Page's LSN offset", mfp->lsn_off);
+	STAT_LONG("Page's clear length", mfp->clear_len);
+
+	__db_print_fileid(env,
+	    R_ADDR(dbmp->reginfo, mfp->fileid_off), "\tID");
+
+	mfp_flags = 0;
+	if (mfp->deadfile)
+		FLD_SET(mfp_flags, MP_FAKE_DEADFILE);
+	if (mfp->file_written)
+		FLD_SET(mfp_flags, MP_FAKE_FILEWRITTEN);
+	if (mfp->no_backing_file)
+		FLD_SET(mfp_flags, MP_FAKE_NB);
+	if (mfp->unlink_on_close)
+		FLD_SET(mfp_flags, MP_FAKE_UOC);
+	__db_prflags(env, NULL, mfp_flags, fn, NULL, "\tFlags");
+
+	if (*countp < FMAP_ENTRIES)
+		fmap[*countp] = R_OFFSET(dbmp->reginfo, mfp);
+	(*countp)++;
+	MUTEX_UNLOCK(env, mfp->mutex);
+	return (0);
+}
+
+/*
+ * __memp_print_hash --
+ *	Display hash bucket statistics for a cache.
+ */
+static int
+__memp_print_hash(env, dbmp, reginfo, fmap, flags)
+	ENV *env;
+	DB_MPOOL *dbmp;
+	REGINFO *reginfo;
+	roff_t *fmap;
+	u_int32_t flags;
+{
+	BH *bhp, *vbhp;
+	DB_MPOOL_HASH *hp;
+	DB_MSGBUF mb;
+	MPOOL *c_mp;
+	u_int32_t bucket;
+
+	c_mp = reginfo->primary;
+	DB_MSGBUF_INIT(&mb);
+	STAT_ULONG("Hash table last-checked", c_mp->last_checked);
+	STAT_ULONG("Hash table LRU priority", c_mp->lru_priority);
+	STAT_ULONG("Hash table LRU generation", c_mp->lru_generation);
+	STAT_ULONG("Put counter", c_mp->put_counter);
+
+	/* Display the hash table list of BH's. */
+	__db_msg(env,
+	    "BH hash table (%lu hash slots)", (u_long)c_mp->htab_buckets);
+	__db_msg(env, "bucket #: priority, I/O wait, [mutex]");
+	__db_msg(env, "\tpageno, file, ref, LSN, address, priority, flags");
+
+	for (hp = R_ADDR(reginfo, c_mp->htab),
+	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+		MUTEX_READLOCK(env, hp->mtx_hash);
+		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) {
+			__db_msgadd(env, &mb,
+			    "bucket %lu: %lu (%lu dirty)",
+			    (u_long)bucket, (u_long)hp->hash_io_wait,
+			    (u_long)atomic_read(&hp->hash_page_dirty));
+			if (hp->hash_frozen != 0)
+				__db_msgadd(env, &mb, "(MVCC %lu/%lu/%lu) ",
+				    (u_long)hp->hash_frozen,
+				    (u_long)hp->hash_thawed,
+				    (u_long)hp->hash_frozen_freed);
+			__mutex_print_debug_stats(
+			    env, &mb, hp->mtx_hash, flags);
+			DB_MSGBUF_FLUSH(env, &mb);
+		}
+		for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+			__memp_print_bh(env, dbmp, NULL, bhp, fmap);
+
+			/* Print the version chain, if it exists. */
+			for (vbhp = SH_CHAIN_PREV(bhp, vc, __bh);
+			    vbhp != NULL;
+			    vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) {
+				__memp_print_bh(env, dbmp,
+				    " next:\t", vbhp, fmap);
+			}
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_print_bh --
+ *	Display a BH structure.
+ */
+static void
+__memp_print_bh(env, dbmp, prefix, bhp, fmap)
+	ENV *env;
+	DB_MPOOL *dbmp;
+	const char *prefix;
+	BH *bhp;
+	roff_t *fmap;
+{
+	static const FN fn[] = {
+		{ BH_CALLPGIN,		"callpgin" },
+		{ BH_DIRTY,		"dirty" },
+		{ BH_DIRTY_CREATE,	"created" },
+		{ BH_DISCARD,		"discard" },
+		{ BH_EXCLUSIVE,		"exclusive" },
+		{ BH_FREED,		"freed" },
+		{ BH_FROZEN,		"frozen" },
+		{ BH_TRASH,		"trash" },
+		{ BH_THAWED,		"thawed" },
+		{ 0,			NULL }
+	};
+	DB_MSGBUF mb;
+	int i;
+
+	DB_MSGBUF_INIT(&mb);
+
+	if (prefix != NULL)
+		__db_msgadd(env, &mb, "%s", prefix);
+	else
+		__db_msgadd(env, &mb, "\t");
+
+	for (i = 0; i < FMAP_ENTRIES; ++i)
+		if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset)
+			break;
+
+	if (fmap[i] == INVALID_ROFF)
+		__db_msgadd(env, &mb, "%5lu, %lu, ",
+		    (u_long)bhp->pgno, (u_long)bhp->mf_offset);
+	else
+		__db_msgadd(
+		    env, &mb, "%5lu, #%d, ", (u_long)bhp->pgno, i + 1);
+
+	__db_msgadd(env, &mb, "%2lu, %lu/%lu", (u_long)atomic_read(&bhp->ref),
+	    F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).file,
+	    F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).offset);
+	if (bhp->td_off != INVALID_ROFF)
+		__db_msgadd(env, &mb, " (@%lu/%lu 0x%x)",
+		    (u_long)VISIBLE_LSN(env, bhp)->file,
+		    (u_long)VISIBLE_LSN(env, bhp)->offset,
+		    BH_OWNER(env, bhp)->txnid);
+	__db_msgadd(env, &mb, ", %#08lx, %lu",
+	    (u_long)R_OFFSET(dbmp->reginfo, bhp), (u_long)bhp->priority);
+	__db_prflags(env, &mb, bhp->flags, fn, " (", ")");
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __memp_stat_wait --
+ *	Total hash bucket wait stats into the region.
+ */
+static void
+__memp_stat_wait(env, reginfo, mp, mstat, flags)
+	ENV *env;
+	REGINFO *reginfo;
+	MPOOL *mp;
+	DB_MPOOL_STAT *mstat;
+	u_int32_t flags;
+{
+	DB_MPOOL_HASH *hp;
+	u_int32_t i;
+	uintmax_t tmp_nowait, tmp_wait;
+
+	mstat->st_hash_max_wait = 0;
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0; i < mp->htab_buckets; i++, hp++) {
+		__mutex_set_wait_info(
+		    env, hp->mtx_hash, &tmp_wait, &tmp_nowait);
+		mstat->st_hash_nowait += tmp_nowait;
+		mstat->st_hash_wait += tmp_wait;
+		if (tmp_wait > mstat->st_hash_max_wait) {
+			mstat->st_hash_max_wait = tmp_wait;
+			mstat->st_hash_max_nowait = tmp_nowait;
+		}
+		if (LF_ISSET(DB_STAT_CLEAR |
+		    DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
+			__mutex_clear(env, hp->mtx_hash);
+
+		mstat->st_io_wait += hp->hash_io_wait;
+		mstat->st_mvcc_frozen += hp->hash_frozen;
+		mstat->st_mvcc_thawed += hp->hash_thawed;
+		mstat->st_mvcc_freed += hp->hash_frozen_freed;
+		if (LF_ISSET(DB_STAT_CLEAR)) {
+			hp->hash_io_wait = 0;
+			hp->hash_frozen = 0;
+			hp->hash_thawed = 0;
+			hp->hash_frozen_freed = 0;
+		}
+	}
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__memp_stat_pp(dbenv, gspp, fspp, flags)
+	DB_ENV *dbenv;
+	DB_MPOOL_STAT **gspp;
+	DB_MPOOL_FSTAT ***fspp;
+	u_int32_t flags;
+{
+	COMPQUIET(gspp, NULL);
+	COMPQUIET(fspp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__memp_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
+
+/*
+ * __memp_stat_hash --
+ *	Total hash bucket stats (other than mutex wait) into the region.
+ *
+ * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+ */
+void
+__memp_stat_hash(reginfo, mp, dirtyp)
+	REGINFO *reginfo;
+	MPOOL *mp;
+	u_int32_t *dirtyp;
+{
+	DB_MPOOL_HASH *hp;
+	u_int32_t dirty, i;
+
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
+		dirty += (u_int32_t)atomic_read(&hp->hash_page_dirty);
+	*dirtyp = dirty;
+}
diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c
new file mode 100644
index 00000000..fa06b1d4
--- /dev/null
+++ b/src/mp/mp_sync.c
@@ -0,0 +1,965 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+typedef struct {
+	DB_MPOOL_HASH *track_hp;	/* Hash bucket. */
+
+	roff_t	  track_off;		/* Page file offset. */
+	db_pgno_t track_pgno;		/* Page number. */
+} BH_TRACK;
+
+static int __bhcmp __P((const void *, const void *));
+static int __memp_close_flush_files __P((ENV *, int));
+static int __memp_sync_files __P((ENV *));
+static int __memp_sync_file __P((ENV *,
+		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+
+/*
+ * __memp_walk_files --
+ * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *,
+ * PUBLIC:	int (*) __P((ENV *, MPOOLFILE *, void *,
+ * PUBLIC:	u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t));
+ */
+int
+__memp_walk_files(env, mp, func, arg, countp, flags)
+	ENV *env;
+	MPOOL *mp;
+	int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+	void *arg;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOLFILE *mfp;
+	int i, ret, t_ret;
+
+	dbmp = env->mp_handle;
+	ret = 0;
+
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+		MUTEX_LOCK(env, hp->mtx_hash);
+		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+			if ((t_ret = func(env,
+			    mfp, arg, countp, flags)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
+				break;
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+		if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
+			break;
+	}
+	return (ret);
+}
+
+/*
+ * __memp_discard_all_mpfs --
+ *	Force discard all mpoolfiles. When closing a private environment, we
+ *	always want to discard all mpoolfiles to avoid memory leak.
+ *
+ * PUBLIC: int __memp_discard_all_mpfs __P((ENV *, MPOOL *));
+ */
+int
+__memp_discard_all_mpfs (env, mp)
+	ENV *env;
+	MPOOL *mp;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOLFILE *mfp;
+	int i, ret, t_ret;
+
+	ret = t_ret = 0;
+	mfp = NULL;
+	hp = NULL;
+	dbmp = env->mp_handle;
+
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+		MUTEX_LOCK(env, hp->mtx_hash);
+		while ((mfp = SH_TAILQ_FIRST(
+		    &hp->hash_bucket, __mpoolfile)) != NULL) {
+			MUTEX_LOCK(env, mfp->mutex);
+			if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+	return (ret);
+}
+
+/*
+ * __memp_sync_pp --
+ *	ENV->memp_sync pre/post processing.
+ *
+ * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
+ */
+int
+__memp_sync_pp(dbenv, lsnp)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "memp_sync", DB_INIT_MPOOL);
+
+	/*
+	 * If no LSN is provided, flush the entire cache (reasonable usage
+	 * even if there's no log subsystem configured).
+	 */
+	if (lsnp != NULL)
+		ENV_REQUIRES_CONFIG(env,
+		    env->lg_handle, "memp_sync", DB_INIT_LOG);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_sync --
+ *	ENV->memp_sync.
+ *
+ * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *));
+ */
+int
+__memp_sync(env, flags, lsnp)
+	ENV *env;
+	u_int32_t flags;
+	DB_LSN *lsnp;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	int interrupted, ret;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/* If we've flushed to the requested LSN, return that information. */
+	if (lsnp != NULL) {
+		MPOOL_SYSTEM_LOCK(env);
+		if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) {
+			*lsnp = mp->lsn;
+
+			MPOOL_SYSTEM_UNLOCK(env);
+			return (0);
+		}
+		MPOOL_SYSTEM_UNLOCK(env);
+	}
+
+	if ((ret =
+	    __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0)
+		return (ret);
+
+	if (!interrupted && lsnp != NULL) {
+		MPOOL_SYSTEM_LOCK(env);
+		if (LOG_COMPARE(lsnp, &mp->lsn) > 0)
+			mp->lsn = *lsnp;
+		MPOOL_SYSTEM_UNLOCK(env);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_fsync_pp --
+ *	DB_MPOOLFILE->sync pre/post processing.
+ *
+ * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fsync_pp(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbmfp->env;
+
+	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync");
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_fsync --
+ *	DB_MPOOLFILE->sync.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fsync(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	MPOOLFILE *mfp;
+
+	mfp = dbmfp->mfp;
+
+	/*
+	 * If this handle doesn't have a file descriptor that's open for
+	 * writing, or if the file is a temporary, or if the file hasn't
+	 * been written since it was flushed, there's no reason to proceed
+	 * further.
+	 */
+	if (F_ISSET(dbmfp, MP_READONLY))
+		return (0);
+
+	if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file)
+		return (0);
+
+	if (mfp->file_written == 0)
+		return (0);
+
+	return (__memp_sync_int(
+	    dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL));
+}
+
+/*
+ * __mp_xxx_fh --
+ *	Return a file descriptor for DB 1.85 compatibility locking.
+ *
+ * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+ */
+int
+__mp_xxx_fh(dbmfp, fhp)
+	DB_MPOOLFILE *dbmfp;
+	DB_FH **fhp;
+{
+	int ret;
+
+	/*
+	 * This is a truly spectacular layering violation, intended ONLY to
+	 * support compatibility for the DB 1.85 DB->fd call.
+	 *
+	 * Sync the database file to disk, creating the file as necessary.
+	 *
+	 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
+	 * The MP_READONLY test isn't interesting because we will either
+	 * already have a file descriptor (we opened the database file for
+	 * reading) or we aren't readonly (we created the database which
+	 * requires write privileges).  The MP_TEMP test isn't interesting
+	 * because we want to write to the backing file regardless so that
+	 * we get a file descriptor to return.
+	 */
+	if ((*fhp = dbmfp->fhp) != NULL)
+		return (0);
+
+	if ((ret = __memp_sync_int(
+	    dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0)
+		*fhp = dbmfp->fhp;
+	return (ret);
+}
+
+/*
+ * __memp_sync_int --
+ *	Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int __P((ENV *,
+ * PUBLIC:     DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
+ */
+int
+__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp)
+	ENV *env;
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t trickle_max, flags, *wrote_totalp;
+	int *interruptedp;
+{
+	BH *bhp;
+	BH_TRACK *bharray;
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	db_mutex_t mutex;
+	roff_t last_mf_offset;
+	u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote_total;
+	int32_t wrote_cnt;
+	int dirty, filecnt, maxopenfd, required_write, ret, t_ret;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	last_mf_offset = INVALID_ROFF;
+	filecnt = wrote_total = 0;
+
+	if (wrote_totalp != NULL)
+		*wrote_totalp = 0;
+	if (interruptedp != NULL)
+		*interruptedp = 0;
+
+	/*
+	 * If we're flushing the cache, it's a checkpoint or we're flushing a
+	 * specific file, we really have to write the blocks and we have to
+	 * confirm they made it to disk.  Otherwise, we can skip a block if
+	 * it's hard to get.
+	 */
+	required_write = LF_ISSET(DB_SYNC_CACHE |
+	    DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT);
+
+	/* Get shared configuration information. */
+	MPOOL_SYSTEM_LOCK(env);
+	maxopenfd = mp->mp_maxopenfd;
+	MPOOL_SYSTEM_UNLOCK(env);
+
+	/* Assume one dirty page per bucket. */
+	ar_max = mp->nreg * mp->htab_buckets;
+	if ((ret =
+	    __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
+		return (ret);
+
+	/*
+	 * Walk each cache's list of buffers and mark all dirty buffers to be
+	 * written and all dirty buffers to be potentially written, depending
+	 * on our flags.
+	 */
+	for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+		c_mp = dbmp->reginfo[n_cache].primary;
+
+		hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+		for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
+			/*
+			 * We can check for empty buckets before locking as
+			 * we only care if the pointer is zero or non-zero.
+			 * We can ignore empty or clean buckets because we
+			 * only need write buffers that were dirty before
+			 * we started.
+			 */
+#ifdef DIAGNOSTIC
+			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+#else
+			if (atomic_read(&hp->hash_page_dirty) == 0)
+#endif
+				continue;
+
+			dirty = 0;
+			MUTEX_LOCK(env, hp->mtx_hash);
+			SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+				/* Always ignore clean pages. */
+				if (!F_ISSET(bhp, BH_DIRTY))
+					continue;
+
+				dirty++;
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+				/*
+				 * Ignore in-memory files, unless the file is
+				 * specifically being flushed.
+				 */
+				if (mfp->no_backing_file)
+					continue;
+				if (!LF_ISSET(DB_SYNC_FILE) &&
+				    F_ISSET(mfp, MP_TEMP))
+					continue;
+
+				/*
+				 * Ignore files that aren't involved in DB's
+				 * transactional operations during checkpoints.
+				 */
+				if (LF_ISSET(DB_SYNC_CHECKPOINT) &&
+				    mfp->lsn_off == DB_LSN_OFF_NOTSET)
+					continue;
+
+				/*
+				 * Ignore files that aren't Queue extent files
+				 * if we're flushing a Queue file with extents.
+				 */
+				if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) &&
+				    !F_ISSET(mfp, MP_EXTENT))
+					continue;
+
+				/*
+				 * If we're flushing a specific file, see if
+				 * this page is from that file.
+				 */
+				if (dbmfp != NULL && mfp != dbmfp->mfp)
+					continue;
+
+				/* Track the buffer, we want it. */
+				bharray[ar_cnt].track_hp = hp;
+				bharray[ar_cnt].track_pgno = bhp->pgno;
+				bharray[ar_cnt].track_off = bhp->mf_offset;
+				ar_cnt++;
+
+				/*
+				 * If we run out of space, double and continue.
+				 * Don't stop at trickle_max, we want to sort
+				 * as large a sample set as possible in order
+				 * to minimize disk seeks.
+				 */
+				if (ar_cnt >= ar_max) {
+					if ((ret = __os_realloc(env,
+					    (ar_max * 2) * sizeof(BH_TRACK),
+					    &bharray)) != 0)
+						break;
+					ar_max *= 2;
+				}
+			}
+
+			if (ret != 0)
+				goto err;
+			/*
+			 * We are only checking this in diagnostic mode
+			 * since it requires extra latching to keep the count
+			 * in sync with the number of bits counted.
+			 */
+			DB_ASSERT(env,
+			    dirty == (int)atomic_read(&hp->hash_page_dirty));
+			MUTEX_UNLOCK(env, hp->mtx_hash);
+
+			/* Check if the call has been interrupted. */
+			if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(
+			    mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+				STAT(++mp->stat.st_sync_interrupted);
+				if (interruptedp != NULL)
+					*interruptedp = 1;
+				goto err;
+			}
+		}
+	}
+
+	/* If there no buffers to write, we're done. */
+	if (ar_cnt == 0)
+		goto done;
+
+	/*
+	 * Write the buffers in file/page order, trying to reduce seeks by the
+	 * filesystem and, when pages are smaller than filesystem block sizes,
+	 * reduce the actual number of writes.
+	 */
+	if (ar_cnt > 1)
+		qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
+
+	/*
+	 * If we're trickling buffers, only write enough to reach the correct
+	 * percentage.
+	 */
+	if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max)
+		ar_cnt = trickle_max;
+
+	/*
+	 * Flush the log.  We have to ensure the log records reflecting the
+	 * changes on the database pages we're writing have already made it
+	 * to disk.  We still have to check the log each time we write a page
+	 * (because pages we are about to write may be modified after we have
+	 * flushed the log), but in general this will at least avoid any I/O
+	 * on the log's part.
+	 */
+	if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0)
+		goto err;
+
+	/*
+	 * Walk the array, writing buffers.  When we write a buffer, we NULL
+	 * out its hash bucket pointer so we don't process a slot more than
+	 * once.
+	 */
+	for (i = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
+		if (i >= ar_cnt) {
+			i = 0;
+			__os_yield(env, 1, 0);
+		}
+		if ((hp = bharray[i].track_hp) == NULL)
+			continue;
+
+		/* Lock the hash bucket and find the buffer. */
+		mutex = hp->mtx_hash;
+		MUTEX_READLOCK(env, mutex);
+		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh)
+			if (bhp->pgno == bharray[i].track_pgno &&
+			    bhp->mf_offset == bharray[i].track_off)
+				break;
+
+		/*
+		 * If we can't find the buffer we're done, somebody else had
+		 * to have written it.
+		 *
+		 * If the buffer isn't dirty, we're done, there's no work
+		 * needed.
+		 */
+		if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) {
+			MUTEX_UNLOCK(env, mutex);
+			--remaining;
+			bharray[i].track_hp = NULL;
+			continue;
+		}
+
+		/*
+		 * If the buffer is locked by another thread, ignore it, we'll
+		 * come back to it.
+		 */
+		if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+			MUTEX_UNLOCK(env, mutex);
+			if (!required_write) {
+				--remaining;
+				bharray[i].track_hp = NULL;
+			}
+			continue;
+		}
+
+		/* Pin the buffer into memory. */
+		atomic_inc(env, &bhp->ref);
+		MUTEX_UNLOCK(env, mutex);
+		MUTEX_READLOCK(env, bhp->mtx_buf);
+		DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE));
+
+		/*
+		 * When swapping the hash bucket mutex for the buffer mutex,
+		 * we may have raced with an MVCC update.  In that case, we
+		 * no longer have the most recent version, and need to retry
+		 * (the buffer header we have pinned will no longer be marked
+		 * dirty, so we can't just write it).
+		 */
+		if (SH_CHAIN_HASNEXT(bhp, vc)) {
+			atomic_dec(env, &bhp->ref);
+			MUTEX_UNLOCK(env, bhp->mtx_buf);
+			continue;
+		}
+
+		/*
+		 * If we've switched files, check to see if we're configured
+		 * to close file descriptors.
+		 */
+		if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) {
+			if (++filecnt >= maxopenfd) {
+				filecnt = 0;
+				if ((t_ret = __memp_close_flush_files(
+				    env, 1)) != 0 && ret == 0)
+					ret = t_ret;
+			}
+			last_mf_offset = bhp->mf_offset;
+		}
+
+		/*
+		 * If the buffer is dirty, we write it.  We only try to
+		 * write the buffer once.
+		 */
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+			if ((t_ret =
+			    __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) {
+				++wrote_cnt;
+				++wrote_total;
+			} else {
+				/* The buffer is being backed up, try again. */
+				if (t_ret == EAGAIN) {
+					atomic_dec(env, &bhp->ref);
+					MUTEX_UNLOCK(env, bhp->mtx_buf);
+					continue;
+				}
+				if (ret == 0)
+					ret = t_ret;
+				__db_errx(env, DB_STR_A("3027",
+				    "%s: unable to flush page: %lu", "%s %lu"),
+				    __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+
+			}
+		}
+
+		/* we disposed of this buffer. */
+		--remaining;
+		bharray[i].track_hp = NULL;
+
+		/* Discard our buffer reference. */
+		DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
+		atomic_dec(env, &bhp->ref);
+		MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+		/* Check if the call has been interrupted. */
+		if (LF_ISSET(DB_SYNC_INTERRUPT_OK) &&
+		    FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+			STAT(++mp->stat.st_sync_interrupted);
+			if (interruptedp != NULL)
+				*interruptedp = 1;
+			goto err;
+		}
+
+		/*
+		 * Sleep after some number of writes to avoid disk saturation.
+		 * Don't cache the max writes value, an application shutting
+		 * down might reset the value in order to do a fast flush or
+		 * checkpoint.
+		 */
+		if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) &&
+		    !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) &&
+		    mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) {
+			wrote_cnt = 0;
+			__os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep);
+		}
+	}
+
+done:	/*
+	 * If a write is required, we have to force the pages to disk.  We
+	 * don't do this as we go along because we want to give the OS as
+	 * much time as possible to lazily flush, and because we have to flush
+	 * files that might not even have had dirty buffers in the cache, so
+	 * we have to walk the files list.
+	 */
+	if (ret == 0 && required_write) {
+		if (dbmfp == NULL)
+			ret = __memp_sync_files(env);
+		else
+			ret = __os_fsync(env, dbmfp->fhp);
+	}
+
+	/* If we've opened files to flush pages, close them. */
+	if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	__os_free(env, bharray);
+	if (wrote_totalp != NULL)
+		*wrote_totalp = wrote_total;
+
+	return (ret);
+}
+
+static int
+__memp_sync_file(env, mfp, argp, countp, flags)
+	ENV *env;
+	MPOOLFILE *mfp;
+	void *argp;
+	u_int32_t *countp;
+	u_int32_t flags;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	int ret, t_ret;
+
+	COMPQUIET(countp, NULL);
+	COMPQUIET(flags, 0);
+
+	if (!mfp->file_written || mfp->no_backing_file ||
+	    mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+		return (0);
+	/*
+	 * Pin the MPOOLFILE structure into memory, and release the
+	 * region mutex allowing us to walk the linked list.  We'll
+	 * re-acquire that mutex to move to the next entry in the list.
+	 *
+	 * This works because we only need to flush current entries,
+	 * we don't care about new entries being added, and the linked
+	 * list is never re-ordered, a single pass is sufficient.  It
+	 * requires MPOOLFILE structures removed before we get to them
+	 * be flushed to disk, but that's nothing new, they could have
+	 * been removed while checkpoint was running, too.
+	 *
+	 * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is
+	 * not being discarded.  (A thread removing the MPOOLFILE
+	 * will: hold the MPOOLFILE mutex, set deadfile, drop the
+	 * MPOOLFILE mutex and then acquire the region MUTEX to walk
+	 * the linked list and remove the MPOOLFILE structure.  Make
+	 * sure the MPOOLFILE wasn't marked dead while we waited for
+	 * the mutex.
+	 */
+	MUTEX_LOCK(env, mfp->mutex);
+	if (!mfp->file_written || mfp->deadfile) {
+		MUTEX_UNLOCK(env, mfp->mutex);
+		return (0);
+	}
+	++mfp->mpf_cnt;
+	++mfp->neutral_cnt;
+	MUTEX_UNLOCK(env, mfp->mutex);
+
+	/*
+	 * Look for an already open, writable handle (fsync doesn't
+	 * work on read-only Windows handles).
+	 */
+	dbmp = env->mp_handle;
+	MUTEX_LOCK(env, dbmp->mutex);
+	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) {
+		if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY))
+			continue;
+		/*
+		 * We don't want to hold the mutex while calling sync.
+		 * Increment the DB_MPOOLFILE handle ref count to pin
+		 * it into memory.
+		 */
+		++dbmfp->ref;
+		break;
+	}
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	/* If we don't find a handle we can use, open one. */
+	if (dbmfp == NULL) {
+		if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) {
+			__db_err(env, ret, DB_STR_A("3028",
+			    "%s: unable to flush", "%s"), (char *)
+			    R_ADDR(dbmp->reginfo, mfp->path_off));
+		}
+	} else
+		ret = __os_fsync(env, dbmfp->fhp);
+
+	/*
+	 * Re-acquire the MPOOLFILE mutex, we need it to modify the
+	 * reference count.
+	 */
+	MUTEX_LOCK(env, mfp->mutex);
+
+	/*
+	 * If we wrote the file and there are no other references (or there
+	 * is a single reference, and it's the one we opened to write
+	 * buffers during checkpoint), clear the file_written flag.  We
+	 * do this so that applications opening thousands of files don't
+	 * loop here opening and flushing those files during checkpoint.
+	 *
+	 * The danger here is if a buffer were to be written as part of
+	 * a checkpoint, and then not be flushed to disk.  This cannot
+	 * happen because we only clear file_written when there are no
+	 * other users of the MPOOLFILE in the system, and, as we hold
+	 * the region lock, no possibility of another thread of control
+	 * racing with us to open a MPOOLFILE.
+	 */
+	if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 &&
+	    dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
+		mfp->file_written = 0;
+
+		/*
+		 * We may be the last reference for a MPOOLFILE, as we
+		 * weren't holding the MPOOLFILE mutex when flushing
+		 * it's buffers to disk.  If we can discard it, set
+		 * a flag to schedule a clean-out pass.   (Not likely,
+		 * I mean, what are the chances that there aren't any
+		 * buffers in the pool?  Regardless, it might happen.)
+		 */
+		if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0)
+			*(int *)argp = 1;
+	}
+
+	/*
+	 * If we found the file we must close it in case we are the last
+	 * reference to the dbmfp.  NOTE: since we have incremented
+	 * mfp->mpf_cnt this cannot be the last reference to the mfp.
+	 * This is important since we are called with the hash bucket
+	 * locked.  The mfp will get freed via the cleanup pass.
+	 */
+	if (dbmfp != NULL &&
+	    (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0)
+		ret = t_ret;
+
+	--mfp->mpf_cnt;
+	DB_ASSERT(env, mfp->neutral_cnt != 0);
+	--mfp->neutral_cnt;
+
+	/* Unlock the MPOOLFILE. */
+	MUTEX_UNLOCK(env, mfp->mutex);
+	return (ret);
+}
+
+/*
+ * __memp_sync_files --
+ *	Sync all the files in the environment, open or not.
+ */
+static int
+__memp_sync_files(env)
+	ENV *env;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *mp;
+	MPOOLFILE *mfp, *next_mfp;
+	int i, need_discard_pass, ret;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	need_discard_pass = ret = 0;
+
+	ret = __memp_walk_files(env,
+	    mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR);
+
+	/*
+	 * We may need to do a last pass through the MPOOLFILE list -- if we
+	 * were the last reference to an MPOOLFILE, we need to clean it out.
+	 */
+	if (!need_discard_pass)
+		return (ret);
+
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
+	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+retry:		MUTEX_LOCK(env, hp->mtx_hash);
+		for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket,
+		    __mpoolfile); mfp != NULL; mfp = next_mfp) {
+			next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
+			/*
+			 * Do a fast check -- we can check for zero/non-zero
+			 * without a mutex on the MPOOLFILE.  If likely to
+			 * succeed, lock the MPOOLFILE down and look for real.
+			 */
+			if (mfp->deadfile ||
+			    mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
+				continue;
+
+			MUTEX_LOCK(env, mfp->mutex);
+			if (!mfp->deadfile &&
+			    mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+				MUTEX_UNLOCK(env, hp->mtx_hash);
+				(void)__memp_mf_discard(dbmp, mfp, 0);
+				goto retry;
+			} else
+				MUTEX_UNLOCK(env, mfp->mutex);
+		}
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+	}
+	return (ret);
+}
+
+/*
+ * __memp_mf_sync --
+ *	Flush an MPOOLFILE, when no currently open handle is available.
+ *
+ * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
+ */
+int
+__memp_mf_sync(dbmp, mfp, locked)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	int locked;
+{
+	DB_FH *fhp;
+	DB_MPOOL_HASH *hp;
+	ENV *env;
+	MPOOL *mp;
+	int ret, t_ret;
+	char *rpath;
+
+	COMPQUIET(hp, NULL);
+	env = dbmp->env;
+
+	/*
+	 * We need to be holding the hash lock: we're using the path name
+	 * and __memp_nameop might try and rename the file.
+	 */
+	if (!locked) {
+		mp = dbmp->reginfo[0].primary;
+		hp = R_ADDR(dbmp->reginfo, mp->ftab);
+		hp += FNBUCKET(
+		    R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN);
+		MUTEX_LOCK(env, hp->mtx_hash);
+	}
+
+	if ((ret = __db_appname(env, DB_APP_DATA,
+	    R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) {
+		if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) {
+			ret = __os_fsync(env, fhp);
+			if ((t_ret =
+			    __os_closehandle(env, fhp)) != 0 && ret == 0)
+				ret = t_ret;
+		}
+		__os_free(env, rpath);
+	}
+
+	if (!locked)
+		MUTEX_UNLOCK(env, hp->mtx_hash);
+
+	return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ *	Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(env, dosync)
+	ENV *env;
+	int dosync;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	int ret;
+
+	dbmp = env->mp_handle;
+
+	/*
+	 * The routine exists because we must close files opened by sync to
+	 * flush buffers.  There are two cases: first, extent files have to
+	 * be closed so they may be removed when empty.  Second, regular
+	 * files have to be closed so we don't run out of descriptors (for
+	 * example, an application partitioning its data into databases
+	 * based on timestamps, so there's a continually increasing set of
+	 * files).
+	 *
+	 * We mark files opened in the __memp_bhwrite() function with the
+	 * MP_FLUSH flag.  Here we walk through our file descriptor list,
+	 * and, if a file was opened by __memp_bhwrite(), we close it.
+	 */
+retry:	MUTEX_LOCK(env, dbmp->mutex);
+	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
+		if (F_ISSET(dbmfp, MP_FLUSH)) {
+			F_CLR(dbmfp, MP_FLUSH);
+			MUTEX_UNLOCK(env, dbmp->mutex);
+			if (dosync) {
+				/*
+				 * If we have the only open handle on the file,
+				 * clear the dirty flag so we don't re-open and
+				 * sync it again when discarding the MPOOLFILE
+				 * structure.  Clear the flag before the sync
+				 * so can't race with a thread writing the file.
+				 */
+				mfp = dbmfp->mfp;
+				if (mfp->mpf_cnt == 1) {
+					MUTEX_LOCK(env, mfp->mutex);
+					if (mfp->mpf_cnt == 1)
+						mfp->file_written = 0;
+					MUTEX_UNLOCK(env, mfp->mutex);
+				}
+				if ((ret = __os_fsync(env, dbmfp->fhp)) != 0)
+					return (ret);
+			}
+			if ((ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0)
+				return (ret);
+			goto retry;
+		}
+	MUTEX_UNLOCK(env, dbmp->mutex);
+
+	return (0);
+}
+
+static int
+__bhcmp(p1, p2)
+	const void *p1, *p2;
+{
+	BH_TRACK *bhp1, *bhp2;
+
+	bhp1 = (BH_TRACK *)p1;
+	bhp2 = (BH_TRACK *)p2;
+
+	/* Sort by file (shared memory pool offset). */
+	if (bhp1->track_off < bhp2->track_off)
+		return (-1);
+	if (bhp1->track_off > bhp2->track_off)
+		return (1);
+
+	/*
+	 * !!!
+	 * Defend against badly written quicksort code calling the comparison
+	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+	 */
+	if (bhp1->track_pgno < bhp2->track_pgno)
+		return (-1);
+	if (bhp1->track_pgno > bhp2->track_pgno)
+		return (1);
+	return (0);
+}
diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c
new file mode 100644
index 00000000..fba528b3
--- /dev/null
+++ b/src/mp/mp_trickle.c
@@ -0,0 +1,112 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_trickle __P((ENV *, int, int *));
+
+/*
+ * __memp_trickle_pp --
+ *	ENV->memp_trickle pre/post processing.
+ *
+ * PUBLIC: int __memp_trickle_pp __P((DB_ENV *, int, int *));
+ */
+int
+__memp_trickle_pp(dbenv, pct, nwrotep)
+	DB_ENV *dbenv;
+	int pct, *nwrotep;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mp_handle, "memp_trickle", DB_INIT_MPOOL);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__memp_trickle(env, pct, nwrotep)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __memp_trickle --
+ *	ENV->memp_trickle.
+ */
+static int
+__memp_trickle(env, pct, nwrotep)
+	ENV *env;
+	int pct, *nwrotep;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp, *mp;
+	u_int32_t clean, dirty, i, need_clean, total, dtmp, wrote;
+	int ret;
+
+	dbmp = env->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	if (nwrotep != NULL)
+		*nwrotep = 0;
+
+	if (pct < 1 || pct > 100) {
+		__db_errx(env, DB_STR_A("3007",
+	    "DB_ENV->memp_trickle: %d: percent must be between 1 and 100",
+		    "%d"), pct);
+		return (EINVAL);
+	}
+
+	/*
+	 * Loop through the caches counting total/dirty buffers.
+	 *
+	 * XXX
+	 * Using hash_page_dirty is our only choice at the moment, but it's not
+	 * as correct as we might like in the presence of pools having more
+	 * than one page size, as a free 512B buffer may not be equivalent to
+	 * having a free 8KB buffer.
+	 */
+	for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		total += c_mp->pages;
+		__memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
+		dirty += dtmp;
+	}
+
+	/*
+	 * If there are sufficient clean buffers, no buffers or no dirty
+	 * buffers, we're done.
+	 */
+	if (total == 0 || dirty == 0)
+		return (0);
+
+	/*
+	 * The total number of pages is an exact number, but the dirty page
+	 * count can change while we're walking the hash buckets, and it's
+	 * even possible the dirty page count ends up larger than the total
+	 * number of pages.
+	 */
+	clean = total > dirty ? total - dirty : 0;
+	need_clean = (total * (u_int)pct) / 100;
+	if (clean >= need_clean)
+		return (0);
+
+	need_clean -= clean;
+	ret = __memp_sync_int(env, NULL,
+	    need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL);
+	STAT((mp->stat.st_page_trickle += wrote));
+	if (nwrotep != NULL)
+		*nwrotep = (int)wrote;
+
+	return (ret);
+}
diff --git a/src/mutex/README b/src/mutex/README
new file mode 100644
index 00000000..6e95c5fd
--- /dev/null
+++ b/src/mutex/README
@@ -0,0 +1,110 @@
+# $Id$
+
+Note: this only applies to locking using test-and-set and fcntl calls,
+pthreads were added after this was written.
+
+Resource locking routines: lock based on a DB_MUTEX.  All this gunk
+(including trying to make assembly code portable), is necessary because
+System V semaphores require system calls for uncontested locks and we
+don't want to make two system calls per resource lock.
+
+First, this is how it works.  The DB_MUTEX structure contains a resource
+test-and-set lock (tsl), a file offset, a pid for debugging and statistics
+information.
+
+If HAVE_MUTEX_FCNTL is NOT defined (that is, we know how to do
+test-and-sets for this compiler/architecture combination), we try and
+lock the resource tsl some number of times (based on the number of
+processors).  If we can't acquire the mutex that way, we use a system
+call to sleep for 1ms, 2ms, 4ms, etc.  (The time is bounded at 10ms for
+mutexes backing logical locks and 25 ms for data structures, just in
+case.)  Using the timer backoff means that there are two assumptions:
+that mutexes are held for brief periods (never over system calls or I/O)
+and mutexes are not hotly contested.
+
+If HAVE_MUTEX_FCNTL is defined, we use a file descriptor to do byte
+locking on a file at a specified offset.  In this case, ALL of the
+locking is done in the kernel.  Because file descriptors are allocated
+per process, we have to provide the file descriptor as part of the lock
+call.  We still have to do timer backoff because we need to be able to
+block ourselves, that is, the lock manager causes processes to wait by
+having the process acquire a mutex and then attempting to re-acquire the
+mutex.  There's no way to use kernel locking to block yourself, that is,
+if you hold a lock and attempt to re-acquire it, the attempt will
+succeed.
+
+Next, let's talk about why it doesn't work the way a reasonable person
+would think it should work.
+
+Ideally, we'd have the ability to try to lock the resource tsl, and if
+that fails, increment a counter of waiting processes, then block in the
+kernel until the tsl is released.  The process holding the resource tsl
+would see the wait counter when it went to release the resource tsl, and
+would wake any waiting processes up after releasing the lock.  This would
+actually require both another tsl (call it the mutex tsl) and
+synchronization between the call that blocks in the kernel and the actual
+resource tsl.  The mutex tsl would be used to protect accesses to the
+DB_MUTEX itself.  Locking the mutex tsl would be done by a busy loop,
+which is safe because processes would never block holding that tsl (all
+they would do is try to obtain the resource tsl and set/check the wait
+count).  The problem in this model is that the blocking call into the
+kernel requires a blocking semaphore, i.e. one whose normal state is
+locked.
+
+The only portable forms of locking under UNIX are fcntl(2) on a file
+descriptor/offset, and System V semaphores.  Neither of these locking
+methods are sufficient to solve the problem.
+
+The problem with fcntl locking is that only the process that obtained the
+lock can release it.  Remember, we want the normal state of the kernel
+semaphore to be locked.  So, if the creator of the DB_MUTEX were to
+initialize the lock to "locked", then a second process locks the resource
+tsl, and then a third process needs to block, waiting for the resource
+tsl, when the second process wants to wake up the third process, it can't
+because it's not the holder of the lock!  For the second process to be
+the holder of the lock, we would have to make a system call per
+uncontested lock, which is what we were trying to get away from in the
+first place.
+
+There are some hybrid schemes, such as signaling the holder of the lock,
+or using a different blocking offset depending on which process is
+holding the lock, but it gets complicated fairly quickly.  I'm open to
+suggestions, but I'm not holding my breath.
+
+Regardless, we use this form of locking when we don't have any other
+choice, because it doesn't have the limitations found in System V
+semaphores, and because the normal state of the kernel object in that
+case is unlocked, so the process releasing the lock is also the holder
+of the lock.
+
+The System V semaphore design has a number of other limitations that make
+it inappropriate for this task.  Namely:
+
+First, the semaphore key name space is separate from the file system name
+space (although there exist methods for using file names to create
+semaphore keys).  If we use a well-known key, there's no reason to believe
+that any particular key will not already be in use, either by another
+instance of the DB application or some other application, in which case
+the DB application will fail.  If we create a key, then we have to use a
+file system name to rendezvous and pass around the key.
+
+Second, System V semaphores traditionally have compile-time, system-wide
+limits on the number of semaphore keys that you can have.  Typically, that
+number is far too low for any practical purpose.  Since the semaphores
+permit more than a single slot per semaphore key, we could try and get
+around that limit by using multiple slots, but that means that the file
+that we're using for rendezvous is going to have to contain slot
+information as well as semaphore key information, and we're going to be
+reading/writing it on every db_mutex_t init or destroy operation.  Anyhow,
+similar compile-time, system-wide limits on the numbers of slots per
+semaphore key kick in, and you're right back where you started.
+
+My fantasy is that once POSIX.1 standard mutexes are in wide-spread use,
+we can switch to them.  My guess is that it won't happen, because the
+POSIX semaphores are only required to work for threads within a process,
+and not independent processes.
+
+Note: there are races in the statistics code, but since it's just that,
+I didn't bother fixing them.  (The fix requires a mutex tsl, so, when/if
+this code is fixed to do rational locking (see above), then change the
+statistics update code to acquire/release the mutex tsl.
diff --git a/src/mutex/mut_alloc.c b/src/mutex/mut_alloc.c
new file mode 100644
index 00000000..5df3de53
--- /dev/null
+++ b/src/mutex/mut_alloc.c
@@ -0,0 +1,291 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mutex_alloc --
+ *	Allocate a mutex from the mutex region.
+ *
+ * PUBLIC: int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc(env, alloc_id, flags, indxp)
+	ENV *env;
+	int alloc_id;
+	u_int32_t flags;
+	db_mutex_t *indxp;
+{
+	/* The caller may depend on us to initialize. */
+	*indxp = MUTEX_INVALID;
+
+	/*
+	 * If this is not an application lock, and we've turned off locking,
+	 * or the ENV handle isn't thread-safe, and this is a thread lock
+	 * or the environment isn't multi-process by definition, there's no
+	 * need to mutex at all.
+	 */
+	if (alloc_id != MTX_APPLICATION && alloc_id != MTX_MUTEX_TEST &&
+	    (F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||
+	    (!F_ISSET(env, ENV_THREAD) &&
+	    (LF_ISSET(DB_MUTEX_PROCESS_ONLY) ||
+	    F_ISSET(env, ENV_PRIVATE)))))
+		return (0);
+
+	/* Private environments never share mutexes. */
+	if (F_ISSET(env, ENV_PRIVATE))
+		LF_SET(DB_MUTEX_PROCESS_ONLY);
+
+	/*
+	 * If we have a region in which to allocate the mutexes, lock it and
+	 * do the allocation.
+	 */
+	if (!MUTEX_ON(env)) {
+		__db_errx(env, DB_STR("2033",
+		    "Mutex allocated before mutex region."));
+		return (__env_panic(env, EINVAL));
+	}
+	return (__mutex_alloc_int(env, 1, alloc_id, flags, indxp));
+}
+
+/*
+ * __mutex_alloc_int --
+ *	Internal routine to allocate a mutex.
+ *
+ * PUBLIC: int __mutex_alloc_int
+ * PUBLIC:	__P((ENV *, int, int, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc_int(env, locksys, alloc_id, flags, indxp)
+	ENV *env;
+	int locksys, alloc_id;
+	u_int32_t flags;
+	db_mutex_t *indxp;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t i;
+	size_t len;
+	u_int32_t cnt;
+	int ret;
+
+	dbenv = env->dbenv;
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	ret = 0;
+
+	/*
+	 * If we're not initializing the mutex region, then lock the region to
+	 * allocate new mutexes.  Drop the lock before initializing the mutex,
+	 * mutex initialization may require a system call.
+	 */
+	if (locksys)
+		MUTEX_SYSTEM_LOCK(env);
+
+	if (mtxregion->mutex_next == MUTEX_INVALID) {
+		if (mtxregion->stat.st_mutex_max != 0 &&
+		    mtxregion->stat.st_mutex_cnt >=
+		    mtxregion->stat.st_mutex_max) {
+nomem:			__db_errx(env, DB_STR("2034",
+	    "unable to allocate memory for mutex; resize mutex region"));
+			if (locksys)
+				MUTEX_SYSTEM_UNLOCK(env);
+			return (ret == 0 ? ENOMEM : ret);
+		}
+		cnt = mtxregion->stat.st_mutex_cnt / 2;
+		if (cnt < 8)
+			cnt = 8;
+		if (mtxregion->stat.st_mutex_max != 0 &&
+		    mtxregion->stat.st_mutex_cnt + cnt >
+		    mtxregion->stat.st_mutex_max)
+			cnt = mtxregion->stat.st_mutex_max -
+			    mtxregion->stat.st_mutex_cnt;
+		if (F_ISSET(env, ENV_PRIVATE)) {
+			F_SET(&mtxmgr->reginfo, REGION_TRACKED);
+			while (__env_alloc(&mtxmgr->reginfo,
+			    (cnt * mtxregion->mutex_size) +
+			    mtxregion->stat.st_mutex_align, &i) != 0)
+				if ((cnt >> 1) == 0)
+					break;
+			F_CLR(&mtxmgr->reginfo, REGION_TRACKED);
+			i = (db_mutex_t)ALIGNP_INC(i,
+			    mtxregion->stat.st_mutex_align);
+		} else {
+			len = cnt * mtxregion->mutex_size;
+			if ((ret = __env_alloc_extend(&mtxmgr->reginfo,
+			    R_ADDR(&mtxmgr->reginfo,
+			    mtxregion->mutex_off_alloc), &len)) != 0)
+				goto nomem;
+			cnt = (u_int32_t)(len / mtxregion->mutex_size);
+			i = mtxregion->stat.st_mutex_cnt + 1;
+		}
+		if (cnt == 0)
+			goto nomem;
+		mutexp = MUTEXP_SET(env, i);
+		mtxregion->stat.st_mutex_free = cnt;
+		mtxregion->mutex_next = i;
+		mtxregion->stat.st_mutex_cnt += cnt;
+		while (--cnt > 0) {
+			mutexp->flags = 0;
+			if (F_ISSET(env, ENV_PRIVATE))
+				mutexp->mutex_next_link =
+				    (uintptr_t)(mutexp + 1);
+			else
+				mutexp->mutex_next_link = ++i;
+			mutexp++;
+		}
+		mutexp->flags = 0;
+		mutexp->mutex_next_link = MUTEX_INVALID;
+	}
+
+	*indxp = mtxregion->mutex_next;
+	mutexp = MUTEXP_SET(env, *indxp);
+	DB_ASSERT(env,
+	    ((uintptr_t)mutexp & (dbenv->mutex_align - 1)) == 0);
+	mtxregion->mutex_next = mutexp->mutex_next_link;
+
+	--mtxregion->stat.st_mutex_free;
+	++mtxregion->stat.st_mutex_inuse;
+	if (mtxregion->stat.st_mutex_inuse > mtxregion->stat.st_mutex_inuse_max)
+		mtxregion->stat.st_mutex_inuse_max =
+		    mtxregion->stat.st_mutex_inuse;
+	if (locksys)
+		MUTEX_SYSTEM_UNLOCK(env);
+
+	/* Initialize the mutex. */
+	memset(mutexp, 0, sizeof(*mutexp));
+	F_SET(mutexp, DB_MUTEX_ALLOCATED |
+	    LF_ISSET(DB_MUTEX_LOGICAL_LOCK |
+		DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED));
+
+	/*
+	 * If the mutex is associated with a single process, set the process
+	 * ID.  If the application ever calls DbEnv::failchk, we'll need the
+	 * process ID to know if the mutex is still in use.
+	 */
+	if (LF_ISSET(DB_MUTEX_PROCESS_ONLY))
+		dbenv->thread_id(dbenv, &mutexp->pid, NULL);
+
+#ifdef HAVE_STATISTICS
+	mutexp->alloc_id = alloc_id;
+#else
+	COMPQUIET(alloc_id, 0);
+#endif
+
+	if ((ret = __mutex_init(env, *indxp, flags)) != 0)
+		(void)__mutex_free_int(env, locksys, indxp);
+
+	return (ret);
+}
+
+/*
+ * __mutex_free --
+ *	Free a mutex.
+ *
+ * PUBLIC: int __mutex_free __P((ENV *, db_mutex_t *));
+ */
+int
+__mutex_free(env, indxp)
+	ENV *env;
+	db_mutex_t *indxp;
+{
+	/*
+	 * There is no explicit ordering in how the regions are cleaned up
+	 * up and/or discarded when an environment is destroyed (either a
+	 * private environment is closed or a public environment is removed).
+	 * The way we deal with mutexes is to clean up all remaining mutexes
+	 * when we close the mutex environment (because we have to be able to
+	 * do that anyway, after a crash), which means we don't have to deal
+	 * with region cleanup ordering on normal environment destruction.
+	 * All that said, what it really means is we can get here without a
+	 * mpool region.  It's OK, the mutex has been, or will be, destroyed.
+	 *
+	 * If the mutex has never been configured, we're done.
+	 */
+	if (!MUTEX_ON(env) || *indxp == MUTEX_INVALID)
+		return (0);
+
+	return (__mutex_free_int(env, 1, indxp));
+}
+
+/*
+ * __mutex_free_int --
+ *	Internal routine to free a mutex.
+ *
+ * PUBLIC: int __mutex_free_int __P((ENV *, int, db_mutex_t *));
+ */
+int
+__mutex_free_int(env, locksys, indxp)
+	ENV *env;
+	int locksys;
+	db_mutex_t *indxp;
+{
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t mutex;
+	int ret;
+
+	mutex = *indxp;
+	*indxp = MUTEX_INVALID;
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_ALLOCATED));
+	F_CLR(mutexp, DB_MUTEX_ALLOCATED);
+
+	ret = __mutex_destroy(env, mutex);
+
+	if (locksys)
+		MUTEX_SYSTEM_LOCK(env);
+
+	/* Link the mutex on the head of the free list. */
+	mutexp->mutex_next_link = mtxregion->mutex_next;
+	mtxregion->mutex_next = mutex;
+	++mtxregion->stat.st_mutex_free;
+	--mtxregion->stat.st_mutex_inuse;
+
+	if (locksys)
+		MUTEX_SYSTEM_UNLOCK(env);
+
+	return (ret);
+}
+
+/*
+ * __mutex_refresh --
+ *	Reinitialize a mutex, if we are not sure of its state.
+ *
+ * PUBLIC: int __mutex_refresh __P((ENV *, db_mutex_t));
+ */
+int
+__mutex_refresh(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_MUTEX *mutexp;
+	u_int32_t flags;
+	int ret;
+
+	mutexp = MUTEXP_SET(env, mutex);
+	flags = mutexp->flags;
+	if ((ret = __mutex_destroy(env, mutex)) == 0) {
+		memset(mutexp, 0, sizeof(*mutexp));
+		F_SET(mutexp, DB_MUTEX_ALLOCATED |
+		    LF_ISSET(DB_MUTEX_LOGICAL_LOCK |
+			DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED));
+		LF_CLR(DB_MUTEX_LOCKED);
+		ret = __mutex_init(env, mutex, flags);
+	}
+	return (ret);
+}
diff --git a/src/mutex/mut_failchk.c b/src/mutex/mut_failchk.c
new file mode 100644
index 00000000..1425389f
--- /dev/null
+++ b/src/mutex/mut_failchk.c
@@ -0,0 +1,76 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mut_failchk --
+ *	Check for mutexes held by dead processes.
+ *
+ * PUBLIC: int __mut_failchk __P((ENV *));
+ */
+int
+__mut_failchk(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t i;
+	int ret;
+	char buf[DB_THREADID_STRLEN];
+	db_threadid_t unused;
+
+	if (F_ISSET(env, ENV_PRIVATE))
+		return (0);
+
+	DB_THREADID_INIT(unused);
+
+	dbenv = env->dbenv;
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	ret = 0;
+
+	MUTEX_SYSTEM_LOCK(env);
+	for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i, ++mutexp) {
+		mutexp = MUTEXP_SET(env, i);
+
+		/*
+		 * We're looking for per-process mutexes where the process
+		 * has died.
+		 */
+		if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED) ||
+		    !F_ISSET(mutexp, DB_MUTEX_PROCESS_ONLY))
+			continue;
+
+		/*
+		 * The thread that allocated the mutex may have exited, but
+		 * we cannot reclaim the mutex if the process is still alive.
+		 */
+		if (dbenv->is_alive(
+		    dbenv, mutexp->pid, unused, DB_MUTEX_PROCESS_ONLY))
+			continue;
+
+		__db_msg(env, DB_STR_A("2017",
+		    "Freeing mutex for process: %s", "%s"),
+		    dbenv->thread_id_string(dbenv, mutexp->pid, unused, buf));
+
+		/* Unlock and free the mutex. */
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+			MUTEX_UNLOCK(env, i);
+
+		if ((ret = __mutex_free_int(env, 0, &i)) != 0)
+			break;
+	}
+	MUTEX_SYSTEM_UNLOCK(env);
+
+	return (ret);
+}
diff --git a/src/mutex/mut_fcntl.c b/src/mutex/mut_fcntl.c
new file mode 100644
index 00000000..0694aa59
--- /dev/null
+++ b/src/mutex/mut_fcntl.c
@@ -0,0 +1,248 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static inline int __db_fcntl_mutex_lock_int
+	    __P((ENV *, db_mutex_t, db_timeout_t, int));
+
+/*
+ * __db_fcntl_mutex_init --
+ *	Initialize a fcntl mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_fcntl_mutex_init(env, mutex, flags)
+	ENV *env;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+	COMPQUIET(flags, 0);
+
+	return (0);
+}
+
+/*
+ * __db_fcntl_mutex_lock_int
+ *	Internal function to lock a mutex, blocking only when requested
+ */
+inline int
+__db_fcntl_mutex_lock_int(env, mutex, timeout, wait)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+	int wait;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_THREAD_INFO *ip;
+	struct flock k_lock;
+	int locked, ms, ret;
+	db_timespec now, timespec;
+	db_timeout_t time_left;
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+#ifdef HAVE_STATISTICS
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		++mutexp->mutex_set_wait;
+	else
+		++mutexp->mutex_set_nowait;
+#endif
+
+	/* Initialize the lock. */
+	k_lock.l_whence = SEEK_SET;
+	k_lock.l_start = mutex;
+	k_lock.l_len = 1;
+
+	if (timeout != 0) {
+		timespecclear(&timespec);
+		__clock_set_expires(env, &timespec, timeout);
+	}
+
+	/*
+	 * Only check the thread state once, by initializing the thread
+	 * control block pointer to null.  If it is not the failchk
+	 * thread, then ip will have a valid value subsequent times
+	 * in the loop.
+	 */
+	ip = NULL;
+
+	for (locked = 0;;) {
+		/*
+		 * Wait for the lock to become available; wait 1ms initially,
+		 * up to 1 second.
+		 */
+		for (ms = 1; F_ISSET(mutexp, DB_MUTEX_LOCKED);) {
+			if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+			    ip == NULL && dbenv->is_alive(dbenv,
+			    mutexp->pid, mutexp->tid, 0) == 0) {
+				ret = __env_set_state(env, &ip, THREAD_VERIFY);
+				if (ret != 0 ||
+				    ip->dbth_state == THREAD_FAILCHK)
+					return (DB_RUNRECOVERY);
+			}
+			if (!wait)
+				return (DB_LOCK_NOTGRANTED);
+			if (timeout != 0) {
+				timespecclear(&now);
+				if (__clock_expired(env, &now, &timespec))
+					return (DB_TIMEOUT);
+				DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
+				time_left = timeout - time_left;
+				if (ms * US_PER_MS > time_left)
+					ms = time_left / US_PER_MS;
+			}
+			__os_yield(NULL, 0, ms * US_PER_MS);
+			if ((ms <<= 1) > MS_PER_SEC)
+				ms = MS_PER_SEC;
+		}
+
+		/* Acquire an exclusive kernel lock on the byte. */
+		k_lock.l_type = F_WRLCK;
+		if (fcntl(env->lockfhp->fd, F_SETLKW, &k_lock))
+			goto err;
+
+		/* If the resource is still available, it's ours. */
+		if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			locked = 1;
+
+			F_SET(mutexp, DB_MUTEX_LOCKED);
+			dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+		}
+
+		/* Release the kernel lock. */
+		k_lock.l_type = F_UNLCK;
+		if (fcntl(env->lockfhp->fd, F_SETLK, &k_lock))
+			goto err;
+
+		/*
+		 * If we got the resource lock we're done.
+		 *
+		 * !!!
+		 * We can't check to see if the lock is ours, because we may
+		 * be trying to block ourselves in the lock manager, and so
+		 * the holder of the lock that's preventing us from getting
+		 * the lock may be us!  (Seriously.)
+		 */
+		if (locked)
+			break;
+	}
+
+#ifdef DIAGNOSTIC
+	/*
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a mutex to ensure contention.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+	return (0);
+
+err:	ret = __os_get_syserr();
+	__db_syserr(env, ret, DB_STR("2019", "fcntl lock failed"));
+	return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_fcntl_mutex_lock
+ *	Lock a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_fcntl_mutex_lock(env, mutex, timeout)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+{
+	return (__db_fcntl_mutex_lock_int(env, mutex, timeout, 1));
+}
+
+/*
+ * __db_fcntl_mutex_trylock
+ *	Try to lock a mutex, without blocking when it is busy.
+ *
+ * PUBLIC: int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_trylock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_fcntl_mutex_lock_int(env, mutex, 0, 0));
+}
+
+/*
+ * __db_fcntl_mutex_unlock --
+ *	Release a mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_unlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+#ifdef DIAGNOSTIC
+	if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+		__db_errx(env, DB_STR("2020",
+		    "fcntl unlock failed: lock already unlocked"));
+		return (__env_panic(env, EACCES));
+	}
+#endif
+
+	/*
+	 * Release the resource.  We don't have to acquire any locks because
+	 * processes trying to acquire the lock are waiting for the flag to
+	 * go to 0.  Once that happens the waiters will serialize acquiring
+	 * an exclusive kernel lock before locking the mutex.
+	 */
+	F_CLR(mutexp, DB_MUTEX_LOCKED);
+
+	return (0);
+}
+
+/*
+ * __db_fcntl_mutex_destroy --
+ *	Destroy a mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_destroy(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+
+	return (0);
+}
diff --git a/src/mutex/mut_method.c b/src/mutex/mut_method.c
new file mode 100644
index 00000000..cb666082
--- /dev/null
+++ b/src/mutex/mut_method.c
@@ -0,0 +1,482 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mutex_alloc_pp --
+ *	Allocate a mutex, application method.
+ *
+ * PUBLIC: int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc_pp(dbenv, flags, indxp)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	db_mutex_t *indxp;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if ((ret = __db_fchk(env, "DB_ENV->mutex_alloc",
+	    flags, DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SELF_BLOCK)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __mutex_alloc(env, MTX_APPLICATION, flags, indxp);
+	ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __mutex_free_pp --
+ *	Destroy a mutex, application method.
+ *
+ * PUBLIC: int __mutex_free_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_free_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if (indx == MUTEX_INVALID)
+		return (EINVAL);
+
+	/*
+	 * Internally Berkeley DB passes around the db_mutex_t address on
+	 * free, because we want to make absolutely sure the slot gets
+	 * overwritten with MUTEX_INVALID.  We don't export MUTEX_INVALID,
+	 * so we don't export that part of the API, either.
+	 */
+	ENV_ENTER(env, ip);
+	ret = __mutex_free(env, &indx);
+	ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __mutex_lock --
+ *	Lock a mutex, application method.
+ *
+ * PUBLIC: int __mutex_lock_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_lock_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if (indx == MUTEX_INVALID)
+		return (EINVAL);
+
+	ENV_ENTER(env, ip);
+	ret = __mutex_lock(env, indx);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __mutex_unlock --
+ *	Unlock a mutex, application method.
+ *
+ * PUBLIC: int __mutex_unlock_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_unlock_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if (indx == MUTEX_INVALID)
+		return (EINVAL);
+
+	ENV_ENTER(env, ip);
+	ret = __mutex_unlock(env, indx);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __mutex_get_align --
+ *	DB_ENV->mutex_get_align.
+ *
+ * PUBLIC: int __mutex_get_align __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_align(dbenv, alignp)
+	DB_ENV *dbenv;
+	u_int32_t *alignp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (MUTEX_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*alignp = ((DB_MUTEXREGION *)
+		    env->mutex_handle->reginfo.primary)->stat.st_mutex_align;
+	} else
+		*alignp = dbenv->mutex_align;
+	return (0);
+}
+
+/*
+ * __mutex_set_align --
+ *	DB_ENV->mutex_set_align.
+ *
+ * PUBLIC: int __mutex_set_align __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_align(dbenv, align)
+	DB_ENV *dbenv;
+	u_int32_t align;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_align");
+
+	if (align == 0 || !POWER_OF_TWO(align)) {
+		__db_errx(env, DB_STR("2018",
+"DB_ENV->mutex_set_align: alignment value must be a non-zero power-of-two"));
+		return (EINVAL);
+	}
+
+	dbenv->mutex_align = align;
+	return (0);
+}
+
+/*
+ * __mutex_get_increment --
+ *	DB_ENV->mutex_get_increment.
+ *
+ * PUBLIC: int __mutex_get_increment __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_increment(dbenv, incrementp)
+	DB_ENV *dbenv;
+	u_int32_t *incrementp;
+{
+	/*
+	 * We don't maintain the increment in the region (it just makes
+	 * no sense).  Return whatever we have configured on this handle,
+	 * nobody is ever going to notice.
+	 */
+	*incrementp = dbenv->mutex_inc;
+	return (0);
+}
+
+/*
+ * __mutex_set_increment --
+ *	DB_ENV->mutex_set_increment.
+ *
+ * PUBLIC: int __mutex_set_increment __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_increment(dbenv, increment)
+	DB_ENV *dbenv;
+	u_int32_t increment;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_increment");
+
+	dbenv->mutex_cnt = 0;
+	dbenv->mutex_inc = increment;
+	return (0);
+}
+
+/*
+ * __mutex_get_init --
+ *	DB_ENV->mutex_get_init.
+ *
+ * PUBLIC: int __mutex_get_init __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_init(dbenv, initp)
+	DB_ENV *dbenv;
+	u_int32_t *initp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (MUTEX_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*initp = ((DB_MUTEXREGION *)
+		    env->mutex_handle->reginfo.primary)->stat.st_mutex_init;
+	} else
+		*initp = dbenv->mutex_cnt;
+	return (0);
+}
+
+/*
+ * __mutex_set_init --
+ *	DB_ENV->mutex_set_init.
+ *
+ * PUBLIC: int __mutex_set_init __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_init(dbenv, init)
+	DB_ENV *dbenv;
+	u_int32_t init;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_init");
+
+	dbenv->mutex_cnt = init;
+	dbenv->mutex_inc = 0;
+	return (0);
+}
+
+/*
+ * __mutex_get_max --
+ *	DB_ENV->mutex_get_max.
+ *
+ * PUBLIC: int __mutex_get_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_max(dbenv, maxp)
+	DB_ENV *dbenv;
+	u_int32_t *maxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (MUTEX_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*maxp = ((DB_MUTEXREGION *)
+		    env->mutex_handle->reginfo.primary)->stat.st_mutex_max;
+	} else
+		*maxp = dbenv->mutex_max;
+	return (0);
+}
+
+/*
+ * __mutex_set_max --
+ *	DB_ENV->mutex_set_max.
+ *
+ * PUBLIC: int __mutex_set_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_max(dbenv, max)
+	DB_ENV *dbenv;
+	u_int32_t max;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_max");
+
+	dbenv->mutex_max = max;
+	dbenv->mutex_inc = 0;
+	return (0);
+}
+
+/*
+ * __mutex_get_tas_spins --
+ *	DB_ENV->mutex_get_tas_spins.
+ *
+ * PUBLIC: int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_tas_spins(dbenv, tas_spinsp)
+	DB_ENV *dbenv;
+	u_int32_t *tas_spinsp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (MUTEX_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*tas_spinsp = ((DB_MUTEXREGION *)env->
+		    mutex_handle->reginfo.primary)->stat.st_mutex_tas_spins;
+	} else
+		*tas_spinsp = dbenv->mutex_tas_spins;
+	return (0);
+}
+
+/*
+ * __mutex_set_tas_spins --
+ *	DB_ENV->mutex_set_tas_spins.
+ *
+ * PUBLIC: int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_tas_spins(dbenv, tas_spins)
+	DB_ENV *dbenv;
+	u_int32_t tas_spins;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	/*
+	 * Bound the value -- less than 1 makes no sense, greater than 1M
+	 * makes no sense.
+	 */
+	if (tas_spins == 0)
+		tas_spins = 1;
+	else if (tas_spins > 1000000)
+		tas_spins = 1000000;
+
+	/*
+	 * There's a theoretical race here, but I'm not interested in locking
+	 * the test-and-set spin count.  The worst possibility is a thread
+	 * reads out a bad spin count and spins until it gets the lock, but
+	 * that's awfully unlikely.
+	 */
+	if (MUTEX_ON(env))
+		((DB_MUTEXREGION *)env->mutex_handle
+		    ->reginfo.primary)->stat.st_mutex_tas_spins = tas_spins;
+	else
+		dbenv->mutex_tas_spins = tas_spins;
+	return (0);
+}
+
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+/*
+ * Provide atomic operations for platforms which have mutexes yet do not have
+ * native atomic operations configured. They are emulated by protected the
+ * operation with a mutex.  The address of the atomic value selects which
+ * mutex to use.
+ */
+/*
+ * atomic_get_mutex -
+ *	Map an address to the mutex to use to atomically modify it
+ */
+static inline db_mutex_t atomic_get_mutex(env, v)
+	ENV *env;
+	db_atomic_t *v;
+{
+	u_int	index;
+	DB_MUTEXREGION *mtxreg;
+
+	if (!MUTEX_ON(env))
+		return (MUTEX_INVALID);
+	index = (u_int)(((uintptr_t) (v)) >> 6) % MAX_ATOMIC_MUTEXES;
+	mtxreg = (DB_MUTEXREGION *)env->mutex_handle->reginfo.primary;
+	return (mtxreg->mtx_atomic[index]);
+}
+
+/*
+ * __atomic_inc
+ *	Use a mutex to provide an atomic increment function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *));
+ * PUBLIC: #endif
+ */
+atomic_value_t
+__atomic_inc(env, v)
+	ENV *env;
+	db_atomic_t *v;
+{
+	db_mutex_t mtx;
+	int ret;
+
+	mtx = atomic_get_mutex(env, v);
+	MUTEX_LOCK(env, mtx);
+	ret = ++v->value;
+	MUTEX_UNLOCK(env, mtx);
+
+	return (ret);
+}
+
+/*
+ * __atomic_dec
+ *	Use a mutex to provide an atomic decrement function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: atomic_value_t __atomic_dec __P((ENV *, db_atomic_t *));
+ * PUBLIC: #endif
+ */
+atomic_value_t
+__atomic_dec(env, v)
+	ENV *env;
+	db_atomic_t *v;
+{
+	db_mutex_t mtx;
+	int ret;
+
+	mtx = atomic_get_mutex(env, v);
+	MUTEX_LOCK(env, mtx);
+	ret = --v->value;
+	MUTEX_UNLOCK(env, mtx);
+
+	return (ret);
+}
+
+/*
+ * atomic_compare_exchange
+ *	Use a mutex to provide an atomic decrement function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: int atomic_compare_exchange
+ * PUBLIC:     __P((ENV *, db_atomic_t *, atomic_value_t, atomic_value_t));
+ * PUBLIC: #endif
+ *	Returns 1 if the *v was equal to oldval, else 0
+ *
+ *	Side Effect:
+ *		Sets the value to newval if and only if returning 1
+ */
+int
+atomic_compare_exchange(env, v, oldval, newval)
+	ENV *env;
+	db_atomic_t *v;
+	atomic_value_t oldval;
+	atomic_value_t newval;
+{
+	db_mutex_t mtx;
+	int ret;
+
+	if (atomic_read(v) != oldval)
+		return (0);
+
+	mtx = atomic_get_mutex(env, v);
+	MUTEX_LOCK(env, mtx);
+	ret = atomic_read(v) == oldval;
+	if (ret)
+		atomic_init(v, newval);
+	MUTEX_UNLOCK(env, mtx);
+
+	return (ret);
+}
+#endif
diff --git a/src/mutex/mut_pthread.c b/src/mutex/mut_pthread.c
new file mode 100644
index 00000000..1ec4fb9c
--- /dev/null
+++ b/src/mutex/mut_pthread.c
@@ -0,0 +1,770 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * This is where we load in architecture/compiler specific mutex code.
+ */
+#define	LOAD_ACTUAL_MUTEX_CODE
+
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+#define	pthread_cond_destroy(x)		0
+#define	pthread_cond_signal		_lwp_cond_signal
+#define	pthread_cond_broadcast		_lwp_cond_broadcast
+#define	pthread_cond_wait		_lwp_cond_wait
+#define	pthread_mutex_destroy(x)	0
+#define	pthread_mutex_lock		_lwp_mutex_lock
+#define	pthread_mutex_trylock		_lwp_mutex_trylock
+#define	pthread_mutex_unlock		_lwp_mutex_unlock
+#endif
+#ifdef HAVE_MUTEX_UI_THREADS
+#define	pthread_cond_destroy(x)		cond_destroy
+#define	pthread_cond_broadcast		cond_broadcast
+#define	pthread_cond_wait		cond_wait
+#define	pthread_mutex_destroy		mutex_destroy
+#define	pthread_mutex_lock		mutex_lock
+#define	pthread_mutex_trylock		mutex_trylock
+#define	pthread_mutex_unlock		mutex_unlock
+#endif
+
+/*
+ * According to HP-UX engineers contacted by Netscape,
+ * pthread_mutex_unlock() will occasionally return EFAULT for no good reason
+ * on mutexes in shared memory regions, and the correct caller behavior
+ * is to try again.  Do so, up to EFAULT_RETRY_ATTEMPTS consecutive times.
+ * Note that we don't bother to restrict this to HP-UX;
+ * it should be harmless elsewhere. [#2471]
+ */
+#define	EFAULT_RETRY_ATTEMPTS	5
+#define	RETRY_ON_EFAULT(func_invocation, ret) do {	\
+	int i;						\
+	i = EFAULT_RETRY_ATTEMPTS;			\
+	do {						\
+		RET_SET((func_invocation), ret);	\
+	} while (ret == EFAULT && --i > 0);		\
+} while (0)
+
+/*
+ * IBM's MVS pthread mutex implementation returns -1 and sets errno rather than
+ * returning errno itself.  As -1 is not a valid errno value, assume functions
+ * returning -1 have set errno.  If they haven't, return a random error value.
+ */
+#define	RET_SET(f, ret) do {						\
+	if (((ret) = (f)) == -1 && ((ret) = errno) == 0)		\
+		(ret) = EAGAIN;						\
+} while (0)
+
+/*
+ * __db_pthread_mutex_init --
+ *	Initialize a pthread mutex: either a native one or
+ *	just the mutex for block/wakeup of a hybrid test-and-set mutex
+ *
+ *
+ * PUBLIC: int __db_pthread_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_pthread_mutex_init(env, mutex, flags)
+	ENV *env;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	DB_MUTEX *mutexp;
+	int ret;
+
+	mutexp = MUTEXP_SET(env, mutex);
+	ret = 0;
+
+#ifndef HAVE_MUTEX_HYBRID
+	/* Can't have self-blocking shared latches.  */
+	DB_ASSERT(env, !LF_ISSET(DB_MUTEX_SELF_BLOCK) ||
+	    !LF_ISSET(DB_MUTEX_SHARED));
+#endif
+
+#ifdef HAVE_MUTEX_PTHREADS
+	{
+#ifndef HAVE_MUTEX_THREAD_ONLY
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+#endif
+	pthread_condattr_t *condattrp = NULL;
+	pthread_mutexattr_t *mutexattrp = NULL;
+
+#ifndef HAVE_MUTEX_HYBRID
+	if (LF_ISSET(DB_MUTEX_SHARED)) {
+#if defined(HAVE_SHARED_LATCHES)
+		pthread_rwlockattr_t rwlockattr, *rwlockattrp = NULL;
+#ifndef HAVE_MUTEX_THREAD_ONLY
+		if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+			RET_SET((pthread_rwlockattr_init(&rwlockattr)), ret);
+			if (ret != 0)
+				goto err;
+			RET_SET((pthread_rwlockattr_setpshared(
+			    &rwlockattr, PTHREAD_PROCESS_SHARED)), ret);
+			rwlockattrp = &rwlockattr;
+		}
+#endif
+
+		if (ret == 0)
+			RET_SET((pthread_rwlock_init(&mutexp->u.rwlock,
+			    rwlockattrp)), ret);
+		if (rwlockattrp != NULL)
+			(void)pthread_rwlockattr_destroy(rwlockattrp);
+
+		F_SET(mutexp, DB_MUTEX_SHARED);
+		/* For rwlocks, we're done - cannot use the mutex or cond */
+		goto err;
+#endif
+	}
+#endif
+#ifndef HAVE_MUTEX_THREAD_ONLY
+	if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+		RET_SET((pthread_mutexattr_init(&mutexattr)), ret);
+		if (ret != 0)
+			goto err;
+		RET_SET((pthread_mutexattr_setpshared(
+		    &mutexattr, PTHREAD_PROCESS_SHARED)), ret);
+		mutexattrp = &mutexattr;
+	}
+#endif
+
+	if (ret == 0)
+		RET_SET(
+		    (pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp)), ret);
+
+	if (mutexattrp != NULL)
+		(void)pthread_mutexattr_destroy(mutexattrp);
+	if (ret != 0)
+		goto err;
+	if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+#ifndef HAVE_MUTEX_THREAD_ONLY
+		if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+			RET_SET((pthread_condattr_init(&condattr)), ret);
+			if (ret != 0)
+				goto err;
+
+			condattrp = &condattr;
+			RET_SET((pthread_condattr_setpshared(
+			    &condattr, PTHREAD_PROCESS_SHARED)), ret);
+		}
+#endif
+
+		if (ret == 0)
+			RET_SET((pthread_cond_init(
+			    &mutexp->u.m.cond, condattrp)), ret);
+
+		F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+		if (condattrp != NULL)
+			(void)pthread_condattr_destroy(condattrp);
+	}
+
+	}
+#endif
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+	/*
+	 * XXX
+	 * Gcc complains about missing braces in the static initializations of
+	 * lwp_cond_t and lwp_mutex_t structures because the structures contain
+	 * sub-structures/unions and the Solaris include file that defines the
+	 * initialization values doesn't have surrounding braces.  There's not
+	 * much we can do.
+	 */
+	if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+		static lwp_mutex_t mi = DEFAULTMUTEX;
+
+		mutexp->mutex = mi;
+	} else {
+		static lwp_mutex_t mi = SHAREDMUTEX;
+
+		mutexp->mutex = mi;
+	}
+	if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+		if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+			static lwp_cond_t ci = DEFAULTCV;
+
+			mutexp->cond = ci;
+		} else {
+			static lwp_cond_t ci = SHAREDCV;
+
+			mutexp->cond = ci;
+		}
+		F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+	}
+#endif
+#ifdef HAVE_MUTEX_UI_THREADS
+	{
+	int type;
+
+	type = LF_ISSET(DB_MUTEX_PROCESS_ONLY) ? USYNC_THREAD : USYNC_PROCESS;
+
+	ret = mutex_init(&mutexp->mutex, type, NULL);
+	if (ret == 0 && LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+		ret = cond_init(&mutexp->cond, type, NULL);
+
+		F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+	}}
+#endif
+
+err:	if (ret != 0) {
+		__db_err(env, ret, DB_STR("2021",
+		    "unable to initialize mutex"));
+	}
+	return (ret);
+}
+
+/*
+ * __db_pthread_mutex_prep
+ *	Prepare to use a pthread-based DB_MUTEX.
+ *
+ *	This exclusively locks a DB_MUTEX's pthread_mutex_t or pthread_rwlock_t,
+ *	before locking, unlocking, or waiting for the DB mutex to be become
+ *	available in the requested mode (exclusive == 1, shared == 0).
+ *
+ *	Test for failchk concerns here too, to avoid hanging on a dead pid/tid.
+ */
+inline static int
+__db_pthread_mutex_prep(env, mutex, mutexp, exclusive)
+	ENV *env;
+	db_mutex_t mutex;
+	DB_MUTEX *mutexp;
+	int exclusive;
+{
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	dbenv = env->dbenv;
+	PERFMON4(env,
+	    mutex, suspend, mutex, exclusive, mutexp->alloc_id, mutexp);
+	if (F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+		for (;;) {
+			RET_SET_PTHREAD_TRYLOCK(mutexp, ret);
+			if (ret != EBUSY)
+				break;
+			if (dbenv->is_alive(dbenv,
+			    mutexp->pid, mutexp->tid, 0) == 0) {
+				ret = __env_set_state(env, &ip, THREAD_VERIFY);
+				if (ret != 0 ||
+				    ip->dbth_state == THREAD_FAILCHK) {
+					ret = DB_RUNRECOVERY;
+				} else {
+					/*
+					 * Some thread other than the true
+					 * FAILCHK thread in this process is
+					 * asking for the mutex held by the
+					 * dead process/thread.  We will block
+					 * here until someone else does the
+					 * cleanup.  Same behavior as if we
+					 * hadn't gone down the 'if
+					 * DB_ENV_FAILCHK' path to start with.
+					 */
+				    RET_SET_PTHREAD_LOCK(mutexp, ret);
+				    break;
+				}
+			}
+		}
+	} else
+		RET_SET_PTHREAD_LOCK(mutexp, ret);
+
+	PERFMON4(env,
+	    mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+	COMPQUIET(mutex, 0);
+	COMPQUIET(exclusive, 0);
+	return (ret);
+}
+
+/*
+ * __db_pthread_mutex_condwait
+ *	Perform a pthread condition wait for a DB_MUTEX.
+ *
+ *	This will be a timed wait when a timespec has been specified. EINTR and
+ *	spurious ETIME* values are mapped to 0, and hence success.  The
+ *	mutexp->u.m.mutex must be locked upon entry. When returning a success
+ *	or timeout status it will have been locked again.
+ *
+ *	Returns:
+ *		0 if it is safe to retry to get the mutex
+ *		DB_TIMEOUT if the timeout exceeded
+ *		<other> a fatal error. The mutexp->u.m.mutex has been unlocked.
+ */
+inline static int
+__db_pthread_mutex_condwait(env, mutex, mutexp, timespec)
+	ENV *env;
+	db_mutex_t mutex;
+	DB_MUTEX *mutexp;
+	db_timespec *timespec;
+{
+	int ret;
+
+#ifdef MUTEX_DIAG
+	printf("condwait %ld %x wait busy %x count %d\n",
+	    mutex, pthread_self(), MUTEXP_BUSY_FIELD(mutexp), mutexp->wait);
+#endif
+	PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+	if (timespec != NULL) {
+		RET_SET((pthread_cond_timedwait(&mutexp->u.m.cond,
+		    &mutexp->u.m.mutex, (struct timespec *) timespec)), ret);
+		if (ret == ETIMEDOUT) {
+			ret = DB_TIMEOUT;
+			goto ret;
+		}
+	} else
+		RET_SET((pthread_cond_wait(&mutexp->u.m.cond,
+		    &mutexp->u.m.mutex)), ret);
+#ifdef MUTEX_DIAG
+	printf("condwait %ld %x wait returns %d busy %x\n",
+	    mutex, pthread_self(), ret, MUTEXP_BUSY_FIELD(mutexp));
+#endif
+	/*
+	 * !!!
+	 * Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME
+	 * -- out  of sheer paranoia, check both ETIME and ETIMEDOUT.  We
+	 * believe this happens when the application uses SIGALRM for some
+	 * purpose, e.g., the C library sleep call, and Solaris delivers the
+	 * signal to the wrong  LWP.
+	 */
+	if (ret != 0) {
+		if (ret == ETIMEDOUT ||
+#ifdef ETIME
+		    ret == ETIME ||
+#endif
+		    ret == EINTR)
+			ret = 0;
+		else
+			/* Failure, caller shouldn't condwait again. */
+			(void)pthread_mutex_unlock(&mutexp->u.m.mutex);
+	}
+
+ret:
+	PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+	COMPQUIET(mutex, 0);
+	COMPQUIET(env, 0);
+	return (ret);
+}
+
+#ifndef HAVE_MUTEX_HYBRID
+/*
+ * __db_pthread_mutex_lock
+ *	Lock on a mutex, blocking if necessary.
+ *	Timeouts are supported only for self-blocking mutexes.
+ *
+ *	Self-blocking shared latches are not supported.
+ *
+ * PUBLIC: #ifndef HAVE_MUTEX_HYBRID
+ * PUBLIC: int __db_pthread_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ * PUBLIC: #endif
+ */
+int
+__db_pthread_mutex_lock(env, mutex, timeout)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	db_timespec timespec;
+	int ret, t_ret;
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	t_ret = 0;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+#if defined(HAVE_STATISTICS)
+	/*
+	 * We want to know which mutexes are contentious, but don't want to
+	 * do an interlocked test here -- that's slower when the underlying
+	 * system has adaptive mutexes and can perform optimizations like
+	 * spinning only if the thread holding the mutex is actually running
+	 * on a CPU.  Make a guess, using a normal load instruction.
+	 */
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex);
+	else
+		STAT_INC(env,
+		    mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+
+	/* Single-thread the next block, except during the possible condwait. */
+	if ((ret = __db_pthread_mutex_prep(env, mutex, mutexp, TRUE)) != 0)
+		goto err;
+
+	if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+		if (timeout != 0)
+			timespecclear(&timespec);
+		while (MUTEXP_IS_BUSY(mutexp)) {
+			/* Set expiration timer upon first need. */
+			if (timeout != 0 && !timespecisset(&timespec)) {
+				timespecclear(&timespec);
+				__clock_set_expires(env, &timespec, timeout);
+			}
+			t_ret = __db_pthread_mutex_condwait(env,
+			    mutex, mutexp, timeout == 0 ? NULL : &timespec);
+			if (t_ret != 0) {
+				if (t_ret == DB_TIMEOUT)
+					goto out;
+				ret = t_ret;
+				goto err;
+			}
+		}
+
+		F_SET(mutexp, DB_MUTEX_LOCKED);
+		dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+out:
+		/* #2471: HP-UX can sporadically return EFAULT. See above */
+		RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+		if (ret != 0)
+			goto err;
+	} else {
+#ifdef DIAGNOSTIC
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			char buf[DB_THREADID_STRLEN];
+			(void)dbenv->thread_id_string(dbenv,
+			    mutexp->pid, mutexp->tid, buf);
+			__db_errx(env, DB_STR_A("2022",
+		    "pthread lock failed: lock currently in use: pid/tid: %s",
+			    "%s"), buf);
+			ret = EINVAL;
+			goto err;
+		}
+#endif
+		F_SET(mutexp, DB_MUTEX_LOCKED);
+		dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+	}
+
+#ifdef DIAGNOSTIC
+	/*
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a mutex to ensure contention.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+	return (t_ret);
+
+err:
+	__db_err(env, ret, DB_STR("2023", "pthread lock failed"));
+	return (__env_panic(env, ret));
+}
+#endif
+
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+/*
+ * __db_pthread_mutex_readlock
+ *	Take a shared lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_pthread_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_pthread_mutex_readlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+	DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+#if defined(HAVE_STATISTICS)
+	/*
+	 * We want to know which mutexes are contentious, but don't want to
+	 * do an interlocked test here -- that's slower when the underlying
+	 * system has adaptive mutexes and can perform optimizations like
+	 * spinning only if the thread holding the mutex is actually running
+	 * on a CPU.  Make a guess, using a normal load instruction.
+	 */
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		STAT_INC(env,
+		    mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex);
+	else
+		STAT_INC(env,
+		    mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
+#endif
+
+	PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+	RET_SET((pthread_rwlock_rdlock(&mutexp->u.rwlock)), ret);
+	PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+	DB_ASSERT(env, !F_ISSET(mutexp, DB_MUTEX_LOCKED));
+	if (ret != 0)
+		goto err;
+
+#ifdef DIAGNOSTIC
+	/*
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a mutex to ensure contention.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+	return (0);
+
+err:	__db_err(env, ret, DB_STR("2024", "pthread readlock failed"));
+	return (__env_panic(env, ret));
+}
+#endif
+
+#ifdef HAVE_MUTEX_HYBRID
+/*
+ * __db_hybrid_mutex_suspend
+ *	Suspend this thread until the mutex is free enough to give the caller a
+ *	good chance of getting the mutex in the requested exclusivity mode.
+ *
+ *	The major difference between this and the old __db_pthread_mutex_lock()
+ *	is the additional 'exclusive' parameter.
+ *
+ * PUBLIC: #ifdef HAVE_MUTEX_HYBRID
+ * PUBLIC: int __db_hybrid_mutex_suspend
+ * PUBLIC:	__P((ENV *, db_mutex_t, db_timespec *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_hybrid_mutex_suspend(env, mutex, timespec, exclusive)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timespec *timespec;
+	int exclusive;
+{
+	DB_MUTEX *mutexp;
+	int ret, t_ret;
+
+	t_ret = 0;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	if (!exclusive)
+		DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+	DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK));
+
+	if ((ret = __db_pthread_mutex_prep(env, mutex, mutexp, exclusive)) != 0)
+		goto err;
+
+	/*
+	 * Since this is only for hybrid mutexes the pthread mutex
+	 * is only used to wait after spinning on the TAS mutex.
+	 * Set the wait flag before checking to see if the mutex
+	 * is still locked.  The holder will clear DB_MUTEX_LOCKED
+	 * before checking the wait counter.
+	 */
+	mutexp->wait++;
+	MUTEX_MEMBAR(mutexp->wait);
+	while (exclusive ? MUTEXP_IS_BUSY(mutexp) :
+	    atomic_read(&mutexp->sharecount) == MUTEX_SHARE_ISEXCLUSIVE) {
+		t_ret = __db_pthread_mutex_condwait(env,
+		    mutex, mutexp, timespec);
+		if (t_ret != 0) {
+			if (t_ret == DB_TIMEOUT)
+				break;
+			ret = t_ret;
+			goto err;
+		}
+		MUTEX_MEMBAR(mutexp->flags);
+	}
+
+	mutexp->wait--;
+
+	/* #2471: HP-UX can sporadically return EFAULT. See above */
+	RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+	if (ret != 0)
+		goto err;
+
+	PERFMON4(env,
+	    mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a mutex to ensure contention.
+	 */
+	if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+	return (t_ret);
+
+err:
+	PERFMON4(env,
+	    mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+	__db_err(env, ret, "pthread suspend failed");
+	return (__env_panic(env, ret));
+}
+#endif
+
+/*
+ * __db_pthread_mutex_unlock --
+ *	Release a mutex, or, if hybrid, wake a thread up from a suspend.
+ *
+ * PUBLIC: int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_pthread_mutex_unlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	int ret;
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+	int waiters;
+#endif
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+	waiters = mutexp->wait;
+#endif
+
+#if !defined(HAVE_MUTEX_HYBRID) && defined(DIAGNOSTIC)
+	if (!F_ISSET(mutexp, DB_MUTEX_LOCKED | DB_MUTEX_SHARED)) {
+		__db_errx(env, DB_STR("2025",
+		    "pthread unlock failed: lock already unlocked"));
+		return (__env_panic(env, EACCES));
+	}
+#endif
+	if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+		ret = __db_pthread_mutex_prep(env, mutex, mutexp, TRUE);
+		if (ret != 0)
+			goto err;
+
+#ifdef HAVE_MUTEX_HYBRID
+		STAT_INC(env,
+		    mutex, hybrid_wakeup, mutexp->hybrid_wakeup, mutex);
+#else
+		F_CLR(mutexp, DB_MUTEX_LOCKED);	/* nop if DB_MUTEX_SHARED */
+#endif
+
+		if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+			RET_SET(
+			    (pthread_cond_broadcast(&mutexp->u.m.cond)), ret);
+		else
+			RET_SET((pthread_cond_signal(&mutexp->u.m.cond)), ret);
+		if (ret != 0)
+			goto err;
+	} else {
+#ifndef HAVE_MUTEX_HYBRID
+		F_CLR(mutexp, DB_MUTEX_LOCKED);
+#endif
+	}
+
+	/* See comment above; workaround for [#2471]. */
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+		RETRY_ON_EFAULT(pthread_rwlock_unlock(&mutexp->u.rwlock), ret);
+	else
+#endif
+		RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+
+err:	if (ret != 0) {
+		__db_err(env, ret, "pthread unlock failed");
+		return (__env_panic(env, ret));
+	}
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+	if (!MUTEXP_IS_BUSY(mutexp) && mutexp->wait != 0)
+		printf("unlock %ld %x busy %x waiters %d/%d\n",
+		    mutex, pthread_self(), ret,
+		    MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
+#endif
+	return (ret);
+}
+
+/*
+ * __db_pthread_mutex_destroy --
+ *	Destroy a mutex.
+ *	If it is a native shared latch (not hybrid) then
+ *	destroy only one half of the rwlock/mutex&cond union,
+ *	depending whether it was allocated as shared
+ *
+ * PUBLIC: int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_pthread_mutex_destroy(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_MUTEX *mutexp;
+	DB_THREAD_INFO *ip;
+	int ret, t_ret, failchk_thread;
+
+	if (!MUTEX_ON(env))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+	ret = 0;
+	failchk_thread = FALSE;
+	/* Get information to determine if we are really the failchk thread. */
+	if (F_ISSET(env->dbenv, DB_ENV_FAILCHK)) {
+		ret = __env_set_state(env, &ip, THREAD_VERIFY);
+		if (ip != NULL && ip->dbth_state == THREAD_FAILCHK)
+			failchk_thread = TRUE;
+	}
+
+#ifndef HAVE_MUTEX_HYBRID
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+#if defined(HAVE_SHARED_LATCHES)
+		/*
+		 * If there were dead processes waiting on the condition
+		 * we may not be able to destroy it.  Let failchk thread skip
+		 * this, unless destroy is required.
+		 * XXX What operating system resources might this leak?
+		 */
+#ifdef HAVE_PTHREAD_RWLOCK_REINIT_OKAY
+		if (!failchk_thread)
+#endif
+			RET_SET(
+			    (pthread_rwlock_destroy(&mutexp->u.rwlock)), ret);
+		/* For rwlocks, we're done - must not destroy rest of union */
+		return (ret);
+#endif
+	}
+#endif
+	if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+		/*
+		 * If there were dead processes waiting on the condition
+		 * we may not be able to destroy it.  Let failchk thread
+		 * skip this, unless destroy is required.
+		 */
+#ifdef HAVE_PTHREAD_COND_REINIT_OKAY
+		if (!failchk_thread)
+#endif
+			RET_SET((pthread_cond_destroy(&mutexp->u.m.cond)), ret);
+		if (ret != 0)
+			__db_err(env, ret, DB_STR("2026",
+			    "unable to destroy cond"));
+	}
+	RET_SET((pthread_mutex_destroy(&mutexp->u.m.mutex)), t_ret);
+	if (t_ret != 0 && !failchk_thread) {
+		__db_err(env, t_ret, DB_STR("2027",
+		    "unable to destroy mutex"));
+		if (ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
diff --git a/src/mutex/mut_region.c b/src/mutex/mut_region.c
new file mode 100644
index 00000000..26ae0a03
--- /dev/null
+++ b/src/mutex/mut_region.c
@@ -0,0 +1,468 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static db_size_t __mutex_align_size __P((ENV *));
+static int __mutex_region_init __P((ENV *, DB_MUTEXMGR *));
+static size_t __mutex_region_size __P((ENV *));
+static size_t __mutex_region_max __P((ENV *));
+
+/*
+ * __mutex_open --
+ *	Open a mutex region.
+ *
+ * PUBLIC: int __mutex_open __P((ENV *, int));
+ */
+int
+__mutex_open(env, create_ok)
+	ENV *env;
+	int create_ok;
+{
+	DB_ENV *dbenv;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	size_t size;
+	u_int32_t cpu_count;
+	int ret;
+#ifndef HAVE_ATOMIC_SUPPORT
+	u_int i;
+#endif
+
+	dbenv = env->dbenv;
+	if (dbenv->mutex_max == 0 &&
+	    dbenv->mutex_cnt == 0 && dbenv->mutex_inc == 0 &&
+	    F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+		return (0);
+
+	/*
+	 * Initialize the ENV handle information if not already initialized.
+	 *
+	 * Align mutexes on the byte boundaries specified by the application.
+	 */
+	if (dbenv->mutex_align == 0)
+		dbenv->mutex_align = MUTEX_ALIGN;
+	if (dbenv->mutex_tas_spins == 0) {
+		cpu_count = __os_cpu_count();
+		if ((ret = __mutex_set_tas_spins(dbenv, cpu_count == 1 ?
+		    cpu_count : cpu_count * MUTEX_SPINS_PER_PROCESSOR)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * If the user didn't set an absolute value on the number of mutexes
+	 * we'll need, figure it out.  We're conservative in our allocation,
+	 * we need mutexes for DB handles, group-commit queues and other things
+	 * applications allocate at run-time.  The application may have kicked
+	 * up our count to allocate its own mutexes, add that in.
+	 */
+	if (dbenv->mutex_cnt == 0 &&
+	    F_ISSET(env, ENV_PRIVATE | ENV_THREAD) != ENV_PRIVATE)
+		dbenv->mutex_cnt =
+		    __lock_region_mutex_count(env) +
+		    __log_region_mutex_count(env) +
+		    __memp_region_mutex_count(env) +
+		    __txn_region_mutex_count(env);
+
+	if (dbenv->mutex_max != 0 && dbenv->mutex_cnt > dbenv->mutex_max)
+		dbenv->mutex_cnt = dbenv->mutex_max;
+
+	/* Create/initialize the mutex manager structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_MUTEXMGR), &mtxmgr)) != 0)
+		return (ret);
+
+	/* Join/create the mutex region. */
+	mtxmgr->reginfo.env = env;
+	mtxmgr->reginfo.type = REGION_TYPE_MUTEX;
+	mtxmgr->reginfo.id = INVALID_REGION_ID;
+	mtxmgr->reginfo.flags = REGION_JOIN_OK;
+	size = __mutex_region_size(env);
+	if (create_ok)
+		F_SET(&mtxmgr->reginfo, REGION_CREATE_OK);
+	if ((ret = __env_region_attach(env,
+	    &mtxmgr->reginfo, size, size + __mutex_region_max(env))) != 0)
+		goto err;
+
+	/* If we created the region, initialize it. */
+	if (F_ISSET(&mtxmgr->reginfo, REGION_CREATE))
+		if ((ret = __mutex_region_init(env, mtxmgr)) != 0)
+			goto err;
+
+	/* Set the local addresses. */
+	mtxregion = mtxmgr->reginfo.primary =
+	    R_ADDR(&mtxmgr->reginfo, mtxmgr->reginfo.rp->primary);
+	mtxmgr->mutex_array = R_ADDR(&mtxmgr->reginfo, mtxregion->mutex_off);
+
+	env->mutex_handle = mtxmgr;
+
+#ifndef HAVE_ATOMIC_SUPPORT
+	/* If necessary allocate the atomic emulation mutexes.  */
+	if (F_ISSET(&mtxmgr->reginfo, REGION_CREATE))
+		for (i = 0; i != MAX_ATOMIC_MUTEXES; i++)
+			if ((ret = __mutex_alloc_int(
+			    env, 0, MTX_ATOMIC_EMULATION,
+			    0, &mtxregion->mtx_atomic[i])) != 0)
+				return (ret);
+#endif
+
+	return (0);
+
+err:	env->mutex_handle = NULL;
+	if (mtxmgr->reginfo.addr != NULL)
+		(void)__env_region_detach(env, &mtxmgr->reginfo, 0);
+
+	__os_free(env, mtxmgr);
+	return (ret);
+}
+
+/*
+ * __mutex_region_init --
+ *	Initialize a mutex region in shared memory.
+ */
+static int
+__mutex_region_init(env, mtxmgr)
+	ENV *env;
+	DB_MUTEXMGR *mtxmgr;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t mutex;
+	int ret;
+	void *mutex_array;
+
+	dbenv = env->dbenv;
+
+	COMPQUIET(mutexp, NULL);
+
+	if ((ret = __env_alloc(&mtxmgr->reginfo,
+	    sizeof(DB_MUTEXREGION), &mtxmgr->reginfo.primary)) != 0) {
+		__db_errx(env, DB_STR("2013",
+		    "Unable to allocate memory for the mutex region"));
+		return (ret);
+	}
+	mtxmgr->reginfo.rp->primary =
+	    R_OFFSET(&mtxmgr->reginfo, mtxmgr->reginfo.primary);
+	mtxregion = mtxmgr->reginfo.primary;
+	memset(mtxregion, 0, sizeof(*mtxregion));
+
+	mtxregion->mutex_size = __mutex_align_size(env);
+
+	mtxregion->stat.st_mutex_align = dbenv->mutex_align;
+	if (dbenv->mutex_cnt == 0)
+		dbenv->mutex_cnt = 1;
+	mtxregion->stat.st_mutex_init =
+	     mtxregion->stat.st_mutex_cnt = dbenv->mutex_cnt;
+	mtxregion->stat.st_mutex_max = dbenv->mutex_max;
+	if (mtxregion->stat.st_mutex_max != 0)
+		mtxregion->stat.st_mutex_max += dbenv->mutex_inc;
+	mtxregion->stat.st_mutex_tas_spins = dbenv->mutex_tas_spins;
+
+	/*
+	 * Get a chunk of memory to be used for the mutexes themselves.  Each
+	 * piece of the memory must be properly aligned, and that alignment
+	 * may be more restrictive than the memory alignment returned by the
+	 * underlying allocation code.  We already know how much memory each
+	 * mutex in the array will take up, but we need to offset the first
+	 * mutex in the array so the array begins properly aligned.
+	 *
+	 * The OOB mutex (MUTEX_INVALID) is 0.  To make this work, we ignore
+	 * the first allocated slot when we build the free list.  We have to
+	 * correct the count by 1 here, though, otherwise our counter will be
+	 * off by 1.
+	 */
+	if ((ret = __env_alloc(&mtxmgr->reginfo,
+	    mtxregion->stat.st_mutex_align +
+	    (mtxregion->stat.st_mutex_cnt + 1) * mtxregion->mutex_size,
+	    &mutex_array)) != 0) {
+		__db_errx(env, DB_STR("2014",
+		    "Unable to allocate memory for mutexes from the region"));
+		return (ret);
+	}
+
+	mtxregion->mutex_off_alloc = R_OFFSET(&mtxmgr->reginfo, mutex_array);
+	mutex_array = ALIGNP_INC(mutex_array, mtxregion->stat.st_mutex_align);
+	mtxregion->mutex_off = R_OFFSET(&mtxmgr->reginfo, mutex_array);
+	mtxmgr->mutex_array = mutex_array;
+
+	/*
+	 * Put the mutexes on a free list and clear the allocated flag.
+	 *
+	 * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+	 *
+	 * The comparison is <, not <=, because we're looking ahead one
+	 * in each link.
+	 */
+	env->mutex_handle = mtxmgr;
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		mutexp = (DB_MUTEX *)mutex_array;
+		mutexp++;
+		mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+		mtxregion->mutex_next = (db_mutex_t)mutexp;
+	} else {
+		mtxregion->mutex_next = 1;
+		mutexp = MUTEXP_SET(env, 1);
+	}
+	for (mutex = 1; mutex < mtxregion->stat.st_mutex_cnt; ++mutex) {
+		mutexp->flags = 0;
+		if (F_ISSET(env, ENV_PRIVATE))
+			mutexp->mutex_next_link = (db_mutex_t)(mutexp + 1);
+		else
+			mutexp->mutex_next_link = mutex + 1;
+		mutexp++;
+		mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+	}
+	mutexp->flags = 0;
+	mutexp->mutex_next_link = MUTEX_INVALID;
+	mtxregion->stat.st_mutex_free = mtxregion->stat.st_mutex_cnt;
+	mtxregion->stat.st_mutex_inuse = mtxregion->stat.st_mutex_inuse_max = 0;
+	if ((ret = __mutex_alloc(env, MTX_MUTEX_REGION, 0, &mutex)) != 0)
+		return (ret);
+	mtxmgr->reginfo.mtx_alloc = mtxregion->mtx_region = mutex;
+
+	/*
+	 * This is the first place we can test mutexes and we need to
+	 * know if they're working.  (They CAN fail, for example on
+	 * SunOS, when using fcntl(2) for locking and using an
+	 * in-memory filesystem as the database environment directory.
+	 * But you knew that, I'm sure -- it probably wasn't worth
+	 * mentioning.)
+	 */
+	mutex = MUTEX_INVALID;
+	if ((ret =
+	    __mutex_alloc(env, MTX_MUTEX_TEST, 0, &mutex) != 0) ||
+	    (ret = __mutex_lock(env, mutex)) != 0 ||
+	    (ret = __mutex_unlock(env, mutex)) != 0 ||
+	    (ret = __mutex_trylock(env, mutex)) != 0 ||
+	    (ret = __mutex_unlock(env, mutex)) != 0 ||
+	    (ret = __mutex_free(env, &mutex)) != 0) {
+		__db_errx(env, DB_STR("2015",
+	    "Unable to acquire/release a mutex; check configuration"));
+		return (ret);
+	}
+#ifdef HAVE_SHARED_LATCHES
+	if ((ret =
+	    __mutex_alloc(env,
+		MTX_MUTEX_TEST, DB_MUTEX_SHARED, &mutex) != 0) ||
+	    (ret = __mutex_lock(env, mutex)) != 0 ||
+	    (ret = __mutex_tryrdlock(env, mutex)) != DB_LOCK_NOTGRANTED ||
+	    (ret = __mutex_unlock(env, mutex)) != 0 ||
+	    (ret = __mutex_rdlock(env, mutex)) != 0 ||
+	    (ret = __mutex_rdlock(env, mutex)) != 0 ||
+	    (ret = __mutex_unlock(env, mutex)) != 0 ||
+	    (ret = __mutex_unlock(env, mutex)) != 0 ||
+	    (ret = __mutex_free(env, &mutex)) != 0) {
+		__db_errx(env, DB_STR("2016",
+    "Unable to acquire/release a shared latch; check configuration"));
+		return (ret);
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * __mutex_env_refresh --
+ *	Clean up after the mutex region on a close or failed open.
+ *
+ * PUBLIC: int __mutex_env_refresh __P((ENV *));
+ */
+int
+__mutex_env_refresh(env)
+	ENV *env;
+{
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	REGINFO *reginfo;
+	int ret;
+
+	mtxmgr = env->mutex_handle;
+	reginfo = &mtxmgr->reginfo;
+	mtxregion = mtxmgr->reginfo.primary;
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		reginfo->mtx_alloc = MUTEX_INVALID;
+
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+		/*
+		 * If destroying the mutex region, return any system resources
+		 * to the system.
+		 */
+		__mutex_resource_return(env, reginfo);
+#endif
+		/* Discard the mutex array. */
+		__env_alloc_free(
+		    reginfo, R_ADDR(reginfo, mtxregion->mutex_off_alloc));
+	}
+
+	/* Detach from the region. */
+	ret = __env_region_detach(env, reginfo, 0);
+
+	__os_free(env, mtxmgr);
+
+	env->mutex_handle = NULL;
+
+	return (ret);
+}
+
+/*
+ * __mutex_align_size --
+ *	Return how much memory each mutex will take up if an array of them
+ *	are to be properly aligned, individually, within the array.
+ */
+static db_size_t
+__mutex_align_size(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env->dbenv;
+
+	return ((db_size_t)DB_ALIGN(sizeof(DB_MUTEX), dbenv->mutex_align));
+}
+
+/*
+ * __mutex_region_size --
+ *	 Return the amount of space needed for the mutex region.
+ */
+static size_t
+__mutex_region_size(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t s;
+
+	dbenv = env->dbenv;
+
+	s = sizeof(DB_MUTEXMGR) + 1024;
+
+	/* We discard one mutex for the OOB slot. */
+	s += __env_alloc_size(
+	    (dbenv->mutex_cnt + 1) *__mutex_align_size(env));
+
+	return (s);
+}
+
+/*
+ * __mutex_region_max --
+ *	 Return the amount of space needed to reach the maximum size.
+ */
+static size_t
+__mutex_region_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	u_int32_t max;
+
+	dbenv = env->dbenv;
+
+	if ((max = dbenv->mutex_max) == 0) {
+		if (F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+			max = dbenv->mutex_inc + 1;
+		else
+			max = __lock_region_mutex_max(env) +
+			    __txn_region_mutex_max(env) +
+			    __log_region_mutex_max(env) +
+			    dbenv->mutex_inc + 100;
+	} else if (max <= dbenv->mutex_cnt)
+		return (0);
+	else
+		max -= dbenv->mutex_cnt;
+
+	return ( __env_alloc_size(max * __mutex_align_size(env)));
+}
+
+#ifdef	HAVE_MUTEX_SYSTEM_RESOURCES
+/*
+ * __mutex_resource_return
+ *	Return any system-allocated mutex resources to the system.
+ *
+ * PUBLIC: void __mutex_resource_return __P((ENV *, REGINFO *));
+ */
+void
+__mutex_resource_return(env, infop)
+	ENV *env;
+	REGINFO *infop;
+{
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr, mtxmgr_st;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t i, indx;
+	void *orig_handle, *chunk;
+	uintmax_t size;
+
+	/*
+	 * This routine is called in two cases: when discarding the regions
+	 * from a previous Berkeley DB run, during recovery, and two, when
+	 * discarding regions as we shut down the database environment.
+	 *
+	 * Walk the list of mutexes and destroy any live ones.
+	 *
+	 * This is just like joining a region -- the REGINFO we're handed is
+	 * the same as the one returned by __env_region_attach(), all we have
+	 * to do is fill in the links.
+	 *
+	 * !!!
+	 * The region may be corrupted, of course.  We're safe because the
+	 * only things we look at are things that are initialized when the
+	 * region is created, and never modified after that.
+	 */
+	memset(&mtxmgr_st, 0, sizeof(mtxmgr_st));
+	mtxmgr = &mtxmgr_st;
+	mtxmgr->reginfo = *infop;
+	mtxregion = mtxmgr->reginfo.primary =
+	    R_ADDR(&mtxmgr->reginfo, mtxmgr->reginfo.rp->primary);
+	mtxmgr->mutex_array = R_ADDR(&mtxmgr->reginfo, mtxregion->mutex_off);
+
+	/*
+	 * This is a little strange, but the mutex_handle is what all of the
+	 * underlying mutex routines will use to determine if they should do
+	 * any work and to find their information.  Save/restore the handle
+	 * around the work loop.
+	 *
+	 * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+	 */
+	orig_handle = env->mutex_handle;
+	env->mutex_handle = mtxmgr;
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+		chunk = NULL;
+		size = __env_elem_size(env,
+		    (void *)mtxregion->mutex_off_alloc);
+		size -= sizeof(*mutexp);
+	} else
+		mutexp = MUTEXP_SET(env, 1);
+	for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+		if (F_ISSET(env, ENV_PRIVATE))
+			indx = (db_mutex_t)mutexp;
+		else
+			indx = i;
+		if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED))
+			(void)__mutex_destroy(env, indx);
+		mutexp++;
+		if (F_ISSET(env, ENV_PRIVATE) &&
+		    (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+			mutexp = __env_get_chunk(&mtxmgr->reginfo,
+			    &chunk, &size);
+		}
+		mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+	}
+	env->mutex_handle = orig_handle;
+}
+#endif
diff --git a/src/mutex/mut_stat.c b/src/mutex/mut_stat.c
new file mode 100644
index 00000000..b64207fa
--- /dev/null
+++ b/src/mutex/mut_stat.c
@@ -0,0 +1,579 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __mutex_print_all __P((ENV *, u_int32_t));
+static const char *__mutex_print_id __P((int));
+static int __mutex_print_stats __P((ENV *, u_int32_t));
+static void __mutex_print_summary __P((ENV *));
+static int __mutex_stat __P((ENV *, DB_MUTEX_STAT **, u_int32_t));
+
+/*
+ * __mutex_stat_pp --
+ *	ENV->mutex_stat pre/post processing.
+ *
+ * PUBLIC: int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+ */
+int
+__mutex_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_MUTEX_STAT **statp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mutex_handle, "DB_ENV->mutex_stat", DB_INIT_MUTEX);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->mutex_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__mutex_stat(env, statp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __mutex_stat --
+ *	ENV->mutex_stat.
+ */
+static int
+__mutex_stat(env, statp, flags)
+	ENV *env;
+	DB_MUTEX_STAT **statp;
+	u_int32_t flags;
+{
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	DB_MUTEX_STAT *stats;
+	int ret;
+
+	*statp = NULL;
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+
+	if ((ret = __os_umalloc(env, sizeof(DB_MUTEX_STAT), &stats)) != 0)
+		return (ret);
+
+	MUTEX_SYSTEM_LOCK(env);
+
+	/*
+	 * Most fields are maintained in the underlying region structure.
+	 * Region size and region mutex are not.
+	 */
+	*stats = mtxregion->stat;
+	stats->st_regsize = mtxmgr->reginfo.rp->size;
+	stats->st_regmax = mtxmgr->reginfo.rp->max;
+	__mutex_set_wait_info(env, mtxregion->mtx_region,
+	    &stats->st_region_wait, &stats->st_region_nowait);
+	if (LF_ISSET(DB_STAT_CLEAR))
+		__mutex_clear(env, mtxregion->mtx_region);
+
+	MUTEX_SYSTEM_UNLOCK(env);
+
+	*statp = stats;
+	return (0);
+}
+
+/*
+ * __mutex_stat_print_pp --
+ *	ENV->mutex_stat_print pre/post processing.
+ *
+ * PUBLIC: int __mutex_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->mutex_handle, "DB_ENV->mutex_stat_print", DB_INIT_MUTEX);
+
+	if ((ret = __db_fchk(env, "DB_ENV->mutex_stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__mutex_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __mutex_stat_print
+ *	ENV->mutex_stat_print method.
+ *
+ * PUBLIC: int __mutex_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__mutex_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __mutex_print_stats(env, orig_flags);
+		__mutex_print_summary(env);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL))
+		ret = __mutex_print_all(env, orig_flags);
+
+	return (0);
+}
+
+static void
+__mutex_print_summary(env)
+	ENV *env;
+{
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	void *chunk;
+	db_mutex_t i;
+	u_int32_t counts[MTX_MAX_ENTRY + 2];
+	uintmax_t size;
+	int alloc_id;
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	memset(counts, 0, sizeof(counts));
+	size = 0;
+
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+		chunk = NULL;
+		size = __env_elem_size(env,
+		    ROFF_TO_P(mtxregion->mutex_off_alloc));
+		size -= sizeof(*mutexp);
+	} else
+		mutexp = MUTEXP_SET(env, 1);
+	for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+		if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED))
+			counts[0]++;
+		else if (mutexp->alloc_id > MTX_MAX_ENTRY)
+			counts[MTX_MAX_ENTRY + 1]++;
+		else
+			counts[mutexp->alloc_id]++;
+
+		mutexp++;
+		if (F_ISSET(env, ENV_PRIVATE) &&
+		    (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+			mutexp =
+			    __env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+		}
+		mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+	}
+	__db_msg(env, "Mutex counts");
+	__db_msg(env, "%d\tUnallocated", counts[0]);
+	for (alloc_id = 1; alloc_id <= MTX_TXN_REGION + 1; alloc_id++)
+		if (counts[alloc_id] != 0)
+			__db_msg(env, "%lu\t%s",
+			    (u_long)counts[alloc_id],
+			    __mutex_print_id(alloc_id));
+
+}
+
+/*
+ * __mutex_print_stats --
+ *	Display default mutex region statistics.
+ */
+static int
+__mutex_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_MUTEX_STAT *sp;
+	int ret;
+
+	if ((ret = __mutex_stat(env, &sp, LF_ISSET(DB_STAT_CLEAR))) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default mutex region information:");
+
+	__db_dlbytes(env, "Mutex region size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+	__db_dlbytes(env, "Mutex region max size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_regmax);
+	__db_dl_pct(env,
+	    "The number of region locks that required waiting",
+	    (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+	    sp->st_region_wait + sp->st_region_nowait), NULL);
+	STAT_ULONG("Mutex alignment", sp->st_mutex_align);
+	STAT_ULONG("Mutex test-and-set spins", sp->st_mutex_tas_spins);
+	STAT_ULONG("Mutex initial count", sp->st_mutex_init);
+	STAT_ULONG("Mutex total count", sp->st_mutex_cnt);
+	STAT_ULONG("Mutex max count", sp->st_mutex_max);
+	STAT_ULONG("Mutex free count", sp->st_mutex_free);
+	STAT_ULONG("Mutex in-use count", sp->st_mutex_inuse);
+	STAT_ULONG("Mutex maximum in-use count", sp->st_mutex_inuse_max);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __mutex_print_all --
+ *	Display debugging mutex region statistics.
+ */
+static int
+__mutex_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_MUTEX_ALLOCATED,		"alloc" },
+		{ DB_MUTEX_LOCKED,		"locked" },
+		{ DB_MUTEX_LOGICAL_LOCK,	"logical" },
+		{ DB_MUTEX_PROCESS_ONLY,	"process-private" },
+		{ DB_MUTEX_SELF_BLOCK,		"self-block" },
+		{ 0,				NULL }
+	};
+	DB_MSGBUF mb, *mbp;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	db_mutex_t i;
+	uintmax_t size;
+	void *chunk;
+
+	DB_MSGBUF_INIT(&mb);
+	mbp = &mb;
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+
+	__db_print_reginfo(env, &mtxmgr->reginfo, "Mutex", flags);
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+
+	__db_msg(env, "DB_MUTEXREGION structure:");
+	__mutex_print_debug_single(env,
+	    "DB_MUTEXREGION region mutex", mtxregion->mtx_region, flags);
+	STAT_ULONG("Size of the aligned mutex", mtxregion->mutex_size);
+	STAT_ULONG("Next free mutex", mtxregion->mutex_next);
+
+	/*
+	 * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+	 *
+	 * We're not holding the mutex region lock, so we're racing threads of
+	 * control allocating mutexes.  That's OK, it just means we display or
+	 * clear statistics while mutexes are moving.
+	 */
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "mutex\twait/nowait, pct wait, holder, flags");
+	size = 0;
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+		chunk = NULL;
+		size = __env_elem_size(env,
+		    ROFF_TO_P(mtxregion->mutex_off_alloc));
+		size -= sizeof(*mutexp);
+	} else
+		mutexp = MUTEXP_SET(env, 1);
+	for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+		if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED)) {
+			__db_msgadd(env, mbp, "%5lu\t", (u_long)i);
+
+			__mutex_print_debug_stats(env, mbp,
+			    F_ISSET(env, ENV_PRIVATE) ?
+			    (db_mutex_t)mutexp : i, flags);
+
+			if (mutexp->alloc_id != 0)
+				__db_msgadd(env, mbp,
+				    ", %s", __mutex_print_id(mutexp->alloc_id));
+
+			__db_prflags(env, mbp, mutexp->flags, fn, " (", ")");
+
+			DB_MSGBUF_FLUSH(env, mbp);
+		}
+
+		mutexp++;
+		if (F_ISSET(env, ENV_PRIVATE) &&
+		    (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+			mutexp =
+			    __env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+		}
+		mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+	}
+
+	return (0);
+}
+
+/*
+ * __mutex_print_debug_single --
+ *	Print mutex internal debugging statistics for a single mutex on a
+ *	single output line.
+ *
+ * PUBLIC: void __mutex_print_debug_single
+ * PUBLIC:          __P((ENV *, const char *, db_mutex_t, u_int32_t));
+ */
+void
+__mutex_print_debug_single(env, tag, mutex, flags)
+	ENV *env;
+	const char *tag;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	DB_MSGBUF mb, *mbp;
+
+	DB_MSGBUF_INIT(&mb);
+	mbp = &mb;
+
+	if (LF_ISSET(DB_STAT_SUBSYSTEM))
+		LF_CLR(DB_STAT_CLEAR);
+	__db_msgadd(env, mbp, "%lu\t%s ", (u_long)mutex, tag);
+	__mutex_print_debug_stats(env, mbp, mutex, flags);
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __mutex_print_debug_stats --
+ *	Print mutex internal debugging statistics, that is, the statistics
+ *	in the [] square brackets.
+ *
+ * PUBLIC: void __mutex_print_debug_stats
+ * PUBLIC:          __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
+ */
+void
+__mutex_print_debug_stats(env, mbp, mutex, flags)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	u_long value;
+	char buf[DB_THREADID_STRLEN];
+#if defined(HAVE_SHARED_LATCHES) && (defined(HAVE_MUTEX_HYBRID) || \
+    !defined(HAVE_MUTEX_PTHREADS))
+	int sharecount;
+#endif
+
+	if (mutex == MUTEX_INVALID) {
+		__db_msgadd(env, mbp, "[!Set]");
+		return;
+	}
+
+	dbenv = env->dbenv;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	__db_msgadd(env, mbp, "[");
+	if ((value = mutexp->mutex_set_wait) < 10000000)
+		__db_msgadd(env, mbp, "%lu", value);
+	else
+		__db_msgadd(env, mbp, "%luM", value / 1000000);
+	if ((value = mutexp->mutex_set_nowait) < 10000000)
+		__db_msgadd(env, mbp, "/%lu", value);
+	else
+		__db_msgadd(env, mbp, "/%luM", value / 1000000);
+
+	__db_msgadd(env, mbp, " %d%% ",
+	    DB_PCT(mutexp->mutex_set_wait,
+	    mutexp->mutex_set_wait + mutexp->mutex_set_nowait));
+
+#if defined(HAVE_SHARED_LATCHES)
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+		__db_msgadd(env, mbp, " rd ");
+		if ((value = mutexp->mutex_set_rd_wait) < 10000000)
+			__db_msgadd(env, mbp, "%lu", value);
+		else
+			__db_msgadd(env, mbp, "%luM", value / 1000000);
+		if ((value = mutexp->mutex_set_rd_nowait) < 10000000)
+			__db_msgadd(env, mbp, "/%lu", value);
+		else
+			__db_msgadd(env, mbp, "/%luM", value / 1000000);
+		__db_msgadd(env, mbp, " %d%% ",
+		    DB_PCT(mutexp->mutex_set_rd_wait,
+		    mutexp->mutex_set_rd_wait + mutexp->mutex_set_rd_nowait));
+	}
+#endif
+
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		__db_msgadd(env, mbp, "%s]",
+		    dbenv->thread_id_string(dbenv,
+		    mutexp->pid, mutexp->tid, buf));
+	/* Pthreads-based shared latches do not expose the share count. */
+#if defined(HAVE_SHARED_LATCHES) && (defined(HAVE_MUTEX_HYBRID) || \
+    !defined(HAVE_MUTEX_PTHREADS))
+	else if (F_ISSET(mutexp, DB_MUTEX_SHARED) &&
+	    (sharecount = atomic_read(&mutexp->sharecount)) != 0) {
+		if (sharecount == 1)
+			__db_msgadd(env, mbp, "1 reader");
+		else
+			__db_msgadd(env, mbp, "%d readers", sharecount);
+		/* Show the thread which last acquired the latch. */
+		__db_msgadd(env, mbp, " %s]",
+		    dbenv->thread_id_string(dbenv,
+		    mutexp->pid, mutexp->tid, buf));
+	}
+#endif
+	else
+		__db_msgadd(env, mbp, "!Own]");
+
+#ifdef HAVE_MUTEX_HYBRID
+	if (mutexp->hybrid_wait != 0 || mutexp->hybrid_wakeup != 0)
+		__db_msgadd(env, mbp, " <wakeups %d/%d>",
+		    mutexp->hybrid_wait, mutexp->hybrid_wakeup);
+#endif
+
+	if (LF_ISSET(DB_STAT_CLEAR))
+		__mutex_clear(env, mutex);
+}
+
+static const char *
+__mutex_print_id(alloc_id)
+	int alloc_id;
+{
+	switch (alloc_id) {
+	case MTX_APPLICATION:		return ("application allocated");
+	case MTX_ATOMIC_EMULATION:	return ("atomic emulation");
+	case MTX_DB_HANDLE:		return ("db handle");
+	case MTX_ENV_DBLIST:		return ("env dblist");
+	case MTX_ENV_EXCLDBLIST:	return ("env exclusive dblist");
+	case MTX_ENV_HANDLE:		return ("env handle");
+	case MTX_ENV_REGION:		return ("env region");
+	case MTX_LOCK_REGION:		return ("lock region");
+	case MTX_LOGICAL_LOCK:		return ("logical lock");
+	case MTX_LOG_FILENAME:		return ("log filename");
+	case MTX_LOG_FLUSH:		return ("log flush");
+	case MTX_LOG_HANDLE:		return ("log handle");
+	case MTX_LOG_REGION:		return ("log region");
+	case MTX_MPOOLFILE_HANDLE:	return ("mpoolfile handle");
+	case MTX_MPOOL_BH:		return ("mpool buffer");
+	case MTX_MPOOL_FH:		return ("mpool filehandle");
+	case MTX_MPOOL_FILE_BUCKET:	return ("mpool file bucket");
+	case MTX_MPOOL_HANDLE:		return ("mpool handle");
+	case MTX_MPOOL_HASH_BUCKET:	return ("mpool hash bucket");
+	case MTX_MPOOL_REGION:		return ("mpool region");
+	case MTX_MUTEX_REGION:		return ("mutex region");
+	case MTX_MUTEX_TEST:		return ("mutex test");
+	case MTX_REPMGR:		return ("replication manager");
+	case MTX_REP_CHKPT:		return ("replication checkpoint");
+	case MTX_REP_DATABASE:		return ("replication database");
+	case MTX_REP_DIAG:		return ("replication diagnostics");
+	case MTX_REP_EVENT:		return ("replication event");
+	case MTX_REP_REGION:		return ("replication region");
+	case MTX_REP_START:		return ("replication role config");
+	case MTX_REP_WAITER:		return ("replication txn apply");
+	case MTX_SEQUENCE:		return ("sequence");
+	case MTX_TWISTER:		return ("twister");
+	case MTX_TCL_EVENTS:		return ("Tcl events");
+	case MTX_TXN_ACTIVE:		return ("txn active list");
+	case MTX_TXN_CHKPT:		return ("transaction checkpoint");
+	case MTX_TXN_COMMIT:		return ("txn commit");
+	case MTX_TXN_MVCC:		return ("txn mvcc");
+	case MTX_TXN_REGION:		return ("txn region");
+	default:			return ("unknown mutex type");
+	/* NOTREACHED */
+	}
+}
+
+/*
+ * __mutex_set_wait_info --
+ *	Return mutex statistics.
+ *
+ * PUBLIC: void __mutex_set_wait_info
+ * PUBLIC:	__P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *));
+ */
+void
+__mutex_set_wait_info(env, mutex, waitp, nowaitp)
+	ENV *env;
+	db_mutex_t mutex;
+	uintmax_t *waitp, *nowaitp;
+{
+	DB_MUTEX *mutexp;
+
+	if (mutex == MUTEX_INVALID) {
+		*waitp = 0;
+		*nowaitp = 0;
+		return;
+	}
+	mutexp = MUTEXP_SET(env, mutex);
+
+	*waitp = mutexp->mutex_set_wait;
+	*nowaitp = mutexp->mutex_set_nowait;
+}
+
+/*
+ * __mutex_clear --
+ *	Clear mutex statistics.
+ *
+ * PUBLIC: void __mutex_clear __P((ENV *, db_mutex_t));
+ */
+void
+__mutex_clear(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_MUTEX *mutexp;
+
+	if (!MUTEX_ON(env))
+		return;
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+	mutexp->mutex_set_wait = mutexp->mutex_set_nowait = 0;
+#ifdef HAVE_SHARED_LATCHES
+	mutexp->mutex_set_rd_wait = mutexp->mutex_set_rd_nowait = 0;
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+	mutexp->hybrid_wait = mutexp->hybrid_wakeup = 0;
+#endif
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__mutex_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_MUTEX_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__mutex_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/mutex/mut_stub.c b/src/mutex/mut_stub.c
new file mode 100644
index 00000000..61ecc80c
--- /dev/null
+++ b/src/mutex/mut_stub.c
@@ -0,0 +1,252 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_MUTEX_SUPPORT
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * If the library wasn't compiled with mutex support, various routines
+ * aren't available.  Stub them here, returning an appropriate error.
+ */
+static int __db_nomutex __P((ENV *));
+
+/*
+ * __db_nomutex --
+ *	Error when a Berkeley DB build doesn't include mutexes.
+ */
+static int
+__db_nomutex(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("2001",
+	    "library build did not include support for mutexes"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__mutex_alloc_pp(dbenv, flags, indxp)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+	db_mutex_t *indxp;
+{
+	COMPQUIET(flags, 0);
+	COMPQUIET(indxp, NULL);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_alloc(env, alloc_id, flags, indxp)
+	ENV *env;
+	int alloc_id;
+	u_int32_t flags;
+	db_mutex_t *indxp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(alloc_id, 0);
+	COMPQUIET(flags, 0);
+	*indxp = MUTEX_INVALID;
+	return (0);
+}
+
+void
+__mutex_clear(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+}
+
+int
+__mutex_free_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	COMPQUIET(indx, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_free(env, indxp)
+	ENV *env;
+	db_mutex_t *indxp;
+{
+	COMPQUIET(env, NULL);
+	*indxp = MUTEX_INVALID;
+	return (0);
+}
+
+int
+__mutex_get_align(dbenv, alignp)
+	DB_ENV *dbenv;
+	u_int32_t *alignp;
+{
+	COMPQUIET(alignp, NULL);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_increment(dbenv, incrementp)
+	DB_ENV *dbenv;
+	u_int32_t *incrementp;
+{
+	COMPQUIET(incrementp, NULL);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_max(dbenv, maxp)
+	DB_ENV *dbenv;
+	u_int32_t *maxp;
+{
+	COMPQUIET(maxp, NULL);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_tas_spins(dbenv, tas_spinsp)
+	DB_ENV *dbenv;
+	u_int32_t *tas_spinsp;
+{
+	COMPQUIET(tas_spinsp, NULL);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_lock_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	COMPQUIET(indx, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+void
+__mutex_print_debug_single(env, tag, mutex, flags)
+	ENV *env;
+	const char *tag;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(tag, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+	COMPQUIET(flags, 0);
+}
+
+void
+__mutex_print_debug_stats(env, mbp, mutex, flags)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(mbp, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+	COMPQUIET(flags, 0);
+}
+
+int
+__mutex_set_align(dbenv, align)
+	DB_ENV *dbenv;
+	u_int32_t align;
+{
+	COMPQUIET(align, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_increment(dbenv, increment)
+	DB_ENV *dbenv;
+	u_int32_t increment;
+{
+	COMPQUIET(increment, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_init(dbenv, initp)
+	DB_ENV *dbenv;
+	u_int32_t *initp;
+{
+	COMPQUIET(initp, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_init(dbenv, init)
+	DB_ENV *dbenv;
+	u_int32_t init;
+{
+	COMPQUIET(init, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_max(dbenv, max)
+	DB_ENV *dbenv;
+	u_int32_t max;
+{
+	COMPQUIET(max, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_tas_spins(dbenv, tas_spins)
+	DB_ENV *dbenv;
+	u_int32_t tas_spins;
+{
+	COMPQUIET(tas_spins, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+void
+__mutex_set_wait_info(env, mutex, waitp, nowaitp)
+	ENV *env;
+	db_mutex_t mutex;
+	uintmax_t *waitp, *nowaitp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(mutex, MUTEX_INVALID);
+	*waitp = *nowaitp = 0;
+}
+
+int
+__mutex_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_MUTEX_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_unlock_pp(dbenv, indx)
+	DB_ENV *dbenv;
+	db_mutex_t indx;
+{
+	COMPQUIET(indx, 0);
+	return (__db_nomutex(dbenv->env));
+}
+#endif /* !HAVE_MUTEX_SUPPORT */
diff --git a/src/mutex/mut_tas.c b/src/mutex/mut_tas.c
new file mode 100644
index 00000000..0899d237
--- /dev/null
+++ b/src/mutex/mut_tas.c
@@ -0,0 +1,608 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+static inline int __db_tas_mutex_lock_int
+	    __P((ENV *, db_mutex_t, db_timeout_t, int));
+static inline int __db_tas_mutex_readlock_int __P((ENV *, db_mutex_t, int));
+
+/*
+ * __db_tas_mutex_init --
+ *	Initialize a test-and-set mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_tas_mutex_init(env, mutex, flags)
+	ENV *env;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	int ret;
+
+#ifndef HAVE_MUTEX_HYBRID
+	COMPQUIET(flags, 0);
+#endif
+
+	dbenv = env->dbenv;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	/* Check alignment. */
+	if (((uintptr_t)mutexp & (dbenv->mutex_align - 1)) != 0) {
+		__db_errx(env, DB_STR("2028",
+		    "TAS: mutex not appropriately aligned"));
+		return (EINVAL);
+	}
+
+#ifdef HAVE_SHARED_LATCHES
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+		atomic_init(&mutexp->sharecount, 0);
+	else
+#endif
+	if (MUTEX_INIT(&mutexp->tas)) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("2029",
+		    "TAS: mutex initialize"));
+		return (__os_posix_err(ret));
+	}
+#ifdef HAVE_MUTEX_HYBRID
+	if ((ret = __db_pthread_mutex_init(env,
+	     mutex, flags | DB_MUTEX_SELF_BLOCK)) != 0)
+		return (ret);
+#endif
+	return (0);
+}
+
+/*
+ * __db_tas_mutex_lock_int
+ *     Internal function to lock a mutex, or just try to lock it without waiting
+ */
+inline static int
+__db_tas_mutex_lock_int(env, mutex, timeout, nowait)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+	int nowait;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	DB_THREAD_INFO *ip;
+	db_timespec now, timespec;
+	u_int32_t nspins;
+	int ret;
+#ifdef HAVE_MUTEX_HYBRID
+	const u_long micros = 0;
+#else
+	u_long micros, max_micros;
+	db_timeout_t time_left;
+#endif
+
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+#ifdef HAVE_STATISTICS
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex);
+	else
+		STAT_INC(env,
+		    mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+
+#ifndef HAVE_MUTEX_HYBRID
+	/*
+	 * Wait 1ms initially, up to 10ms for mutexes backing logical database
+	 * locks, and up to 25 ms for mutual exclusion data structure mutexes.
+	 * SR: #7675
+	 */
+	micros = 1000;
+	max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000;
+#endif
+
+	/* Clear the ending timespec so it'll be initialed upon first need. */
+	if (timeout != 0)
+		timespecclear(&timespec);
+
+	 /*
+	 * Only check the thread state once, by initializing the thread
+	 * control block pointer to null.  If it is not the failchk
+	 * thread, then ip will have a valid value subsequent times
+	 * in the loop.
+	 */
+	ip = NULL;
+
+loop:	/* Attempt to acquire the resource for N spins. */
+	for (nspins =
+	    mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+#ifdef HAVE_MUTEX_S390_CC_ASSEMBLY
+		tsl_t zero;
+
+		zero = 0;
+#endif
+
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+	relock:
+#endif
+		/*
+		 * Avoid interlocked instructions until they're likely to
+		 * succeed by first checking whether it is held
+		 */
+		if (MUTEXP_IS_BUSY(mutexp) || !MUTEXP_ACQUIRE(mutexp)) {
+			if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+			    ip == NULL && dbenv->is_alive(dbenv,
+			    mutexp->pid, mutexp->tid, 0) == 0) {
+				ret = __env_set_state(env, &ip, THREAD_VERIFY);
+				if (ret != 0 ||
+				    ip->dbth_state == THREAD_FAILCHK)
+					return (DB_RUNRECOVERY);
+			}
+			if (nowait)
+				return (DB_LOCK_NOTGRANTED);
+			/*
+			 * Some systems (notably those with newer Intel CPUs)
+			 * need a small pause here. [#6975]
+			 */
+			MUTEX_PAUSE
+			continue;
+		}
+
+		MEMBAR_ENTER();
+
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+		/*
+		 * HP semaphores are unlocked automatically when a holding
+		 * process exits.  If the mutex appears to be locked
+		 * (F_ISSET(DB_MUTEX_LOCKED)) but we got here, assume this
+		 * has happened.  Set the pid and tid into the mutex and
+		 * lock again.  (The default state of the mutexes used to
+		 * block in __lock_get_internal is locked, so exiting with
+		 * a locked mutex is reasonable behavior for a process that
+		 * happened to initialize or use one of them.)
+		 */
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+			goto relock;
+		}
+		/*
+		 * If we make it here, the mutex isn't locked, the diagnostic
+		 * won't fire, and we were really unlocked by someone calling
+		 * the DB mutex unlock function.
+		 */
+#endif
+#ifdef DIAGNOSTIC
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			char buf[DB_THREADID_STRLEN];
+			__db_errx(env, DB_STR_A("2030",
+		    "TAS lock failed: lock %ld currently in use: ID: %s",
+			    "%ld %s"), (long)mutex,
+			    dbenv->thread_id_string(dbenv,
+			    mutexp->pid, mutexp->tid, buf));
+			return (__env_panic(env, EACCES));
+		}
+#endif
+		F_SET(mutexp, DB_MUTEX_LOCKED);
+		dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+#ifdef DIAGNOSTIC
+		/*
+		 * We want to switch threads as often as possible.  Yield
+		 * every time we get a mutex to ensure contention.
+		 */
+		if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+			__os_yield(env, 0, 0);
+#endif
+		return (0);
+	}
+
+	/*
+	 * We need to wait for the lock to become available.
+	 * Possibly setup timeouts if this is the first wait, or
+	 * check expiration times for the second and subsequent waits.
+	 */
+	if (timeout != 0) {
+		/* Set the expiration time if this is the first sleep . */
+		if (!timespecisset(&timespec))
+			__clock_set_expires(env, &timespec, timeout);
+		else {
+			timespecclear(&now);
+			if (__clock_expired(env, &now, &timespec))
+				return (DB_TIMEOUT);
+#ifndef HAVE_MUTEX_HYBRID
+			timespecsub(&now, &timespec);
+			DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
+			time_left = timeout - time_left;
+			if (micros > time_left)
+				micros = time_left;
+#endif
+		}
+	}
+
+	/*
+	 * This yields for a while for tas mutexes, and just gives up the
+	 * processor for hybrid mutexes.
+	 * By yielding here we can get the other thread to give up the
+	 * mutex before calling the more expensive library mutex call.
+	 * Tests have shown this to be a big win when there is contention.
+	 */
+	PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp);
+	__os_yield(env, 0, micros);
+	PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+#if defined(HAVE_MUTEX_HYBRID)
+	if (!MUTEXP_IS_BUSY(mutexp))
+		goto loop;
+	/* Wait until the mutex can be obtained exclusively or it times out. */
+	if ((ret = __db_hybrid_mutex_suspend(env,
+	    mutex, timeout == 0 ? NULL : &timespec, TRUE)) != 0)
+		return (ret);
+#else
+	if ((micros <<= 1) > max_micros)
+		micros = max_micros;
+#endif
+
+	/*
+	 * We're spinning.  The environment might be hung, and somebody else
+	 * has already recovered it.  The first thing recovery does is panic
+	 * the environment.  Check to see if we're never going to get this
+	 * mutex.
+	 */
+	PANIC_CHECK(env);
+
+	goto loop;
+}
+
+/*
+ * __db_tas_mutex_lock
+ *	Lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_tas_mutex_lock(env, mutex, timeout)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+{
+	return (__db_tas_mutex_lock_int(env, mutex, timeout, 0));
+}
+
+/*
+ * __db_tas_mutex_trylock
+ *	Try to exclusively lock a mutex without ever blocking - ever!
+ *
+ *	Returns 0 on success,
+ *		DB_LOCK_NOTGRANTED on timeout
+ *		Possibly DB_RUNRECOVERY if DB_ENV_FAILCHK or panic.
+ *
+ *	This will work for DB_MUTEX_SHARED, though it always tries
+ *	for exclusive access.
+ *
+ * PUBLIC: int __db_tas_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_tas_mutex_trylock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_tas_mutex_lock_int(env, mutex, 0, 1));
+}
+
+#if defined(HAVE_SHARED_LATCHES)
+/*
+ * __db_tas_mutex_readlock_int
+ *    Internal function to get a shared lock on a latch, blocking if necessary.
+ *
+ */
+static inline int
+__db_tas_mutex_readlock_int(env, mutex, nowait)
+	ENV *env;
+	db_mutex_t mutex;
+	int nowait;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	DB_THREAD_INFO *ip;
+	int lock;
+	u_int32_t nspins;
+	int ret;
+#ifndef HAVE_MUTEX_HYBRID
+	u_long micros, max_micros;
+#endif
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+	DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+#ifdef HAVE_STATISTICS
+	if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+		STAT_INC(env,
+		    mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex);
+	else
+		STAT_INC(env,
+		    mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
+#endif
+
+#ifndef HAVE_MUTEX_HYBRID
+	/*
+	 * Wait 1ms initially, up to 10ms for mutexes backing logical database
+	 * locks, and up to 25 ms for mutual exclusion data structure mutexes.
+	 * SR: #7675
+	 */
+	micros = 1000;
+	max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000;
+#endif
+
+loop:	/* Attempt to acquire the resource for N spins. */
+	for (nspins =
+	    mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+		lock = atomic_read(&mutexp->sharecount);
+		if (lock == MUTEX_SHARE_ISEXCLUSIVE ||
+		    !atomic_compare_exchange(env,
+			&mutexp->sharecount, lock, lock + 1)) {
+			/*
+			 * Some systems (notably those with newer Intel CPUs)
+			 * need a small pause here. [#6975]
+			 */
+			MUTEX_PAUSE
+			continue;
+		}
+
+		MEMBAR_ENTER();
+		/* For shared latches the threadid is the last requestor's id.
+		 */
+		dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+		return (0);
+	}
+
+	/*
+	 * Waiting for the latched must be avoided when it could allow a
+	 * 'failchk'ing thread to hang.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+	    dbenv->is_alive(dbenv, mutexp->pid, mutexp->tid, 0) == 0) {
+		ret = __env_set_state(env, &ip, THREAD_VERIFY);
+		if (ret != 0 || ip->dbth_state == THREAD_FAILCHK)
+			return (DB_RUNRECOVERY);
+	}
+
+	/*
+	 * It is possible to spin out when the latch is just shared, due to
+	 * many threads or interrupts interfering with the compare&exchange.
+	 * Avoid spurious DB_LOCK_NOTGRANTED returns by retrying.
+	 */
+	if (nowait) {
+		if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
+			goto loop;
+		return (DB_LOCK_NOTGRANTED);
+	}
+
+	/* Wait for the lock to become available. */
+#ifdef HAVE_MUTEX_HYBRID
+	/*
+	 * By yielding here we can get the other thread to give up the
+	 * mutex before calling the more expensive library mutex call.
+	 * Tests have shown this to be a big win when there is contention.
+	 */
+	PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+	__os_yield(env, 0, 0);
+	PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+	if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
+		goto loop;
+	/* Wait until the mutex is no longer exclusively locked. */
+	if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0)
+		return (ret);
+#else
+	PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+	__os_yield(env, 0, micros);
+	PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+	if ((micros <<= 1) > max_micros)
+		micros = max_micros;
+#endif
+
+	/*
+	 * We're spinning.  The environment might be hung, and somebody else
+	 * has already recovered it.  The first thing recovery does is panic
+	 * the environment.  Check to see if we're never going to get this
+	 * mutex.
+	 */
+	PANIC_CHECK(env);
+
+	goto loop;
+}
+
+/*
+ * __db_tas_mutex_readlock
+ *	Get a shared lock on a latch, waiting if necessary.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_tas_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_tas_mutex_readlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_tas_mutex_readlock_int(env, mutex, 0));
+}
+
+/*
+ * __db_tas_mutex_tryreadlock
+ *	Try to get a shared lock on a latch; don't wait when busy.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_tas_mutex_tryreadlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_tas_mutex_tryreadlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_tas_mutex_readlock_int(env, mutex, 1));
+}
+#endif
+
+/*
+ * __db_tas_mutex_unlock --
+ *	Release a mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_unlock __P((ENV *, db_mutex_t));
+ *
+ * Hybrid shared latch wakeup
+ *	When an exclusive requester waits for the last shared holder to
+ *	release, it increments mutexp->wait and pthread_cond_wait()'s. The
+ *	last shared unlock calls __db_pthread_mutex_unlock() to wake it.
+ */
+int
+__db_tas_mutex_unlock(env, mutex)
+    ENV *env;
+	db_mutex_t mutex;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+#ifdef HAVE_MUTEX_HYBRID
+	int ret;
+#ifdef MUTEX_DIAG
+	int waiters;
+#endif
+#endif
+#ifdef HAVE_SHARED_LATCHES
+	int sharecount;
+#endif
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+#if defined(HAVE_MUTEX_HYBRID) && defined(MUTEX_DIAG)
+	waiters = mutexp->wait;
+#endif
+
+#if defined(DIAGNOSTIC)
+#if defined(HAVE_SHARED_LATCHES)
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+		if (atomic_read(&mutexp->sharecount) == 0) {
+			__db_errx(env, DB_STR_A("2031",
+			    "shared unlock %ld already unlocked", "%ld"),
+			    (long)mutex);
+			return (__env_panic(env, EACCES));
+		}
+	} else
+#endif
+	if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+		__db_errx(env, DB_STR_A("2032",
+		    "unlock %ld already unlocked", "%ld"), (long)mutex);
+		return (__env_panic(env, EACCES));
+	}
+#endif
+
+#ifdef HAVE_SHARED_LATCHES
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+		sharecount = atomic_read(&mutexp->sharecount);
+		/*MUTEX_MEMBAR(mutexp->sharecount);*/		/* XXX why? */
+		if (sharecount == MUTEX_SHARE_ISEXCLUSIVE) {
+			F_CLR(mutexp, DB_MUTEX_LOCKED);
+			/* Flush flag update before zeroing count */
+			MEMBAR_EXIT();
+			atomic_init(&mutexp->sharecount, 0);
+		} else {
+			DB_ASSERT(env, sharecount > 0);
+			MEMBAR_EXIT();
+			sharecount = atomic_dec(env, &mutexp->sharecount);
+			DB_ASSERT(env, sharecount >= 0);
+			if (sharecount > 0)
+				return (0);
+		}
+	} else
+#endif
+	{
+		F_CLR(mutexp, DB_MUTEX_LOCKED);
+		MUTEX_UNSET(&mutexp->tas);
+	}
+
+#ifdef HAVE_MUTEX_HYBRID
+#ifdef DIAGNOSTIC
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(env, 0, 0);
+#endif
+
+	/* Prevent the load of wait from being hoisted before MUTEX_UNSET */
+	MUTEX_MEMBAR(mutexp->flags);
+	if (mutexp->wait &&
+	    (ret = __db_pthread_mutex_unlock(env, mutex)) != 0)
+		    return (ret);
+
+#ifdef MUTEX_DIAG
+	if (mutexp->wait)
+		printf("tas_unlock %ld %x waiters! busy %x waiters %d/%d\n",
+		    mutex, pthread_self(),
+		    MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
+#endif
+#endif
+
+	return (0);
+}
+
+/*
+ * __db_tas_mutex_destroy --
+ *	Destroy a mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_tas_mutex_destroy(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_MUTEX *mutexp;
+#ifdef HAVE_MUTEX_HYBRID
+	int ret;
+#endif
+
+	if (!MUTEX_ON(env))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+	MUTEX_DESTROY(&mutexp->tas);
+
+#ifdef HAVE_MUTEX_HYBRID
+	if ((ret = __db_pthread_mutex_destroy(env, mutex)) != 0)
+		return (ret);
+#endif
+
+	COMPQUIET(mutexp, NULL);	/* MUTEX_DESTROY may not be defined. */
+	return (0);
+}
diff --git a/src/mutex/mut_win32.c b/src/mutex/mut_win32.c
new file mode 100644
index 00000000..07d5a8dd
--- /dev/null
+++ b/src/mutex/mut_win32.c
@@ -0,0 +1,589 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2002, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#define	LOAD_ACTUAL_MUTEX_CODE
+#include "db_int.h"
+
+#include "dbinc/atomic.h"
+/*
+ * This is where we load in the actual mutex declarations.
+ */
+#include "dbinc/mutex_int.h"
+
+/*
+ * Common code to get an event handle.  This is executed whenever a mutex
+ * blocks, or when unlocking a mutex that a thread is waiting on.  We can't
+ * keep these handles around, since the mutex structure is in shared memory,
+ * and each process gets its own handle value.
+ *
+ * We pass security attributes so that the created event is accessible by all
+ * users, in case a Windows service is sharing an environment with a local
+ * process run as a different user.
+ */
+static _TCHAR hex_digits[] = _T("0123456789abcdef");
+
+static __inline int get_handle(env, mutexp, eventp)
+	ENV *env;
+	DB_MUTEX *mutexp;
+	HANDLE *eventp;
+{
+	_TCHAR idbuf[] = _T("db.m00000000");
+	_TCHAR *p = idbuf + 12;
+	int ret = 0;
+	u_int32_t id;
+
+	for (id = (mutexp)->id; id != 0; id >>= 4)
+		*--p = hex_digits[id & 0xf];
+
+#ifndef DB_WINCE
+	if (DB_GLOBAL(win_sec_attr) == NULL) {
+		InitializeSecurityDescriptor(&DB_GLOBAL(win_default_sec_desc),
+		    SECURITY_DESCRIPTOR_REVISION);
+		SetSecurityDescriptorDacl(&DB_GLOBAL(win_default_sec_desc),
+		    TRUE, 0, FALSE);
+		DB_GLOBAL(win_default_sec_attr).nLength =
+		    sizeof(SECURITY_ATTRIBUTES);
+		DB_GLOBAL(win_default_sec_attr).bInheritHandle = FALSE;
+		DB_GLOBAL(win_default_sec_attr).lpSecurityDescriptor =
+		    &DB_GLOBAL(win_default_sec_desc);
+		DB_GLOBAL(win_sec_attr) = &DB_GLOBAL(win_default_sec_attr);
+	}
+#endif
+
+	if ((*eventp = CreateEvent(DB_GLOBAL(win_sec_attr),
+	    FALSE, FALSE, idbuf)) == NULL) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("2002",
+		    "Win32 create event failed"));
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_win32_mutex_lock_int
+ *	Internal function to lock a win32 mutex
+ *
+ *	If the wait parameter is 0, this function will return DB_LOCK_NOTGRANTED
+ *	rather than wait.
+ *
+ */
+static __inline int
+__db_win32_mutex_lock_int(env, mutex, timeout, wait)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+	int wait;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	DB_THREAD_INFO *ip;
+	HANDLE event;
+	u_int32_t ms, nspins;
+	db_timespec now, tempspec, timeoutspec;
+	db_timeout_t time_left;
+	int ret;
+#ifdef MUTEX_DIAG
+	LARGE_INTEGER now;
+#endif
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+	if (timeout != 0) {
+		timespecclear(&timeoutspec);
+		__clock_set_expires(env, &timeoutspec, timeout);
+	}
+
+	/*
+	 * See WINCE_ATOMIC_MAGIC definition for details.
+	 * Use sharecount, because the value just needs to be a db_atomic_t
+	 * memory mapped onto the same page as those being Interlocked*.
+	 */
+	WINCE_ATOMIC_MAGIC(&mutexp->sharecount);
+
+	event = NULL;
+	ms = 50;
+	ret = 0;
+
+	/*
+	 * Only check the thread state once, by initializing the thread
+	 * control block pointer to null.  If it is not the failchk
+	 * thread, then ip will have a valid value subsequent times
+	 * in the loop.
+	 */
+	ip = NULL;
+
+loop:	/* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
+	for (nspins =
+	    mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+		/*
+		 * We can avoid the (expensive) interlocked instructions if
+		 * the mutex is already busy.
+		 */
+		if (MUTEXP_IS_BUSY(mutexp) || !MUTEXP_ACQUIRE(mutexp)) {
+			if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+			    ip == NULL && dbenv->is_alive(dbenv,
+			    mutexp->pid, mutexp->tid, 0) == 0) {
+				ret = __env_set_state(env, &ip, THREAD_VERIFY);
+				if (ret != 0 ||
+				    ip->dbth_state == THREAD_FAILCHK)
+					return (DB_RUNRECOVERY);
+			}
+			if (!wait)
+				return (DB_LOCK_NOTGRANTED);
+			/*
+			 * Some systems (notably those with newer Intel CPUs)
+			 * need a small pause before retrying. [#6975]
+			 */
+			MUTEX_PAUSE
+			continue;
+		}
+
+#ifdef DIAGNOSTIC
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			char buf[DB_THREADID_STRLEN];
+			__db_errx(env, DB_STR_A("2003",
+			    "Win32 lock failed: mutex already locked by %s",
+			    "%s"), dbenv->thread_id_string(dbenv,
+			    mutexp->pid, mutexp->tid, buf));
+			return (__env_panic(env, EACCES));
+		}
+#endif
+		F_SET(mutexp, DB_MUTEX_LOCKED);
+		dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+#ifdef HAVE_STATISTICS
+		if (event == NULL)
+			++mutexp->mutex_set_nowait;
+		else
+			++mutexp->mutex_set_wait;
+#endif
+		if (event != NULL) {
+			CloseHandle(event);
+			InterlockedDecrement(&mutexp->nwaiters);
+#ifdef MUTEX_DIAG
+			if (ret != WAIT_OBJECT_0) {
+				QueryPerformanceCounter(&diag_now);
+				printf(DB_STR_A("2004",
+				    "[%I64d]: Lost signal on mutex %p, "
+				    "id %d, ms %d\n", "%I64d %p %d %d"),
+				    diag_now.QuadPart, mutexp, mutexp->id, ms);
+			}
+#endif
+		}
+
+#ifdef DIAGNOSTIC
+		/*
+		 * We want to switch threads as often as possible.  Yield
+		 * every time we get a mutex to ensure contention.
+		 */
+		if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+			__os_yield(env, 0, 0);
+#endif
+
+		return (0);
+	}
+
+	/*
+	 * Yield the processor; wait 50 ms initially, up to 1 second.  This
+	 * loop is needed to work around a race where the signal from the
+	 * unlocking thread gets lost.  We start at 50 ms because it's unlikely
+	 * to happen often and we want to avoid wasting CPU.
+	 */
+	if (timeout != 0) {
+		timespecclear(&now);
+		if (__clock_expired(env, &now, &timeoutspec)) {
+			if (event != NULL) {
+				CloseHandle(event);
+				InterlockedDecrement(&mutexp->nwaiters);
+			}
+			return (DB_TIMEOUT);
+		}
+		/* Reduce the event wait if the timeout would happen first. */
+		tempspec = timeoutspec;
+		timespecsub(&tempspec, &now);
+		DB_TIMESPEC_TO_TIMEOUT(time_left, &tempspec, 0);
+		time_left /= US_PER_MS;
+		if (ms > time_left)
+			ms = time_left;
+	}
+	if (event == NULL) {
+#ifdef MUTEX_DIAG
+		QueryPerformanceCounter(&diag_now);
+		printf(DB_STR_A("2005",
+		    "[%I64d]: Waiting on mutex %p, id %d\n",
+		    "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+		InterlockedIncrement(&mutexp->nwaiters);
+		if ((ret = get_handle(env, mutexp, &event)) != 0)
+			goto err;
+	}
+	if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) {
+		ret = __os_get_syserr();
+		goto err;
+	}
+	if ((ms <<= 1) > MS_PER_SEC)
+		ms = MS_PER_SEC;
+
+	PANIC_CHECK(env);
+	goto loop;
+
+err:	__db_syserr(env, ret, DB_STR("2006", "Win32 lock failed"));
+	return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_init --
+ *	Initialize a Win32 mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_win32_mutex_init(env, mutex, flags)
+	ENV *env;
+	db_mutex_t mutex;
+	u_int32_t flags;
+{
+	DB_MUTEX *mutexp;
+
+	mutexp = MUTEXP_SET(env, mutex);
+	mutexp->id = ((getpid() & 0xffff) << 16) ^ P_TO_UINT32(mutexp);
+	F_SET(mutexp, flags);
+
+	return (0);
+}
+
+/*
+ * __db_win32_mutex_lock
+ *	Lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_win32_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_win32_mutex_lock(env, mutex, timeout)
+	ENV *env;
+	db_mutex_t mutex;
+	db_timeout_t timeout;
+{
+	return (__db_win32_mutex_lock_int(env, mutex, timeout, 1));
+}
+
+/*
+ * __db_win32_mutex_trylock
+ *	Try to lock a mutex, returning without waiting if it is busy
+ *
+ * PUBLIC: int __db_win32_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_trylock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_win32_mutex_lock_int(env, mutex, 0));
+}
+
+#if defined(HAVE_SHARED_LATCHES)
+/*
+ * __db_win32_mutex_readlock_int
+ *	Try to lock a mutex, possibly waiting if requested and necessary.
+ */
+int
+__db_win32_mutex_readlock_int(env, mutex, nowait)
+	ENV *env;
+	db_mutex_t mutex;
+	int nowait;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	DB_MUTEXMGR *mtxmgr;
+	DB_MUTEXREGION *mtxregion;
+	HANDLE event;
+	u_int32_t nspins;
+	int ms, ret;
+	long exch_ret, mtx_val;
+#ifdef MUTEX_DIAG
+	LARGE_INTEGER diag_now;
+#endif
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mtxmgr = env->mutex_handle;
+	mtxregion = mtxmgr->reginfo.primary;
+	mutexp = MUTEXP_SET(env, mutex);
+
+	CHECK_MTX_THREAD(env, mutexp);
+
+	/*
+	 * See WINCE_ATOMIC_MAGIC definition for details.
+	 * Use sharecount, because the value just needs to be a db_atomic_t
+	 * memory mapped onto the same page as those being Interlocked*.
+	 */
+	WINCE_ATOMIC_MAGIC(&mutexp->sharecount);
+
+	event = NULL;
+	ms = 50;
+	ret = 0;
+	/*
+	 * This needs to be initialized, since if mutexp->tas
+	 * is write locked on the first pass, it needs a value.
+	 */
+	exch_ret = 0;
+
+loop:	/* Attempt to acquire the resource for N spins. */
+	for (nspins =
+	    mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+		/*
+		 * We can avoid the (expensive) interlocked instructions if
+		 * the mutex is already "set".
+		 */
+retry:		mtx_val = atomic_read(&mutexp->sharecount);
+		if (mtx_val == MUTEX_SHARE_ISEXCLUSIVE) {
+			if (nowait)
+				return (DB_LOCK_NOTGRANTED);
+
+			continue;
+		} else if (!atomic_compare_exchange(env, &mutexp->sharecount,
+		    mtx_val, mtx_val + 1)) {
+			/*
+			 * Some systems (notably those with newer Intel CPUs)
+			 * need a small pause here. [#6975]
+			 */
+			MUTEX_PAUSE
+			goto retry;
+		}
+
+#ifdef HAVE_STATISTICS
+		if (event == NULL)
+			++mutexp->mutex_set_rd_nowait;
+		else
+			++mutexp->mutex_set_rd_wait;
+#endif
+		if (event != NULL) {
+			CloseHandle(event);
+			InterlockedDecrement(&mutexp->nwaiters);
+#ifdef MUTEX_DIAG
+			if (ret != WAIT_OBJECT_0) {
+				QueryPerformanceCounter(&diag_now);
+				printf(DB_STR_A("2007",
+				    "[%I64d]: Lost signal on mutex %p, "
+				    "id %d, ms %d\n", "%I64d %p %d %d"),
+				    diag_now.QuadPart, mutexp, mutexp->id, ms);
+			}
+#endif
+		}
+
+#ifdef DIAGNOSTIC
+		/*
+		 * We want to switch threads as often as possible.  Yield
+		 * every time we get a mutex to ensure contention.
+		 */
+		if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+			__os_yield(env, 0, 0);
+#endif
+
+		return (0);
+	}
+
+	/*
+	 * Yield the processor; wait 50 ms initially, up to 1 second.  This
+	 * loop is needed to work around a race where the signal from the
+	 * unlocking thread gets lost.  We start at 50 ms because it's unlikely
+	 * to happen often and we want to avoid wasting CPU.
+	 */
+	if (event == NULL) {
+#ifdef MUTEX_DIAG
+		QueryPerformanceCounter(&diag_now);
+		printf(DB_STR_A("2008",
+		    "[%I64d]: Waiting on mutex %p, id %d\n",
+		    "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+		InterlockedIncrement(&mutexp->nwaiters);
+		if ((ret = get_handle(env, mutexp, &event)) != 0)
+			goto err;
+	}
+	if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) {
+		ret = __os_get_syserr();
+		goto err;
+	}
+	if ((ms <<= 1) > MS_PER_SEC)
+		ms = MS_PER_SEC;
+
+	PANIC_CHECK(env);
+	goto loop;
+
+err:	__db_syserr(env, ret, DB_STR("2009",
+	    "Win32 read lock failed"));
+	return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_readlock
+ *	Get a shared lock on a latch
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_win32_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_win32_mutex_readlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_win32_mutex_readlock_int(env, mutex, 0));
+}
+
+/*
+ * __db_win32_mutex_tryreadlock
+ *	Try to a shared lock on a latch
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_win32_mutex_tryreadlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_win32_mutex_tryreadlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (__db_win32_mutex_readlock_int(env, mutex, 1));
+}
+#endif
+
+/*
+ * __db_win32_mutex_unlock --
+ *	Release a mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_unlock(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	DB_ENV *dbenv;
+	DB_MUTEX *mutexp;
+	HANDLE event;
+	int ret;
+#ifdef MUTEX_DIAG
+	LARGE_INTEGER diag_now;
+#endif
+	dbenv = env->dbenv;
+
+	if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	mutexp = MUTEXP_SET(env, mutex);
+
+#ifdef DIAGNOSTIC
+	if (!MUTEXP_IS_BUSY(mutexp) || !(F_ISSET(mutexp, DB_MUTEX_SHARED) ||
+	    F_ISSET(mutexp, DB_MUTEX_LOCKED))) {
+		__db_errx(env, DB_STR_A("2010",
+	    "Win32 unlock failed: lock already unlocked: mutex %d busy %d",
+		    "%d %d"), mutex, MUTEXP_BUSY_FIELD(mutexp));
+		return (__env_panic(env, EACCES));
+	}
+#endif
+	/*
+	 * If we have a shared latch, and a read lock (DB_MUTEX_LOCKED is only
+	 * set for write locks), then decrement the latch. If the readlock is
+	 * still held by other threads, just return. Otherwise go ahead and
+	 * notify any waiting threads.
+	 */
+#ifdef HAVE_SHARED_LATCHES
+	if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+		if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+			F_CLR(mutexp, DB_MUTEX_LOCKED);
+			if ((ret = InterlockedExchange(
+			    (interlocked_val)(&atomic_read(
+			    &mutexp->sharecount)), 0)) !=
+			    MUTEX_SHARE_ISEXCLUSIVE) {
+				ret = DB_RUNRECOVERY;
+				goto err;
+			}
+		} else if (InterlockedDecrement(
+		    (interlocked_val)(&atomic_read(&mutexp->sharecount))) > 0)
+			return (0);
+	} else
+#endif
+	{
+		F_CLR(mutexp, DB_MUTEX_LOCKED);
+		MUTEX_UNSET(&mutexp->tas);
+	}
+
+	if (mutexp->nwaiters > 0) {
+		if ((ret = get_handle(env, mutexp, &event)) != 0)
+			goto err;
+
+#ifdef MUTEX_DIAG
+		QueryPerformanceCounter(&diag_now);
+		printf(DB_STR_A("2011",
+		    "[%I64d]: Signalling mutex %p, id %d\n",
+		    "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+		if (!PulseEvent(event)) {
+			ret = __os_get_syserr();
+			CloseHandle(event);
+			goto err;
+		}
+
+		CloseHandle(event);
+	}
+
+	return (0);
+
+err:	__db_syserr(env, ret, DB_STR("2012", "Win32 unlock failed"));
+	return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_destroy --
+ *	Destroy a mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_destroy(env, mutex)
+	ENV *env;
+	db_mutex_t mutex;
+{
+	return (0);
+}
+
+#ifndef DB_WINCE
+/*
+ * db_env_set_win_security
+ *
+ *	Set the SECURITY_ATTRIBUTES to be used by BDB on Windows.
+ *	It should not be called while any BDB mutexes are locked.
+ *
+ * EXTERN: #if defined(DB_WIN32) && !defined(DB_WINCE)
+ * EXTERN: int db_env_set_win_security __P((SECURITY_ATTRIBUTES *sa));
+ * EXTERN: #endif
+ */
+int
+db_env_set_win_security(sa)
+	SECURITY_ATTRIBUTES *sa;
+{
+	DB_GLOBAL(win_sec_attr) = sa;
+	return (0);
+}
+#endif
diff --git a/src/mutex/test_mutex.c b/src/mutex/test_mutex.c
new file mode 100644
index 00000000..24c18016
--- /dev/null
+++ b/src/mutex/test_mutex.c
@@ -0,0 +1,1051 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * Standalone mutex tester for Berkeley DB mutexes.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef DB_WIN32
+#define	MUTEX_THREAD_TEST	1
+
+extern int getopt(int, char * const *, const char *);
+
+typedef HANDLE os_pid_t;
+typedef HANDLE os_thread_t;
+
+#define	os_thread_create(thrp, attr, func, arg)				\
+    (((*(thrp) = CreateThread(NULL, 0,					\
+	(LPTHREAD_START_ROUTINE)(func), (arg), 0, NULL)) == NULL) ? -1 : 0)
+#define	os_thread_join(thr, statusp)					\
+    ((WaitForSingleObject((thr), INFINITE) == WAIT_OBJECT_0) &&		\
+    GetExitCodeThread((thr), (LPDWORD)(statusp)) ? 0 : -1)
+#define	os_thread_self() GetCurrentThreadId()
+
+#else /* !DB_WIN32 */
+
+#include <sys/wait.h>
+
+typedef pid_t os_pid_t;
+
+/*
+ * There's only one mutex implementation that can't support thread-level
+ * locking: UNIX/fcntl mutexes.
+ *
+ * The general Berkeley DB library configuration doesn't look for the POSIX
+ * pthread functions, with one exception -- pthread_yield.
+ *
+ * Use these two facts to decide if we're going to build with or without
+ * threads.
+ */
+#if !defined(HAVE_MUTEX_FCNTL) && defined(HAVE_PTHREAD_YIELD)
+#define	MUTEX_THREAD_TEST	1
+
+#include <pthread.h>
+
+typedef pthread_t os_thread_t;
+
+#define	os_thread_create(thrp, attr, func, arg)				\
+    pthread_create((thrp), (attr), (func), (arg))
+#define	os_thread_join(thr, statusp) pthread_join((thr), (statusp))
+#define	os_thread_self() pthread_self()
+#endif /* HAVE_PTHREAD_YIELD */
+#endif /* !DB_WIN32 */
+
+#define	OS_BAD_PID ((os_pid_t)-1)
+
+#define	TESTDIR		"TESTDIR"		/* Working area */
+#define	MT_FILE		"TESTDIR/mutex.file"
+#define	MT_FILE_QUIT	"TESTDIR/mutex.file.quit"
+
+/*
+ * The backing data layout:
+ *	TM[1]			per-thread mutex array lock
+ *	TM[nthreads]		per-thread mutex array
+ *	TM[maxlocks]		per-lock mutex array
+ */
+typedef struct {
+	db_mutex_t mutex;			/* Mutex. */
+	u_long	   id;				/* Holder's ID. */
+	u_int	   wakeme;			/* Request to awake. */
+} TM;
+
+DB_ENV	*dbenv;					/* Backing environment */
+ENV	*env;
+size_t	 len;					/* Backing data chunk size. */
+
+u_int8_t *gm_addr;				/* Global mutex */
+u_int8_t *lm_addr;				/* Locker mutexes */
+u_int8_t *tm_addr;				/* Thread mutexes */
+
+#ifdef MUTEX_THREAD_TEST
+os_thread_t *kidsp;				/* Locker threads */
+os_thread_t  wakep;				/* Wakeup thread */
+#endif
+
+#ifndef	HAVE_MMAP
+u_int	nprocs = 1;				/* -p: Processes. */
+u_int	nthreads = 20;				/* -t: Threads. */
+#elif	MUTEX_THREAD_TEST
+u_int	nprocs = 5;				/* -p: Processes. */
+u_int	nthreads = 4;				/* -t: Threads. */
+#else
+u_int	nprocs = 20;				/* -p: Processes. */
+u_int	nthreads = 1;				/* -t: Threads. */
+#endif
+
+u_int	maxlocks = 20;				/* -l: Backing locks. */
+u_int	nlocks = 10000;				/* -n: Locks per process. */
+int	verbose;				/* -v: Verbosity. */
+
+const char *progname;
+
+void	 data_off(u_int8_t *, DB_FH *);
+void	 data_on(u_int8_t **, u_int8_t **, u_int8_t **, DB_FH **, int);
+int	 locker_start(u_long);
+int	 locker_wait(void);
+os_pid_t os_spawn(const char *, char *const[]);
+int	 os_wait(os_pid_t *, u_int);
+void	*run_lthread(void *);
+void	*run_wthread(void *);
+os_pid_t spawn_proc(u_long, char *, char *);
+void	 tm_env_close(void);
+int	 tm_env_init(void);
+void	 tm_mutex_destroy(void);
+void	 tm_mutex_init(void);
+void	 tm_mutex_stats(void);
+int	 usage(void);
+int	 wakeup_start(u_long);
+int	 wakeup_wait(void);
+
+int
+main(argc, argv)
+	int argc;
+	char *argv[];
+{
+	enum {LOCKER, WAKEUP, PARENT} rtype;
+	extern int optind;
+	extern char *optarg;
+	os_pid_t wakeup_pid, *pids;
+	u_long id;
+	u_int i;
+	DB_FH *fhp, *map_fhp;
+	int ch, err;
+	char *p, *tmpath, cmd[1024];
+
+	if ((progname = __db_rpath(argv[0])) == NULL)
+		progname = argv[0];
+	else
+		++progname;
+
+	rtype = PARENT;
+	id = 0;
+	tmpath = argv[0];
+	while ((ch = getopt(argc, argv, "l:n:p:T:t:v")) != EOF)
+		switch (ch) {
+		case 'l':
+			maxlocks = (u_int)atoi(optarg);
+			break;
+		case 'n':
+			nlocks = (u_int)atoi(optarg);
+			break;
+		case 'p':
+			nprocs = (u_int)atoi(optarg);
+			break;
+		case 't':
+			if ((nthreads = (u_int)atoi(optarg)) == 0)
+				nthreads = 1;
+#if !defined(MUTEX_THREAD_TEST)
+			if (nthreads != 1) {
+				fprintf(stderr,
+    "%s: thread support not available or not compiled for this platform.\n",
+				    progname);
+				return (EXIT_FAILURE);
+			}
+#endif
+			break;
+		case 'T':
+			if (!memcmp(optarg, "locker", sizeof("locker") - 1))
+				rtype = LOCKER;
+			else if (
+			    !memcmp(optarg, "wakeup", sizeof("wakeup") - 1))
+				rtype = WAKEUP;
+			else
+				return (usage());
+			if ((p = strchr(optarg, '=')) == NULL)
+				return (usage());
+			id = (u_long)atoi(p + 1);
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * If we're not running a multi-process test, we should be running
+	 * a multi-thread test.
+	 */
+	if (nprocs == 1 && nthreads == 1) {
+		fprintf(stderr,
+	    "%s: running in a single process requires multiple threads\n",
+		    progname);
+		return (EXIT_FAILURE);
+	}
+
+	len = sizeof(TM) * (1 + nthreads * nprocs + maxlocks);
+
+	/*
+	 * In the multi-process test, the parent spawns processes that exec
+	 * the original binary, ending up here.  Each process joins the DB
+	 * environment separately and then calls the supporting function.
+	 */
+	if (rtype == LOCKER || rtype == WAKEUP) {
+		__os_yield(env, 3, 0);		/* Let everyone catch up. */
+						/* Initialize random numbers. */
+		srand((u_int)time(NULL) % (u_int)getpid());
+
+		if (tm_env_init() != 0)		/* Join the environment. */
+			exit(EXIT_FAILURE);
+						/* Join the backing data. */
+		data_on(&gm_addr, &tm_addr, &lm_addr, &map_fhp, 0);
+		if (verbose)
+			printf(
+	    "Backing file: global (%#lx), threads (%#lx), locks (%#lx)\n",
+			    (u_long)gm_addr, (u_long)tm_addr, (u_long)lm_addr);
+
+		if ((rtype == LOCKER ?
+		    locker_start(id) : wakeup_start(id)) != 0)
+			exit(EXIT_FAILURE);
+		if ((rtype == LOCKER ? locker_wait() : wakeup_wait()) != 0)
+			exit(EXIT_FAILURE);
+
+		data_off(gm_addr, map_fhp);	/* Detach from backing data. */
+
+		tm_env_close();			/* Detach from environment. */
+
+		exit(EXIT_SUCCESS);
+	}
+
+	/*
+	 * The following code is only executed by the original parent process.
+	 *
+	 * Clean up from any previous runs.
+	 */
+	snprintf(cmd, sizeof(cmd), "rm -rf %s", TESTDIR);
+	(void)system(cmd);
+	snprintf(cmd, sizeof(cmd), "mkdir %s", TESTDIR);
+	(void)system(cmd);
+
+	printf(
+    "%s: %u processes, %u threads/process, %u lock requests from %u locks\n",
+	    progname, nprocs, nthreads, nlocks, maxlocks);
+	printf("%s: backing data %lu bytes\n", progname, (u_long)len);
+
+	if (tm_env_init() != 0)		/* Create the environment. */
+		exit(EXIT_FAILURE);
+					/* Create the backing data. */
+	data_on(&gm_addr, &tm_addr, &lm_addr, &map_fhp, 1);
+	if (verbose)
+		printf(
+	    "backing data: global (%#lx), threads (%#lx), locks (%#lx)\n",
+		    (u_long)gm_addr, (u_long)tm_addr, (u_long)lm_addr);
+
+	tm_mutex_init();		/* Initialize mutexes. */
+
+	if (nprocs > 1) {		/* Run the multi-process test. */
+		/* Allocate array of locker process IDs. */
+		if ((pids = calloc(nprocs, sizeof(os_pid_t))) == NULL) {
+			fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+			goto fail;
+		}
+
+		/* Spawn locker processes and threads. */
+		for (i = 0; i < nprocs; ++i) {
+			if ((pids[i] =
+			    spawn_proc(id, tmpath, "locker")) == OS_BAD_PID) {
+				fprintf(stderr,
+				    "%s: failed to spawn a locker\n", progname);
+				goto fail;
+			}
+			id += nthreads;
+		}
+
+		/* Spawn wakeup process/thread. */
+		if ((wakeup_pid =
+		    spawn_proc(id, tmpath, "wakeup")) == OS_BAD_PID) {
+			fprintf(stderr,
+			    "%s: failed to spawn waker\n", progname);
+			goto fail;
+		}
+		++id;
+
+		/* Wait for all lockers to exit. */
+		if ((err = os_wait(pids, nprocs)) != 0) {
+			fprintf(stderr, "%s: locker wait failed with %d\n",
+			    progname, err);
+			goto fail;
+		}
+
+		/* Signal wakeup process to exit. */
+		if ((err = __os_open(
+		    env, MT_FILE_QUIT, 0, DB_OSO_CREATE, 0664, &fhp)) != 0) {
+			fprintf(stderr,
+			    "%s: open %s\n", progname, db_strerror(err));
+			goto fail;
+		}
+		(void)__os_closehandle(env, fhp);
+
+		/* Wait for wakeup process/thread. */
+		if ((err = os_wait(&wakeup_pid, 1)) != 0) {
+			fprintf(stderr, "%s: %lu: exited %d\n",
+			    progname, (u_long)wakeup_pid, err);
+			goto fail;
+		}
+	} else {			/* Run the single-process test. */
+		/* Spawn locker threads. */
+		if (locker_start(0) != 0)
+			goto fail;
+
+		/* Spawn wakeup thread. */
+		if (wakeup_start(nthreads) != 0)
+			goto fail;
+
+		/* Wait for all lockers to exit. */
+		if (locker_wait() != 0)
+			goto fail;
+
+		/* Signal wakeup process to exit. */
+		if ((err = __os_open(
+		    env, MT_FILE_QUIT, 0, DB_OSO_CREATE, 0664, &fhp)) != 0) {
+			fprintf(stderr,
+			    "%s: open %s\n", progname, db_strerror(err));
+			goto fail;
+		}
+		(void)__os_closehandle(env, fhp);
+
+		/* Wait for wakeup thread. */
+		if (wakeup_wait() != 0)
+			goto fail;
+	}
+
+	tm_mutex_stats();		/* Display run statistics. */
+	tm_mutex_destroy();		/* Destroy mutexes. */
+
+	data_off(gm_addr, map_fhp);	/* Detach from backing data. */
+
+	tm_env_close();			/* Detach from environment. */
+
+	printf("%s: test succeeded\n", progname);
+	return (EXIT_SUCCESS);
+
+fail:	printf("%s: FAILED!\n", progname);
+	return (EXIT_FAILURE);
+}
+
+int
+locker_start(id)
+	u_long id;
+{
+#if defined(MUTEX_THREAD_TEST)
+	u_int i;
+	int err;
+
+	/*
+	 * Spawn off threads.  We have nthreads all locking and going to
+	 * sleep, and one other thread cycling through and waking them up.
+	 */
+	if ((kidsp =
+	    (os_thread_t *)calloc(sizeof(os_thread_t), nthreads)) == NULL) {
+		fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+		return (1);
+	}
+	for (i = 0; i < nthreads; i++)
+		if ((err = os_thread_create(
+		    &kidsp[i], NULL, run_lthread, (void *)(id + i))) != 0) {
+			fprintf(stderr, "%s: failed spawning thread: %s\n",
+			    progname, db_strerror(err));
+			return (1);
+		}
+	return (0);
+#else
+	return (run_lthread((void *)id) == NULL ? 0 : 1);
+#endif
+}
+
+int
+locker_wait()
+{
+#if defined(MUTEX_THREAD_TEST)
+	u_int i;
+	void *retp;
+
+	/* Wait for the threads to exit. */
+	for (i = 0; i < nthreads; i++) {
+		(void)os_thread_join(kidsp[i], &retp);
+		if (retp != NULL) {
+			fprintf(stderr,
+			    "%s: thread exited with error\n", progname);
+			return (1);
+		}
+	}
+	free(kidsp);
+#endif
+	return (0);
+}
+
+void *
+run_lthread(arg)
+	void *arg;
+{
+	TM *gp, *mp, *tp;
+	u_long id, tid;
+	u_int lock, nl;
+	int err, i;
+
+	id = (u_long)arg;
+#if defined(MUTEX_THREAD_TEST)
+	tid = (u_long)os_thread_self();
+#else
+	tid = 0;
+#endif
+	printf("Locker: ID %03lu (PID: %lu; TID: %lx)\n",
+	    id, (u_long)getpid(), tid);
+
+	gp = (TM *)gm_addr;
+	tp = (TM *)(tm_addr + id * sizeof(TM));
+
+	for (nl = nlocks; nl > 0;) {
+		/* Select and acquire a data lock. */
+		lock = (u_int)rand() % maxlocks;
+		mp = (TM *)(lm_addr + lock * sizeof(TM));
+		if (verbose)
+			printf("%03lu: lock %d (mtx: %lu)\n",
+			    id, lock, (u_long)mp->mutex);
+
+		if ((err = dbenv->mutex_lock(dbenv, mp->mutex)) != 0) {
+			fprintf(stderr, "%s: %03lu: never got lock %d: %s\n",
+			    progname, id, lock, db_strerror(err));
+			return ((void *)1);
+		}
+		if (mp->id != 0) {
+			fprintf(stderr,
+			    "%s: RACE! (%03lu granted lock %d held by %03lu)\n",
+			    progname, id, lock, mp->id);
+			return ((void *)1);
+		}
+		mp->id = id;
+
+		/*
+		 * Pretend to do some work, periodically checking to see if
+		 * we still hold the mutex.
+		 */
+		for (i = 0; i < 3; ++i) {
+			__os_yield(env, 0, (u_long)rand() % 3);
+			if (mp->id != id) {
+				fprintf(stderr,
+			    "%s: RACE! (%03lu stole lock %d from %03lu)\n",
+				    progname, mp->id, lock, id);
+				return ((void *)1);
+			}
+		}
+
+		/*
+		 * Test self-blocking and unlocking by other threads/processes:
+		 *
+		 *	acquire the global lock
+		 *	set our wakeup flag
+		 *	release the global lock
+		 *	acquire our per-thread lock
+		 *
+		 * The wakeup thread will wake us up.
+		 */
+		if ((err = dbenv->mutex_lock(dbenv, gp->mutex)) != 0) {
+			fprintf(stderr, "%s: %03lu: global lock: %s\n",
+			    progname, id, db_strerror(err));
+			return ((void *)1);
+		}
+		if (tp->id != 0 && tp->id != id) {
+			fprintf(stderr,
+		    "%s: %03lu: per-thread mutex isn't mine, owned by %03lu\n",
+			    progname, id, tp->id);
+			return ((void *)1);
+		}
+		tp->id = id;
+		if (verbose)
+			printf("%03lu: self-blocking (mtx: %lu)\n",
+			    id, (u_long)tp->mutex);
+		if (tp->wakeme) {
+			fprintf(stderr,
+			    "%s: %03lu: wakeup flag incorrectly set\n",
+			    progname, id);
+			return ((void *)1);
+		}
+		tp->wakeme = 1;
+		if ((err = dbenv->mutex_unlock(dbenv, gp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: %03lu: global unlock: %s\n",
+			    progname, id, db_strerror(err));
+			return ((void *)1);
+		}
+		if ((err = dbenv->mutex_lock(dbenv, tp->mutex)) != 0) {
+			fprintf(stderr, "%s: %03lu: per-thread lock: %s\n",
+			    progname, id, db_strerror(err));
+			return ((void *)1);
+		}
+		/* Time passes... */
+		if (tp->wakeme) {
+			fprintf(stderr, "%s: %03lu: wakeup flag not cleared\n",
+			    progname, id);
+			return ((void *)1);
+		}
+
+		if (verbose)
+			printf("%03lu: release %d (mtx: %lu)\n",
+			    id, lock, (u_long)mp->mutex);
+
+		/* Release the data lock. */
+		mp->id = 0;
+		if ((err = dbenv->mutex_unlock(dbenv, mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: %03lu: lock release: %s\n",
+			    progname, id, db_strerror(err));
+			return ((void *)1);
+		}
+
+		if (--nl % 1000 == 0)
+			printf("%03lu: %d\n", id, nl);
+	}
+
+	return (NULL);
+}
+
+int
+wakeup_start(id)
+	u_long id;
+{
+#if defined(MUTEX_THREAD_TEST)
+	int err;
+
+	/*
+	 * Spawn off wakeup thread.
+	 */
+	if ((err = os_thread_create(
+	    &wakep, NULL, run_wthread, (void *)id)) != 0) {
+		fprintf(stderr, "%s: failed spawning wakeup thread: %s\n",
+		    progname, db_strerror(err));
+		return (1);
+	}
+	return (0);
+#else
+	return (run_wthread((void *)id) == NULL ? 0 : 1);
+#endif
+}
+
+int
+wakeup_wait()
+{
+#if defined(MUTEX_THREAD_TEST)
+	void *retp;
+
+	/*
+	 * A file is created when the wakeup thread is no longer needed.
+	 */
+	(void)os_thread_join(wakep, &retp);
+	if (retp != NULL) {
+		fprintf(stderr,
+		    "%s: wakeup thread exited with error\n", progname);
+		return (1);
+	}
+#endif
+	return (0);
+}
+
+/*
+ * run_wthread --
+ *	Thread to wake up other threads that are sleeping.
+ */
+void *
+run_wthread(arg)
+	void *arg;
+{
+	TM *gp, *tp;
+	u_long id, tid;
+	u_int check_id;
+	int err, quitcheck;
+
+	id = (u_long)arg;
+	quitcheck = 0;
+#if defined(MUTEX_THREAD_TEST)
+	tid = (u_long)os_thread_self();
+#else
+	tid = 0;
+#endif
+	printf("Wakeup: ID %03lu (PID: %lu; TID: %lx)\n",
+	    id, (u_long)getpid(), tid);
+
+	gp = (TM *)gm_addr;
+
+	/* Loop, waking up sleepers and periodically sleeping ourselves. */
+	for (check_id = 0;; ++check_id) {
+		/* Check to see if the locking threads have finished. */
+		if (++quitcheck >= 100) {
+			quitcheck = 0;
+		if (__os_exists(env, MT_FILE_QUIT, NULL) == 0)
+			break;
+		}
+
+		/* Check for ID wraparound. */
+		if (check_id == nthreads * nprocs)
+			check_id = 0;
+
+		/* Check for a thread that needs a wakeup. */
+		tp = (TM *)(tm_addr + check_id * sizeof(TM));
+		if (!tp->wakeme)
+			continue;
+
+		if (verbose) {
+			printf("%03lu: wakeup thread %03lu (mtx: %lu)\n",
+			    id, tp->id, (u_long)tp->mutex);
+			(void)fflush(stdout);
+		}
+
+		/* Acquire the global lock. */
+		if ((err = dbenv->mutex_lock(dbenv, gp->mutex)) != 0) {
+			fprintf(stderr, "%s: wakeup: global lock: %s\n",
+			    progname, db_strerror(err));
+			return ((void *)1);
+		}
+
+		tp->wakeme = 0;
+		if ((err = dbenv->mutex_unlock(dbenv, tp->mutex)) != 0) {
+			fprintf(stderr, "%s: wakeup: unlock: %s\n",
+			    progname, db_strerror(err));
+			return ((void *)1);
+		}
+
+		if ((err = dbenv->mutex_unlock(dbenv, gp->mutex)) != 0) {
+			fprintf(stderr, "%s: wakeup: global unlock: %s\n",
+			    progname, db_strerror(err));
+			return ((void *)1);
+		}
+
+		__os_yield(env, 0, (u_long)rand() % 3);
+	}
+	return (NULL);
+}
+
+/*
+ * tm_env_init --
+ *	Create the backing database environment.
+ */
+int
+tm_env_init()
+{
+	u_int32_t flags;
+	int ret;
+	char *home;
+
+	/*
+	 * Create an environment object and initialize it for error
+	 * reporting.
+	 */
+	if ((ret = db_env_create(&dbenv, 0)) != 0) {
+		fprintf(stderr, "%s: %s\n", progname, db_strerror(ret));
+		return (1);
+	}
+	env = dbenv->env;
+	dbenv->set_errfile(dbenv, stderr);
+	dbenv->set_errpfx(dbenv, progname);
+
+	/* Allocate enough mutexes. */
+	if ((ret = dbenv->mutex_set_increment(dbenv,
+	    1 + nthreads * nprocs + maxlocks)) != 0) {
+		dbenv->err(dbenv, ret, "dbenv->mutex_set_increment");
+		return (1);
+	}
+
+	flags = DB_CREATE;
+	if (nprocs == 1) {
+		home = NULL;
+		flags |= DB_PRIVATE;
+	} else
+		home = TESTDIR;
+	if (nthreads != 1)
+		flags |= DB_THREAD;
+	if ((ret = dbenv->open(dbenv, home, flags, 0)) != 0) {
+		dbenv->err(dbenv, ret, "environment open: %s", home);
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * tm_env_close --
+ *	Close the backing database environment.
+ */
+void
+tm_env_close()
+{
+	(void)dbenv->close(dbenv, 0);
+}
+
+/*
+ * tm_mutex_init --
+ *	Initialize the mutexes.
+ */
+void
+tm_mutex_init()
+{
+	TM *mp;
+	u_int i;
+	int err;
+
+	if (verbose)
+		printf("Allocate the global mutex: ");
+	mp = (TM *)gm_addr;
+	if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) {
+		fprintf(stderr, "%s: DB_ENV->mutex_alloc (global): %s\n",
+		    progname, db_strerror(err));
+		exit(EXIT_FAILURE);
+	}
+	if (verbose)
+		printf("%lu\n", (u_long)mp->mutex);
+
+	if (verbose)
+		printf(
+		    "Allocate %d per-thread, self-blocking mutexes: ",
+		    nthreads * nprocs);
+	for (i = 0; i < nthreads * nprocs; ++i) {
+		mp = (TM *)(tm_addr + i * sizeof(TM));
+		if ((err = dbenv->mutex_alloc(
+		    dbenv, DB_MUTEX_SELF_BLOCK, &mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: DB_ENV->mutex_alloc (per-thread %d): %s\n",
+			    progname, i, db_strerror(err));
+			exit(EXIT_FAILURE);
+		}
+		if ((err = dbenv->mutex_lock(dbenv, mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: DB_ENV->mutex_lock (per-thread %d): %s\n",
+			    progname, i, db_strerror(err));
+			exit(EXIT_FAILURE);
+		}
+		if (verbose)
+			printf("%lu ", (u_long)mp->mutex);
+	}
+	if (verbose)
+		printf("\n");
+
+	if (verbose)
+		printf("Allocate %d per-lock mutexes: ", maxlocks);
+	for (i = 0; i < maxlocks; ++i) {
+		mp = (TM *)(lm_addr + i * sizeof(TM));
+		if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: DB_ENV->mutex_alloc (per-lock: %d): %s\n",
+			    progname, i, db_strerror(err));
+			exit(EXIT_FAILURE);
+		}
+		if (verbose)
+			printf("%lu ", (u_long)mp->mutex);
+	}
+	if (verbose)
+		printf("\n");
+}
+
+/*
+ * tm_mutex_destroy --
+ *	Destroy the mutexes.
+ */
+void
+tm_mutex_destroy()
+{
+	TM *gp, *mp;
+	u_int i;
+	int err;
+
+	if (verbose)
+		printf("Destroy the global mutex.\n");
+	gp = (TM *)gm_addr;
+	if ((err = dbenv->mutex_free(dbenv, gp->mutex)) != 0) {
+		fprintf(stderr, "%s: DB_ENV->mutex_free (global): %s\n",
+		    progname, db_strerror(err));
+		exit(EXIT_FAILURE);
+	}
+
+	if (verbose)
+		printf("Destroy the per-thread mutexes.\n");
+	for (i = 0; i < nthreads * nprocs; ++i) {
+		mp = (TM *)(tm_addr + i * sizeof(TM));
+		if ((err = dbenv->mutex_free(dbenv, mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: DB_ENV->mutex_free (per-thread %d): %s\n",
+			    progname, i, db_strerror(err));
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	if (verbose)
+		printf("Destroy the per-lock mutexes.\n");
+	for (i = 0; i < maxlocks; ++i) {
+		mp = (TM *)(lm_addr + i * sizeof(TM));
+		if ((err = dbenv->mutex_free(dbenv, mp->mutex)) != 0) {
+			fprintf(stderr,
+			    "%s: DB_ENV->mutex_free (per-lock: %d): %s\n",
+			    progname, i, db_strerror(err));
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+/*
+ * tm_mutex_stats --
+ *	Display mutex statistics.
+ */
+void
+tm_mutex_stats()
+{
+#ifdef HAVE_STATISTICS
+	TM *mp;
+	uintmax_t set_wait, set_nowait;
+	u_int i;
+
+	printf("Per-lock mutex statistics.\n");
+	for (i = 0; i < maxlocks; ++i) {
+		mp = (TM *)(lm_addr + i * sizeof(TM));
+		__mutex_set_wait_info(env, mp->mutex, &set_wait, &set_nowait);
+		printf("mutex %2d: wait: %lu; no wait %lu\n", i,
+		    (u_long)set_wait, (u_long)set_nowait);
+	}
+#endif
+}
+
+/*
+ * data_on --
+ *	Map in or allocate the backing data space.
+ */
+void
+data_on(gm_addrp, tm_addrp, lm_addrp, fhpp, init)
+	u_int8_t **gm_addrp, **tm_addrp, **lm_addrp;
+	DB_FH **fhpp;
+	int init;
+{
+	DB_FH *fhp;
+	size_t nwrite;
+	int err;
+	void *addr;
+
+	fhp = NULL;
+
+	/*
+	 * In a single process, use heap memory.
+	 */
+	if (nprocs == 1) {
+		if (init) {
+			if ((err =
+			    __os_calloc(env, (size_t)len, 1, &addr)) != 0)
+				exit(EXIT_FAILURE);
+		} else {
+			fprintf(stderr,
+			    "%s: init should be set for single process call\n",
+			    progname);
+			exit(EXIT_FAILURE);
+		}
+	} else {
+		if (init) {
+			if (verbose)
+				printf("Create the backing file.\n");
+
+			if ((err = __os_open(env, MT_FILE, 0,
+			    DB_OSO_CREATE | DB_OSO_TRUNC, 0666, &fhp)) == -1) {
+				fprintf(stderr, "%s: %s: open: %s\n",
+				    progname, MT_FILE, db_strerror(err));
+				exit(EXIT_FAILURE);
+			}
+
+			if ((err =
+			    __os_seek(env, fhp, 0, 0, (u_int32_t)len)) != 0 ||
+			    (err =
+			    __os_write(env, fhp, &err, 1, &nwrite)) != 0 ||
+			    nwrite != 1) {
+				fprintf(stderr, "%s: %s: seek/write: %s\n",
+				    progname, MT_FILE, db_strerror(err));
+				exit(EXIT_FAILURE);
+			}
+		} else
+			if ((err = __os_open(env, MT_FILE, 0, 0, 0, &fhp)) != 0)
+				exit(EXIT_FAILURE);
+
+		if ((err =
+		    __os_mapfile(env, MT_FILE, fhp, len, 0, &addr)) != 0)
+			exit(EXIT_FAILURE);
+	}
+
+	*gm_addrp = (u_int8_t *)addr;
+	addr = (u_int8_t *)addr + sizeof(TM);
+	*tm_addrp = (u_int8_t *)addr;
+	addr = (u_int8_t *)addr + sizeof(TM) * (nthreads * nprocs);
+	*lm_addrp = (u_int8_t *)addr;
+
+	if (fhpp != NULL)
+		*fhpp = fhp;
+}
+
+/*
+ * data_off --
+ *	Discard or de-allocate the backing data space.
+ */
+void
+data_off(addr, fhp)
+	u_int8_t *addr;
+	DB_FH *fhp;
+{
+	if (nprocs == 1)
+		__os_free(env, addr);
+	else {
+		if (__os_unmapfile(env, addr, len) != 0)
+			exit(EXIT_FAILURE);
+		if (__os_closehandle(env, fhp) != 0)
+			exit(EXIT_FAILURE);
+	}
+}
+
+/*
+ * usage --
+ *
+ */
+int
+usage()
+{
+	fprintf(stderr, "usage: %s %s\n\t%s\n", progname,
+	    "[-v] [-l maxlocks]",
+	    "[-n locks] [-p procs] [-T locker=ID|wakeup=ID] [-t threads]");
+	return (EXIT_FAILURE);
+}
+
+/*
+ * os_wait --
+ *	Wait for an array of N procs.
+ */
+int
+os_wait(procs, n)
+	os_pid_t *procs;
+	u_int n;
+{
+	u_int i;
+	int status;
+#if defined(DB_WIN32)
+	DWORD ret;
+#endif
+
+	status = 0;
+
+#if defined(DB_WIN32)
+	do {
+		ret = WaitForMultipleObjects(n, procs, FALSE, INFINITE);
+		i = ret - WAIT_OBJECT_0;
+		if (i < 0 || i >= n)
+			return (__os_posix_err(__os_get_syserr()));
+
+		if ((GetExitCodeProcess(procs[i], &ret) == 0) || (ret != 0))
+			return (ret);
+
+		/* remove the process handle from the list */
+		while (++i < n)
+			procs[i - 1] = procs[i];
+	} while (--n);
+#elif !defined(HAVE_VXWORKS)
+	do {
+		if (wait(&status) == -1)
+			return (__os_posix_err(__os_get_syserr()));
+
+		if (WIFEXITED(status) == 0 || WEXITSTATUS(status) != 0) {
+			for (i = 0; i < n; i++)
+				(void)kill(procs[i], SIGKILL);
+			return (WEXITSTATUS(status));
+		}
+	} while (--n);
+#endif
+
+	return (0);
+}
+
+os_pid_t
+spawn_proc(id, tmpath, typearg)
+	u_long id;
+	char *tmpath, *typearg;
+{
+	char *const vbuf = verbose ?  "-v" : NULL;
+	char *args[13], lbuf[16], nbuf[16], pbuf[16], tbuf[16], Tbuf[256];
+
+	args[0] = tmpath;
+	args[1] = "-l";
+	snprintf(lbuf, sizeof(lbuf),  "%d", maxlocks);
+	args[2] = lbuf;
+	args[3] = "-n";
+	snprintf(nbuf, sizeof(nbuf),  "%d", nlocks);
+	args[4] = nbuf;
+	args[5] = "-p";
+	snprintf(pbuf, sizeof(pbuf),  "%d", nprocs);
+	args[6] = pbuf;
+	args[7] = "-t";
+	snprintf(tbuf, sizeof(tbuf),  "%d", nthreads);
+	args[8] = tbuf;
+	args[9] = "-T";
+	snprintf(Tbuf, sizeof(Tbuf),  "%s=%lu", typearg, id);
+	args[10] = Tbuf;
+	args[11] = vbuf;
+	args[12] = NULL;
+
+	return (os_spawn(tmpath, args));
+}
+
+os_pid_t
+os_spawn(path, argv)
+	const char *path;
+	char *const argv[];
+{
+	os_pid_t pid;
+	int status;
+
+	COMPQUIET(pid, 0);
+	COMPQUIET(status, 0);
+
+#ifdef HAVE_VXWORKS
+	fprintf(stderr, "%s: os_spawn not supported for VxWorks.\n", progname);
+	return (OS_BAD_PID);
+#elif defined(HAVE_QNX)
+	/*
+	 * For QNX, we cannot fork if we've ever used threads.  So
+	 * we'll use their spawn function.  We use 'spawnl' which
+	 * is NOT a POSIX function.
+	 *
+	 * The return value of spawnl is just what we want depending
+	 * on the value of the 'wait' arg.
+	 */
+	return (spawnv(P_NOWAIT, path, argv));
+#elif defined(DB_WIN32)
+	return (os_pid_t)(_spawnv(P_NOWAIT, path, argv));
+#else
+	if ((pid = fork()) != 0) {
+		if (pid == -1)
+			return (OS_BAD_PID);
+		return (pid);
+	} else {
+		(void)execv(path, argv);
+		exit(EXIT_FAILURE);
+	}
+#endif
+}
diff --git a/src/mutex/uts4_cc.s b/src/mutex/uts4_cc.s
new file mode 100644
index 00000000..4f59e9c8
--- /dev/null
+++ b/src/mutex/uts4_cc.s
@@ -0,0 +1,26 @@
+ / See the file LICENSE for redistribution information.
+ /
+ / Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ /
+ / $Id$
+ /
+ / int uts_lock ( int *p, int i );
+ /             Update the lock word pointed to by p with the
+ /             value i, using compare-and-swap.
+ /             Returns 0 if update was successful.
+ /             Returns 1 if update failed.
+ /
+         entry   uts_lock
+ uts_lock:
+         using   .,r15
+         st      r2,8(sp)        / Save R2
+         l       r2,64+0(sp)     / R2 -> word to update
+         slr     r0, r0          / R0 = current lock value must be 0
+         l       r1,64+4(sp)     / R1 = new lock value
+         cs      r0,r1,0(r2)     / Try the update ...
+         be      x               /  ... Success.  Return 0
+         la      r0,1            /  ... Failure.  Return 1
+ x:                              /
+         l       r2,8(sp)        / Restore R2
+         b       2(,r14)         / Return to caller
+         drop    r15
diff --git a/src/os/os_abort.c b/src/os/os_abort.c
new file mode 100644
index 00000000..68b4bc05
--- /dev/null
+++ b/src/os/os_abort.c
@@ -0,0 +1,33 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abort --
+ *
+ * PUBLIC: void __os_abort __P((ENV *));
+ */
+void
+__os_abort(env)
+	ENV *env;
+{
+	__os_stack(env);		/* Try and get a stack trace. */
+
+#ifdef HAVE_ABORT
+	abort();			/* Try and drop core. */
+	/* NOTREACHED */
+#endif
+#ifdef SIGABRT
+	(void)raise(SIGABRT);		/* Try and drop core. */
+#endif
+	exit(1);			/* Quit anyway. */
+	/* NOTREACHED */
+}
diff --git a/src/os/os_abs.c b/src/os/os_abs.c
new file mode 100644
index 00000000..4a1a5abd
--- /dev/null
+++ b/src/os/os_abs.c
@@ -0,0 +1,24 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abspath --
+ *	Return if a path is an absolute path.
+ *
+ * PUBLIC: int __os_abspath __P((const char *));
+ */
+int
+__os_abspath(path)
+	const char *path;
+{
+	return (path[0] == '/');
+}
diff --git a/src/os/os_addrinfo.c b/src/os/os_addrinfo.c
new file mode 100644
index 00000000..205f41ec
--- /dev/null
+++ b/src/os/os_addrinfo.c
@@ -0,0 +1,179 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getaddrinfo and __os_freeaddrinfo wrap the getaddrinfo and freeaddrinfo
+ * calls, as well as the associated platform dependent error handling, mapping
+ * the error return to a ANSI C/POSIX error return.
+ */
+
+/*
+ * __os_getaddrinfo --
+ *
+ * PUBLIC: #if defined(HAVE_REPLICATION_THREADS)
+ * PUBLIC: int __os_getaddrinfo __P((ENV *, const char *, u_int,
+ * PUBLIC:    const char *, const ADDRINFO *, ADDRINFO **));
+ * PUBLIC: #endif
+ */
+int
+__os_getaddrinfo(env, nodename, port, servname, hints, res)
+	ENV *env;
+	const char *nodename, *servname;
+	u_int port;
+	const ADDRINFO *hints;
+	ADDRINFO **res;
+{
+#ifdef HAVE_GETADDRINFO
+	int ret;
+
+	if ((ret = getaddrinfo(nodename, servname, hints, res)) == 0)
+		return (0);
+
+	__db_errx(env, DB_STR_A("0153",
+	    "%s(%u): host lookup failed: %s", "%s %u %s"),
+	    nodename == NULL ? "" : nodename, port,
+#ifdef DB_WIN32
+	    gai_strerrorA(ret));
+#else
+	    gai_strerror(ret));
+#endif
+	return (__os_posix_err(ret));
+#else
+	ADDRINFO *answer;
+	struct hostent *hostaddr;
+	struct sockaddr_in sin;
+	u_int32_t tmpaddr;
+	int ret;
+
+	COMPQUIET(hints, NULL);
+	COMPQUIET(servname, NULL);
+
+	/* INADDR_NONE is not defined on Solaris 2.6, 2.7 or 2.8. */
+#ifndef	INADDR_NONE
+#define	INADDR_NONE	((u_long)0xffffffff)
+#endif
+
+	/*
+	 * Basic implementation of IPv4 component of getaddrinfo.
+	 * Limited to the functionality used by repmgr.
+	 */
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	if (nodename) {
+		if (nodename[0] == '\0')
+			sin.sin_addr.s_addr = htonl(INADDR_ANY);
+		else if ((tmpaddr = inet_addr(CHAR_STAR_CAST nodename)) !=
+		    INADDR_NONE) {
+			sin.sin_addr.s_addr = tmpaddr;
+		} else {
+			hostaddr = gethostbyname(nodename);
+			if (hostaddr == NULL) {
+#ifdef DB_WIN32
+				ret = __os_get_neterr();
+				__db_syserr(env, ret, DB_STR_A("0154",
+				    "%s(%u): host lookup failed", "%s %u"),
+				    nodename == NULL ? "" : nodename, port);
+				return (__os_posix_err(ret));
+#else
+				/*
+				 * Historic UNIX systems used the h_errno
+				 * global variable to return gethostbyname
+				 * errors.  The only function we currently
+				 * use that needs h_errno is gethostbyname,
+				 * so we deal with it here.
+				 *
+				 * hstrerror is not available on Solaris 2.6
+				 * (it is in libresolv but is a private,
+				 * unexported symbol).
+				 */
+#ifdef HAVE_HSTRERROR
+				__db_errx(env, DB_STR_A("0155",
+				    "%s(%u): host lookup failed: %s",
+				    "%s %u %s"),
+				    nodename == NULL ? "" : nodename, port,
+				    hstrerror(h_errno));
+#else
+				__db_errx(env, DB_STR_A("0156",
+				    "%s(%u): host lookup failed: %d",
+				    "%s %u %d"),
+				    nodename == NULL ? "" : nodename, port,
+				    h_errno);
+#endif
+				switch (h_errno) {
+				case HOST_NOT_FOUND:
+				case NO_DATA:
+					return (EHOSTUNREACH);
+				case TRY_AGAIN:
+					return (EAGAIN);
+				case NO_RECOVERY:
+				default:
+					return (EFAULT);
+				}
+				/* NOTREACHED */
+#endif
+			}
+			memcpy(&(sin.sin_addr),
+			    hostaddr->h_addr, (size_t)hostaddr->h_length);
+		}
+	} else					/* No host specified. */
+		sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons((u_int16_t)port);
+
+	if ((ret = __os_calloc(env, 1, sizeof(ADDRINFO), &answer)) != 0)
+		return (ret);
+	if ((ret = __os_malloc(env, sizeof(sin), &answer->ai_addr)) != 0) {
+		__os_free(env, answer);
+		return (ret);
+	}
+
+	answer->ai_family = AF_INET;
+	answer->ai_protocol = IPPROTO_TCP;
+	answer->ai_socktype = SOCK_STREAM;
+	answer->ai_addrlen = sizeof(sin);
+	memcpy(answer->ai_addr, &sin, sizeof(sin));
+	*res = answer;
+
+	return (0);
+#endif /* HAVE_GETADDRINFO */
+}
+
+/*
+ * __os_freeaddrinfo --
+ *
+ * PUBLIC: #if defined(HAVE_REPLICATION_THREADS)
+ * PUBLIC: void __os_freeaddrinfo __P((ENV *, ADDRINFO *));
+ * PUBLIC: #endif
+ */
+void
+__os_freeaddrinfo(env, ai)
+	ENV *env;
+	ADDRINFO *ai;
+{
+#ifdef HAVE_GETADDRINFO
+	COMPQUIET(env, NULL);
+
+	freeaddrinfo(ai);
+#else
+	ADDRINFO *next, *tmpaddr;
+
+	for (next = ai; next != NULL; next = tmpaddr) {
+		if (next->ai_canonname != NULL)
+			__os_free(env, next->ai_canonname);
+
+		if (next->ai_addr != NULL)
+			__os_free(env, next->ai_addr);
+
+		tmpaddr = next->ai_next;
+		__os_free(env, next);
+	}
+#endif
+}
diff --git a/src/os/os_alloc.c b/src/os/os_alloc.c
new file mode 100644
index 00000000..fb7bf109
--- /dev/null
+++ b/src/os/os_alloc.c
@@ -0,0 +1,464 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef DIAGNOSTIC
+static void __os_guard __P((ENV *));
+
+typedef union {
+	size_t size;
+	uintmax_t align;
+} db_allocinfo_t;
+#endif
+
+/*
+ * !!!
+ * Correct for systems that return NULL when you allocate 0 bytes of memory.
+ * There are several places in DB where we allocate the number of bytes held
+ * by the key/data item, and it can be 0.  Correct here so that malloc never
+ * returns a NULL for that reason (which behavior is permitted by ANSI).  We
+ * could make these calls macros on non-Alpha architectures (that's where we
+ * saw the problem), but it's probably not worth the autoconf complexity.
+ *
+ * !!!
+ * Correct for systems that don't set errno when malloc and friends fail.
+ *
+ *	Out of memory.
+ *	We wish to hold the whole sky,
+ *	But we never will.
+ */
+
+/*
+ * __os_umalloc --
+ *	Allocate memory to be used by the application.
+ *
+ *	Use, in order of preference, the allocation function specified to the
+ *	ENV handle, the allocation function specified as a replacement for
+ *	the library malloc, or the library malloc().
+ *
+ * PUBLIC: int __os_umalloc __P((ENV *, size_t, void *));
+ */
+int
+__os_umalloc(env, size, storep)
+	ENV *env;
+	size_t size;
+	void *storep;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+	if (dbenv == NULL || dbenv->db_malloc == NULL) {
+		if (DB_GLOBAL(j_malloc) != NULL)
+			*(void **)storep = DB_GLOBAL(j_malloc)(size);
+		else
+			*(void **)storep = malloc(size);
+		if (*(void **)storep == NULL) {
+			/*
+			 *  Correct error return, see __os_malloc.
+			 */
+			if ((ret = __os_get_errno_ret_zero()) == 0) {
+				ret = ENOMEM;
+				__os_set_errno(ENOMEM);
+			}
+			__db_err(env, ret, DB_STR_A("0143", "malloc: %lu",
+			    "%lu"), (u_long)size);
+			return (ret);
+		}
+		return (0);
+	}
+
+	if ((*(void **)storep = dbenv->db_malloc(size)) == NULL) {
+		__db_errx(env, DB_STR("0144",
+		    "user-specified malloc function returned NULL"));
+		return (ENOMEM);
+	}
+
+	return (0);
+}
+
+/*
+ * __os_urealloc --
+ *	Allocate memory to be used by the application.
+ *
+ *	A realloc(3) counterpart to __os_umalloc's malloc(3).
+ *
+ * PUBLIC: int __os_urealloc __P((ENV *, size_t, void *));
+ */
+int
+__os_urealloc(env, size, storep)
+	ENV *env;
+	size_t size;
+	void *storep;
+{
+	DB_ENV *dbenv;
+	int ret;
+	void *ptr;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	ptr = *(void **)storep;
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+	if (dbenv == NULL || dbenv->db_realloc == NULL) {
+		if (ptr == NULL)
+			return (__os_umalloc(env, size, storep));
+
+		if (DB_GLOBAL(j_realloc) != NULL)
+			*(void **)storep = DB_GLOBAL(j_realloc)(ptr, size);
+		else
+			*(void **)storep = realloc(ptr, size);
+		if (*(void **)storep == NULL) {
+			/*
+			 * Correct errno, see __os_realloc.
+			 */
+			if ((ret = __os_get_errno_ret_zero()) == 0) {
+				ret = ENOMEM;
+				__os_set_errno(ENOMEM);
+			}
+			__db_err(env, ret, DB_STR_A("0145",
+			    "realloc: %lu", "%lu"), (u_long)size);
+			return (ret);
+		}
+		return (0);
+	}
+
+	if ((*(void **)storep = dbenv->db_realloc(ptr, size)) == NULL) {
+		__db_errx(env, DB_STR("0146",
+		    "User-specified realloc function returned NULL"));
+		return (ENOMEM);
+	}
+
+	return (0);
+}
+
+/*
+ * __os_ufree --
+ *	Free memory used by the application.
+ *
+ *	A free(3) counterpart to __os_umalloc's malloc(3).
+ *
+ * PUBLIC: void __os_ufree __P((ENV *, void *));
+ */
+void
+__os_ufree(env, ptr)
+	ENV *env;
+	void *ptr;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL && dbenv->db_free != NULL)
+		dbenv->db_free(ptr);
+	else if (DB_GLOBAL(j_free) != NULL)
+		DB_GLOBAL(j_free)(ptr);
+	else
+		free(ptr);
+}
+
+/*
+ * __os_strdup --
+ *	The strdup(3) function for DB.
+ *
+ * PUBLIC: int __os_strdup __P((ENV *, const char *, void *));
+ */
+int
+__os_strdup(env, str, storep)
+	ENV *env;
+	const char *str;
+	void *storep;
+{
+	size_t size;
+	int ret;
+	void *p;
+
+	*(void **)storep = NULL;
+
+	size = strlen(str) + 1;
+	if ((ret = __os_malloc(env, size, &p)) != 0)
+		return (ret);
+
+	memcpy(p, str, size);
+
+	*(void **)storep = p;
+	return (0);
+}
+
+/*
+ * __os_calloc --
+ *	The calloc(3) function for DB.
+ *
+ * PUBLIC: int __os_calloc __P((ENV *, size_t, size_t, void *));
+ */
+int
+__os_calloc(env, num, size, storep)
+	ENV *env;
+	size_t num, size;
+	void *storep;
+{
+	int ret;
+
+	size *= num;
+	if ((ret = __os_malloc(env, size, storep)) != 0)
+		return (ret);
+
+	memset(*(void **)storep, 0, size);
+
+	return (0);
+}
+
+/*
+ * __os_malloc --
+ *	The malloc(3) function for DB.
+ *
+ * PUBLIC: int __os_malloc __P((ENV *, size_t, void *));
+ */
+int
+__os_malloc(env, size, storep)
+	ENV *env;
+	size_t size;
+	void *storep;
+{
+	int ret;
+	void *p;
+
+	*(void **)storep = NULL;
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+#ifdef DIAGNOSTIC
+	/* Add room for size and a guard byte. */
+	size += sizeof(db_allocinfo_t) + 1;
+#endif
+
+	if (DB_GLOBAL(j_malloc) != NULL)
+		p = DB_GLOBAL(j_malloc)(size);
+	else
+		p = malloc(size);
+	if (p == NULL) {
+		/*
+		 * Some C libraries don't correctly set errno when malloc(3)
+		 * fails.  We'd like to 0 out errno before calling malloc,
+		 * but it turns out that setting errno is quite expensive on
+		 * Windows/NT in an MT environment.
+		 */
+		if ((ret = __os_get_errno_ret_zero()) == 0) {
+			ret = ENOMEM;
+			__os_set_errno(ENOMEM);
+		}
+		__db_err(env, ret, DB_STR_A("0147", "malloc: %lu", "%lu"),
+		    (u_long)size);
+		return (ret);
+	}
+
+#ifdef DIAGNOSTIC
+	/* Overwrite memory. */
+	memset(p, CLEAR_BYTE, size);
+
+	/*
+	 * Guard bytes: if #DIAGNOSTIC is defined, we allocate an additional
+	 * byte after the memory and set it to a special value that we check
+	 * for when the memory is free'd.
+	 */
+	((u_int8_t *)p)[size - 1] = CLEAR_BYTE;
+
+	((db_allocinfo_t *)p)->size = size;
+	p = &((db_allocinfo_t *)p)[1];
+#endif
+	*(void **)storep = p;
+
+	return (0);
+}
+
+/*
+ * __os_realloc --
+ *	The realloc(3) function for DB.
+ *
+ * PUBLIC: int __os_realloc __P((ENV *, size_t, void *));
+ */
+int
+__os_realloc(env, size, storep)
+	ENV *env;
+	size_t size;
+	void *storep;
+{
+	int ret;
+	void *p, *ptr;
+
+	ptr = *(void **)storep;
+
+	/* Never allocate 0 bytes -- some C libraries don't like it. */
+	if (size == 0)
+		++size;
+
+	/* If we haven't yet allocated anything yet, simply call malloc. */
+	if (ptr == NULL)
+		return (__os_malloc(env, size, storep));
+
+#ifdef DIAGNOSTIC
+	/* Add room for size and a guard byte. */
+	size += sizeof(db_allocinfo_t) + 1;
+
+	/* Back up to the real beginning */
+	ptr = &((db_allocinfo_t *)ptr)[-1];
+
+	{
+		size_t s;
+
+		s = ((db_allocinfo_t *)ptr)->size;
+		if (((u_int8_t *)ptr)[s - 1] != CLEAR_BYTE)
+			 __os_guard(env);
+	}
+#endif
+
+	/*
+	 * Don't overwrite the original pointer, there are places in DB we
+	 * try to continue after realloc fails.
+	 */
+	if (DB_GLOBAL(j_realloc) != NULL)
+		p = DB_GLOBAL(j_realloc)(ptr, size);
+	else
+		p = realloc(ptr, size);
+	if (p == NULL) {
+		/*
+		 * Some C libraries don't correctly set errno when malloc(3)
+		 * fails.  We'd like to 0 out errno before calling malloc,
+		 * but it turns out that setting errno is quite expensive on
+		 * Windows/NT in an MT environment.
+		 */
+		if ((ret = __os_get_errno_ret_zero()) == 0) {
+			ret = ENOMEM;
+			__os_set_errno(ENOMEM);
+		}
+		__db_err(env, ret, DB_STR_A("0148", "realloc: %lu", "%lu"),
+		    (u_long)size);
+		return (ret);
+	}
+#ifdef DIAGNOSTIC
+	((u_int8_t *)p)[size - 1] = CLEAR_BYTE;	/* Initialize guard byte. */
+
+	((db_allocinfo_t *)p)->size = size;
+	p = &((db_allocinfo_t *)p)[1];
+#endif
+
+	*(void **)storep = p;
+
+	return (0);
+}
+
+/*
+ * __os_free --
+ *	The free(3) function for DB.
+ *
+ * PUBLIC: void __os_free __P((ENV *, void *));
+ */
+void
+__os_free(env, ptr)
+	ENV *env;
+	void *ptr;
+{
+#ifdef DIAGNOSTIC
+	size_t size;
+#endif
+
+	/*
+	 * ANSI C requires free(NULL) work.  Don't depend on the underlying
+	 * library.
+	 */
+	if (ptr == NULL)
+		return;
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Check that the guard byte (one past the end of the memory) is
+	 * still CLEAR_BYTE.
+	 */
+	ptr = &((db_allocinfo_t *)ptr)[-1];
+	size = ((db_allocinfo_t *)ptr)->size;
+	if (((u_int8_t *)ptr)[size - 1] != CLEAR_BYTE)
+		 __os_guard(env);
+
+	/* Overwrite memory. */
+	if (size != 0)
+		memset(ptr, CLEAR_BYTE, size);
+#else
+	COMPQUIET(env, NULL);
+#endif
+
+	if (DB_GLOBAL(j_free) != NULL)
+		DB_GLOBAL(j_free)(ptr);
+	else
+		free(ptr);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __os_guard --
+ *	Complain and abort.
+ */
+static void
+__os_guard(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0149",
+	    "Guard byte incorrect during free"));
+	__os_abort(env);
+	/* NOTREACHED */
+}
+#endif
+
+/*
+ * __ua_memcpy --
+ *	Copy memory to memory without relying on any kind of alignment.
+ *
+ *	There are places in DB that we have unaligned data, for example,
+ *	when we've stored a structure in a log record as a DBT, and now
+ *	we want to look at it.  Unfortunately, if you have code like:
+ *
+ *		struct a {
+ *			int x;
+ *		} *p;
+ *
+ *		void *func_argument;
+ *		int local;
+ *
+ *		p = (struct a *)func_argument;
+ *		memcpy(&local, p->x, sizeof(local));
+ *
+ *	compilers optimize to use inline instructions requiring alignment,
+ *	and records in the log don't have any particular alignment.  (This
+ *	isn't a compiler bug, because it's a structure they're allowed to
+ *	assume alignment.)
+ *
+ *	Casting the memcpy arguments to (u_int8_t *) appears to work most
+ *	of the time, but we've seen examples where it wasn't sufficient
+ *	and there's nothing in ANSI C that requires that work.
+ *
+ * PUBLIC: void *__ua_memcpy __P((void *, const void *, size_t));
+ */
+void *
+__ua_memcpy(dst, src, len)
+	void *dst;
+	const void *src;
+	size_t len;
+{
+	return ((void *)memcpy(dst, src, len));
+}
diff --git a/src/os/os_clock.c b/src/os/os_clock.c
new file mode 100644
index 00000000..25eeb704
--- /dev/null
+++ b/src/os/os_clock.c
@@ -0,0 +1,73 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_gettime --
+ *	Return the current time-of-day clock in seconds and nanoseconds.
+ *
+ * PUBLIC: void __os_gettime __P((ENV *, db_timespec *, int));
+ */
+void
+__os_gettime(env, tp, monotonic)
+	ENV *env;
+	db_timespec *tp;
+	int monotonic;
+{
+	const char *sc;
+	int ret;
+
+#if defined(HAVE_CLOCK_GETTIME)
+#if defined(HAVE_CLOCK_MONOTONIC)
+	if (monotonic)
+		RETRY_CHK((clock_gettime(
+		    CLOCK_MONOTONIC, (struct timespec *)tp)), ret);
+	else
+#endif
+		RETRY_CHK((clock_gettime(
+		    CLOCK_REALTIME, (struct timespec *)tp)), ret);
+
+	RETRY_CHK((clock_gettime(CLOCK_REALTIME, (struct timespec *)tp)), ret);
+	if (ret != 0) {
+		sc = "clock_gettime";
+		goto err;
+	}
+#elif defined(HAVE_GETTIMEOFDAY)
+	struct timeval v;
+
+	RETRY_CHK((gettimeofday(&v, NULL)), ret);
+	if (ret != 0) {
+		sc = "gettimeofday";
+		goto err;
+	}
+
+	tp->tv_sec = v.tv_sec;
+	tp->tv_nsec = v.tv_usec * NS_PER_US;
+#elif defined(HAVE_TIME)
+	time_t now;
+
+	RETRY_CHK((time(&now) == (time_t)-1 ? 1 : 0), ret);
+	if (ret != 0) {
+		sc = "time";
+		goto err;
+	}
+
+	tp->tv_sec = now;
+	tp->tv_nsec = 0;
+#else
+	NO AVAILABLE CLOCK IMPLEMENTATION
+#endif
+	COMPQUIET(monotonic, 0);
+	return;
+
+err:	__db_syserr(env, ret, "%s", sc);
+	(void)__env_panic(env, __os_posix_err(ret));
+}
diff --git a/src/os/os_config.c b/src/os/os_config.c
new file mode 100644
index 00000000..c455a349
--- /dev/null
+++ b/src/os/os_config.c
@@ -0,0 +1,70 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fs_notzero --
+ *	Return 1 if allocated filesystem blocks are not zeroed.
+ *
+ * PUBLIC: int __os_fs_notzero __P((void));
+ */
+int
+__os_fs_notzero()
+{
+	/* Most filesystems zero out implicitly created pages. */
+	return (0);
+}
+
+/*
+ * __os_support_direct_io --
+ *	Return 1 if we support direct I/O.
+ *
+ * PUBLIC: int __os_support_direct_io __P((void));
+ */
+int
+__os_support_direct_io()
+{
+	int ret;
+
+	ret = 0;
+
+#ifdef HAVE_O_DIRECT
+	ret = 1;
+#endif
+#if defined(HAVE_DIRECTIO) && defined(DIRECTIO_ON)
+	ret = 1;
+#endif
+	return (ret);
+}
+
+/*
+ * __os_support_db_register --
+ *	Return 1 if the system supports DB_REGISTER.
+ *
+ * PUBLIC: int __os_support_db_register __P((void));
+ */
+int
+__os_support_db_register()
+{
+	return (1);
+}
+
+/*
+ * __os_support_replication --
+ *	Return 1 if the system supports replication.
+ *
+ * PUBLIC: int __os_support_replication __P((void));
+ */
+int
+__os_support_replication()
+{
+	return (1);
+}
diff --git a/src/os/os_cpu.c b/src/os/os_cpu.c
new file mode 100644
index 00000000..6b7f9f1e
--- /dev/null
+++ b/src/os/os_cpu.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+#include <sys/pstat.h>
+#endif
+#endif
+
+/*
+ * __os_cpu_count --
+ *	Return the number of CPUs.
+ *
+ * PUBLIC: u_int32_t __os_cpu_count __P((void));
+ */
+u_int32_t
+__os_cpu_count()
+{
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+	/*
+	 * HP/UX.
+	 */
+	struct pst_dynamic psd;
+
+	return ((u_int32_t)pstat_getdynamic(&psd,
+	    sizeof(psd), (size_t)1, 0) == -1 ? 1 : psd.psd_proc_cnt);
+#elif defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+	/*
+	 * Solaris, Linux.
+	 */
+	long nproc;
+
+	nproc = sysconf(_SC_NPROCESSORS_ONLN);
+	return ((u_int32_t)(nproc > 1 ? nproc : 1));
+#else
+	return (1);
+#endif
+}
diff --git a/src/os/os_ctime.c b/src/os/os_ctime.c
new file mode 100644
index 00000000..3f656c32
--- /dev/null
+++ b/src/os/os_ctime.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_ctime --
+ *	Format a time-stamp.
+ *
+ * PUBLIC: char *__os_ctime __P((const time_t *, char *));
+ */
+char *
+__os_ctime(tod, time_buf)
+	const time_t *tod;
+	char *time_buf;
+{
+	time_buf[CTIME_BUFLEN - 1] = '\0';
+
+	/*
+	 * The ctime_r interface is the POSIX standard, thread-safe version of
+	 * ctime.  However, it was implemented in three different ways (with
+	 * and without a buffer length argument, and where the buffer length
+	 * argument was an int vs. a size_t *).  Also, you can't depend on a
+	 * return of (char *) from ctime_r, HP-UX 10.XX's version returned an
+	 * int.
+	 */
+#if defined(HAVE_VXWORKS)
+	{
+	size_t buflen = CTIME_BUFLEN;
+	(void)ctime_r(tod, time_buf, &buflen);
+	}
+#elif defined(HAVE_CTIME_R_3ARG)
+	(void)ctime_r(tod, time_buf, CTIME_BUFLEN);
+#elif defined(HAVE_CTIME_R)
+	(void)ctime_r(tod, time_buf);
+#else
+	(void)strncpy(time_buf, ctime(tod), CTIME_BUFLEN - 1);
+#endif
+	return (time_buf);
+}
diff --git a/src/os/os_dir.c b/src/os/os_dir.c
new file mode 100644
index 00000000..42bad194
--- /dev/null
+++ b/src/os/os_dir.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+
+#include "db_int.h"
+
+/*
+ * __os_dirlist --
+ *	Return a list of the files in a directory.
+ *
+ * PUBLIC: int __os_dirlist __P((ENV *, const char *, int, char ***, int *));
+ */
+int
+__os_dirlist(env, dir, returndir, namesp, cntp)
+	ENV *env;
+	const char *dir;
+	int returndir, *cntp;
+	char ***namesp;
+{
+	DB_ENV *dbenv;
+	struct dirent *dp;
+	DIR *dirp;
+	struct stat sb;
+	int arraysz, cnt, ret;
+	char **names, buf[DB_MAXPATHLEN];
+
+	*namesp = NULL;
+	*cntp = 0;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0159",
+		    "fileops: directory list %s", "%s"), dir);
+
+	if (DB_GLOBAL(j_dirlist) != NULL)
+		return (DB_GLOBAL(j_dirlist)(dir, namesp, cntp));
+
+	if ((dirp = opendir(CHAR_STAR_CAST dir)) == NULL)
+		return (__os_get_errno());
+	names = NULL;
+	for (arraysz = cnt = 0; (dp = readdir(dirp)) != NULL;) {
+		snprintf(buf, sizeof(buf), "%s/%s", dir, dp->d_name);
+
+		RETRY_CHK(stat(buf, &sb), ret);
+		if (ret != 0) {
+			ret = __os_posix_err(ret);
+			/* Ignore entries that no longer exist. */
+			if (ret == ENOENT)
+				continue;
+
+			goto err;
+		}
+
+		/*
+		 * We return regular files, and optionally return directories
+		 * (except for dot and dot-dot).
+		 *
+		 * Shared memory files are of a different type on QNX, and we
+		 * return those as well.
+		 */
+#ifdef HAVE_QNX
+		if (!S_ISREG(sb.st_mode) && !S_TYPEISSHM(&sb)) {
+#else
+		if (!S_ISREG(sb.st_mode)) {
+#endif
+			if (!returndir || !S_ISDIR(sb.st_mode))
+				continue;
+			if (dp->d_name[0] == '.' && (dp->d_name[1] == '\0' ||
+			    (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+				continue;
+		}
+
+		if (cnt >= arraysz) {
+			arraysz += 100;
+			if ((ret = __os_realloc(env,
+			    (u_int)arraysz * sizeof(names[0]), &names)) != 0)
+				goto err;
+		}
+		if ((ret = __os_strdup(env, dp->d_name, &names[cnt])) != 0)
+			goto err;
+		cnt++;
+	}
+	(void)closedir(dirp);
+
+	*namesp = names;
+	*cntp = cnt;
+	return (0);
+
+err:	if (names != NULL)
+		__os_dirfree(env, names, cnt);
+	if (dirp != NULL)
+		(void)closedir(dirp);
+	return (ret);
+}
+
+/*
+ * __os_dirfree --
+ *	Free the list of files.
+ *
+ * PUBLIC: void __os_dirfree __P((ENV *, char **, int));
+ */
+void
+__os_dirfree(env, names, cnt)
+	ENV *env;
+	char **names;
+	int cnt;
+{
+	if (DB_GLOBAL(j_dirfree) != NULL)
+		DB_GLOBAL(j_dirfree)(names, cnt);
+	else {
+		while (cnt > 0)
+			__os_free(env, names[--cnt]);
+		__os_free(env, names);
+	}
+}
diff --git a/src/os/os_errno.c b/src/os/os_errno.c
new file mode 100644
index 00000000..a8219f90
--- /dev/null
+++ b/src/os/os_errno.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_get_errno_ret_zero --
+ *	Return the last system error, including an error of zero.
+ *
+ * PUBLIC: int __os_get_errno_ret_zero __P((void));
+ */
+int
+__os_get_errno_ret_zero()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	return (errno);
+}
+
+/*
+ * We've seen cases where system calls failed but errno was never set.  For
+ * that reason, __os_get_errno() and __os_get_syserr set errno to EAGAIN if
+ * it's not already set, to work around the problem.  For obvious reasons,
+ * we can only call this function if we know an error has occurred, that
+ * is, we can't test the return for a non-zero value after the get call.
+ *
+ * __os_get_errno --
+ *	Return the last ANSI C "errno" value or EAGAIN if the last error
+ *	is zero.
+ *
+ * PUBLIC: int __os_get_errno __P((void));
+ */
+int
+__os_get_errno()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	return (__os_get_syserr());
+}
+
+#if 0
+/*
+ * __os_get_neterr --
+ *      Return the last network-related error or EAGAIN if the last
+ *	error is zero.
+ *
+ * PUBLIC: int __os_get_neterr __P((void));
+ */
+int
+__os_get_neterr()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	return (__os_get_syserr());
+}
+#endif
+
+/*
+ * __os_get_syserr --
+ *	Return the last system error or EAGAIN if the last error is zero.
+ *
+ * PUBLIC: int __os_get_syserr __P((void));
+ */
+int
+__os_get_syserr()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	if (errno == 0)
+		__os_set_errno(EAGAIN);
+	return (errno);
+}
+
+/*
+ * __os_set_errno --
+ *	Set the value of errno.
+ *
+ * PUBLIC: void __os_set_errno __P((int));
+ */
+void
+__os_set_errno(evalue)
+	int evalue;
+{
+	/*
+	 * This routine is called by the compatibility interfaces (DB 1.85,
+	 * dbm and hsearch).  Force values > 0, that is, not one of DB 2.X
+	 * and later's public error returns.  If something bad has happened,
+	 * default to EFAULT -- a nasty return.  Otherwise, default to EINVAL.
+	 * As the compatibility APIs aren't included on Windows, the Windows
+	 * version of this routine doesn't need this behavior.
+	 */
+	errno =
+	    evalue >= 0 ? evalue : (evalue == DB_RUNRECOVERY ? EFAULT : EINVAL);
+}
+
+/*
+ * __os_strerror --
+ *	Return a string associated with the system error.
+ *
+ * PUBLIC: char *__os_strerror __P((int, char *, size_t));
+ */
+char *
+__os_strerror(error, buf, len)
+	int error;
+	char *buf;
+	size_t len;
+{
+	/* No translation is needed in the POSIX layer. */
+	(void)strncpy(buf, strerror(error), len - 1);
+	buf[len - 1] = '\0';
+
+	return (buf);
+}
+
+/*
+ * __os_posix_err
+ *	Convert a system error to a POSIX error.
+ *
+ * PUBLIC: int __os_posix_err __P((int));
+ */
+int
+__os_posix_err(error)
+	int error;
+{
+	return (error);
+}
diff --git a/src/os/os_fid.c b/src/os/os_fid.c
new file mode 100644
index 00000000..f2d80e25
--- /dev/null
+++ b/src/os/os_fid.c
@@ -0,0 +1,135 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fileid --
+ *	Return a unique identifier for a file.
+ *
+ * PUBLIC: int __os_fileid __P((ENV *, const char *, int, u_int8_t *));
+ */
+int
+__os_fileid(env, fname, unique_okay, fidp)
+	ENV *env;
+	const char *fname;
+	int unique_okay;
+	u_int8_t *fidp;
+{
+	pid_t pid;
+	size_t i;
+	u_int32_t tmp;
+	u_int8_t *p;
+
+#ifdef HAVE_STAT
+	struct stat sb;
+	int ret;
+
+	/*
+	 * The structure of a fileid on a POSIX/UNIX system is:
+	 *
+	 *	ino[4] dev[4] unique-ID[4] serial-counter[4] empty[4].
+	 *
+	 * For real files, which have a backing inode and device, the first
+	 * 8 bytes are filled in and the following bytes are left 0.  For
+	 * temporary files, the following 12 bytes are filled in.
+	 *
+	 * Clear the buffer.
+	 */
+	memset(fidp, 0, DB_FILE_ID_LEN);
+	RETRY_CHK((stat(CHAR_STAR_CAST fname, &sb)), ret);
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0158",
+		    "stat: %s", "%s"), fname);
+		return (__os_posix_err(ret));
+	}
+
+	/*
+	 * !!!
+	 * Nothing is ever big enough -- on Sparc V9, st_ino, st_dev and the
+	 * time_t types are all 8 bytes.  As DB_FILE_ID_LEN is only 20 bytes,
+	 * we convert to a (potentially) smaller fixed-size type and use it.
+	 *
+	 * We don't worry about byte sexing or the actual variable sizes.
+	 *
+	 * When this routine is called from the DB access methods, it's only
+	 * called once -- whatever ID is generated when a database is created
+	 * is stored in the database file's metadata, and that is what is
+	 * saved in the mpool region's information to uniquely identify the
+	 * file.
+	 *
+	 * When called from the mpool layer this routine will be called each
+	 * time a new thread of control wants to share the file, which makes
+	 * things tougher.  As far as byte sexing goes, since the mpool region
+	 * lives on a single host, there's no issue of that -- the entire
+	 * region is byte sex dependent.  As far as variable sizes go, we make
+	 * the simplifying assumption that 32-bit and 64-bit processes will
+	 * get the same 32-bit values if we truncate any returned 64-bit value
+	 * to a 32-bit value.  When we're called from the mpool layer, though,
+	 * we need to be careful not to include anything that isn't
+	 * reproducible for a given file, such as the timestamp or serial
+	 * number.
+	 */
+	tmp = (u_int32_t)sb.st_ino;
+	for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = *p++;
+
+	tmp = (u_int32_t)sb.st_dev;
+	for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = *p++;
+#else
+	 /*
+	  * Use the file name.
+	  *
+	  * XXX
+	  * Cast the first argument, the BREW ARM compiler is unhappy if
+	  * we don't.
+	  */
+	 (void)strncpy((char *)fidp, fname, DB_FILE_ID_LEN);
+#endif /* HAVE_STAT */
+
+	if (unique_okay) {
+		/* Add in 32-bits of (hopefully) unique number. */
+		__os_unique_id(env, &tmp);
+		for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+			*fidp++ = *p++;
+
+		/*
+		 * Initialize/increment the serial number we use to help
+		 * avoid fileid collisions.  Note we don't bother with
+		 * locking; it's unpleasant to do from down in here, and
+		 * if we race on this no real harm will be done, since the
+		 * finished fileid has so many other components.
+		 *
+		 * We use the bottom 32-bits of the process ID, hoping they
+		 * are more random than the top 32-bits (should we be on a
+		 * machine with 64-bit process IDs).
+		 *
+		 * We increment by 100000 on each call as a simple way of
+		 * randomizing; simply incrementing seems potentially less
+		 * useful if pids are also simply incremented, since this
+		 * is process-local and we may be one of a set of processes
+		 * starting up.  100000 pushes us out of pid space on most
+		 * 32-bit platforms, and has few interesting properties in
+		 * base 2.
+		 */
+		if (DB_GLOBAL(fid_serial) == 0) {
+			__os_id(env->dbenv, &pid, NULL);
+			DB_GLOBAL(fid_serial) = (u_int32_t)pid;
+		} else
+			DB_GLOBAL(fid_serial) += 100000;
+
+		for (p = (u_int8_t *)
+		    &DB_GLOBAL(fid_serial), i = sizeof(u_int32_t); i > 0; --i)
+			*fidp++ = *p++;
+	}
+
+	return (0);
+}
diff --git a/src/os/os_flock.c b/src/os/os_flock.c
new file mode 100644
index 00000000..904d5efe
--- /dev/null
+++ b/src/os/os_flock.c
@@ -0,0 +1,64 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fdlock --
+ *	Acquire/release a lock on a byte in a file.
+ *
+ * PUBLIC: int __os_fdlock __P((ENV *, DB_FH *, off_t, int, int));
+ */
+int
+__os_fdlock(env, fhp, offset, acquire, nowait)
+	ENV *env;
+	DB_FH *fhp;
+	int acquire, nowait;
+	off_t offset;
+{
+#ifdef HAVE_FCNTL
+	DB_ENV *dbenv;
+	struct flock fl;
+	int ret, t_ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0138",
+		    "fileops: flock %s %s offset %lu", "%s %s %lu"), fhp->name,
+		    acquire ? DB_STR_P("acquire"): DB_STR_P("release"),
+		    (u_long)offset);
+
+	fl.l_start = offset;
+	fl.l_len = 1;
+	fl.l_type = acquire ? F_WRLCK : F_UNLCK;
+	fl.l_whence = SEEK_SET;
+
+	RETRY_CHK_EINTR_ONLY(
+	    (fcntl(fhp->fd, nowait ? F_SETLK : F_SETLKW, &fl)), ret);
+
+	if (ret == 0)
+		return (0);
+
+	if ((t_ret = __os_posix_err(ret)) != EACCES && t_ret != EAGAIN)
+		__db_syserr(env, ret, DB_STR("0139", "fcntl"));
+	return (t_ret);
+#else
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(acquire, 0);
+	COMPQUIET(nowait, 0);
+	COMPQUIET(offset, 0);
+	__db_syserr(env, DB_OPNOTSUP, DB_STR("0140",
+	    "advisory file locking unavailable"));
+	return (DB_OPNOTSUP);
+#endif
+}
diff --git a/src/os/os_fsync.c b/src/os/os_fsync.c
new file mode 100644
index 00000000..4b757b2c
--- /dev/null
+++ b/src/os/os_fsync.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef	HAVE_VXWORKS
+#include "ioLib.h"
+
+#define	fsync(fd)	__vx_fsync(fd)
+
+int
+__vx_fsync(fd)
+	int fd;
+{
+	int ret;
+
+	/*
+	 * The results of ioctl are driver dependent.  Some will return the
+	 * number of bytes sync'ed.  Only if it returns 'ERROR' should we
+	 * flag it.
+	 */
+	if ((ret = ioctl(fd, FIOSYNC, 0)) != ERROR)
+		return (0);
+	return (ret);
+}
+#endif
+
+#ifdef __hp3000s900
+#define	fsync(fd)	__mpe_fsync(fd)
+
+int
+__mpe_fsync(fd)
+	int fd;
+{
+	extern FCONTROL(short, short, void *);
+
+	FCONTROL(_MPE_FILENO(fd), 2, NULL);	/* Flush the buffers */
+	FCONTROL(_MPE_FILENO(fd), 6, NULL);	/* Write the EOF */
+	return (0);
+}
+#endif
+
+/*
+ * __os_fsync --
+ *	Flush a file descriptor.
+ *
+ * PUBLIC: int __os_fsync __P((ENV *, DB_FH *));
+ */
+int
+__os_fsync(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+	/*
+	 * Do nothing if the file descriptor has been marked as not requiring
+	 * any sync to disk.
+	 */
+	if (F_ISSET(fhp, DB_FH_NOSYNC))
+		return (0);
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0150", "fileops: flush %s", "%s"),
+		    fhp->name);
+
+	if (DB_GLOBAL(j_fsync) != NULL)
+		ret = DB_GLOBAL(j_fsync)(fhp->fd);
+	else {
+#if defined(F_FULLFSYNC)
+		RETRY_CHK((fcntl(fhp->fd, F_FULLFSYNC, 0)), ret);
+		/*
+		 * On OS X, F_FULLSYNC only works on HFS+, so we need to fall
+		 * back to regular fsync on other filesystems.
+		 */
+		if (ret == ENOTSUP)
+			RETRY_CHK((fsync(fhp->fd)), ret);
+#elif defined(HAVE_QNX)
+		ret = __qnx_fsync(fhp);
+#elif defined(HAVE_FDATASYNC)
+		RETRY_CHK((fdatasync(fhp->fd)), ret);
+#else
+		RETRY_CHK((fsync(fhp->fd)), ret);
+#endif
+	}
+
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR("0151", "fsync"));
+		ret = __os_posix_err(ret);
+	}
+	return (ret);
+}
diff --git a/src/os/os_getenv.c b/src/os/os_getenv.c
new file mode 100644
index 00000000..05972112
--- /dev/null
+++ b/src/os/os_getenv.c
@@ -0,0 +1,58 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getenv --
+ *	Retrieve an environment variable.
+ *
+ * PUBLIC: int __os_getenv __P((ENV *, const char *, char **, size_t));
+ */
+int
+__os_getenv(env, name, bpp, buflen)
+	ENV *env;
+	const char *name;
+	char **bpp;
+	size_t buflen;
+{
+	/*
+	 * If we have getenv, there's a value and the buffer is large enough:
+	 *	copy value into the pointer, return 0
+	 * If we have getenv, there's a value  and the buffer is too short:
+	 *	set pointer to NULL, return EINVAL
+	 * If we have getenv and there's no value:
+	 *	set pointer to NULL, return 0
+	 * If we don't have getenv:
+	 *	set pointer to NULL, return 0
+	 */
+#ifdef HAVE_GETENV
+	char *p;
+
+	if ((p = getenv(name)) != NULL) {
+		if (strlen(p) < buflen) {
+			(void)strcpy(*bpp, p);
+			return (0);
+		}
+
+		*bpp = NULL;
+		__db_errx(env, DB_STR_A("0157",
+		    "%s: buffer too small to hold environment variable %s",
+		    "%s %s"), name, p);
+		return (EINVAL);
+	}
+#else
+	COMPQUIET(env, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(buflen, 0);
+#endif
+	*bpp = NULL;
+	return (0);
+}
diff --git a/src/os/os_handle.c b/src/os/os_handle.c
new file mode 100644
index 00000000..8ae9dc7f
--- /dev/null
+++ b/src/os/os_handle.c
@@ -0,0 +1,243 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_openhandle --
+ *	Open a file, using POSIX 1003.1 open flags.
+ *
+ * PUBLIC: int __os_openhandle
+ * PUBLIC:     __P((ENV *, const char *, int, int, DB_FH **));
+ */
+int
+__os_openhandle(env, name, flags, mode, fhpp)
+	ENV *env;
+	const char *name;
+	int flags, mode;
+	DB_FH **fhpp;
+{
+	DB_FH *fhp;
+	u_int nrepeat, retries;
+	int fcntl_flags, ret;
+#ifdef HAVE_VXWORKS
+	int newflags;
+#endif
+	/*
+	 * Allocate the file handle and copy the file name.  We generally only
+	 * use the name for verbose or error messages, but on systems where we
+	 * can't unlink temporary files immediately, we use the name to unlink
+	 * the temporary file when the file handle is closed.
+	 *
+	 * Lock the ENV handle and insert the new file handle on the list.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+		return (ret);
+	if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+		goto err;
+	if (env != NULL) {
+		MUTEX_LOCK(env, env->mtx_env);
+		TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+		MUTEX_UNLOCK(env, env->mtx_env);
+		F_SET(fhp, DB_FH_ENVLINK);
+	}
+
+	/* If the application specified an interface, use it. */
+	if (DB_GLOBAL(j_open) != NULL) {
+		if ((fhp->fd = DB_GLOBAL(j_open)(name, flags, mode)) == -1) {
+			ret = __os_posix_err(__os_get_syserr());
+			goto err;
+		}
+		goto done;
+	}
+
+	retries = 0;
+	for (nrepeat = 1; nrepeat < 4; ++nrepeat) {
+		ret = 0;
+#ifdef	HAVE_VXWORKS
+		/*
+		 * VxWorks does not support O_CREAT on open, you have to use
+		 * creat() instead.  (It does not support O_EXCL or O_TRUNC
+		 * either, even though they are defined "for future support".)
+		 * We really want the POSIX behavior that if O_CREAT is set,
+		 * we open if it exists, or create it if it doesn't exist.
+		 * If O_CREAT is specified, single thread and try to open the
+		 * file.  If successful, and O_EXCL return EEXIST.  If
+		 * unsuccessful call creat and then end single threading.
+		 */
+		if (LF_ISSET(O_CREAT)) {
+			DB_BEGIN_SINGLE_THREAD;
+			newflags = flags & ~(O_CREAT | O_EXCL);
+			if ((fhp->fd = open(name, newflags, mode)) != -1) {
+				/*
+				 * We need to mark the file opened at this
+				 * point so that if we get any error below
+				 * we will properly close the fd we just
+				 * opened on the error path.
+				 */
+				F_SET(fhp, DB_FH_OPENED);
+				if (LF_ISSET(O_EXCL)) {
+					/*
+					 * If we get here, want O_EXCL create,
+					 * and the file exists.  Close and
+					 * return EEXISTS.
+					 */
+					DB_END_SINGLE_THREAD;
+					ret = EEXIST;
+					goto err;
+				}
+				/*
+				 * XXX
+				 * Assume any error means non-existence.
+				 * Unfortunately return values (even for
+				 * non-existence) are driver specific so
+				 * there is no single error we can use to
+				 * verify we truly got the equivalent of
+				 * ENOENT.
+				 */
+			} else
+				fhp->fd = creat(name, newflags);
+			DB_END_SINGLE_THREAD;
+		} else
+		/* FALLTHROUGH */
+#endif
+#ifdef __VMS
+		/*
+		 * !!!
+		 * Open with full sharing on VMS.
+		 *
+		 * We use these flags because they are the ones set by the VMS
+		 * CRTL mmap() call when it opens a file, and we have to be
+		 * able to open files that mmap() has previously opened, e.g.,
+		 * when we're joining already existing DB regions.
+		 */
+		fhp->fd = open(name, flags, mode, "shr=get,put,upd,del,upi");
+#else
+		fhp->fd = open(name, flags, mode);
+#endif
+		if (fhp->fd != -1) {
+			ret = 0;
+			break;
+		}
+
+		switch (ret = __os_posix_err(__os_get_syserr())) {
+		case EMFILE:
+		case ENFILE:
+		case ENOSPC:
+			/*
+			 * If it's a "temporary" error, we retry up to 3 times,
+			 * waiting up to 12 seconds.  While it's not a problem
+			 * if we can't open a database, an inability to open a
+			 * log file is cause for serious dismay.
+			 */
+			__os_yield(env, nrepeat * 2, 0);
+			break;
+		case EAGAIN:
+		case EBUSY:
+		case EINTR:
+			/*
+			 * If an EAGAIN, EBUSY or EINTR, retry immediately for
+			 * DB_RETRY times.
+			 */
+			if (++retries < DB_RETRY)
+				--nrepeat;
+			break;
+		default:
+			/* Open is silent on error. */
+			goto err;
+		}
+	}
+
+	if (ret == 0) {
+#if defined(HAVE_FCNTL_F_SETFD)
+		/* Deny file descriptor access to any child process. */
+		if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 ||
+		    fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) {
+			ret = __os_get_syserr();
+			__db_syserr(env, ret, DB_STR("0162",
+			    "fcntl(F_SETFD)"));
+			ret = __os_posix_err(ret);
+			goto err;
+		}
+#else
+		COMPQUIET(fcntl_flags, 0);
+#endif
+
+done:		F_SET(fhp, DB_FH_OPENED);
+		*fhpp = fhp;
+		return (0);
+	}
+
+err:	(void)__os_closehandle(env, fhp);
+	return (ret);
+}
+
+/*
+ * __os_closehandle --
+ *	Close a file.
+ *
+ * PUBLIC: int __os_closehandle __P((ENV *, DB_FH *));
+ */
+int
+__os_closehandle(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * If we linked the DB_FH handle into the ENV, it needs to be
+	 * unlinked.
+	 */
+	DB_ASSERT(env, env != NULL || !F_ISSET(fhp, DB_FH_ENVLINK));
+
+	if (env != NULL) {
+		dbenv = env->dbenv;
+		if (fhp->name != NULL && FLD_ISSET(
+		    dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+			__db_msg(env, DB_STR_A("0163",
+			    "fileops: close %s", "%s"), fhp->name);
+
+		if (F_ISSET(fhp, DB_FH_ENVLINK)) {
+			/*
+			 * Lock the ENV handle and remove this file
+			 * handle from the list.
+			 */
+			MUTEX_LOCK(env, env->mtx_env);
+			TAILQ_REMOVE(&env->fdlist, fhp, q);
+			MUTEX_UNLOCK(env, env->mtx_env);
+		}
+	}
+
+	/* Discard any underlying system file reference. */
+	if (F_ISSET(fhp, DB_FH_OPENED)) {
+		if (DB_GLOBAL(j_close) != NULL)
+			ret = DB_GLOBAL(j_close)(fhp->fd);
+		else
+			RETRY_CHK((close(fhp->fd)), ret);
+		if (ret != 0) {
+			__db_syserr(env, ret, DB_STR("0164", "close"));
+			ret = __os_posix_err(ret);
+		}
+	}
+
+	/* Unlink the file if we haven't already done so. */
+	if (F_ISSET(fhp, DB_FH_UNLINK))
+		(void)__os_unlink(env, fhp->name, 0);
+
+	if (fhp->name != NULL)
+		__os_free(env, fhp->name);
+	__os_free(env, fhp);
+
+	return (ret);
+}
diff --git a/src/os/os_map.c b/src/os/os_map.c
new file mode 100644
index 00000000..0528f473
--- /dev/null
+++ b/src/os/os_map.c
@@ -0,0 +1,607 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_SHMGET
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+#endif
+
+#ifdef HAVE_MMAP
+static int __os_map __P((ENV *, char *, DB_FH *, size_t, int, int, void **));
+#endif
+#ifdef HAVE_SHMGET
+static int __shm_mode __P((ENV *));
+#else
+static int __no_system_mem __P((ENV *));
+#endif
+
+/*
+ * __os_attach --
+ *	Create/join a shared memory region.
+ *
+ * PUBLIC: int __os_attach __P((ENV *, REGINFO *, REGION *));
+ */
+int
+__os_attach(env, infop, rp)
+	ENV *env;
+	REGINFO *infop;
+	REGION *rp;
+{
+	DB_ENV *dbenv;
+	int create_ok, ret;
+
+	/*
+	 * We pass a DB_ENV handle to the user's replacement map function,
+	 * so there must be a valid handle.
+	 */
+	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+	dbenv = env->dbenv;
+
+	if (DB_GLOBAL(j_region_map) != NULL) {
+		/*
+		 * We have to find out if the region is being created.  Ask
+		 * the underlying map function, and use the REGINFO structure
+		 * to pass that information back to our caller.
+		 */
+		create_ok = F_ISSET(infop, REGION_CREATE) ? 1 : 0;
+		ret = DB_GLOBAL(j_region_map)
+		    (dbenv, infop->name, rp->max, &create_ok, &infop->addr);
+		if (create_ok)
+			F_SET(infop, REGION_CREATE);
+		else
+			F_CLR(infop, REGION_CREATE);
+		return (ret);
+	}
+
+	if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+		/*
+		 * If the region is in system memory on UNIX, we use shmget(2).
+		 *
+		 * !!!
+		 * There exist spinlocks that don't work in shmget memory, e.g.,
+		 * the HP/UX msemaphore interface.  If we don't have locks that
+		 * will work in shmget memory, we better be private and not be
+		 * threaded.  If we reach this point, we know we're public, so
+		 * it's an error.
+		 */
+#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+		__db_errx(env, DB_STR("0114",
+    "architecture does not support locks inside system shared memory"));
+		return (EINVAL);
+#endif
+#if defined(HAVE_SHMGET)
+		{
+		key_t segid;
+		int id, mode;
+
+		/*
+		 * We could potentially create based on REGION_CREATE_OK, but
+		 * that's dangerous -- we might get crammed in sideways if
+		 * some of the expected regions exist but others do not.  Also,
+		 * if the requested size differs from an existing region's
+		 * actual size, then all sorts of nasty things can happen.
+		 * Basing create solely on REGION_CREATE is much safer -- a
+		 * recovery will get us straightened out.
+		 */
+		if (F_ISSET(infop, REGION_CREATE)) {
+			/*
+			 * The application must give us a base System V IPC key
+			 * value.  Adjust that value based on the region's ID,
+			 * and correct so the user's original value appears in
+			 * the ipcs output.
+			 */
+			if (dbenv->shm_key == INVALID_REGION_SEGID) {
+				__db_errx(env, DB_STR("0115",
+			    "no base system shared memory ID specified"));
+				return (EINVAL);
+			}
+
+			/*
+			 * !!!
+			 * The BDB API takes a "long" as the base segment ID,
+			 * then adds an unsigned 32-bit value and stores it
+			 * in a key_t.  Wrong, admittedly, but not worth an
+			 * API change to fix.
+			 */
+			segid = (key_t)
+			    ((u_long)dbenv->shm_key + (infop->id - 1));
+
+			/*
+			 * If map to an existing region, assume the application
+			 * crashed and we're restarting.  Delete the old region
+			 * and re-try.  If that fails, return an error, the
+			 * application will have to select a different segment
+			 * ID or clean up some other way.
+			 */
+			if ((id = shmget(segid, 0, 0)) != -1) {
+				(void)shmctl(id, IPC_RMID, NULL);
+				if ((id = shmget(segid, 0, 0)) != -1) {
+					__db_errx(env, DB_STR_A("0116",
+		"shmget: key: %ld: shared system memory region already exists",
+					    "%ld"), (long)segid);
+					return (EAGAIN);
+				}
+			}
+
+			/*
+			 * Map the DbEnv::open method file mode permissions to
+			 * shmget call permissions.
+			 */
+			mode = IPC_CREAT | __shm_mode(env);
+			if ((id = shmget(segid, rp->max, mode)) == -1) {
+				ret = __os_get_syserr();
+				__db_syserr(env, ret, DB_STR_A("0117",
+	"shmget: key: %ld: unable to create shared system memory region",
+				    "%ld"), (long)segid);
+				return (__os_posix_err(ret));
+			}
+			rp->size = rp->max;
+			rp->segid = id;
+		} else
+			id = rp->segid;
+
+		if ((infop->addr = shmat(id, NULL, 0)) == (void *)-1) {
+			infop->addr = NULL;
+			ret = __os_get_syserr();
+			__db_syserr(env, ret, DB_STR_A("0118",
+	"shmat: id %d: unable to attach to shared system memory region",
+			    "%d"), id);
+			return (__os_posix_err(ret));
+		}
+
+		/* Optionally lock the memory down. */
+		if (F_ISSET(env, ENV_LOCKDOWN)) {
+#ifdef HAVE_SHMCTL_SHM_LOCK
+			ret = shmctl(
+			    id, SHM_LOCK, NULL) == 0 ? 0 : __os_get_syserr();
+#else
+			ret = DB_OPNOTSUP;
+#endif
+			if (ret != 0) {
+				__db_syserr(env, ret, DB_STR_A("0119",
+	"shmctl/SHM_LOCK: id %d: unable to lock down shared memory region",
+				   "%d"), id);
+				return (__os_posix_err(ret));
+			}
+		}
+
+		return (0);
+		}
+#else
+		return (__no_system_mem(env));
+#endif
+	}
+
+#ifdef HAVE_MMAP
+	{
+	infop->fhp = NULL;
+
+	/*
+	 * Try to open/create the shared region file.  We DO NOT need to ensure
+	 * that multiple threads/processes attempting to simultaneously create
+	 * the region are properly ordered, our caller has already taken care
+	 * of that.
+	 */
+	if ((ret = __os_open(env, infop->name, 0,
+	    DB_OSO_REGION |
+	    (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
+	    env->db_mode, &infop->fhp)) != 0)
+		__db_err(env, ret, "%s", infop->name);
+
+	/*
+	 * If we created the file, grow it before mapping it in. We really want
+	 * to avoid touching the buffer cache after mmap() is called, doing
+	 * anything else confuses the hell out of systems without merged
+	 * VM/buffer cache systems, or, more to the point, *badly* merged
+	 * VM/buffer cache systems.
+	 */
+	if (rp->max < rp->size)
+		rp->max = rp->size;
+	if (ret == 0 && F_ISSET(infop, REGION_CREATE)) {
+		if (F_ISSET(dbenv, DB_ENV_REGION_INIT))
+			ret = __db_file_write(env, infop->fhp,
+			    rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00);
+		else
+			ret = __db_file_extend(env, infop->fhp, rp->size);
+	}
+
+	/* Map the file in. */
+	if (ret == 0)
+		ret = __os_map(env,
+		    infop->name, infop->fhp, rp->max, 1, 0, &infop->addr);
+
+	if (ret != 0 && infop->fhp != NULL) {
+		(void)__os_closehandle(env, infop->fhp);
+		infop->fhp = NULL;
+	}
+
+	return (ret);
+	}
+#else
+	COMPQUIET(infop, NULL);
+	COMPQUIET(rp, NULL);
+	__db_errx(env, DB_STR("0120",
+	    "architecture lacks mmap(2), shared environments not possible"));
+	return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __os_detach --
+ *	Detach from a shared memory region.
+ *
+ * PUBLIC: int __os_detach __P((ENV *, REGINFO *, int));
+ */
+int
+__os_detach(env, infop, destroy)
+	ENV *env;
+	REGINFO *infop;
+	int destroy;
+{
+	DB_ENV *dbenv;
+	REGION *rp;
+	int ret;
+
+	/*
+	 * We pass a DB_ENV handle to the user's replacement unmap function,
+	 * so there must be a valid handle.
+	 */
+	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+	dbenv = env->dbenv;
+
+	rp = infop->rp;
+
+	/* If the user replaced the unmap call, call through their interface. */
+	if (DB_GLOBAL(j_region_unmap) != NULL)
+		return (DB_GLOBAL(j_region_unmap)(dbenv, infop->addr));
+
+	if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+#ifdef HAVE_SHMGET
+		int segid;
+
+		/*
+		 * We may be about to remove the memory referenced by rp,
+		 * save the segment ID, and (optionally) wipe the original.
+		 */
+		segid = rp->segid;
+		if (destroy)
+			rp->segid = INVALID_REGION_SEGID;
+
+		if (shmdt(infop->addr) != 0) {
+			ret = __os_get_syserr();
+			__db_syserr(env, ret, DB_STR("0121", "shmdt"));
+			return (__os_posix_err(ret));
+		}
+
+		if (destroy && shmctl(segid, IPC_RMID,
+		    NULL) != 0 && (ret = __os_get_syserr()) != EINVAL) {
+			__db_syserr(env, ret, DB_STR_A("0122",
+	    "shmctl: id %d: unable to delete system shared memory region",
+			    "%d"), segid);
+			return (__os_posix_err(ret));
+		}
+
+		return (0);
+#else
+		return (__no_system_mem(env));
+#endif
+	}
+
+#ifdef HAVE_MMAP
+#ifdef HAVE_MUNLOCK
+	if (F_ISSET(env, ENV_LOCKDOWN))
+		(void)munlock(infop->addr, rp->max);
+#endif
+	if (infop->fhp != NULL) {
+		ret = __os_closehandle(env, infop->fhp);
+		infop->fhp = NULL;
+		if (ret != 0)
+			return (ret);
+	}
+
+	if (munmap(infop->addr, rp->max) != 0) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("0123", "munmap"));
+		return (__os_posix_err(ret));
+	}
+
+	if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0)
+		return (ret);
+
+	return (0);
+#else
+	COMPQUIET(destroy, 0);
+	COMPQUIET(ret, 0);
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __os_mapfile --
+ *	Map in a shared memory file.
+ *
+ * PUBLIC: int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **));
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addrp)
+	ENV *env;
+	char *path;
+	DB_FH *fhp;
+	int is_rdonly;
+	size_t len;
+	void **addrp;
+{
+#if defined(HAVE_MMAP) && !defined(HAVE_QNX)
+	DB_ENV *dbenv;
+
+	/* If the user replaced the map call, call through their interface. */
+	if (DB_GLOBAL(j_file_map) != NULL) {
+		/*
+		 * We pass a DB_ENV handle to the user's replacement map
+		 * function, so there must be a valid handle.
+		 */
+		DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+		dbenv = env->dbenv;
+
+		return (
+		    DB_GLOBAL(j_file_map)(dbenv, path, len, is_rdonly, addrp));
+	}
+
+	return (__os_map(env, path, fhp, len, 0, is_rdonly, addrp));
+#else
+	COMPQUIET(env, NULL);
+	COMPQUIET(path, NULL);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(is_rdonly, 0);
+	COMPQUIET(len, 0);
+	COMPQUIET(addrp, NULL);
+	return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __os_unmapfile --
+ *	Unmap the shared memory file.
+ *
+ * PUBLIC: int __os_unmapfile __P((ENV *, void *, size_t));
+ */
+int
+__os_unmapfile(env, addr, len)
+	ENV *env;
+	void *addr;
+	size_t len;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	/*
+	 * We pass a DB_ENV handle to the user's replacement unmap function,
+	 * so there must be a valid handle.
+	 */
+	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+	dbenv = env->dbenv;
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR("0124", "fileops: munmap"));
+
+	/* If the user replaced the map call, call through their interface. */
+	if (DB_GLOBAL(j_file_unmap) != NULL)
+		return (DB_GLOBAL(j_file_unmap)(dbenv, addr));
+
+#ifdef HAVE_MMAP
+#ifdef HAVE_MUNLOCK
+	if (F_ISSET(env, ENV_LOCKDOWN))
+		RETRY_CHK((munlock(addr, len)), ret);
+		/*
+		 * !!!
+		 * The return value is ignored.
+		 */
+#else
+	COMPQUIET(env, NULL);
+#endif
+	RETRY_CHK((munmap(addr, len)), ret);
+	ret = __os_posix_err(ret);
+#else
+	COMPQUIET(env, NULL);
+	ret = EINVAL;
+#endif
+	return (ret);
+}
+
+#ifdef HAVE_MMAP
+/*
+ * __os_map --
+ *	Call the mmap(2) function.
+ */
+static int
+__os_map(env, path, fhp, len, is_region, is_rdonly, addrp)
+	ENV *env;
+	char *path;
+	DB_FH *fhp;
+	int is_region, is_rdonly;
+	size_t len;
+	void **addrp;
+{
+	DB_ENV *dbenv;
+	int flags, prot, ret;
+	void *p;
+
+	/*
+	 * We pass a DB_ENV handle to the user's replacement map function,
+	 * so there must be a valid handle.
+	 */
+	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+	dbenv = env->dbenv;
+
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0125", "fileops: mmap %s",
+		    "%s"), path);
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+	/*
+	 * If it's read-only, it's private, and if it's not, it's shared.
+	 * Don't bother with an additional parameter.
+	 */
+	flags = is_rdonly ? MAP_PRIVATE : MAP_SHARED;
+
+#ifdef MAP_FILE
+	/*
+	 * Historically, MAP_FILE was required for mapping regular files,
+	 * even though it was the default.  Some systems have it, some
+	 * don't, some that have it set it to 0.
+	 */
+	flags |= MAP_FILE;
+#endif
+
+	/*
+	 * I know of no systems that implement the flag to tell the system
+	 * that the region contains semaphores, but it's not an unreasonable
+	 * thing to do, and has been part of the design since forever.  I
+	 * don't think anyone will object, but don't set it for read-only
+	 * files, it doesn't make sense.
+	 */
+#ifdef MAP_HASSEMAPHORE
+	if (is_region && !is_rdonly)
+		flags |= MAP_HASSEMAPHORE;
+#else
+	COMPQUIET(is_region, 0);
+#endif
+
+	/*
+	 * FreeBSD:
+	 * Causes data dirtied via this VM map to be flushed to physical media
+	 * only when necessary (usually by the pager) rather then gratuitously.
+	 * Typically this prevents the update daemons from flushing pages
+	 * dirtied through such maps and thus allows efficient sharing of
+	 * memory across unassociated processes using a file-backed shared
+	 * memory map.
+	 */
+#ifdef MAP_NOSYNC
+	flags |= MAP_NOSYNC;
+#endif
+
+	prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE);
+
+	/*
+	 * XXX
+	 * Work around a bug in the VMS V7.1 mmap() implementation.  To map
+	 * a file into memory on VMS it needs to be opened in a certain way,
+	 * originally.  To get the file opened in that certain way, the VMS
+	 * mmap() closes the file and re-opens it.  When it does this, it
+	 * doesn't flush any caches out to disk before closing.  The problem
+	 * this causes us is that when the memory cache doesn't get written
+	 * out, the file isn't big enough to match the memory chunk and the
+	 * mmap() call fails.  This call to fsync() fixes the problem.  DEC
+	 * thinks this isn't a bug because of language in XPG5 discussing user
+	 * responsibility for on-disk and in-memory synchronization.
+	 */
+#ifdef VMS
+	if (__os_fsync(env, fhp) == -1)
+		return (__os_posix_err(__os_get_syserr()));
+#endif
+
+	/* MAP_FAILED was not defined in early mmap implementations. */
+#ifndef MAP_FAILED
+#define	MAP_FAILED	-1
+#endif
+	if ((p = mmap(NULL,
+	    len, prot, flags, fhp->fd, (off_t)0)) == (void *)MAP_FAILED) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("0126", "mmap"));
+		return (__os_posix_err(ret));
+	}
+
+	/*
+	 * If it's a region, we want to make sure that the memory isn't paged.
+	 * For example, Solaris will page large mpools because it thinks that
+	 * I/O buffer memory is more important than we are.  The mlock system
+	 * call may or may not succeed (mlock is restricted to the super-user
+	 * on some systems).  Currently, the only other use of mmap in DB is
+	 * to map read-only databases -- we don't want them paged, either, so
+	 * the call isn't conditional.
+	 */
+	if (F_ISSET(env, ENV_LOCKDOWN)) {
+#ifdef HAVE_MLOCK
+		ret = mlock(p, len) == 0 ? 0 : __os_get_syserr();
+#else
+		ret = DB_OPNOTSUP;
+#endif
+		if (ret != 0) {
+			__db_syserr(env, ret, DB_STR("0127", "mlock"));
+			return (__os_posix_err(ret));
+		}
+	}
+
+	*addrp = p;
+	return (0);
+}
+#endif
+
+#ifdef HAVE_SHMGET
+#ifndef SHM_R
+#define	SHM_R	0400
+#endif
+#ifndef SHM_W
+#define	SHM_W	0200
+#endif
+
+/*
+ * __shm_mode --
+ *	Map the DbEnv::open method file mode permissions to shmget call
+ *	permissions.
+ */
+static int
+__shm_mode(env)
+	ENV *env;
+{
+	int mode;
+
+	/* Default to r/w owner, r/w group. */
+	if (env->db_mode == 0)
+		return (SHM_R | SHM_W | SHM_R >> 3 | SHM_W >> 3);
+
+	mode = 0;
+	if (env->db_mode & S_IRUSR)
+		mode |= SHM_R;
+	if (env->db_mode & S_IWUSR)
+		mode |= SHM_W;
+	if (env->db_mode & S_IRGRP)
+		mode |= SHM_R >> 3;
+	if (env->db_mode & S_IWGRP)
+		mode |= SHM_W >> 3;
+	if (env->db_mode & S_IROTH)
+		mode |= SHM_R >> 6;
+	if (env->db_mode & S_IWOTH)
+		mode |= SHM_W >> 6;
+	return (mode);
+}
+#else
+/*
+ * __no_system_mem --
+ *	No system memory environments error message.
+ */
+static int
+__no_system_mem(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("0128",
+	    "architecture doesn't support environments in system memory"));
+	return (DB_OPNOTSUP);
+}
+#endif /* HAVE_SHMGET */
diff --git a/src/os/os_mkdir.c b/src/os/os_mkdir.c
new file mode 100644
index 00000000..800d445c
--- /dev/null
+++ b/src/os/os_mkdir.c
@@ -0,0 +1,52 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_mkdir --
+ *	Create a directory.
+ *
+ * PUBLIC: int __os_mkdir __P((ENV *, const char *, int));
+ */
+int
+__os_mkdir(env, name, mode)
+	ENV *env;
+	const char *name;
+	int mode;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0129", "fileops: mkdir %s",
+		    "%s"), name);
+
+	/* Make the directory, with paranoid permissions. */
+#if defined(HAVE_VXWORKS)
+	RETRY_CHK((mkdir(CHAR_STAR_CAST name)), ret);
+#else
+	RETRY_CHK((mkdir(name, DB_MODE_700)), ret);
+#endif
+	if (ret != 0)
+		return (__os_posix_err(ret));
+
+	/* Set the absolute permissions, if specified. */
+#if !defined(HAVE_VXWORKS)
+	if (mode != 0) {
+		RETRY_CHK((chmod(name, mode)), ret);
+		if (ret != 0)
+			ret = __os_posix_err(ret);
+	}
+#endif
+	return (ret);
+}
diff --git a/src/os/os_open.c b/src/os/os_open.c
new file mode 100644
index 00000000..5090c8e1
--- /dev/null
+++ b/src/os/os_open.c
@@ -0,0 +1,162 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_open --
+ *	Open a file descriptor (including page size and log size information).
+ *
+ * PUBLIC: int __os_open __P((ENV *,
+ * PUBLIC:     const char *, u_int32_t, u_int32_t, int, DB_FH **));
+ */
+int
+__os_open(env, name, page_size, flags, mode, fhpp)
+	ENV *env;
+	const char *name;
+	u_int32_t page_size, flags;
+	int mode;
+	DB_FH **fhpp;
+{
+	DB_ENV *dbenv;
+	DB_FH *fhp;
+	int oflags, ret;
+
+	COMPQUIET(page_size, 0);
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	*fhpp = NULL;
+	oflags = 0;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0152",
+		    "fileops: open %s", "%s"), name);
+
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
+	DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION | DB_OSO_SEQ |	\
+	DB_OSO_TEMP | DB_OSO_TRUNC)
+	if ((ret = __db_fchk(env, "__os_open", flags, OKFLAGS)) != 0)
+		return (ret);
+
+#if defined(O_BINARY)
+	/*
+	 * If there's a binary-mode open flag, set it, we never want any
+	 * kind of translation.  Some systems do translations by default,
+	 * e.g., with Cygwin, the default mode for an open() is set by the
+	 * mode of the mount that underlies the file.
+	 */
+	oflags |= O_BINARY;
+#endif
+
+	/*
+	 * DB requires the POSIX 1003.1 semantic that two files opened at the
+	 * same time with DB_OSO_CREATE/O_CREAT and DB_OSO_EXCL/O_EXCL flags
+	 * set return an EEXIST failure in at least one.
+	 */
+	if (LF_ISSET(DB_OSO_CREATE))
+		oflags |= O_CREAT;
+
+	if (LF_ISSET(DB_OSO_EXCL))
+		oflags |= O_EXCL;
+
+#ifdef HAVE_O_DIRECT
+	if (LF_ISSET(DB_OSO_DIRECT))
+		oflags |= O_DIRECT;
+#endif
+#ifdef O_DSYNC
+	if (LF_ISSET(DB_OSO_DSYNC))
+		oflags |= O_DSYNC;
+#endif
+
+	if (LF_ISSET(DB_OSO_RDONLY))
+		oflags |= O_RDONLY;
+	else
+		oflags |= O_RDWR;
+
+	if (LF_ISSET(DB_OSO_TRUNC))
+		oflags |= O_TRUNC;
+
+	/*
+	 * Undocumented feature: allow applications to create intermediate
+	 * directories whenever a file is opened.
+	 */
+	if (dbenv != NULL &&
+	    env->dir_mode != 0 && LF_ISSET(DB_OSO_CREATE) &&
+	    (ret = __db_mkpath(env, name)) != 0)
+		return (ret);
+
+	/* Open the file. */
+#ifdef HAVE_QNX
+	if (LF_ISSET(DB_OSO_REGION))
+		ret = __os_qnx_region_open(env, name, oflags, mode, &fhp);
+	else
+#endif
+	ret = __os_openhandle(env, name, oflags, mode, &fhp);
+	if (ret != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_OSO_REGION))
+		F_SET(fhp, DB_FH_REGION);
+#ifdef HAVE_FCHMOD
+	/*
+	 * If the code using Berkeley DB is a library, that code may not be able
+	 * to control the application's umask value.  Allow applications to set
+	 * absolute file modes.  We can't fix the race between file creation and
+	 * the fchmod call -- we can't modify the process' umask here since the
+	 * process may be multi-threaded and the umask value is per-process, not
+	 * per-thread.
+	 */
+	if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_ABSMODE))
+		(void)fchmod(fhp->fd, mode);
+#endif
+
+#ifdef O_DSYNC
+	/*
+	 * If we can configure the file descriptor to flush on write, the
+	 * file descriptor does not need to be explicitly sync'd.
+	 */
+	if (LF_ISSET(DB_OSO_DSYNC))
+		F_SET(fhp, DB_FH_NOSYNC);
+#endif
+
+#if defined(HAVE_DIRECTIO) && defined(DIRECTIO_ON)
+	/*
+	 * The Solaris C library includes directio, but you have to set special
+	 * compile flags to #define DIRECTIO_ON.  Require both in order to call
+	 * directio.
+	 */
+	if (LF_ISSET(DB_OSO_DIRECT))
+		(void)directio(fhp->fd, DIRECTIO_ON);
+#endif
+
+	/*
+	 * Delete any temporary file.
+	 *
+	 * !!!
+	 * There's a race here, where we've created a file and we crash before
+	 * we can unlink it.  Temporary files aren't common in DB, regardless,
+	 * it's not a security problem because the file is empty.  There's no
+	 * reasonable way to avoid the race (playing signal games isn't worth
+	 * the portability nightmare), so we just live with it.
+	 */
+	if (LF_ISSET(DB_OSO_TEMP)) {
+#if defined(HAVE_UNLINK_WITH_OPEN_FAILURE) || defined(CONFIG_TEST)
+		F_SET(fhp, DB_FH_UNLINK);
+#else
+		(void)__os_unlink(env, name, 0);
+#endif
+	}
+
+	*fhpp = fhp;
+	return (0);
+}
diff --git a/src/os/os_path.c b/src/os/os_path.c
new file mode 100644
index 00000000..478fdf45
--- /dev/null
+++ b/src/os/os_path.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+/*
+ * __os_concat_path --
+ *	Concatenate two elements of a path.
+ * PUBLIC: int __os_concat_path __P((char *,
+ * PUBLIC:     size_t, const char *, const char *));
+ */
+int __os_concat_path(dest, destsize, path, file)
+	char *dest;
+	size_t destsize;
+	const char *path, *file;
+{
+	if ((size_t)snprintf(dest, destsize,
+	    "%s%c%s", path, PATH_SEPARATOR[0], file) >= destsize)
+		return (EINVAL);
+	return (0);
+}
diff --git a/src/os/os_pid.c b/src/os/os_pid.c
new file mode 100644
index 00000000..b1b94d60
--- /dev/null
+++ b/src/os/os_pid.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_id --
+ *	Return the current process ID.
+ *
+ * PUBLIC: void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*));
+ */
+void
+__os_id(dbenv, pidp, tidp)
+	DB_ENV *dbenv;
+	pid_t *pidp;
+	db_threadid_t *tidp;
+{
+	/*
+	 * We can't depend on dbenv not being NULL, this routine is called
+	 * from places where there's no DB_ENV handle.
+	 *
+	 * We cache the pid in the ENV handle, getting the process ID is a
+	 * fairly slow call on lots of systems.
+	 */
+	if (pidp != NULL) {
+		if (dbenv == NULL) {
+#if defined(HAVE_VXWORKS)
+			*pidp = taskIdSelf();
+#else
+			*pidp = getpid();
+#endif
+		} else
+			*pidp = dbenv->env->pid_cache;
+	}
+
+/* 
+ * When building on MinGW, we define both HAVE_PTHREAD_SELF and DB_WIN32,
+ * and we are using pthreads instead of Windows threads implementation.
+ * So here, we need to check the thread implementations before checking
+ * the platform.
+ */
+	if (tidp != NULL) {
+#if defined(HAVE_PTHREAD_SELF)
+		*tidp = pthread_self();
+#elif defined(HAVE_MUTEX_UI_THREADS)
+		*tidp = thr_self();
+#elif defined(DB_WIN32)
+		*tidp = GetCurrentThreadId();
+#else
+		/*
+		 * Default to just getpid.
+		 */
+		DB_THREADID_INIT(*tidp);
+#endif
+	}
+}
diff --git a/src/os/os_rename.c b/src/os/os_rename.c
new file mode 100644
index 00000000..63aac7bb
--- /dev/null
+++ b/src/os/os_rename.c
@@ -0,0 +1,53 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rename --
+ *	Rename a file.
+ *
+ * PUBLIC: int __os_rename __P((ENV *,
+ * PUBLIC:    const char *, const char *, u_int32_t));
+ */
+int
+__os_rename(env, oldname, newname, silent)
+	ENV *env;
+	const char *oldname, *newname;
+	u_int32_t silent;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0168", "fileops: rename %s to %s",
+		    "%s %s"), oldname, newname);
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	if (DB_GLOBAL(j_rename) != NULL)
+		ret = DB_GLOBAL(j_rename)(oldname, newname);
+	else
+		RETRY_CHK((rename(oldname, newname)), ret);
+
+	/*
+	 * If "silent" is not set, then errors are OK and we should not output
+	 * an error message.
+	 */
+	if (ret != 0) {
+		if (!silent)
+			__db_syserr(env, ret, DB_STR_A("0169",
+			    "rename %s %s", "%s %s"), oldname, newname);
+		ret = __os_posix_err(ret);
+	}
+	return (ret);
+}
diff --git a/src/os/os_root.c b/src/os/os_root.c
new file mode 100644
index 00000000..77e7a72c
--- /dev/null
+++ b/src/os/os_root.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_isroot --
+ *	Return if user has special permissions.
+ *
+ * PUBLIC: int __os_isroot __P((void));
+ */
+int
+__os_isroot()
+{
+#ifdef HAVE_GETUID
+	return (getuid() == 0);
+#else
+	return (0);
+#endif
+}
diff --git a/src/os/os_rpath.c b/src/os/os_rpath.c
new file mode 100644
index 00000000..16f3e54c
--- /dev/null
+++ b/src/os/os_rpath.c
@@ -0,0 +1,36 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_rpath --
+ *	Return the last path separator in the path or NULL if none found.
+ *
+ * PUBLIC: char *__db_rpath __P((const char *));
+ */
+char *
+__db_rpath(path)
+	const char *path;
+{
+	const char *s, *last;
+
+	s = path;
+	last = NULL;
+	if (PATH_SEPARATOR[1] != '\0') {
+		for (; s[0] != '\0'; ++s)
+			if (strchr(PATH_SEPARATOR, s[0]) != NULL)
+				last = s;
+	} else
+		for (; s[0] != '\0'; ++s)
+			if (s[0] == PATH_SEPARATOR[0])
+				last = s;
+	return ((char *)last);
+}
diff --git a/src/os/os_rw.c b/src/os/os_rw.c
new file mode 100644
index 00000000..c0967514
--- /dev/null
+++ b/src/os/os_rw.c
@@ -0,0 +1,291 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_io --
+ *	Do an I/O.
+ *
+ * PUBLIC: int __os_io __P((ENV *, int, DB_FH *, db_pgno_t,
+ * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
+ */
+int
+__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
+	ENV *env;
+	int op;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize, relative, io_len;
+	u_int8_t *buf;
+	size_t *niop;
+{
+#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
+	DB_ENV *dbenv;
+	off_t offset;
+	ssize_t nio;
+#endif
+	int ret;
+
+	/*
+	 * Check for illegal usage.
+	 *
+	 * This routine is used in one of two ways: reading bytes from an
+	 * absolute offset and reading a specific database page.  All of
+	 * our absolute offsets are known to fit into a u_int32_t, while
+	 * our database pages might be at offsets larger than a u_int32_t.
+	 */
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+	DB_ASSERT(env, (pgno == 0 && pgsize == 0) || relative == 0);
+
+#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if ((offset = relative) == 0)
+		offset = (off_t)pgno * pgsize;
+	switch (op) {
+	case DB_IO_READ:
+		if (DB_GLOBAL(j_read) != NULL)
+			goto slow;
+#if defined(HAVE_STATISTICS)
+		++fhp->read_count;
+#endif
+		if (dbenv != NULL &&
+		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+			__db_msg(env, DB_STR_A("0130",
+			    "fileops: read %s: %lu bytes at offset %lu",
+			    "%s %lu %lu"), fhp->name, (u_long)io_len,
+			    (u_long)offset);
+
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		nio = DB_GLOBAL(j_pread) != NULL ?
+		    DB_GLOBAL(j_pread)(fhp->fd, buf, io_len, offset) :
+		    pread(fhp->fd, buf, io_len, offset);
+		break;
+	case DB_IO_WRITE:
+		if (DB_GLOBAL(j_write) != NULL)
+			goto slow;
+#ifdef HAVE_FILESYSTEM_NOTZERO
+		if (__os_fs_notzero())
+			goto slow;
+#endif
+#if defined(HAVE_STATISTICS)
+		++fhp->write_count;
+#endif
+		if (dbenv != NULL &&
+		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+			__db_msg(env, DB_STR_A("0131",
+			    "fileops: write %s: %lu bytes at offset %lu",
+			    "%s %lu %lu"), fhp->name, (u_long)io_len,
+			    (u_long)offset);
+
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		nio = DB_GLOBAL(j_pwrite) != NULL ?
+		    DB_GLOBAL(j_pwrite)(fhp->fd, buf, io_len, offset) :
+		    pwrite(fhp->fd, buf, io_len, offset);
+		break;
+	default:
+		return (EINVAL);
+	}
+	if (nio == (ssize_t)io_len) {
+		*niop = io_len;
+		return (0);
+	}
+slow:
+#endif
+	MUTEX_LOCK(env, fhp->mtx_fh);
+
+	if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
+		goto err;
+	switch (op) {
+	case DB_IO_READ:
+		ret = __os_read(env, fhp, buf, io_len, niop);
+		break;
+	case DB_IO_WRITE:
+		ret = __os_write(env, fhp, buf, io_len, niop);
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+err:	MUTEX_UNLOCK(env, fhp->mtx_fh);
+
+	return (ret);
+
+}
+
+/*
+ * __os_read --
+ *	Read from a file handle.
+ *
+ * PUBLIC: int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_read(env, fhp, addr, len, nrp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nrp;
+{
+	DB_ENV *dbenv;
+	size_t offset;
+	ssize_t nr;
+	int ret;
+	u_int8_t *taddr;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	ret = 0;
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#if defined(HAVE_STATISTICS)
+	++fhp->read_count;
+#endif
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0132",
+		    "fileops: read %s: %lu bytes", "%s %lu"),
+		    fhp->name, (u_long)len);
+
+	if (DB_GLOBAL(j_read) != NULL) {
+		*nrp = len;
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
+			ret = __os_get_syserr();
+			__db_syserr(env, ret, DB_STR_A("0133",
+			    "read: %#lx, %lu", "%#lx %lu"),
+			    P_TO_ULONG(addr), (u_long)len);
+			ret = __os_posix_err(ret);
+		}
+		return (ret);
+	}
+
+	for (taddr = addr, offset = 0;
+	    offset < len; taddr += nr, offset += (u_int32_t)nr) {
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		RETRY_CHK(((nr = read(fhp->fd,
+		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
+		if (nr == 0 || ret != 0)
+			break;
+	}
+	*nrp = (size_t)(taddr - (u_int8_t *)addr);
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0134",
+		    "read: %#lx, %lu", "%#lx %lu"),
+		    P_TO_ULONG(taddr), (u_long)len - offset);
+		ret = __os_posix_err(ret);
+	}
+	return (ret);
+}
+
+/*
+ * __os_write --
+ *	Write to a file handle.
+ *
+ * PUBLIC: int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_write(env, fhp, addr, len, nwp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nwp;
+{
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+	/* Zero-fill as necessary. */
+	if (__os_fs_notzero()) {
+		int ret;
+		if ((ret = __db_zero_fill(env, fhp)) != 0)
+			return (ret);
+	}
+#endif
+	return (__os_physwrite(env, fhp, addr, len, nwp));
+}
+
+/*
+ * __os_physwrite --
+ *	Physical write to a file handle.
+ *
+ * PUBLIC: int __os_physwrite
+ * PUBLIC:     __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_physwrite(env, fhp, addr, len, nwp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nwp;
+{
+	DB_ENV *dbenv;
+	size_t offset;
+	ssize_t nw;
+	int ret;
+	u_int8_t *taddr;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	ret = 0;
+
+#if defined(HAVE_STATISTICS)
+	++fhp->write_count;
+#endif
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0135",
+		    "fileops: write %s: %lu bytes", "%s %lu"),
+		    fhp->name, (u_long)len);
+
+#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
+	if (__os_fs_notzero()) {
+		struct stat sb;
+		off_t cur_off;
+
+		DB_ASSERT(env, fstat(fhp->fd, &sb) != -1 &&
+		    (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
+		    cur_off <= sb.st_size);
+	}
+#endif
+	if (DB_GLOBAL(j_write) != NULL) {
+		*nwp = len;
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
+			ret = __os_get_syserr();
+			__db_syserr(env, ret, DB_STR_A("0136",
+			    "write: %#lx, %lu", "%#lx %lu"),
+			    P_TO_ULONG(addr), (u_long)len);
+			ret = __os_posix_err(ret);
+
+			DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+		}
+		return (ret);
+	}
+
+	for (taddr = addr, offset = 0;
+	    offset < len; taddr += nw, offset += (u_int32_t)nw) {
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		RETRY_CHK(((nw = write(fhp->fd,
+		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
+		if (ret != 0)
+			break;
+	}
+	*nwp = len;
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0137",
+		    "write: %#lx, %lu", "%#lx %lu"),
+		    P_TO_ULONG(taddr), (u_long)len - offset);
+		ret = __os_posix_err(ret);
+
+		DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+	}
+	return (ret);
+}
diff --git a/src/os/os_seek.c b/src/os/os_seek.c
new file mode 100644
index 00000000..4676d33a
--- /dev/null
+++ b/src/os/os_seek.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_seek --
+ *	Seek to a page/byte offset in the file.
+ *
+ * PUBLIC: int __os_seek __P((ENV *,
+ * PUBLIC:      DB_FH *, db_pgno_t, u_int32_t, off_t));
+ */
+int
+__os_seek(env, fhp, pgno, pgsize, relative)
+	ENV *env;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize;
+	off_t relative;
+{
+	DB_ENV *dbenv;
+	off_t offset;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#if defined(HAVE_STATISTICS)
+	++fhp->seek_count;
+#endif
+
+	offset = (off_t)pgsize * pgno + relative;
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0170",
+		    "fileops: seek %s to %lu", "%s %lu"),
+		    fhp->name, (u_long)offset);
+
+	if (DB_GLOBAL(j_seek) != NULL)
+		ret = DB_GLOBAL(j_seek)(fhp->fd, offset, SEEK_SET);
+	else
+		RETRY_CHK((lseek(
+		    fhp->fd, offset, SEEK_SET) == -1 ? 1 : 0), ret);
+
+	if (ret == 0) {
+		fhp->pgsize = pgsize;
+		fhp->pgno = pgno;
+		fhp->offset = relative;
+	} else {
+		__db_syserr(env, ret, DB_STR_A("0171",
+		    "seek: %lu: (%lu * %lu) + %lu", "%lu %lu %lu %lu"),
+		    (u_long)offset, (u_long)pgno, (u_long)pgsize,
+		    (u_long)relative);
+		ret = __os_posix_err(ret);
+	}
+
+	return (ret);
+}
diff --git a/src/os/os_stack.c b/src/os/os_stack.c
new file mode 100644
index 00000000..037080f3
--- /dev/null
+++ b/src/os/os_stack.c
@@ -0,0 +1,45 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#if defined(HAVE_SYSTEM_INCLUDE_FILES) && defined(HAVE_BACKTRACE) && \
+    defined(HAVE_BACKTRACE_SYMBOLS) && defined(HAVE_EXECINFO_H)
+#include <execinfo.h>
+#endif
+
+/*
+ * __os_stack --
+ *	Output a stack trace to the message file handle.
+ *
+ * PUBLIC: void __os_stack __P((ENV *));
+ */
+void
+__os_stack(env)
+	ENV *env;
+{
+#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
+	void *array[200];
+	size_t i, size;
+	char **strings;
+
+	/*
+	 * Solaris and the GNU C library support this interface.  Solaris
+	 * has additional interfaces (printstack and walkcontext), I don't
+	 * know if they offer any additional value or not.
+	 */
+	size = backtrace(array, sizeof(array) / sizeof(array[0]));
+	strings = backtrace_symbols(array, size);
+
+	for (i = 0; i < size; ++i)
+		__db_errx(env, "%s", strings[i]);
+	free(strings);
+#endif
+	COMPQUIET(env, NULL);
+}
diff --git a/src/os/os_stat.c b/src/os/os_stat.c
new file mode 100644
index 00000000..43c66075
--- /dev/null
+++ b/src/os/os_stat.c
@@ -0,0 +1,108 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_exists --
+ *	Return if the file exists.
+ *
+ * PUBLIC: int __os_exists __P((ENV *, const char *, int *));
+ */
+int
+__os_exists(env, path, isdirp)
+	ENV *env;
+	const char *path;
+	int *isdirp;
+{
+	DB_ENV *dbenv;
+	struct stat sb;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0165",
+		    "fileops: stat %s", "%s"), path);
+
+	if (DB_GLOBAL(j_exists) != NULL)
+		return (DB_GLOBAL(j_exists)(path, isdirp));
+
+	RETRY_CHK((stat(CHAR_STAR_CAST path, &sb)), ret);
+	if (ret != 0)
+		return (__os_posix_err(ret));
+
+#if !defined(S_ISDIR) || defined(STAT_MACROS_BROKEN)
+#undef	S_ISDIR
+#ifdef _S_IFDIR
+#define	S_ISDIR(m)	(_S_IFDIR & (m))
+#else
+#define	S_ISDIR(m)	(((m) & 0170000) == 0040000)
+#endif
+#endif
+	if (isdirp != NULL)
+		*isdirp = S_ISDIR(sb.st_mode);
+
+	return (0);
+}
+
+/*
+ * __os_ioinfo --
+ *	Return file size and I/O size; abstracted to make it easier
+ *	to replace.
+ *
+ * PUBLIC: int __os_ioinfo __P((ENV *, const char *,
+ * PUBLIC:    DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__os_ioinfo(env, path, fhp, mbytesp, bytesp, iosizep)
+	ENV *env;
+	const char *path;
+	DB_FH *fhp;
+	u_int32_t *mbytesp, *bytesp, *iosizep;
+{
+	struct stat sb;
+	int ret;
+
+	if (DB_GLOBAL(j_ioinfo) != NULL)
+		return (DB_GLOBAL(j_ioinfo)(path,
+		    fhp->fd, mbytesp, bytesp, iosizep));
+
+	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+	RETRY_CHK((fstat(fhp->fd, &sb)), ret);
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR("0166", "fstat"));
+		return (__os_posix_err(ret));
+	}
+
+	/* Return the size of the file. */
+	if (mbytesp != NULL)
+		*mbytesp = (u_int32_t)(sb.st_size / MEGABYTE);
+	if (bytesp != NULL)
+		*bytesp = (u_int32_t)(sb.st_size % MEGABYTE);
+
+	/*
+	 * Return the underlying filesystem I/O size, if available.
+	 *
+	 * XXX
+	 * Check for a 0 size -- the HP MPE/iX architecture has st_blksize,
+	 * but it's always 0.
+	 */
+#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
+	if (iosizep != NULL && (*iosizep = sb.st_blksize) == 0)
+		*iosizep = DB_DEF_IOSIZE;
+#else
+	if (iosizep != NULL)
+		*iosizep = DB_DEF_IOSIZE;
+#endif
+	return (0);
+}
diff --git a/src/os/os_tmpdir.c b/src/os/os_tmpdir.c
new file mode 100644
index 00000000..06d35ba9
--- /dev/null
+++ b/src/os/os_tmpdir.c
@@ -0,0 +1,141 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#ifdef macintosh
+#include <TFileSpec.h>
+#endif
+#endif
+
+/*
+ * __os_tmpdir --
+ *	Set the temporary directory path.
+ *
+ * The order of items in the list structure and the order of checks in
+ * the environment are documented.
+ *
+ * PUBLIC: int __os_tmpdir __P((ENV *, u_int32_t));
+ */
+int
+__os_tmpdir(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	int isdir, ret;
+	char *tdir, tdir_buf[DB_MAXPATHLEN];
+
+	dbenv = env->dbenv;
+
+	/* Use the environment if it's permitted and initialized. */
+	if (LF_ISSET(DB_USE_ENVIRON) ||
+	    (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot())) {
+		/* POSIX: TMPDIR */
+		tdir = tdir_buf;
+		if ((ret = __os_getenv(
+		    env, "TMPDIR", &tdir, sizeof(tdir_buf))) != 0)
+			return (ret);
+		if (tdir != NULL && tdir[0] != '\0')
+			goto found;
+
+		/*
+		 * Windows: TEMP, TMP
+		 */
+		tdir = tdir_buf;
+		if ((ret = __os_getenv(
+		    env, "TEMP", &tdir, sizeof(tdir_buf))) != 0)
+			return (ret);
+		if (tdir != NULL && tdir[0] != '\0')
+			goto found;
+
+		tdir = tdir_buf;
+		if ((ret = __os_getenv(
+		    env, "TMP", &tdir, sizeof(tdir_buf))) != 0)
+			return (ret);
+		if (tdir != NULL && tdir[0] != '\0')
+			goto found;
+
+		/* Macintosh */
+		tdir = tdir_buf;
+		if ((ret = __os_getenv(
+		    env, "TempFolder", &tdir, sizeof(tdir_buf))) != 0)
+			return (ret);
+
+		if (tdir != NULL && tdir[0] != '\0')
+found:			return (__os_strdup(env, tdir, &dbenv->db_tmp_dir));
+	}
+
+#ifdef macintosh
+	/* Get the path to the temporary folder. */
+	{FSSpec spec;
+
+		if (!Special2FSSpec(kTemporaryFolderType,
+		    kOnSystemDisk, 0, &spec))
+			return (__os_strdup(env,
+			    FSp2FullPath(&spec), &dbenv->db_tmp_dir));
+	}
+#endif
+#ifdef DB_WIN32
+	/* Get the path to the temporary directory. */
+	{
+		_TCHAR tpath[DB_MAXPATHLEN + 1];
+		char *path, *eos;
+
+		if (GetTempPath(DB_MAXPATHLEN, tpath) > 2) {
+			FROM_TSTRING(env, tpath, path, ret);
+			if (ret != 0)
+				return (ret);
+
+			eos = path + strlen(path) - 1;
+			if (*eos == '\\' || *eos == '/')
+				*eos = '\0';
+			if (__os_exists(env, path, &isdir) == 0 && isdir) {
+				ret = __os_strdup(env,
+				    path, &dbenv->db_tmp_dir);
+				FREE_STRING(env, path);
+				return (ret);
+			}
+			FREE_STRING(env, path);
+		}
+	}
+#endif
+
+	/*
+	 * Step through the static list looking for a possibility.
+	 *
+	 * We don't use the obvious data structure because some C compilers
+	 * (and I use the phrase loosely) don't like static data arrays.
+	 */
+#define	DB_TEMP_DIRECTORY(n) {						\
+	char *__p = n;							\
+	if (__os_exists(env, __p, &isdir) == 0 && isdir != 0)		\
+		return (__os_strdup(env, __p, &dbenv->db_tmp_dir));	\
+	}
+#ifdef DB_WIN32
+	DB_TEMP_DIRECTORY("/temp");
+	DB_TEMP_DIRECTORY("C:/temp");
+	DB_TEMP_DIRECTORY("C:/tmp");
+#else
+	DB_TEMP_DIRECTORY("/var/tmp");
+	DB_TEMP_DIRECTORY("/usr/tmp");
+	DB_TEMP_DIRECTORY("/tmp");
+#if defined(ANDROID) || defined(DB_ANDROID)
+	DB_TEMP_DIRECTORY("/cache");
+#endif
+#endif
+
+	/*
+	 * If we don't have any other place to store temporary files, store
+	 * them in the current directory.
+	 */
+	return (__os_strdup(env, "", &dbenv->db_tmp_dir));
+}
diff --git a/src/os/os_truncate.c b/src/os/os_truncate.c
new file mode 100644
index 00000000..f559e9cb
--- /dev/null
+++ b/src/os/os_truncate.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_truncate --
+ *	Truncate the file.
+ *
+ * PUBLIC: int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+ */
+int
+__os_truncate(env, fhp, pgno, pgsize)
+	ENV *env;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize;
+{
+	DB_ENV *dbenv;
+	off_t offset;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * Truncate a file so that "pgno" is discarded from the end of the
+	 * file.
+	 */
+	offset = (off_t)pgsize * pgno;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0141",
+		    "fileops: truncate %s to %lu", "%s %lu"),
+		    fhp->name, (u_long)offset);
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	if (DB_GLOBAL(j_ftruncate) != NULL)
+		ret = DB_GLOBAL(j_ftruncate)(fhp->fd, offset);
+	else {
+#ifdef HAVE_FTRUNCATE
+		RETRY_CHK((ftruncate(fhp->fd, offset)), ret);
+#else
+		ret = DB_OPNOTSUP;
+#endif
+	}
+
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0142",
+		    "ftruncate: %lu", "%lu"), (u_long)offset);
+		ret = __os_posix_err(ret);
+	}
+
+	return (ret);
+}
diff --git a/src/os/os_uid.c b/src/os/os_uid.c
new file mode 100644
index 00000000..2e5c9f87
--- /dev/null
+++ b/src/os/os_uid.c
@@ -0,0 +1,55 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unique_id --
+ *	Return a unique 32-bit value.
+ *
+ * PUBLIC: void __os_unique_id __P((ENV *, u_int32_t *));
+ */
+void
+__os_unique_id(env, idp)
+	ENV *env;
+	u_int32_t *idp;
+{
+	DB_ENV *dbenv;
+	db_timespec v;
+	pid_t pid;
+	u_int32_t id;
+
+	*idp = 0;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * Our randomized value is comprised of our process ID, the current
+	 * time of day and a stack address, all XOR'd together.
+	 */
+	__os_id(dbenv, &pid, NULL);
+	__os_gettime(env, &v, 1);
+
+	id = (u_int32_t)pid ^
+	    (u_int32_t)v.tv_sec ^ (u_int32_t)v.tv_nsec ^ P_TO_UINT32(&pid);
+
+	/*
+	 * We could try and find a reasonable random-number generator, but
+	 * that's not all that easy to do.  Seed and use srand()/rand(), if
+	 * we can find them.
+	 */
+	if (DB_GLOBAL(uid_init) == 0) {
+		DB_GLOBAL(uid_init) = 1;
+		srand((u_int)id);
+	}
+	id ^= (u_int)rand();
+
+	*idp = id;
+}
diff --git a/src/os/os_unlink.c b/src/os/os_unlink.c
new file mode 100644
index 00000000..f9a0b688
--- /dev/null
+++ b/src/os/os_unlink.c
@@ -0,0 +1,80 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unlink --
+ *	Remove a file.
+ *
+ * PUBLIC: int __os_unlink __P((ENV *, const char *, int));
+ */
+int
+__os_unlink(env, path, overwrite_test)
+	ENV *env;
+	const char *path;
+	int overwrite_test;
+{
+	DB_ENV *dbenv;
+	int ret, t_ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0160", "fileops: unlink %s",
+		    "%s"), path);
+
+	/* Optionally overwrite the contents of the file to enhance security. */
+	if (dbenv != NULL && overwrite_test && F_ISSET(dbenv, DB_ENV_OVERWRITE))
+		(void)__db_file_multi_write(env, path);
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	if (DB_GLOBAL(j_unlink) != NULL)
+		ret = DB_GLOBAL(j_unlink)(path);
+	else {
+		RETRY_CHK((unlink(CHAR_STAR_CAST path)), ret);
+#ifdef HAVE_QNX
+		/*
+		 * The file may be a region file created by shm_open, not a
+		 * regular file.  Try and delete using unlink, and if that
+		 * fails for an unexpected reason, try a shared memory unlink.
+		 */
+		if (ret != 0 && __os_posix_err(ret) != ENOENT)
+			RETRY_CHK((shm_unlink(path)), ret);
+#endif
+	}
+
+	/*
+	 * !!!
+	 * The results of unlink are file system driver specific on VxWorks.
+	 * In the case of removing a file that did not exist, some, at least,
+	 * return an error, but with an errno of 0, not ENOENT.  We do not
+	 * have to test for that explicitly, the RETRY_CHK macro resets "ret"
+	 * to be the errno, and so we'll just slide right on through.
+	 *
+	 * XXX
+	 * We shouldn't be testing for an errno of ENOENT here, but ENOENT
+	 * signals that a file is missing, and we attempt to unlink things
+	 * (such as v. 2.x environment regions, in ENV->remove) that we
+	 * are expecting not to be there.  Reporting errors in these cases
+	 * is annoying.
+	 */
+	if (ret != 0) {
+		t_ret = __os_posix_err(ret);
+		if (t_ret != ENOENT)
+			__db_syserr(env, ret, DB_STR_A("0161",
+			    "unlink: %s", "%s"), path);
+		ret = t_ret;
+	}
+
+	return (ret);
+}
diff --git a/src/os/os_yield.c b/src/os/os_yield.c
new file mode 100644
index 00000000..f0e170f0
--- /dev/null
+++ b/src/os/os_yield.c
@@ -0,0 +1,95 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if defined(HAVE_SYSTEM_INCLUDE_FILES) && defined(HAVE_SCHED_YIELD)
+#include <sched.h>
+#endif
+
+static void __os_sleep __P((ENV *, u_long, u_long));
+
+/*
+ * __os_yield --
+ *	Yield the processor, optionally pausing until running again.
+ *
+ * PUBLIC: void __os_yield __P((ENV *, u_long, u_long));
+ */
+void
+__os_yield(env, secs, usecs)
+	ENV *env;
+	u_long secs, usecs;		/* Seconds and microseconds. */
+{
+	/*
+	 * Don't require the values be normalized (some operating systems
+	 * return an error if the usecs argument to select is too large).
+	 */
+	for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+		++secs;
+
+	if (DB_GLOBAL(j_yield) != NULL) {
+		(void)DB_GLOBAL(j_yield)(secs, usecs);
+		return;
+	}
+
+	/*
+	 * Yield the processor so other processes or threads can run.  Use
+	 * the local yield call if not pausing, otherwise call the select
+	 * function.
+	 */
+	if (secs != 0 || usecs != 0)
+		__os_sleep(env, secs, usecs);
+	else {
+#if defined(HAVE_MUTEX_UI_THREADS)
+		thr_yield();
+#elif defined(HAVE_PTHREAD_YIELD)
+		pthread_yield();
+#elif defined(HAVE_SCHED_YIELD)
+		(void)sched_yield();
+#elif defined(HAVE_YIELD)
+		yield();
+#else
+		__os_sleep(env, 0, 0);
+#endif
+	}
+}
+
+/*
+ * __os_sleep --
+ *	Pause the thread of control.
+ */
+static void
+__os_sleep(env, secs, usecs)
+	ENV *env;
+	u_long secs, usecs;		/* Seconds and microseconds. */
+{
+	struct timeval t;
+	int ret;
+
+	/*
+	 * Sheer raving paranoia -- don't select for 0 time, in case some
+	 * implementation doesn't yield the processor in that case.
+	 */
+	t.tv_sec = (long)secs;
+	t.tv_usec = (long)usecs + 1;
+
+	/*
+	 * We don't catch interrupts and restart the system call here, unlike
+	 * other Berkeley DB system calls.  This may be a user attempting to
+	 * interrupt a sleeping DB utility (for example, db_checkpoint), and
+	 * we want the utility to see the signal and quit.  This assumes it's
+	 * always OK for DB to sleep for less time than originally scheduled.
+	 */
+	if (select(0, NULL, NULL, NULL, &t) == -1) {
+		ret = __os_get_syserr();
+		if (__os_posix_err(ret) != EINTR)
+			__db_syserr(env, ret, DB_STR("0167", "select"));
+	}
+}
diff --git a/src/os_qnx/os_qnx_fsync.c b/src/os_qnx/os_qnx_fsync.c
new file mode 100644
index 00000000..827fa446
--- /dev/null
+++ b/src/os_qnx/os_qnx_fsync.c
@@ -0,0 +1,73 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * QNX has special requirements on FSYNC: if the file is a shared memory
+ * object, we can not call fsync because it is not implemented, instead,
+ * we set the O_DSYNC flag to the file descriptor  and then do an empty
+ * write so that all data are synced. We only sync this way if the file
+ * is a shared memory object, other types of ordinary files are still synced
+ * using fsync, to be not only faster but also atomic.
+ * We don't just set the O_DSYNC flag on open, since it would force all writes
+ * to be sync'ed. And we remove the O_DSYNC if it is not originally set to
+ * the file descriptor before passed in to this function.
+ * This is slightly different to the VxWorks and hp code above, since QNX does
+ * supply a fsync call, it just has a unique requirement.
+ */
+int
+__qnx_fsync(fhp)
+	DB_FH *fhp;
+{
+	int ret;
+	int fd, unset, flags;
+
+	fd = fhp->fd;
+	unset = 1;
+	ret = flags = 0;
+	if (F_ISSET(fhp, DB_FH_REGION))
+	{
+		RETRY_CHK(fcntl(fd, F_GETFL), ret);
+		if (ret == -1)
+			goto err;
+		/*
+		 * if already has O_DSYNC flag, we can't remove it
+		 * after the empty write
+		 */
+		if (ret & O_DSYNC != 0)
+			unset = 0;
+		else {
+			ret |= O_DSYNC;
+			flags = ret;
+			RETRY_CHK(fcntl(fd, F_SETFL, flags), ret);
+			if (ret == -1)
+				goto err;
+		}
+		/* Do an empty write, to force a sync */
+		RETRY_CHK(write(fd, "", 0), ret);
+		if (ret == -1)
+			goto err;
+		/* remove the O_DSYNC flag if necessary */
+		if (unset) {
+			RETRY_CHK(fcntl(fd, F_GETFL), ret);
+			if (ret == -1)
+				goto err;
+			ret &= ~O_DSYNC;
+			flags = ret;
+			RETRY_CHK(fcntl(fd, F_SETFL, flags), ret);
+			if (ret == -1)
+				goto err;
+		}
+	} else
+		RETRY_CHK(fdatasync(fd), ret);
+
+err:	return (ret);
+}
diff --git a/src/os_qnx/os_qnx_open.c b/src/os_qnx/os_qnx_open.c
new file mode 100644
index 00000000..d0214a0d
--- /dev/null
+++ b/src/os_qnx/os_qnx_open.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_qnx_region_open --
+ *	Open a shared memory region file using POSIX shm_open.
+ *
+ * PUBLIC: #ifdef HAVE_QNX
+ * PUBLIC: int __os_qnx_region_open
+ * PUBLIC:     __P((ENV *, const char *, int, int, DB_FH **));
+ * PUBLIC: #endif
+ */
+int
+__os_qnx_region_open(env, name, oflags, mode, fhpp)
+	ENV *env;
+	const char *name;
+	int oflags, mode;
+	DB_FH **fhpp;
+{
+	DB_FH *fhp;
+	int fcntl_flags;
+	int ret;
+
+	/*
+	 * Allocate the file handle and copy the file name.  We generally only
+	 * use the name for verbose or error messages, but on systems where we
+	 * can't unlink temporary files immediately, we use the name to unlink
+	 * the temporary file when the file handle is closed.
+	 *
+	 * Lock the ENV handle and insert the new file handle on the list.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+		return (ret);
+	if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+		goto err;
+	if (env != NULL) {
+		MUTEX_LOCK(env, env->mtx_env);
+		TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+		MUTEX_UNLOCK(env, env->mtx_env);
+		F_SET(fhp, DB_FH_ENVLINK);
+	}
+
+	/*
+	 * Once we have created the object, we don't need the name
+	 * anymore.  Other callers of this will convert themselves.
+	 */
+	if ((fhp->fd = shm_open(name, oflags, mode)) == -1) {
+		ret = __os_posix_err(__os_get_syserr());
+err:		(void)__os_closehandle(env, fhp);
+		return (ret);
+	}
+
+	F_SET(fhp, DB_FH_OPENED);
+
+#ifdef HAVE_FCNTL_F_SETFD
+	/* Deny file descriptor access to any child process. */
+	if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 ||
+	    fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("0001", "fcntl(F_SETFD)"));
+		(void)__os_closehandle(env, fhp);
+		return (__os_posix_err(ret));
+	}
+#else
+	COMPQUIET(fcntl_flags, 0);
+#endif
+	F_SET(fhp, DB_FH_OPENED);
+	*fhpp = fhp;
+	return (0);
+}
diff --git a/src/os_vxworks/os_vx_abs.c b/src/os_vxworks/os_vx_abs.c
new file mode 100644
index 00000000..69413ee5
--- /dev/null
+++ b/src/os_vxworks/os_vx_abs.c
@@ -0,0 +1,42 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "iosLib.h"
+
+/*
+ * __os_abspath --
+ *	Return if a path is an absolute path.
+ */
+int
+__os_abspath(path)
+	const char *path;
+{
+	DEV_HDR *dummy;
+	char *ptail;
+
+	/*
+	 * VxWorks devices can be rooted at any name at all.
+	 * Use iosDevFind() to see if name matches any of our devices.
+	 */
+	if ((dummy = iosDevFind(path, (const char**)&ptail)) == NULL)
+		return (0);
+	/*
+	 * If the routine used a device, then ptail points to the
+	 * rest and we are an abs path.
+	 */
+	if (ptail != path)
+		return (1);
+	/*
+	 * If the path starts with a '/', then we are an absolute path,
+	 * using the host machine, otherwise we are not.
+	 */
+	return (path[0] == '/');
+}
diff --git a/src/os_vxworks/os_vx_config.c b/src/os_vxworks/os_vx_config.c
new file mode 100644
index 00000000..649a3b4a
--- /dev/null
+++ b/src/os_vxworks/os_vx_config.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fs_notzero --
+ *	Return 1 if allocated filesystem blocks are not zeroed.
+ */
+int
+__os_fs_notzero()
+{
+	/*
+	 * Some VxWorks FS drivers do not zero-fill pages that were never
+	 * explicitly written to the file, they give you random garbage,
+	 * and that breaks Berkeley DB.
+	 */
+	return (1);
+}
+
+/*
+ * __os_support_direct_io --
+ *      Return 1 if we support direct I/O.
+ */
+int
+__os_support_direct_io()
+{
+	return (0);
+}
+
+/*
+ * __os_support_db_register --
+ *	Return 1 if the system supports DB_REGISTER.
+ */
+int
+__os_support_db_register()
+{
+	return (0);
+}
+
+/*
+ * __os_support_replication --
+ *	Return 1 if the system supports replication.
+ */
+int
+__os_support_replication()
+{
+	return (1);
+}
diff --git a/src/os_vxworks/os_vx_map.c b/src/os_vxworks/os_vx_map.c
new file mode 100644
index 00000000..517cadae
--- /dev/null
+++ b/src/os_vxworks/os_vx_map.c
@@ -0,0 +1,436 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * This code is derived from software contributed to Sleepycat Software by
+ * Frederick G.M. Roeber of Netscape Communications Corp.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * DB uses memory-mapped files for two things:
+ *	faster access of read-only databases, and
+ *	shared memory for process synchronization and locking.
+ * The code carefully does not mix the two uses.  The first-case uses are
+ * actually written such that memory-mapping isn't really required -- it's
+ * merely a convenience -- so we don't have to worry much about it.  In the
+ * second case, it's solely used as a shared memory mechanism, so that's
+ * all we have to replace.
+ *
+ * All memory in VxWorks is shared, and a task can allocate memory and keep
+ * notes.  So I merely have to allocate memory, remember the "filename" for
+ * that memory, and issue small-integer segment IDs which index the list of
+ * these shared-memory segments. Subsequent opens are checked against the
+ * list of already open segments.
+ */
+typedef struct {
+	void *segment;			/* Segment address. */
+	u_int32_t size;			/* Segment size. */
+	char *name;			/* Segment name. */
+	long segid;			/* Segment ID. */
+} os_segdata_t;
+
+static os_segdata_t *__os_segdata;	/* Segment table. */
+static int __os_segdata_size;		/* Segment table size. */
+
+#define	OS_SEGDATA_STARTING_SIZE 16
+#define	OS_SEGDATA_INCREMENT	 16
+
+static int __os_segdata_allocate
+	       __P((ENV *, const char *, REGINFO *, REGION *));
+static int __os_segdata_find_byname
+	       __P((ENV *, const char *, REGINFO *, REGION *));
+static int __os_segdata_init __P((ENV *));
+static int __os_segdata_new __P((ENV *, int *));
+static int __os_segdata_release __P((ENV *, REGION *, int));
+
+/*
+ * __os_attach --
+ *	Create/join a shared memory region.
+ */
+int
+__os_attach(env, infop, rp)
+	ENV *env;
+	REGINFO *infop;
+	REGION *rp;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	if (__os_segdata == NULL)
+		__os_segdata_init(env);
+
+	DB_BEGIN_SINGLE_THREAD;
+
+	/* Try to find an already existing segment. */
+	ret = __os_segdata_find_byname(env, infop->name, infop, rp);
+
+	/*
+	 * If we are trying to join a region, it is easy, either we
+	 * found it and we return, or we didn't find it and we return
+	 * an error that it doesn't exist.
+	 */
+	if (!F_ISSET(infop, REGION_CREATE)) {
+		if (ret != 0) {
+			__db_errx(env, DB_STR_A("0197",
+			    "segment %s does not exist", "%s"),
+			    infop->name);
+			ret = EAGAIN;
+		}
+		goto out;
+	}
+
+	/*
+	 * If we get here, we are trying to create the region.
+	 * There are several things to consider:
+	 * - if we have an error (not a found or not-found value), return.
+	 * - they better have shm_key set.
+	 * - if the region is already there (ret == 0 from above),
+	 * assume the application crashed and we're restarting.
+	 * Delete the old region.
+	 * - try to create the region.
+	 */
+	if (ret != 0 && ret != ENOENT)
+		goto out;
+
+	if (dbenv->shm_key == INVALID_REGION_SEGID) {
+		__db_errx(env, DB_STR("0198",
+		    "no base shared memory ID specified"));
+		ret = EAGAIN;
+		goto out;
+	}
+	if (ret == 0 && __os_segdata_release(env, rp, 1) != 0) {
+		__db_errx(env,DB_STR_A("0199",
+		    "key: %ld: shared memory region already exists", "%ld"),
+		    dbenv->shm_key + (infop->id - 1));
+		ret = EAGAIN;
+		goto out;
+	}
+
+	ret = __os_segdata_allocate(env, infop->name, infop, rp);
+out:
+	DB_END_SINGLE_THREAD;
+	return (ret);
+}
+
+/*
+ * __os_detach --
+ *	Detach from a shared region.
+ */
+int
+__os_detach(env, infop, destroy)
+	ENV *env;
+	REGINFO *infop;
+	int destroy;
+{
+	/*
+	 * If just detaching, there is no mapping to discard.
+	 * If destroying, remove the region.
+	 */
+	if (destroy)
+		return (__os_segdata_release(env, infop->rp, 0));
+	return (0);
+}
+
+/*
+ * __os_mapfile --
+ *	Map in a shared memory file.
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addrp)
+	ENV *env;
+	char *path;
+	DB_FH *fhp;
+	int is_rdonly;
+	size_t len;
+	void **addrp;
+{
+	/* We cannot map in regular files in VxWorks. */
+	COMPQUIET(env, NULL);
+	COMPQUIET(path, NULL);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(is_rdonly, 0);
+	COMPQUIET(len, 0);
+	COMPQUIET(addrp, NULL);
+	return (DB_OPNOTSUP);
+}
+
+/*
+ * __os_unmapfile --
+ *	Unmap the shared memory file.
+ */
+int
+__os_unmapfile(env, addr, len)
+	ENV *env;
+	void *addr;
+	size_t len;
+{
+	/* We cannot map in regular files in VxWorks. */
+	COMPQUIET(env, NULL);
+	COMPQUIET(addr, NULL);
+	COMPQUIET(len, 0);
+	return (DB_OPNOTSUP);
+}
+
+/*
+ * __os_segdata_init --
+ *	Initializes the library's table of shared memory segments.
+ *	Called once on the first time through __os_segdata_new().
+ */
+static int
+__os_segdata_init(env)
+	ENV *env;
+{
+	int ret;
+
+	if (__os_segdata != NULL) {
+		__db_errx(env, DB_STR("0200",
+		    "shared memory segment already exists"));
+		return (EEXIST);
+	}
+
+	/*
+	 * The lock init call returns a locked lock.
+	 */
+	DB_BEGIN_SINGLE_THREAD;
+	__os_segdata_size = OS_SEGDATA_STARTING_SIZE;
+	ret = __os_calloc(env,
+	    __os_segdata_size, sizeof(os_segdata_t), &__os_segdata);
+	DB_END_SINGLE_THREAD;
+	return (ret);
+}
+
+/*
+ * __os_segdata_destroy --
+ *	Destroys the library's table of shared memory segments.  It also
+ *	frees all linked data: the segments themselves, and their names.
+ *	Currently not called.  This function should be called if the
+ *	user creates a function to unload or shutdown.
+ */
+int
+__os_segdata_destroy(env)
+	ENV *env;
+{
+	os_segdata_t *p;
+	int i;
+
+	if (__os_segdata == NULL)
+		return (0);
+
+	DB_BEGIN_SINGLE_THREAD;
+	for (i = 0; i < __os_segdata_size; i++) {
+		p = &__os_segdata[i];
+		if (p->name != NULL) {
+			__os_free(env, p->name);
+			p->name = NULL;
+		}
+		if (p->segment != NULL) {
+			__os_free(env, p->segment);
+			p->segment = NULL;
+		}
+		p->size = 0;
+	}
+
+	__os_free(env, __os_segdata);
+	__os_segdata = NULL;
+	__os_segdata_size = 0;
+	DB_END_SINGLE_THREAD;
+
+	return (0);
+}
+
+/*
+ * __os_segdata_allocate --
+ *	Creates a new segment of the specified size, optionally with the
+ *	specified name.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_allocate(env, name, infop, rp)
+	ENV *env;
+	const char *name;
+	REGINFO *infop;
+	REGION *rp;
+{
+	DB_ENV *dbenv;
+	os_segdata_t *p;
+	int id, ret;
+
+	dbenv = env->dbenv;
+
+	if ((ret = __os_segdata_new(env, &id)) != 0)
+		return (ret);
+
+	p = &__os_segdata[id];
+	if ((ret = __os_calloc(env, 1, rp->size, &p->segment)) != 0)
+		return (ret);
+	if ((ret = __os_strdup(env, name, &p->name)) != 0) {
+		__os_free(env, p->segment);
+		p->segment = NULL;
+		return (ret);
+	}
+	p->size = rp->size;
+	p->segid = dbenv->shm_key + infop->id - 1;
+
+	infop->addr = p->segment;
+	rp->segid = id;
+
+	return (0);
+}
+
+/*
+ * __os_segdata_new --
+ *	Finds a new segdata slot.  Does not initialise it, so the fd returned
+ *	is only valid until you call this again.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_new(env, segidp)
+	ENV *env;
+	int *segidp;
+{
+	os_segdata_t *p;
+	int i, newsize, ret;
+
+	if (__os_segdata == NULL) {
+		__db_errx(env, DB_STR("0201",
+		    "shared memory segment not initialized"));
+		return (EAGAIN);
+	}
+
+	for (i = 0; i < __os_segdata_size; i++) {
+		p = &__os_segdata[i];
+		if (p->segment == NULL) {
+			*segidp = i;
+			return (0);
+		}
+	}
+
+	/*
+	 * No more free slots, expand.
+	 */
+	newsize = __os_segdata_size + OS_SEGDATA_INCREMENT;
+	if ((ret = __os_realloc(env, newsize * sizeof(os_segdata_t),
+	    &__os_segdata)) != 0)
+		return (ret);
+	memset(&__os_segdata[__os_segdata_size],
+	    0, OS_SEGDATA_INCREMENT * sizeof(os_segdata_t));
+
+	*segidp = __os_segdata_size;
+	__os_segdata_size = newsize;
+
+	return (0);
+}
+
+/*
+ * __os_segdata_find_byname --
+ *	Finds a segment by its name and shm_key.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_find_byname(env, name, infop, rp)
+	ENV *env;
+	const char *name;
+	REGINFO *infop;
+	REGION *rp;
+{
+	DB_ENV *dbenv;
+	os_segdata_t *p;
+	long segid;
+	int i;
+
+	dbenv = env->dbenv;
+
+	if (__os_segdata == NULL) {
+		__db_errx(env, DB_STR("0202",
+		    "shared memory segment not initialized"));
+		return (EAGAIN);
+	}
+
+	if (name == NULL) {
+		__db_errx(env, DB_STR("0203", "no segment name given"));
+		return (EAGAIN);
+	}
+
+	/*
+	 * If we are creating the region, compute the segid.
+	 * If we are joining the region, we use the segid in the
+	 * index we are given.
+	 */
+	if (F_ISSET(infop, REGION_CREATE))
+		segid = dbenv->shm_key + (infop->id - 1);
+	else {
+		if (rp->segid >= __os_segdata_size ||
+		    rp->segid == INVALID_REGION_SEGID) {
+			__db_errx(env, DB_STR("0204",
+			    "Invalid segment id given"));
+			return (EAGAIN);
+		}
+		segid = __os_segdata[rp->segid].segid;
+	}
+	for (i = 0; i < __os_segdata_size; i++) {
+		p = &__os_segdata[i];
+		if (p->name != NULL && strcmp(name, p->name) == 0 &&
+		    p->segid == segid) {
+			infop->addr = p->segment;
+			rp->segid = i;
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+/*
+ * __os_segdata_release --
+ *	Free a segdata entry.
+ */
+static int
+__os_segdata_release(env, rp, is_locked)
+	ENV *env;
+	REGION *rp;
+	int is_locked;
+{
+	os_segdata_t *p;
+
+	if (__os_segdata == NULL) {
+		__db_errx(env, DB_STR("0205",
+		    "shared memory segment not initialized"));
+		return (EAGAIN);
+	}
+
+	if (rp->segid < 0 || rp->segid >= __os_segdata_size) {
+		__db_errx(env, DB_STR_A("0206",
+		    "segment id %ld out of range", "%ld"), rp->segid);
+		return (EINVAL);
+	}
+
+	if (is_locked == 0)
+		DB_BEGIN_SINGLE_THREAD;
+	p = &__os_segdata[rp->segid];
+	if (p->name != NULL) {
+		__os_free(env, p->name);
+		p->name = NULL;
+	}
+	if (p->segment != NULL) {
+		__os_free(env, p->segment);
+		p->segment = NULL;
+	}
+	p->size = 0;
+	if (is_locked == 0)
+		DB_END_SINGLE_THREAD;
+
+	/* Any shrink-table logic could go here */
+
+	return (0);
+}
diff --git a/src/os_vxworks/os_vx_rpath.c b/src/os_vxworks/os_vx_rpath.c
new file mode 100644
index 00000000..1ffd3549
--- /dev/null
+++ b/src/os_vxworks/os_vx_rpath.c
@@ -0,0 +1,55 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#include "iosLib.h"
+
+/*
+ * __db_rpath --
+ *	Return the last path separator in the path or NULL if none found.
+ */
+char *
+__db_rpath(path)
+	const char *path;
+{
+	const char *s, *last;
+	DEV_HDR *dummy;
+	char *ptail;
+
+	/*
+	 * VxWorks devices can be rooted at any name.  We want to
+	 * skip over the device name and not take into account any
+	 * PATH_SEPARATOR characters that might be in that name.
+	 *
+	 * XXX [#2393]
+	 * VxWorks supports having a filename directly follow a device
+	 * name with no separator.  I.e. to access a file 'xxx' in
+	 * the top level directory of a device mounted at "mydrive"
+	 * you could say "mydrivexxx" or "mydrive/xxx" or "mydrive\xxx".
+	 * We do not support the first usage here.
+	 * XXX
+	 */
+	if ((dummy = iosDevFind(path, (const char**)&ptail)) == NULL)
+		s = path;
+	else
+		s = ptail;
+
+	last = NULL;
+	if (PATH_SEPARATOR[1] != '\0') {
+		for (; s[0] != '\0'; ++s)
+			if (strchr(PATH_SEPARATOR, s[0]) != NULL)
+				last = s;
+	} else
+		for (; s[0] != '\0'; ++s)
+			if (s[0] == PATH_SEPARATOR[0])
+				last = s;
+	return ((char *)last);
+}
diff --git a/src/os_vxworks/os_vx_yield.c b/src/os_vxworks/os_vx_yield.c
new file mode 100644
index 00000000..c7c54cf2
--- /dev/null
+++ b/src/os_vxworks/os_vx_yield.c
@@ -0,0 +1,49 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/* vxworks API to get system clock rate */
+int sysClkRateGet (void);
+
+/*
+ * __os_yield --
+ *	Yield the processor, optionally pausing until running again.
+ */
+void
+__os_yield(env, secs, usecs)
+	ENV *env;
+	u_long secs, usecs;		/* Seconds and microseconds. */
+{
+	int ticks_delay, ticks_per_second;
+
+	COMPQUIET(env, NULL);
+
+	/* Don't require the values be normalized. */
+	for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+		++secs;
+
+	/*
+	 * Yield the processor so other processes or threads can run.
+	 *
+	 * As a side effect, taskDelay() moves the calling task to the end of
+	 * the ready queue for tasks of the same priority. In particular, you
+	 * can yield the CPU to any other tasks of the same priority by
+	 * "delaying" for zero clock ticks.
+	 *
+	 * Never wait less than a tick, if we were supposed to wait at all.
+	 */
+	ticks_per_second = sysClkRateGet();
+	ticks_delay =
+	    secs * ticks_per_second + (usecs * ticks_per_second) / US_PER_SEC;
+	if (ticks_delay == 0 && (secs != 0 || usecs != 0))
+		ticks_delay = 1;
+	(void)taskDelay(ticks_delay);
+}
diff --git a/src/os_windows/ce_ctime.c b/src/os_windows/ce_ctime.c
new file mode 100644
index 00000000..e8ae76aa
--- /dev/null
+++ b/src/os_windows/ce_ctime.c
@@ -0,0 +1,87 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static void  __os_windows_ct_numb __P((char *, int));
+
+/*
+ * __os_ctime --
+ *	Format a time-stamp.
+ */
+char *
+__os_ctime(tod, time_buf)
+	const time_t *tod;
+	char *time_buf;
+{
+	char *ncp;
+	__int64 i64_tod;
+	struct _FILETIME file_tod, file_loc;
+	struct _SYSTEMTIME sys_loc;
+static const __int64 SECS_BETWEEN_EPOCHS = 11644473600;
+static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */
+
+	strcpy(time_buf, "Thu Jan 01 00:00:00 1970");
+	time_buf[CTIME_BUFLEN - 1] = '\0';
+
+	/* Convert the tod to a SYSTEM_TIME struct */
+	i64_tod = *tod;
+	i64_tod = (i64_tod + SECS_BETWEEN_EPOCHS)*SECS_TO_100NS;
+	memcpy(&file_tod, &i64_tod, sizeof(file_tod));
+	FileTimeToLocalFileTime(&file_tod, &file_loc);
+	FileTimeToSystemTime(&file_loc, &sys_loc);
+
+	/*
+	 * Convert the _SYSTEMTIME to the correct format in time_buf.
+	 * Based closely on the os_brew/ctime.c implementation.
+	 *
+	 * wWeekDay : Day of the week 0-6 (0=Monday, 6=Sunday)
+	 */
+	ncp = &"MonTueWedThuFriSatSun"[sys_loc.wDayOfWeek*3];
+	time_buf[0] = *ncp++;
+	time_buf[1] = *ncp++;
+	time_buf[2] = *ncp;
+	ncp = &"JanFebMarAprMayJunJulAugSepOctNovDec"[(sys_loc.wMonth - 1) * 3];
+	time_buf[4] = *ncp++;
+	time_buf[5] = *ncp++;
+	time_buf[6] = *ncp;
+
+	__os_windows_ct_numb(time_buf + 8, sys_loc.wDay);
+					/* Add 100 to keep the leading zero. */
+	__os_windows_ct_numb(time_buf + 11, sys_loc.wHour + 100);
+	__os_windows_ct_numb(time_buf + 14, sys_loc.wMinute + 100);
+	__os_windows_ct_numb(time_buf + 17, sys_loc.wSecond + 100);
+
+	if (sys_loc.wYear < 100) {		/* 9 99 */
+		time_buf[20] = ' ';
+		time_buf[21] = ' ';
+		__os_windows_ct_numb(time_buf + 22, sys_loc.wYear);
+	} else {			/* 99 1999 */
+		__os_windows_ct_numb(time_buf + 20, sys_loc.wYear / 100);
+		__os_windows_ct_numb(time_buf + 22, sys_loc.wYear % 100 + 100);
+	}
+
+	return (time_buf);
+}
+
+/*
+ * __os_windows_ct_numb --
+ *	Append ASCII representations for two digits to a string.
+ */
+static void
+__os_windows_ct_numb(cp, n)
+	char *cp;
+	int n;
+{
+	cp[0] = ' ';
+	if (n >= 10)
+		cp[0] = (n / 10) % 10 + '0';
+	cp[1] = n % 10 + '0';
+}
diff --git a/src/os_windows/os_abs.c b/src/os_windows/os_abs.c
new file mode 100644
index 00000000..e769ab2c
--- /dev/null
+++ b/src/os_windows/os_abs.c
@@ -0,0 +1,33 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abspath --
+ *	Return if a path is an absolute path.
+ */
+int
+__os_abspath(path)
+	const char *path;
+{
+	/*
+	 * !!!
+	 * Check for drive specifications, e.g., "C:".  In addition, the path
+	 * separator used by the win32 DB (PATH_SEPARATOR) is \; look for both
+	 * / and \ since these are user-input paths.
+	 */
+	if (strlen(path) == 0)
+		return (0);
+
+	if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
+		path += 2;
+	return (path[0] == '/' || path[0] == '\\');
+}
diff --git a/src/os_windows/os_clock.c b/src/os_windows/os_clock.c
new file mode 100644
index 00000000..e548729b
--- /dev/null
+++ b/src/os_windows/os_clock.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_gettime --
+ *	Return the current time-of-day clock in seconds and nanoseconds.
+ */
+void
+__os_gettime(env, tp, monotonic)
+	ENV *env;
+	db_timespec *tp;
+	int monotonic;
+{
+	if (monotonic) {
+		/*
+		 * The elapsed time is stored as a DWORD value, so time wraps
+		 * around to zero if the system runs for 49.7 days.  Initialize
+		 * a base value with 50 days worth of seconds, and add 50 more
+		 * days every time the counter wraps.  That ensures we always
+		 * move forward.
+		 *
+		 * It's possible this code could race, but the danger is we
+		 * would increment base_seconds more than once per wrap and
+		 * eventually overflow, which is a pretty remote possibility.
+		 */
+#define	TIMER_WRAP_SECONDS	(50 * 24 * 60 * 60)
+		static DWORD last_ticks;
+		static time_t base_seconds;
+		DWORD ticks;
+
+		ticks = GetTickCount();
+		if (ticks < last_ticks)
+			base_seconds += TIMER_WRAP_SECONDS;
+		last_ticks = ticks;
+		tp->tv_sec = base_seconds + (u_int32_t)(ticks / 1000);
+		tp->tv_nsec = (u_int32_t)((ticks % 1000) * NS_PER_MS);
+	} else {
+#ifdef DB_WINCE
+		FILETIME ft;
+		LARGE_INTEGER large_int;
+		LONGLONG ns_since_epoch, utc1970;
+		SYSTEMTIME st;
+
+		(void)GetSystemTime(&st);
+		(void)SystemTimeToFileTime(&st, &ft);
+
+		/*
+		 * A FILETIME expresses time as 100 nanosecond chunks from
+		 * Jan 1, 1601; convert to a timespec where the time is
+		 * is expressed in seconds and nanoseconds from Jan 1, 1970.
+		 *
+		 * UTC_1970 is the number of 100-nano-second chunks from
+		 * 1601 to 1970.
+		 */
+#define	NS100_PER_SEC	(NS_PER_SEC / 100)
+#define	UTC_1970	(((LONGLONG)27111902 << 32) + (LONGLONG)3577643008)
+		memcpy(&large_int, &ft, sizeof(large_int));
+		utc1970 = UTC_1970;
+		ns_since_epoch = (large_int.QuadPart - utc1970);
+		tp->tv_sec = (time_t)(ns_since_epoch / NS100_PER_SEC);
+		tp->tv_nsec = (long)(ns_since_epoch % NS100_PER_SEC);
+#else
+		struct _timeb now;
+
+		_ftime(&now);
+		tp->tv_sec = now.time;
+		tp->tv_nsec = now.millitm * NS_PER_MS;
+#endif
+	}
+}
diff --git a/src/os_windows/os_config.c b/src/os_windows/os_config.c
new file mode 100644
index 00000000..4250dbd4
--- /dev/null
+++ b/src/os_windows/os_config.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_is_winnt --
+ *	Return 1 if Windows/NT, otherwise 0.
+ *
+ * PUBLIC: int __os_is_winnt __P((void));
+ */
+int
+__os_is_winnt()
+{
+#ifdef DB_WINCE
+	return (1);
+#else
+	static int __os_type = -1;
+
+	/*
+	 * The value of __os_type is computed only once, and cached to
+	 * avoid the overhead of repeated calls to GetVersion().
+	 */
+	if (__os_type == -1) {
+		if ((GetVersion() & 0x80000000) == 0)
+			__os_type = 1;
+		else
+			__os_type = 0;
+	}
+	return (__os_type);
+#endif
+}
+
+/*
+ * __os_fs_notzero --
+ *	Return 1 if allocated filesystem blocks are not zeroed.
+ */
+int
+__os_fs_notzero()
+{
+#ifdef DB_WINCE
+	return (1);
+#else
+	static int __os_notzero = -1;
+	OSVERSIONINFO osvi;
+
+	/*
+	 * Windows/NT zero-fills pages that were never explicitly written to
+	 * the file.  Note however that this is *NOT* documented.  In fact, the
+	 * Win32 documentation makes it clear that there are no guarantees that
+	 * uninitialized bytes will be zeroed:
+	 *
+	 *   If the file is extended, the contents of the file between the old
+	 *   EOF position and the new position are not defined.
+	 *
+	 * Experiments confirm that NT/2K/XP all zero fill for both NTFS and
+	 * FAT32.  Cygwin also relies on this behavior.  This is the relevant
+	 * comment from Cygwin:
+	 *
+	 *    Oops, this is the bug case - Win95 uses whatever is on the disk
+	 *    instead of some known (safe) value, so we must seek back and fill
+	 *    in the gap with zeros. - DJ
+	 *    Note: this bug doesn't happen on NT4, even though the
+	 *    documentation for WriteFile() says that it *may* happen on any OS.
+	 *
+	 * We're making a bet, here, but we made it a long time ago and haven't
+	 * yet seen any evidence that it was wrong.
+	 *
+	 * Windows 95/98 and On-Time give random garbage, and that breaks
+	 * Berkeley DB.
+	 *
+	 * The value of __os_notzero is computed only once, and cached to
+	 * avoid the overhead of repeated calls to GetVersion().
+	 */
+	if (__os_notzero == -1) {
+		if (__os_is_winnt()) {
+			osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+			GetVersionEx(&osvi);
+			if (_tcscmp(osvi.szCSDVersion, _T("RTTarget-32")) == 0)
+				__os_notzero = 1;	/* On-Time */
+			else
+				__os_notzero = 0;	/* Windows/NT */
+		} else
+			__os_notzero = 1;		/* Not Windows/NT */
+	}
+	return (__os_notzero);
+#endif
+}
+
+/*
+ * __os_support_direct_io --
+ *	Check to see if we support direct I/O.
+ */
+int
+__os_support_direct_io()
+{
+	return (1);
+}
+
+/*
+ * __os_support_db_register --
+ *	Return 1 if the system supports DB_REGISTER.
+ */
+int
+__os_support_db_register()
+{
+#ifdef DB_WINCE
+	return (0);
+#else
+	return (__os_is_winnt());
+#endif
+}
+
+/*
+ * __os_support_replication --
+ *	Return 1 if the system supports replication.
+ */
+int
+__os_support_replication()
+{
+#ifdef DB_WINCE
+	return (0);
+#else
+	return (__os_is_winnt());
+#endif
+}
diff --git a/src/os_windows/os_cpu.c b/src/os_windows/os_cpu.c
new file mode 100644
index 00000000..0922071f
--- /dev/null
+++ b/src/os_windows/os_cpu.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_cpu_count --
+ *	Return the number of CPUs.
+ *
+ * PUBLIC: u_int32_t __os_cpu_count __P((void));
+ */
+u_int32_t
+__os_cpu_count()
+{
+	SYSTEM_INFO SystemInfo;
+
+	GetSystemInfo(&SystemInfo);
+
+	return ((u_int32_t)SystemInfo.dwNumberOfProcessors);
+}
diff --git a/src/os_windows/os_dir.c b/src/os_windows/os_dir.c
new file mode 100644
index 00000000..31d364d7
--- /dev/null
+++ b/src/os_windows/os_dir.c
@@ -0,0 +1,122 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_dirlist --
+ *	Return a list of the files in a directory.
+ */
+int
+__os_dirlist(env, dir, returndir, namesp, cntp)
+	ENV *env;
+	const char *dir;
+	int returndir, *cntp;
+	char ***namesp;
+{
+	HANDLE dirhandle;
+	WIN32_FIND_DATA fdata;
+	int arraysz, cnt, ret;
+	char **names, *onename;
+	_TCHAR tfilespec[DB_MAXPATHLEN + 1];
+	_TCHAR *tdir;
+
+	*namesp = NULL;
+	*cntp = 0;
+
+	TO_TSTRING(env, dir, tdir, ret);
+	if (ret != 0)
+		return (ret);
+
+	(void)_sntprintf(tfilespec, DB_MAXPATHLEN,
+	    _T("%s%hc*"), tdir, PATH_SEPARATOR[0]);
+
+	/*
+	 * On WinCE, FindFirstFile will return INVALID_HANDLE_VALUE when
+	 * the searched directory is empty, and set last error to
+	 * ERROR_NO_MORE_FILES, on Windows it will return "." instead.
+	 */
+	if ((dirhandle =
+	    FindFirstFile(tfilespec, &fdata)) == INVALID_HANDLE_VALUE) {
+		if (GetLastError() == ERROR_NO_MORE_FILES)
+			return (0);
+		return (__os_posix_err(__os_get_syserr()));
+	}
+
+	names = NULL;
+	arraysz = cnt = ret = 0;
+	for (;;) {
+		if (returndir ||
+		    (fdata.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+			if (fdata.cFileName[0] == _T('.') &&
+			    (fdata.cFileName[1] == _T('\0') ||
+			    (fdata.cFileName[1] == _T('.') &&
+			    fdata.cFileName[2] == _T('\0'))))
+				goto next;
+			if (cnt >= arraysz) {
+				arraysz += 100;
+				if ((ret = __os_realloc(env,
+				    arraysz * sizeof(names[0]), &names)) != 0)
+					goto err;
+			}
+			/*
+			 * FROM_TSTRING doesn't necessarily allocate new
+			 * memory, so we must do that explicitly.
+			 * Unfortunately, when compiled with UNICODE, we'll
+			 * copy twice.
+			 */
+			FROM_TSTRING(env, fdata.cFileName, onename, ret);
+			if (ret != 0)
+				goto err;
+			ret = __os_strdup(env, onename, &names[cnt]);
+			FREE_STRING(env, onename);
+			if (ret != 0)
+				goto err;
+			cnt++;
+		}
+next:
+		if (!FindNextFile(dirhandle, &fdata)) {
+			if (GetLastError() == ERROR_NO_MORE_FILES)
+				break;
+			else {
+				ret = __os_posix_err(__os_get_syserr());
+				goto err;
+			}
+		}
+	}
+
+err:	if (!FindClose(dirhandle) && ret == 0)
+		ret = __os_posix_err(__os_get_syserr());
+
+	if (ret == 0) {
+		*namesp = names;
+		*cntp = cnt;
+	} else if (names != NULL)
+		__os_dirfree(env, names, cnt);
+
+	FREE_STRING(env, tdir);
+
+	return (ret);
+}
+
+/*
+ * __os_dirfree --
+ *	Free the list of files.
+ */
+void
+__os_dirfree(env, names, cnt)
+	ENV *env;
+	char **names;
+	int cnt;
+{
+	while (cnt > 0)
+		__os_free(env, names[--cnt]);
+	__os_free(env, names);
+}
diff --git a/src/os_windows/os_errno.c b/src/os_windows/os_errno.c
new file mode 100644
index 00000000..ba8ec359
--- /dev/null
+++ b/src/os_windows/os_errno.c
@@ -0,0 +1,428 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_get_errno_ret_zero --
+ *	Return the last system error, including an error of zero.
+ */
+int
+__os_get_errno_ret_zero()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	return (errno);
+}
+
+/*
+ * We've seen cases where system calls failed but errno was never set.  For
+ * that reason, __os_get_errno() and __os_get_syserr set errno to EAGAIN if
+ * it's not already set, to work around the problem.  For obvious reasons,
+ * we can only call this function if we know an error has occurred, that
+ * is, we can't test the return for a non-zero value after the get call.
+ *
+ * __os_get_errno --
+ *	Return the last ANSI C "errno" value or EAGAIN if the last error
+ *	is zero.
+ */
+int
+__os_get_errno()
+{
+	/* This routine must be able to return the same value repeatedly. */
+	if (errno == 0)
+		__os_set_errno(EAGAIN);
+	return (errno);
+}
+
+#ifdef HAVE_REPLICATION_THREADS
+/*
+ * __os_get_neterr --
+ *	Return the last networking error or EAGAIN if the last error is zero.
+ *
+ * PUBLIC: #ifdef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __os_get_neterr __P((void));
+ * PUBLIC: #endif
+ */
+int
+__os_get_neterr()
+{
+	int err;
+
+	/* This routine must be able to return the same value repeatedly. */
+	err = WSAGetLastError();
+	if (err == 0)
+		WSASetLastError(err = ERROR_RETRY);
+	return (err);
+}
+#endif
+
+/*
+ * __os_get_syserr --
+ *	Return the last system error or EAGAIN if the last error is zero.
+ */
+int
+__os_get_syserr()
+{
+	int err;
+
+	/* This routine must be able to return the same value repeatedly. */
+	err = GetLastError();
+	if (err == 0)
+		SetLastError(err = ERROR_RETRY);
+	return (err);
+}
+
+/*
+ * __os_set_errno --
+ *	Set the value of errno.
+ */
+void
+__os_set_errno(evalue)
+	int evalue;
+{
+	/*
+	 * This routine is called by the compatibility interfaces (DB 1.85,
+	 * dbm and hsearch).  Force values > 0, that is, not one of DB 2.X
+	 * and later's public error returns.  If something bad has happened,
+	 * default to EFAULT -- a nasty return.  Otherwise, default to EINVAL.
+	 * As the compatibility APIs aren't included on Windows, the Windows
+	 * version of this routine doesn't need this behavior.
+	 */
+	errno =
+	    evalue >= 0 ? evalue : (evalue == DB_RUNRECOVERY ? EFAULT : EINVAL);
+}
+
+/*
+ * __os_strerror --
+ *	Return a string associated with the system error.
+ */
+char *
+__os_strerror(error, buf, len)
+	int error;
+	char *buf;
+	size_t len;
+{
+#ifdef DB_WINCE
+#define	MAX_TMPBUF_LEN 512
+	_TCHAR tbuf[MAX_TMPBUF_LEN];
+	size_t  maxlen;
+
+	DB_ASSERT(NULL, error != 0);
+
+	memset(tbuf, 0, sizeof(_TCHAR)*MAX_TMPBUF_LEN);
+	maxlen = (len > MAX_TMPBUF_LEN ? MAX_TMPBUF_LEN : len);
+	FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, (DWORD)error,
+		0, tbuf, maxlen-1, NULL);
+
+	if (WideCharToMultiByte(CP_UTF8, 0, tbuf, -1,
+		buf, len, 0, NULL) == 0)
+		strncpy(buf, DB_STR("0035",
+		    "Error message translation failed."), len);
+#else
+	DB_ASSERT(NULL, error != 0);
+	/*
+	 * Explicitly call FormatMessageA, since we want to receive a char
+	 * string back, not a tchar string.
+	 */
+	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM,
+	    0, (DWORD)error, 0, buf, (DWORD)(len - 1), NULL);
+	buf[len - 1] = '\0';
+#endif
+
+	return (buf);
+}
+
+/*
+ * __os_posix_err --
+ *	Convert a system error to a POSIX error.
+ */
+int
+__os_posix_err(error)
+	int error;
+{
+	/* Handle calls on successful returns. */
+	if (error == 0)
+		return (0);
+
+	/*
+	 * Translate the Windows error codes we care about.
+	 */
+	switch (error) {
+	case ERROR_INVALID_PARAMETER:
+		return (EINVAL);
+
+	case ERROR_FILE_NOT_FOUND:
+	case ERROR_INVALID_DRIVE:
+	case ERROR_PATH_NOT_FOUND:
+		return (ENOENT);
+
+	case ERROR_NO_MORE_FILES:
+	case ERROR_TOO_MANY_OPEN_FILES:
+		return (EMFILE);
+
+	case ERROR_ACCESS_DENIED:
+		return (EPERM);
+
+	case ERROR_INVALID_HANDLE:
+		return (EBADF);
+
+	case ERROR_NOT_ENOUGH_MEMORY:
+		return (ENOMEM);
+
+	case ERROR_DISK_FULL:
+		return (ENOSPC);
+
+	case ERROR_ARENA_TRASHED:
+	case ERROR_BAD_COMMAND:
+	case ERROR_BAD_ENVIRONMENT:
+	case ERROR_BAD_FORMAT:
+	case ERROR_GEN_FAILURE:
+	case ERROR_INVALID_ACCESS:
+	case ERROR_INVALID_BLOCK:
+	case ERROR_INVALID_DATA:
+	case ERROR_READ_FAULT:
+	case ERROR_WRITE_FAULT:
+		return (EFAULT);
+
+	case ERROR_ALREADY_EXISTS:
+	case ERROR_FILE_EXISTS:
+		return (EEXIST);
+
+	case ERROR_NOT_SAME_DEVICE:
+		return (EXDEV);
+
+	case ERROR_WRITE_PROTECT:
+		return (EACCES);
+
+	case ERROR_LOCK_FAILED:
+	case ERROR_LOCK_VIOLATION:
+	case ERROR_NOT_READY:
+	case ERROR_SHARING_VIOLATION:
+		return (EBUSY);
+
+	case ERROR_RETRY:
+		return (EINTR);
+	}
+
+	/*
+	 * Translate the Windows socket error codes.
+	 */
+	switch (error) {
+	case WSAEADDRINUSE:
+#ifdef EADDRINUSE
+		return (EADDRINUSE);
+#else
+		break;
+#endif
+	case WSAEADDRNOTAVAIL:
+#ifdef EADDRNOTAVAIL
+		return (EADDRNOTAVAIL);
+#else
+		break;
+#endif
+	case WSAEAFNOSUPPORT:
+#ifdef EAFNOSUPPORT
+		return (EAFNOSUPPORT);
+#else
+		break;
+#endif
+	case WSAEALREADY:
+#ifdef EALREADY
+		return (EALREADY);
+#else
+		break;
+#endif
+	case WSAEBADF:
+		return (EBADF);
+	case WSAECONNABORTED:
+#ifdef ECONNABORTED
+		return (ECONNABORTED);
+#else
+		break;
+#endif
+	case WSAECONNREFUSED:
+#ifdef ECONNREFUSED
+		return (ECONNREFUSED);
+#else
+		break;
+#endif
+	case WSAECONNRESET:
+#ifdef ECONNRESET
+		return (ECONNRESET);
+#else
+		break;
+#endif
+	case WSAEDESTADDRREQ:
+#ifdef EDESTADDRREQ
+		return (EDESTADDRREQ);
+#else
+		break;
+#endif
+	case WSAEFAULT:
+		return (EFAULT);
+	case WSAEHOSTDOWN:
+#ifdef EHOSTDOWN
+		return (EHOSTDOWN);
+#else
+		break;
+#endif
+	case WSAEHOSTUNREACH:
+#ifdef EHOSTUNREACH
+		return (EHOSTUNREACH);
+#else
+		break;
+#endif
+	case WSAEINPROGRESS:
+#ifdef EINPROGRESS
+		return (EINPROGRESS);
+#else
+		break;
+#endif
+	case WSAEINTR:
+		return (EINTR);
+	case WSAEINVAL:
+		return (EINVAL);
+	case WSAEISCONN:
+#ifdef EISCONN
+		return (EISCONN);
+#else
+		break;
+#endif
+	case WSAELOOP:
+#ifdef ELOOP
+		return (ELOOP);
+#else
+		break;
+#endif
+	case WSAEMFILE:
+		return (EMFILE);
+	case WSAEMSGSIZE:
+#ifdef EMSGSIZE
+		return (EMSGSIZE);
+#else
+		break;
+#endif
+	case WSAENAMETOOLONG:
+		return (ENAMETOOLONG);
+	case WSAENETDOWN:
+#ifdef ENETDOWN
+		return (ENETDOWN);
+#else
+		break;
+#endif
+	case WSAENETRESET:
+#ifdef ENETRESET
+		return (ENETRESET);
+#else
+		break;
+#endif
+	case WSAENETUNREACH:
+#ifdef ENETUNREACH
+		return (ENETUNREACH);
+#else
+		break;
+#endif
+	case WSAENOBUFS:
+#ifdef ENOBUFS
+		return (ENOBUFS);
+#else
+		break;
+#endif
+	case WSAENOPROTOOPT:
+#ifdef ENOPROTOOPT
+		return (ENOPROTOOPT);
+#else
+		break;
+#endif
+	case WSAENOTCONN:
+#ifdef ENOTCONN
+		return (ENOTCONN);
+#else
+		break;
+#endif
+	case WSANOTINITIALISED:
+		return (EAGAIN);
+	case WSAENOTSOCK:
+#ifdef ENOTSOCK
+		return (ENOTSOCK);
+#else
+		break;
+#endif
+	case WSAEOPNOTSUPP:
+		return (DB_OPNOTSUP);
+	case WSAEPFNOSUPPORT:
+#ifdef EPFNOSUPPORT
+		return (EPFNOSUPPORT);
+#else
+		break;
+#endif
+	case WSAEPROTONOSUPPORT:
+#ifdef EPROTONOSUPPORT
+		return (EPROTONOSUPPORT);
+#else
+		break;
+#endif
+	case WSAEPROTOTYPE:
+#ifdef EPROTOTYPE
+		return (EPROTOTYPE);
+#else
+		break;
+#endif
+	case WSAESHUTDOWN:
+#ifdef ESHUTDOWN
+		return (ESHUTDOWN);
+#else
+		break;
+#endif
+	case WSAESOCKTNOSUPPORT:
+#ifdef ESOCKTNOSUPPORT
+		return (ESOCKTNOSUPPORT);
+#else
+		break;
+#endif
+	case WSAETIMEDOUT:
+#ifdef ETIMEDOUT
+		return (ETIMEDOUT);
+#else
+		break;
+#endif
+	case WSAETOOMANYREFS:
+#ifdef ETOOMANYREFS
+		return (ETOOMANYREFS);
+#else
+		break;
+#endif
+	case WSAEWOULDBLOCK:
+#ifdef EWOULDBLOCK
+		return (EWOULDBLOCK);
+#else
+		return (EAGAIN);
+#endif
+	case WSAHOST_NOT_FOUND:
+#ifdef EHOSTUNREACH
+		return (EHOSTUNREACH);
+#else
+		break;
+#endif
+	case WSASYSNOTREADY:
+		return (EAGAIN);
+	case WSATRY_AGAIN:
+		return (EAGAIN);
+	case WSAVERNOTSUPPORTED:
+		return (DB_OPNOTSUP);
+	case WSAEACCES:
+		return (EACCES);
+	}
+
+	/*
+	 * EFAULT is the default if we don't have a translation.
+	 */
+	return (EFAULT);
+}
diff --git a/src/os_windows/os_fid.c b/src/os_windows/os_fid.c
new file mode 100644
index 00000000..f2d190b1
--- /dev/null
+++ b/src/os_windows/os_fid.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fileid --
+ *	Return a unique identifier for a file.
+ */
+int
+__os_fileid(env, fname, unique_okay, fidp)
+	ENV *env;
+	const char *fname;
+	int unique_okay;
+	u_int8_t *fidp;
+{
+	pid_t pid;
+	size_t i;
+	u_int32_t tmp;
+	u_int8_t *p;
+	int ret;
+
+	/*
+	 * The documentation for GetFileInformationByHandle() states that the
+	 * inode-type numbers are not constant between processes.  Actually,
+	 * they are, they're the NTFS MFT indexes.  So, this works on NTFS,
+	 * but perhaps not on other platforms, and perhaps not over a network.
+	 * Can't think of a better solution right now.
+	 */
+	DB_FH *fhp;
+	BY_HANDLE_FILE_INFORMATION fi;
+	BOOL retval = FALSE;
+
+	DB_ASSERT(env, fname != NULL);
+
+	/* Clear the buffer. */
+	memset(fidp, 0, DB_FILE_ID_LEN);
+
+	/*
+	 * First we open the file, because we're not given a handle to it.
+	 * If we can't open it, we're in trouble.
+	 */
+	if ((ret = __os_open(env, fname, 0,
+	    DB_OSO_RDONLY, DB_MODE_400, &fhp)) != 0)
+		return (ret);
+
+	/* File open, get its info */
+	if ((retval = GetFileInformationByHandle(fhp->handle, &fi)) == FALSE)
+		ret = __os_get_syserr();
+	(void)__os_closehandle(env, fhp);
+
+	if (retval == FALSE)
+		return (__os_posix_err(ret));
+
+	/*
+	 * We want the three 32-bit words which tell us the volume ID and
+	 * the file ID.  We make a crude attempt to copy the bytes over to
+	 * the callers buffer.
+	 *
+	 * We don't worry about byte sexing or the actual variable sizes.
+	 *
+	 * When this routine is called from the DB access methods, it's only
+	 * called once -- whatever ID is generated when a database is created
+	 * is stored in the database file's metadata, and that is what is
+	 * saved in the mpool region's information to uniquely identify the
+	 * file.
+	 *
+	 * When called from the mpool layer this routine will be called each
+	 * time a new thread of control wants to share the file, which makes
+	 * things tougher.  As far as byte sexing goes, since the mpool region
+	 * lives on a single host, there's no issue of that -- the entire
+	 * region is byte sex dependent.  As far as variable sizes go, we make
+	 * the simplifying assumption that 32-bit and 64-bit processes will
+	 * get the same 32-bit values if we truncate any returned 64-bit value
+	 * to a 32-bit value.
+	 */
+	tmp = (u_int32_t)fi.nFileIndexLow;
+	for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = *p++;
+	tmp = (u_int32_t)fi.nFileIndexHigh;
+	for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = *p++;
+
+	if (unique_okay) {
+		/* Add in 32-bits of (hopefully) unique number. */
+		__os_unique_id(env, &tmp);
+		for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+			*fidp++ = *p++;
+
+		/*
+		 * Initialize/increment the serial number we use to help
+		 * avoid fileid collisions.  Note we don't bother with
+		 * locking; it's unpleasant to do from down in here, and
+		 * if we race on this no real harm will be done, since the
+		 * finished fileid has so many other components.
+		 *
+		 * We use the bottom 32-bits of the process ID, hoping they
+		 * are more random than the top 32-bits (should we be on a
+		 * machine with 64-bit process IDs).
+		 *
+		 * We increment by 100000 on each call as a simple way of
+		 * randomizing; simply incrementing seems potentially less
+		 * useful if pids are also simply incremented, since this
+		 * is process-local and we may be one of a set of processes
+		 * starting up.  100000 pushes us out of pid space on most
+		 * 32-bit platforms, and has few interesting properties in
+		 * base 2.
+		 */
+		if (DB_GLOBAL(fid_serial) == 0) {
+			__os_id(env->dbenv, &pid, NULL);
+			DB_GLOBAL(fid_serial) = (u_int32_t)pid;
+		} else
+			DB_GLOBAL(fid_serial) += 100000;
+
+	} else {
+		tmp = (u_int32_t)fi.dwVolumeSerialNumber;
+		for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+			*fidp++ = *p++;
+	}
+
+	return (0);
+}
diff --git a/src/os_windows/os_flock.c b/src/os_windows/os_flock.c
new file mode 100644
index 00000000..cb3e4986
--- /dev/null
+++ b/src/os_windows/os_flock.c
@@ -0,0 +1,90 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fdlock --
+ *	Acquire/release a lock on a byte in a file.
+ */
+int
+__os_fdlock(env, fhp, offset, acquire, nowait)
+	ENV *env;
+	DB_FH *fhp;
+	int acquire, nowait;
+	off_t offset;
+{
+#ifdef DB_WINCE
+	/*
+	 * This functionality is not supported by WinCE, so just fail.
+	 *
+	 * Should only happen if an app attempts to open an environment
+	 * with the DB_REGISTER flag.
+	 */
+	 __db_errx(env, DB_STR("0019",
+	    "fdlock API not implemented for WinCE, DB_REGISTER "
+	    "environment flag not supported."));
+	return (EFAULT);
+#else
+	DWORD low, high;
+	DB_ENV *dbenv;
+	OVERLAPPED over;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	DB_ASSERT(env,
+	    F_ISSET(fhp, DB_FH_OPENED) && fhp->handle != INVALID_HANDLE_VALUE);
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0020",
+		    "fileops: flock %s %s offset %lu", "%s %s %lu"), fhp->name,
+		    acquire ? DB_STR_P("acquire"): DB_STR_P("release"),
+		    (u_long)offset);
+
+	/*
+	 * Windows file locking interferes with read/write operations, so we
+	 * map the ranges to an area past the end of the file.
+	 */
+	DB_ASSERT(env, offset < (u_int64_t)INT64_MAX);
+	offset = UINT64_MAX - offset;
+	low = (DWORD)offset;
+	high = (DWORD)(offset >> 32);
+
+	if (acquire) {
+		if (nowait)
+			RETRY_CHK_EINTR_ONLY(
+			    !LockFile(fhp->handle, low, high, 1, 0), ret);
+		else if (__os_is_winnt()) {
+			memset(&over, 0, sizeof(over));
+			over.Offset = low;
+			over.OffsetHigh = high;
+			RETRY_CHK_EINTR_ONLY(
+			    !LockFileEx(fhp->handle, LOCKFILE_EXCLUSIVE_LOCK,
+			    0, 1, 0, &over),
+			    ret);
+		} else {
+			/* Windows 9x/ME doesn't support a blocking call. */
+			for (;;) {
+				RETRY_CHK_EINTR_ONLY(
+				    !LockFile(fhp->handle, low, high, 1, 0),
+				    ret);
+				if (__os_posix_err(ret) != EAGAIN)
+					break;
+				__os_yield(env, 1, 0);
+			}
+		}
+	} else
+		RETRY_CHK_EINTR_ONLY(
+		    !UnlockFile(fhp->handle, low, high, 1, 0), ret);
+
+	return (__os_posix_err(ret));
+#endif
+}
diff --git a/src/os_windows/os_fsync.c b/src/os_windows/os_fsync.c
new file mode 100644
index 00000000..8824aac1
--- /dev/null
+++ b/src/os_windows/os_fsync.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fsync --
+ *	Flush a file descriptor.
+ */
+int
+__os_fsync(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	/*
+	 * Do nothing if the file descriptor has been marked as not requiring
+	 * any sync to disk.
+	 */
+	if (F_ISSET(fhp, DB_FH_NOSYNC))
+		return (0);
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0023",
+		    "fileops: flush %s", "%s"), fhp->name);
+
+	RETRY_CHK((!FlushFileBuffers(fhp->handle)), ret);
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR("0024", "FlushFileBuffers"));
+		ret = __os_posix_err(ret);
+	}
+	return (ret);
+}
diff --git a/src/os_windows/os_getenv.c b/src/os_windows/os_getenv.c
new file mode 100644
index 00000000..aad59d01
--- /dev/null
+++ b/src/os_windows/os_getenv.c
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getenv --
+ *	Retrieve an environment variable.
+ */
+int
+__os_getenv(env, name, bpp, buflen)
+	ENV *env;
+	const char *name;
+	char **bpp;
+	size_t buflen;
+{
+#ifdef DB_WINCE
+	COMPQUIET(name, NULL);
+	/* WinCE does not have a getenv implementation. */
+	return (0);
+#else
+	_TCHAR *tname, tbuf[1024];
+	int ret;
+	char *p;
+
+	/*
+	 * If there's a value and the buffer is large enough:
+	 *	copy value into the pointer, return 0
+	 * If there's a value and the buffer is too short:
+	 *	set pointer to NULL, return EINVAL
+	 * If there's no value:
+	 *	set pointer to NULL, return 0
+	 */
+	if ((p = getenv(name)) != NULL) {
+		if (strlen(p) < buflen) {
+			(void)strcpy(*bpp, p);
+			return (0);
+		}
+		goto small_buf;
+	}
+
+	TO_TSTRING(env, name, tname, ret);
+	if (ret != 0)
+		return (ret);
+	/*
+	 * The declared size of the tbuf buffer limits the maximum environment
+	 * variable size in Berkeley DB on Windows.  If that's too small, or if
+	 * we need to get rid of large allocations on the BDB stack, we should
+	 * malloc the tbuf memory.
+	 */
+	ret = GetEnvironmentVariable(tname, tbuf, sizeof(tbuf));
+	FREE_STRING(env, tname);
+
+	/*
+	 * If GetEnvironmentVariable succeeds, the return value is the number
+	 * of characters stored in the buffer pointed to by lpBuffer, not
+	 * including the terminating null character.  If the buffer is not
+	 * large enough to hold the data, the return value is the buffer size,
+	 * in characters, required to hold the string and its terminating null
+	 * character.  If GetEnvironmentVariable fails, the return value is
+	 * zero.  If the specified environment variable was not found in the
+	 * environment block, GetLastError returns ERROR_ENVVAR_NOT_FOUND.
+	 */
+	if (ret == 0) {
+		if ((ret = __os_get_syserr()) == ERROR_ENVVAR_NOT_FOUND) {
+			*bpp = NULL;
+			return (0);
+		}
+		__db_syserr(env, ret, DB_STR("0026",
+		    "GetEnvironmentVariable"));
+		return (__os_posix_err(ret));
+	}
+	if (ret > (int)sizeof(tbuf))
+		goto small_buf;
+
+	FROM_TSTRING(env, tbuf, p, ret);
+	if (ret != 0)
+		return (ret);
+	if (strlen(p) < buflen)
+		(void)strcpy(*bpp, p);
+	else
+		*bpp = NULL;
+	FREE_STRING(env, p);
+	if (*bpp == NULL)
+		goto small_buf;
+
+	return (0);
+
+small_buf:
+	*bpp = NULL;
+	__db_errx(env, DB_STR_A("0027",
+	    "%s: buffer too small to hold environment variable %s", "%s %s"),
+	    name, p);
+	return (EINVAL);
+#endif
+}
diff --git a/src/os_windows/os_handle.c b/src/os_windows/os_handle.c
new file mode 100644
index 00000000..e6edc3ef
--- /dev/null
+++ b/src/os_windows/os_handle.c
@@ -0,0 +1,167 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_openhandle --
+ *	Open a file, using POSIX 1003.1 open flags.
+ */
+int
+__os_openhandle(env, name, flags, mode, fhpp)
+	ENV *env;
+	const char *name;
+	int flags, mode;
+	DB_FH **fhpp;
+{
+#ifdef DB_WINCE
+	/*
+	 * __os_openhandle API is not implemented on WinCE.
+	 * It is not currently called from within the Berkeley DB library,
+	 * so don't log the failure via the __db_err mechanism.
+	 */
+	return (EFAULT);
+#else
+	DB_FH *fhp;
+	int ret, nrepeat, retries;
+
+	/*
+	 * Allocate the file handle and copy the file name.  We generally only
+	 * use the name for verbose or error messages, but on systems where we
+	 * can't unlink temporary files immediately, we use the name to unlink
+	 * the temporary file when the file handle is closed.
+	 *
+	 * Lock the ENV handle and insert the new file handle on the list.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+		return (ret);
+	if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+		goto err;
+	if (env != NULL) {
+		MUTEX_LOCK(env, env->mtx_env);
+		TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+		MUTEX_UNLOCK(env, env->mtx_env);
+		F_SET(fhp, DB_FH_ENVLINK);
+	}
+
+	retries = 0;
+	for (nrepeat = 1; nrepeat < 4; ++nrepeat) {
+		fhp->fd = _open(name, flags, mode);
+
+		if (fhp->fd != -1) {
+			ret = 0;
+			break;
+		}
+
+		switch (ret = __os_posix_err(__os_get_syserr())) {
+		case EMFILE:
+		case ENFILE:
+		case ENOSPC:
+			/*
+			 * If it's a "temporary" error, we retry up to 3 times,
+			 * waiting up to 12 seconds.  While it's not a problem
+			 * if we can't open a database, an inability to open a
+			 * log file is cause for serious dismay.
+			 */
+			__os_yield(env, nrepeat * 2, 0);
+			break;
+		case EAGAIN:
+		case EBUSY:
+		case EINTR:
+			/*
+			 * If an EAGAIN, EBUSY or EINTR, retry immediately for
+			 * DB_RETRY times.
+			 */
+			if (++retries < DB_RETRY)
+				--nrepeat;
+			break;
+		default:
+			/* Open is silent on error. */
+			goto err;
+		}
+	}
+
+	if (ret == 0) {
+		F_SET(fhp, DB_FH_OPENED);
+		*fhpp = fhp;
+		return (0);
+	}
+
+err:	(void)__os_closehandle(env, fhp);
+	return (ret);
+#endif
+}
+
+/*
+ * __os_closehandle --
+ *	Close a file.
+ */
+int
+__os_closehandle(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	DB_ENV *dbenv;
+	int ret, t_ret;
+
+	ret = 0;
+
+	if (env != NULL) {
+		dbenv = env->dbenv;
+		if (fhp->name != NULL && FLD_ISSET(
+		    dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+			__db_msg(env, DB_STR_A("0031",
+			    "fileops: %s: close", "%s"), fhp->name);
+
+		if (F_ISSET(fhp, DB_FH_ENVLINK)) {
+			/*
+			 * Lock the ENV handle and remove this file
+			 * handle from the list.
+			 */
+			MUTEX_LOCK(env, env->mtx_env);
+			TAILQ_REMOVE(&env->fdlist, fhp, q);
+			MUTEX_UNLOCK(env, env->mtx_env);
+		}
+	}
+
+	/* Discard any underlying system file reference. */
+	if (F_ISSET(fhp, DB_FH_OPENED)) {
+		if (fhp->handle != INVALID_HANDLE_VALUE)
+			RETRY_CHK((!CloseHandle(fhp->handle)), ret);
+		else
+#ifdef DB_WINCE
+			ret = EFAULT;
+#else
+			RETRY_CHK((_close(fhp->fd)), ret);
+#endif
+
+		if (fhp->trunc_handle != INVALID_HANDLE_VALUE) {
+			RETRY_CHK((!CloseHandle(fhp->trunc_handle)), t_ret);
+			if (t_ret != 0 && ret == 0)
+				ret = t_ret;
+		}
+
+		if (ret != 0) {
+			__db_syserr(env, ret, DB_STR("0032",
+			    "CloseHandle"));
+			ret = __os_posix_err(ret);
+		}
+	}
+
+	/* Unlink the file if we haven't already done so. */
+	if (F_ISSET(fhp, DB_FH_UNLINK))
+		(void)__os_unlink(env, fhp->name, 0);
+
+	if (fhp->name != NULL)
+		__os_free(env, fhp->name);
+	__os_free(env, fhp);
+
+	return (ret);
+}
diff --git a/src/os_windows/os_map.c b/src/os_windows/os_map.c
new file mode 100644
index 00000000..8f646d68
--- /dev/null
+++ b/src/os_windows/os_map.c
@@ -0,0 +1,397 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __os_map
+  __P((ENV *, char *, REGINFO *, DB_FH *, size_t, int, int, int, void **));
+static int __os_unique_name __P((_TCHAR *, HANDLE, _TCHAR *, size_t));
+
+/*
+ * __os_attach --
+ *	Create/join a shared memory region.
+ */
+int
+__os_attach(env, infop, rp)
+	ENV *env;
+	REGINFO *infop;
+	REGION *rp;
+{
+	int ret;
+	int is_sparse;
+#ifndef DB_WINCE
+	DWORD dw;
+#endif
+
+	infop->fhp = NULL;
+	/*
+	 * On Windows/9X, files that are opened by multiple processes do not
+	 * share data correctly.  For this reason, we require that DB_PRIVATE
+	 * be specified on that platform.
+	 */
+	if (!F_ISSET(env, ENV_PRIVATE) && __os_is_winnt() == 0) {
+		__db_err(env, EINVAL, DB_STR("0006",
+		    "Windows 9X systems must specify DB_PRIVATE"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Try to open/create the file.  We DO NOT need to ensure that multiple
+	 * threads/processes attempting to simultaneously create the region are
+	 * properly ordered, our caller has already taken care of that.
+	 */
+	if ((ret = __os_open(env, infop->name, 0, DB_OSO_REGION |
+	    (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
+	    env->db_mode, &infop->fhp)) != 0) {
+		__db_err(env, ret, "%s", infop->name);
+		return (ret);
+	}
+
+	is_sparse = 0;
+#ifndef DB_WINCE
+	/*
+	 * Sparse file only works for NTFS filesystem. If we failed to set it,
+	 * just ignore the error and use the normal method.
+	 */
+	if (!F_ISSET(env, ENV_SYSTEM_MEM) && (DeviceIoControl(
+	    infop->fhp->handle, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
+	    &dw, NULL)))
+		is_sparse = 1;
+#endif
+
+	/*
+	 * Map the file in.  If we're creating an in-system-memory region,
+	 * specify a segment ID (which is never used again) so that the
+	 * calling code writes out the REGENV_REF structure to the primary
+	 * environment file.
+	 */
+	ret = __os_map(env, infop->name, infop, infop->fhp, rp->max,
+	   1, F_ISSET(env, ENV_SYSTEM_MEM), 0, &infop->addr);
+	if (ret == 0 && F_ISSET(env, ENV_SYSTEM_MEM))
+		rp->segid = 1;
+
+	if (ret != 0) {
+		(void)__os_closehandle(env, infop->fhp);
+		infop->fhp = NULL;
+		return (ret);
+	}
+
+	/*
+	 * If we are using sparse file, we don't need to keep the file handle
+	 * for writing or extending.
+	 */
+	if (is_sparse && infop->fhp != NULL) {
+		ret = __os_closehandle(env, infop->fhp);
+		infop->fhp = NULL;
+	}
+	return (ret);
+}
+
+/*
+ * __os_detach --
+ *	Detach from a shared memory region.
+ */
+int
+__os_detach(env, infop, destroy)
+	ENV *env;
+	REGINFO *infop;
+	int destroy;
+{
+	DB_ENV *dbenv;
+	int ret, t_ret;
+
+	dbenv = env->dbenv;
+
+	if (infop->wnt_handle != NULL) {
+		(void)CloseHandle(infop->wnt_handle);
+		infop->wnt_handle = NULL;
+	}
+	if (infop->fhp != NULL) {
+		ret = __os_closehandle(env, infop->fhp);
+		infop->fhp = NULL;
+		if (ret != 0)
+			return (ret);
+	}
+
+	ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR("0007", "UnmapViewOfFile"));
+		ret = __os_posix_err(ret);
+	}
+
+	if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy &&
+	    (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __os_mapfile --
+ *	Map in a shared memory file.
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addr)
+	ENV *env;
+	char *path;
+	DB_FH *fhp;
+	int is_rdonly;
+	size_t len;
+	void **addr;
+{
+#ifdef DB_WINCE
+	/*
+	 * Windows CE has special requirements for file mapping to work.
+	 * * The input handle needs to be opened using CreateFileForMapping
+	 * * Concurrent access via a non mapped file is not supported.
+	 * So we disable support for memory mapping files on Windows CE. It is
+	 * currently only used as an optimization in mpool for small read only
+	 * databases.
+	 */
+	return (EFAULT);
+#else
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0008", "fileops: mmap %s", "%s"), path);
+	return (__os_map(env, path, NULL, fhp, len, 0, 0, is_rdonly, addr));
+#endif
+}
+
+/*
+ * __os_unmapfile --
+ *	Unmap the shared memory file.
+ */
+int
+__os_unmapfile(env, addr, len)
+	ENV *env;
+	void *addr;
+	size_t len;
+{
+	DB_ENV *dbenv;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR("0009", "fileops: munmap"));
+
+	return (!UnmapViewOfFile(addr) ? __os_posix_err(__os_get_syserr()) : 0);
+}
+
+/*
+ * __os_unique_name --
+ *	Create a unique identifying name from a pathname (may be absolute or
+ *	relative) and/or a file descriptor.
+ *
+ *	The name returned must be unique (different files map to different
+ *	names), and repeatable (same files, map to same names).  It's not
+ *	so easy to do by name.  Should handle not only:
+ *
+ *		foo.bar == ./foo.bar == c:/whatever_path/foo.bar
+ *
+ *	but also understand that:
+ *
+ *		foo.bar == Foo.Bar	(FAT file system)
+ *		foo.bar != Foo.Bar	(NTFS)
+ *
+ *	The best solution is to use the file index, found in the file
+ *	information structure (similar to UNIX inode #).
+ *
+ *	When a file is deleted, its file index may be reused,
+ *	but if the unique name has not gone from its namespace,
+ *	we may get a conflict.  So to ensure some tie in to the
+ *	original pathname, we also use the creation time and the
+ *	file basename.  This is not a perfect system, but it
+ *	should work for all but anamolous test cases.
+ *
+ */
+static int
+__os_unique_name(orig_path, hfile, result_path, result_path_len)
+	_TCHAR *orig_path, *result_path;
+	HANDLE hfile;
+	size_t result_path_len;
+{
+	BY_HANDLE_FILE_INFORMATION fileinfo;
+	_TCHAR *basename, *p;
+
+	/*
+	 * In Windows, pathname components are delimited by '/' or '\', and
+	 * if neither is present, we need to strip off leading drive letter
+	 * (e.g. c:foo.txt).
+	 */
+	basename = _tcsrchr(orig_path, '/');
+	p = _tcsrchr(orig_path, '\\');
+	if (basename == NULL || (p != NULL && p > basename))
+		basename = p;
+	if (basename == NULL)
+		basename = _tcsrchr(orig_path, ':');
+
+	if (basename == NULL)
+		basename = orig_path;
+	else
+		basename++;
+
+	if (!GetFileInformationByHandle(hfile, &fileinfo))
+		return (__os_posix_err(__os_get_syserr()));
+
+	(void)_sntprintf(result_path, result_path_len,
+	    _T("__db_shmem.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%s"),
+	    fileinfo.dwVolumeSerialNumber,
+	    fileinfo.nFileIndexHigh,
+	    fileinfo.nFileIndexLow,
+	    fileinfo.ftCreationTime.dwHighDateTime,
+	    fileinfo.ftCreationTime.dwHighDateTime,
+	    basename);
+
+	return (0);
+}
+
+/*
+ * __os_map --
+ *	The mmap(2) function for Windows.
+ */
+static int
+__os_map(env, path, infop, fhp, len, is_region, is_system, is_rdonly, addr)
+	ENV *env;
+	REGINFO *infop;
+	char *path;
+	DB_FH *fhp;
+	int is_region, is_system, is_rdonly;
+	size_t len;
+	void **addr;
+{
+	HANDLE hMemory;
+	int ret, use_pagefile;
+	_TCHAR *tpath, shmem_name[DB_MAXPATHLEN];
+	void *pMemory;
+	unsigned __int64 len64;
+
+	ret = 0;
+	if (infop != NULL)
+		infop->wnt_handle = NULL;
+
+	/*
+	 * On 64 bit systems, len is already a 64 bit value.
+	 * On 32 bit systems len is a 32 bit value.
+	 * Always convert to a 64 bit value, so that the high order
+	 * DWORD can be simply extracted on 64 bit platforms.
+	 */
+	len64 = len;
+
+	use_pagefile = is_region && is_system;
+
+	/*
+	 * If creating a region in system space, get a matching name in the
+	 * paging file namespace.
+	 */
+	if (use_pagefile) {
+#ifdef DB_WINCE
+		__db_errx(env, DB_STR("0010",
+		    "Unable to memory map regions using system "
+		    "memory on WinCE."));
+		return (EFAULT);
+#endif
+		TO_TSTRING(env, path, tpath, ret);
+		if (ret != 0)
+			return (ret);
+		ret = __os_unique_name(tpath, fhp->handle,
+		    shmem_name, sizeof(shmem_name));
+		FREE_STRING(env, tpath);
+		if (ret != 0)
+			return (ret);
+	}
+
+	/*
+	 * XXX
+	 * DB: We have not implemented copy-on-write here.
+	 *
+	 * If this is an region in system memory, we try to open it using the
+	 * OpenFileMapping() first, and only call CreateFileMapping() if we're
+	 * really creating the section.  There are two reasons:
+	 *
+	 * 1) We only create the mapping if we have newly created the region.
+	 *    This avoids a long-running problem caused by Windows reference
+	 *    counting, where regions that are closed by all processes are
+	 *    deleted.  It turns out that just checking for a zeroed region
+	 *    is not good enough. See [#4882] and [#7127] for the details.
+	 *
+	 * 2) CreateFileMapping seems to mess up making the commit charge to
+	 *    the process. It thinks, incorrectly, that when we want to join a
+	 *    previously existing section, that it should make a commit charge
+	 *    for the whole section.  In fact, there is no new committed memory
+	 *    whatever.  The call can fail if there is insufficient memory free
+	 *    to handle the erroneous commit charge.  So, we find that the
+	 *    bogus commit is not made if we call OpenFileMapping.
+	 */
+	hMemory = NULL;
+	if (use_pagefile) {
+#ifndef DB_WINCE
+		hMemory = OpenFileMapping(
+		    is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS,
+		    0, shmem_name);
+
+		if (hMemory == NULL && F_ISSET(infop, REGION_CREATE_OK))
+			hMemory = CreateFileMapping((HANDLE)-1, 0,
+			    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+			    (DWORD)(len64 >> 32), (DWORD)len64, shmem_name);
+#endif
+	} else {
+		hMemory = CreateFileMapping(fhp->handle, 0,
+		    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+		    (DWORD)(len64 >> 32), (DWORD)len64, NULL);
+#ifdef DB_WINCE
+		/*
+		 * WinCE automatically closes the handle passed in.
+		 * Ensure DB does not attempt to close the handle again.
+		 */
+		fhp->handle = INVALID_HANDLE_VALUE;
+		F_CLR(fhp, DB_FH_OPENED);
+#endif
+	}
+
+	if (hMemory == NULL) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("0011", "OpenFileMapping"));
+		return (__env_panic(env, __os_posix_err(ret)));
+	}
+
+	pMemory = MapViewOfFile(hMemory,
+	    (is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), 0, 0, len);
+	if (pMemory == NULL) {
+		ret = __os_get_syserr();
+		__db_syserr(env, ret, DB_STR("0012", "MapViewOfFile"));
+		return (__env_panic(env, __os_posix_err(ret)));
+	}
+
+	/*
+	 * XXX
+	 * It turns out that the kernel object underlying the named section
+	 * is reference counted, but that the call to MapViewOfFile() above
+	 * does NOT increment the reference count! So, if we close the handle
+	 * here, the kernel deletes the object from the kernel namespace.
+	 * When a second process comes along to join the region, the kernel
+	 * happily creates a new object with the same name, but completely
+	 * different identity. The two processes then have distinct isolated
+	 * mapped sections, not at all what was wanted. Not closing the handle
+	 * here fixes this problem.  We carry the handle around in the region
+	 * structure so we can close it when unmap is called.
+	 */
+	if (use_pagefile && infop != NULL)
+		infop->wnt_handle = hMemory;
+	else
+		CloseHandle(hMemory);
+
+	*addr = pMemory;
+	return (ret);
+}
diff --git a/src/os_windows/os_mkdir.c b/src/os_windows/os_mkdir.c
new file mode 100644
index 00000000..b87f3f9d
--- /dev/null
+++ b/src/os_windows/os_mkdir.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_mkdir --
+ *	Create a directory.
+ */
+int
+__os_mkdir(env, name, mode)
+	ENV *env;
+	const char *name;
+	int mode;
+{
+	DB_ENV *dbenv;
+	_TCHAR *tname;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0013", "fileops: mkdir %s",
+		    "%s"), name);
+
+	/* Make the directory, with paranoid permissions. */
+	TO_TSTRING(env, name, tname, ret);
+	if (ret != 0)
+		return (ret);
+	RETRY_CHK(!CreateDirectory(tname, NULL), ret);
+	FREE_STRING(env, tname);
+	if (ret != 0)
+		return (__os_posix_err(ret));
+
+	return (ret);
+}
diff --git a/src/os_windows/os_open.c b/src/os_windows/os_open.c
new file mode 100644
index 00000000..44f2faf3
--- /dev/null
+++ b/src/os_windows/os_open.c
@@ -0,0 +1,258 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_open --
+ *	Open a file descriptor (including page size and log size information).
+ */
+int
+__os_open(env, name, page_size, flags, mode, fhpp)
+	ENV *env;
+	const char *name;
+	u_int32_t page_size, flags;
+	int mode;
+	DB_FH **fhpp;
+{
+	DB_ENV *dbenv;
+	DB_FH *fhp;
+#ifndef DB_WINCE
+	DWORD cluster_size, sector_size, free_clusters, total_clusters;
+	_TCHAR *drive, dbuf[4]; /* <letter><colon><slash><nul> */
+
+#endif
+	int access, attr, createflag, nrepeat, ret, share;
+	_TCHAR *tname;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	*fhpp = NULL;
+	tname = NULL;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0025", "fileops: open %s",
+		    "%s"), name);
+
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
+	DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION |	DB_OSO_SEQ |	\
+	DB_OSO_TEMP | DB_OSO_TRUNC)
+	if ((ret = __db_fchk(env, "__os_open", flags, OKFLAGS)) != 0)
+		return (ret);
+
+	TO_TSTRING(env, name, tname, ret);
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Allocate the file handle and copy the file name.  We generally only
+	 * use the name for verbose or error messages, but on systems where we
+	 * can't unlink temporary files immediately, we use the name to unlink
+	 * the temporary file when the file handle is closed.
+	 *
+	 * Lock the ENV handle and insert the new file handle on the list.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+		return (ret);
+	if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+		goto err;
+	if (env != NULL) {
+		MUTEX_LOCK(env, env->mtx_env);
+		TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+		MUTEX_UNLOCK(env, env->mtx_env);
+		F_SET(fhp, DB_FH_ENVLINK);
+	}
+
+	/*
+	 * Otherwise, use the Windows/32 CreateFile interface so that we can
+	 * play magic games with files to get data flush effects similar to
+	 * the POSIX O_DSYNC flag.
+	 *
+	 * !!!
+	 * We currently ignore the 'mode' argument.  It would be possible
+	 * to construct a set of security attributes that we could pass to
+	 * CreateFile that would accurately represents the mode.  In worst
+	 * case, this would require looking up user and all group names and
+	 * creating an entry for each.  Alternatively, we could call the
+	 * _chmod (partial emulation) function after file creation, although
+	 * this leaves us with an obvious race.  However, these efforts are
+	 * largely meaningless on FAT, the most common file system, which
+	 * only has a "readable" and "writable" flag, applying to all users.
+	 */
+	access = GENERIC_READ;
+	if (!LF_ISSET(DB_OSO_RDONLY))
+		access |= GENERIC_WRITE;
+
+#ifdef DB_WINCE
+	/*
+	 * WinCE translates these flags into share flags for
+	 * CreateFileForMapping.
+	 * Also WinCE does not support the FILE_SHARE_DELETE flag.
+	 */
+	if (LF_ISSET(DB_OSO_REGION))
+		share = GENERIC_READ | GENERIC_WRITE;
+	else
+		share = FILE_SHARE_READ | FILE_SHARE_WRITE;
+#else
+	share = FILE_SHARE_READ | FILE_SHARE_WRITE;
+	if (__os_is_winnt())
+		share |= FILE_SHARE_DELETE;
+#endif
+	attr = FILE_ATTRIBUTE_NORMAL;
+
+	/*
+	 * Reproduce POSIX 1003.1 semantics: if O_CREATE and O_EXCL are both
+	 * specified, fail, returning EEXIST, unless we create the file.
+	 */
+	if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_EXCL))
+		createflag = CREATE_NEW;	/* create only if !exist*/
+	else if (!LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_TRUNC))
+		createflag = TRUNCATE_EXISTING; /* truncate, fail if !exist */
+	else if (LF_ISSET(DB_OSO_TRUNC))
+		createflag = CREATE_ALWAYS;	/* create and truncate */
+	else if (LF_ISSET(DB_OSO_CREATE))
+		createflag = OPEN_ALWAYS;	/* open or create */
+	else
+		createflag = OPEN_EXISTING;	/* open only if existing */
+
+	if (LF_ISSET(DB_OSO_DSYNC)) {
+		F_SET(fhp, DB_FH_NOSYNC);
+		attr |= FILE_FLAG_WRITE_THROUGH;
+	}
+
+#ifndef DB_WINCE
+	if (LF_ISSET(DB_OSO_SEQ))
+		attr |= FILE_FLAG_SEQUENTIAL_SCAN;
+	else
+		attr |= FILE_FLAG_RANDOM_ACCESS;
+#endif
+
+	if (LF_ISSET(DB_OSO_TEMP))
+		attr |= FILE_FLAG_DELETE_ON_CLOSE;
+
+	/*
+	 * We can turn filesystem buffering off if the page size is a
+	 * multiple of the disk's sector size. To find the sector size,
+	 * we call GetDiskFreeSpace, which expects a drive name like "d:\\"
+	 * or NULL for the current disk (i.e., a relative path).
+	 *
+	 * WinCE only has GetDiskFreeSpaceEx which does not
+	 * return the sector size.
+	 */
+#ifndef DB_WINCE
+	if (LF_ISSET(DB_OSO_DIRECT) && page_size != 0 && name[0] != '\0') {
+		if (name[1] == ':') {
+			drive = dbuf;
+			_sntprintf(dbuf, sizeof(dbuf), _T("%c:\\"), tname[0]);
+		} else
+			drive = NULL;
+
+		/*
+		 * We ignore all results except sectorsize, but some versions
+		 * of Windows require that the parameters are non-NULL.
+		 */
+		if (GetDiskFreeSpace(drive, &cluster_size,
+		    &sector_size, &free_clusters, &total_clusters) &&
+		    page_size % sector_size == 0)
+			attr |= FILE_FLAG_NO_BUFFERING;
+	}
+#endif
+
+	fhp->handle = fhp->trunc_handle = INVALID_HANDLE_VALUE;
+	for (nrepeat = 1;; ++nrepeat) {
+		if (fhp->handle == INVALID_HANDLE_VALUE) {
+#ifdef DB_WINCE
+			if (LF_ISSET(DB_OSO_REGION))
+				fhp->handle = CreateFileForMapping(tname,
+				    access, share, NULL, createflag, attr, 0);
+			else
+#endif
+				fhp->handle = CreateFile(tname,
+				    access, share, NULL, createflag, attr, 0);
+		}
+
+#ifdef HAVE_FTRUNCATE
+		/*
+		 * Older versions of WinCE may not support truncate, if so, the
+		 * HAVE_FTRUNCATE macro should be #undef'ed, and we
+		 * don't need to open this second handle.
+		 *
+		 * WinCE dose not support opening a second handle on the same
+		 * file via CreateFileForMapping, but this dose not matter
+		 * since we are not truncating region files but database files.
+		 *
+		 * But some older versions of WinCE even
+		 * dose not allow a second handle opened via CreateFile. If
+		 * this is the case, users will need to #undef the
+		 * HAVE_FTRUNCATE macro in build_wince/db_config.h.
+		 */
+
+		/*
+		 * Windows does not provide truncate directly.  There is no
+		 * safe way to use a handle for truncate concurrently with
+		 * reads or writes.  To deal with this, we open a second handle
+		 * used just for truncating.
+		 */
+		if (fhp->handle != INVALID_HANDLE_VALUE &&
+		    !LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+		    fhp->trunc_handle == INVALID_HANDLE_VALUE
+#ifdef DB_WINCE
+		    /* Do not open trunc handle for region files. */
+		    && (!LF_ISSET(DB_OSO_REGION))
+#endif
+		    )
+			fhp->trunc_handle = CreateFile(
+			    tname, access, share, NULL, OPEN_EXISTING, attr, 0);
+#endif
+
+#ifndef HAVE_FTRUNCATE
+		if (fhp->handle == INVALID_HANDLE_VALUE)
+#else
+		if (fhp->handle == INVALID_HANDLE_VALUE ||
+		    (!LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+		    fhp->trunc_handle == INVALID_HANDLE_VALUE
+#ifdef DB_WINCE
+		    /* Do not open trunc handle for region files. */
+		    && (!LF_ISSET(DB_OSO_REGION))
+#endif
+		    ))
+#endif
+		{
+			/*
+			 * If it's a "temporary" error, we retry up to 3 times,
+			 * waiting up to 12 seconds.  While it's not a problem
+			 * if we can't open a database, an inability to open a
+			 * log file is cause for serious dismay.
+			 */
+			ret = __os_posix_err(__os_get_syserr());
+			if ((ret != ENFILE && ret != EMFILE && ret != ENOSPC) ||
+			    nrepeat > 3)
+				goto err;
+
+			__os_yield(env, nrepeat * 2, 0);
+		} else
+			break;
+	}
+
+	FREE_STRING(env, tname);
+
+	if (LF_ISSET(DB_OSO_REGION))
+		F_SET(fhp, DB_FH_REGION);
+	F_SET(fhp, DB_FH_OPENED);
+	*fhpp = fhp;
+	return (0);
+
+err:	FREE_STRING(env, tname);
+	if (fhp != NULL)
+		(void)__os_closehandle(env, fhp);
+	return (ret);
+}
diff --git a/src/os_windows/os_rename.c b/src/os_windows/os_rename.c
new file mode 100644
index 00000000..791f53a5
--- /dev/null
+++ b/src/os_windows/os_rename.c
@@ -0,0 +1,82 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rename --
+ *	Rename a file.
+ */
+int
+__os_rename(env, oldname, newname, silent)
+	ENV *env;
+	const char *oldname, *newname;
+	u_int32_t silent;
+{
+	DB_ENV *dbenv;
+	_TCHAR *toldname, *tnewname;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0036", "fileops: rename %s to %s",
+		    "%s %s"), oldname, newname);
+
+	TO_TSTRING(env, oldname, toldname, ret);
+	if (ret != 0)
+		return (ret);
+	TO_TSTRING(env, newname, tnewname, ret);
+	if (ret != 0) {
+		FREE_STRING(env, toldname);
+		return (ret);
+	}
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	if (!MoveFile(toldname, tnewname))
+		ret = __os_get_syserr();
+
+	if (__os_posix_err(ret) == EEXIST) {
+		ret = 0;
+#ifndef DB_WINCE
+		if (__os_is_winnt()) {
+			if (!MoveFileEx(
+			    toldname, tnewname, MOVEFILE_REPLACE_EXISTING))
+				ret = __os_get_syserr();
+		} else
+#endif
+		{
+			/*
+			 * There is no MoveFileEx for Win9x/Me/CE, so we have to
+			 * do the best we can.  Note that the MoveFile call
+			 * above would have succeeded if oldname and newname
+			 * refer to the same file, so we don't need to check
+			 * that here.
+			 */
+			(void)DeleteFile(tnewname);
+			if (!MoveFile(toldname, tnewname))
+				ret = __os_get_syserr();
+		}
+	}
+
+	FREE_STRING(env, tnewname);
+	FREE_STRING(env, toldname);
+
+	if (ret != 0) {
+		if (silent == 0)
+			__db_syserr(env, ret, DB_STR_A("0037",
+			    "MoveFileEx %s %s", "%s %s"), oldname, newname);
+		ret = __os_posix_err(ret);
+	}
+
+	return (ret);
+}
diff --git a/src/os_windows/os_rw.c b/src/os_windows/os_rw.c
new file mode 100644
index 00000000..e64a7d08
--- /dev/null
+++ b/src/os_windows/os_rw.c
@@ -0,0 +1,218 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_io --
+ *	Do an I/O.
+ */
+int
+__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
+	ENV *env;
+	int op;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize, relative, io_len;
+	u_int8_t *buf;
+	size_t *niop;
+{
+	int ret;
+
+#ifndef DB_WINCE
+	if (__os_is_winnt()) {
+		DB_ENV *dbenv;
+		DWORD nbytes;
+		OVERLAPPED over;
+		ULONG64 off;
+		dbenv = env == NULL ? NULL : env->dbenv;
+		if ((off = relative) == 0)
+			off = (ULONG64)pgsize * pgno;
+		over.Offset = (DWORD)(off & 0xffffffff);
+		over.OffsetHigh = (DWORD)(off >> 32);
+		over.hEvent = 0; /* we don't want asynchronous notifications */
+
+		if (dbenv != NULL &&
+		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+			__db_msg(env, DB_STR_A("0014",
+			    "fileops: %s %s: %lu bytes at offset %lu",
+			    "%s %s %lu %lu"), op == DB_IO_READ ?
+			    DB_STR_P("read") : DB_STR_P("write"),
+			    fhp->name, (u_long)io_len, (u_long)off);
+
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+
+		switch (op) {
+		case DB_IO_READ:
+#if defined(HAVE_STATISTICS)
+			++fhp->read_count;
+#endif
+			if (!ReadFile(fhp->handle,
+			    buf, (DWORD)io_len, &nbytes, &over))
+				goto slow;
+			break;
+		case DB_IO_WRITE:
+#ifdef HAVE_FILESYSTEM_NOTZERO
+			if (__os_fs_notzero())
+				goto slow;
+#endif
+#if defined(HAVE_STATISTICS)
+			++fhp->write_count;
+#endif
+			if (!WriteFile(fhp->handle,
+			    buf, (DWORD)io_len, &nbytes, &over))
+				goto slow;
+			break;
+		}
+		if (nbytes == io_len) {
+			*niop = (size_t)nbytes;
+			return (0);
+		}
+	}
+
+slow:
+#endif
+	MUTEX_LOCK(env, fhp->mtx_fh);
+
+	if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
+		goto err;
+
+	switch (op) {
+	case DB_IO_READ:
+		ret = __os_read(env, fhp, buf, io_len, niop);
+		break;
+	case DB_IO_WRITE:
+		ret = __os_write(env, fhp, buf, io_len, niop);
+		break;
+	}
+
+err:	MUTEX_UNLOCK(env, fhp->mtx_fh);
+
+	return (ret);
+}
+
+/*
+ * __os_read --
+ *	Read from a file handle.
+ */
+int
+__os_read(env, fhp, addr, len, nrp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nrp;
+{
+	DB_ENV *dbenv;
+	DWORD count;
+	size_t offset, nr;
+	u_int8_t *taddr;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	ret = 0;
+
+#if defined(HAVE_STATISTICS)
+	++fhp->read_count;
+#endif
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0015", "fileops: read %s: %lu bytes",
+		    "%s %lu"), fhp->name, (u_long)len);
+
+	for (taddr = addr,
+	    offset = 0; offset < len; taddr += nr, offset += nr) {
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		RETRY_CHK((!ReadFile(fhp->handle,
+		    taddr, (DWORD)(len - offset), &count, NULL)), ret);
+		if (count == 0 || ret != 0)
+			break;
+		nr = (size_t)count;
+	}
+	*nrp = taddr - (u_int8_t *)addr;
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0016",
+		    "read: 0x%lx, %lu", "%lx %lu"),
+		    P_TO_ULONG(taddr), (u_long)len - offset);
+		ret = __os_posix_err(ret);
+	}
+	return (ret);
+}
+
+/*
+ * __os_write --
+ *	Write to a file handle.
+ */
+int
+__os_write(env, fhp, addr, len, nwp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nwp;
+{
+	int ret;
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+	/* Zero-fill as necessary. */
+	if (__os_fs_notzero() &&
+	    (ret = __db_zero_fill(env, fhp)) != 0)
+		return (ret);
+#endif
+	return (__os_physwrite(env, fhp, addr, len, nwp));
+}
+
+/*
+ * __os_physwrite --
+ *	Physical write to a file handle.
+ */
+int
+__os_physwrite(env, fhp, addr, len, nwp)
+	ENV *env;
+	DB_FH *fhp;
+	void *addr;
+	size_t len;
+	size_t *nwp;
+{
+	DB_ENV *dbenv;
+	DWORD count;
+	size_t offset, nw;
+	u_int8_t *taddr;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	ret = 0;
+
+#if defined(HAVE_STATISTICS)
+	++fhp->write_count;
+#endif
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0017", "fileops: write %s: %lu bytes",
+		    "%s %lu"), fhp->name, (u_long)len);
+
+	for (taddr = addr,
+	    offset = 0; offset < len; taddr += nw, offset += nw) {
+		LAST_PANIC_CHECK_BEFORE_IO(env);
+		RETRY_CHK((!WriteFile(fhp->handle,
+		    taddr, (DWORD)(len - offset), &count, NULL)), ret);
+		if (ret != 0)
+			break;
+		nw = (size_t)count;
+	}
+	*nwp = len;
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0018",
+		    "write: %#lx, %lu", "%#lx %lu"),
+		    P_TO_ULONG(taddr), (u_long)len - offset);
+		ret = __os_posix_err(ret);
+
+		DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+	}
+	return (ret);
+}
diff --git a/src/os_windows/os_seek.c b/src/os_windows/os_seek.c
new file mode 100644
index 00000000..7632c15d
--- /dev/null
+++ b/src/os_windows/os_seek.c
@@ -0,0 +1,67 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_seek --
+ *	Seek to a page/byte offset in the file.
+ */
+int
+__os_seek(env, fhp, pgno, pgsize, relative)
+	ENV *env;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize;
+	off_t relative;
+{
+	/* Yes, this really is how Microsoft designed their API. */
+	union {
+		__int64 bigint;
+		struct {
+			unsigned long low;
+			long high;
+		};
+	} offbytes;
+	DB_ENV *dbenv;
+	off_t offset;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+#if defined(HAVE_STATISTICS)
+	++fhp->seek_count;
+#endif
+
+	offset = (off_t)pgsize * pgno + relative;
+
+	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0038",
+		    "fileops: seek %s to %lu", "%s %lu"),
+		    fhp->name, (u_long)offset);
+
+	offbytes.bigint = offset;
+	ret = (SetFilePointer(fhp->handle, offbytes.low,
+	    &offbytes.high, FILE_BEGIN) == (DWORD)-1) ? __os_get_syserr() : 0;
+
+	if (ret == 0) {
+		fhp->pgsize = pgsize;
+		fhp->pgno = pgno;
+		fhp->offset = relative;
+	} else {
+		__db_syserr(env, ret, DB_STR_A("0039",
+		    "seek: %lu: (%lu * %lu) + %lu", "%lu %lu %lu %lu"),
+		    (u_long)offset, (u_long)pgno,
+		    (u_long)pgsize, (u_long)relative);
+		ret = __os_posix_err(ret);
+	}
+
+	return (ret);
+}
diff --git a/src/os_windows/os_stat.c b/src/os_windows/os_stat.c
new file mode 100644
index 00000000..11248886
--- /dev/null
+++ b/src/os_windows/os_stat.c
@@ -0,0 +1,231 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Raw data reads must be done in multiples of the disk sector size. Currently
+ * the sector size is either 512 bytes or 4096 bytes. So we set the
+ * MAX_SECTOR_SIZE to 4096.
+ */
+#define	MAX_SECTOR_SIZE 4096
+
+/*
+ * Find the cluster size of the file system that would contain the given path.
+ * If the value can't be determined, an error is returned.
+ */
+int
+__os_get_cluster_size(path, psize)
+	const char *path;
+	u_int32_t *psize;
+{
+
+#if (WINVER < 0x500) || defined(DB_WINCE)
+	/*
+	 * WinCE and versions of Windows earlier than Windows NT don't have
+	 * the APIs required to retrieve the cluster size.
+	 */
+	*psize = DB_DEF_IOSIZE;
+	return (0);
+#else
+	BYTE clustershift, sectorshift, *pcluster;
+	char buffer[MAX_SECTOR_SIZE];
+	DWORD flags, infolen, length, mcl, name_size;
+	HANDLE vhandle;
+	int ret;
+	NTFS_VOLUME_DATA_BUFFER ntfsinfo;
+	size_t name_len;
+	TCHAR *env_path, name_buffer[MAX_PATH + 1], root_path[MAX_PATH + 1];
+	WORD *psector;
+
+	if (path == NULL || psize == NULL) {
+		return (EINVAL);
+	}
+
+	name_size = MAX_PATH + 1;
+	*psize = 0;
+
+	TO_TSTRING(NULL, path, env_path, ret);
+	if (ret != 0)
+		return (ret);
+	/* Retrieve the volume root path where the input path resides. */
+	if (!GetVolumePathName(env_path, root_path, name_size)) {
+		FREE_STRING(NULL, env_path);
+		return (__os_posix_err(__os_get_syserr()));
+	}
+	FREE_STRING(NULL, env_path);
+
+	/* Get the volume GUID name from the root path. */
+	if (!GetVolumeNameForVolumeMountPoint(
+	    root_path, name_buffer, name_size))
+		return (__os_posix_err(__os_get_syserr()));
+
+	/* Delete the last trail "\" in the GUID name. */
+	name_len = _tcsclen(name_buffer);
+	if (name_len > 0)
+		name_buffer[name_len - 1] = _T('\0');
+
+	/* Create a handle to the volume. */
+	vhandle = CreateFile(name_buffer, FILE_READ_ATTRIBUTES | FILE_READ_DATA,
+	    FILE_SHARE_READ | FILE_SHARE_WRITE,	NULL, OPEN_EXISTING,
+	    FILE_ATTRIBUTE_NORMAL, NULL);
+
+	/* If open failed, return error */
+	if (vhandle == INVALID_HANDLE_VALUE)
+		return (__os_posix_err(__os_get_syserr()));
+
+	/* Get the volume information through the root path. */
+	if (!GetVolumeInformation(root_path, NULL, name_size, NULL, &mcl,
+	    &flags, name_buffer, name_size)) {
+		ret = __os_posix_err(__os_get_syserr());
+		CloseHandle(vhandle);
+		return (ret);
+	}
+
+	ret = 0;
+	if (_tcscmp(name_buffer, _T("NTFS")) == 0) {
+		/*
+		 * If this is NTFS file system, use FSCTL_GET_NTFS_VOLUME_DATA
+		 * to get the cluster size.
+		 */
+		if (DeviceIoControl(
+		    vhandle,			/* volume handle */
+		    FSCTL_GET_NTFS_VOLUME_DATA,	/* Control Code */
+		    NULL,			/* not use */
+		    0,				/* not use */
+		    &ntfsinfo,			/* output buffer */
+		    sizeof(NTFS_VOLUME_DATA_BUFFER),/* output buffer length */
+		    &infolen,			/* number of returned bytes */
+		    NULL))			/* ignore here */
+			*psize = ntfsinfo.BytesPerCluster;
+		else
+			ret = __os_posix_err(__os_get_syserr());
+	} else if (_tcscmp(name_buffer, _T("exFAT")) == 0) {
+		/*
+		 * If this is exFAT file system, read the information of sector
+		 * and cluster from the BPB on sector 0
+		 * +6C H: BYTE SectorSizeShift
+		 * +6D H: BYTE ClusterShift
+		 */
+		if (ReadFile(vhandle, buffer, MAX_SECTOR_SIZE, &length, NULL)) {
+			sectorshift = *(BYTE *)(&buffer[0x6C]);
+			clustershift = *(BYTE *)(&buffer[0x6D]);
+			*psize = 1 << sectorshift;
+			*psize = (*psize) << clustershift;
+		}
+		else
+			ret = __os_posix_err(__os_get_syserr());
+	} else if (_tcscmp(name_buffer, _T("FAT")) == 0 ||
+	    _tcscmp(name_buffer, _T("FAT32")) == 0) {
+		/*
+		 * If this is FAT or FAT32 file system, read the information of
+		 * sector and cluster from the BPB on sector 0.
+		 * +0B H: WORD Bytes per Sector.
+		 * +0D H: BYTE Sectors Per Cluster.
+		 */
+		if (ReadFile(vhandle, buffer, MAX_SECTOR_SIZE, &length, NULL)) {
+			psector = (WORD *)(&buffer[0x0B]);
+			pcluster = (BYTE *)(&buffer[0x0D]);
+			*psize = (*psector) * (*pcluster);
+		}
+		else
+			ret = __os_posix_err(__os_get_syserr());
+	}
+
+	CloseHandle(vhandle);
+	return (ret);
+#endif
+}
+
+/*
+ * __os_exists --
+ *	Return if the file exists.
+ */
+int
+__os_exists(env, path, isdirp)
+	ENV *env;
+	const char *path;
+	int *isdirp;
+{
+	DB_ENV *dbenv;
+	DWORD attrs;
+	_TCHAR *tpath;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	TO_TSTRING(env, path, tpath, ret);
+	if (ret != 0)
+		return (ret);
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0033", "fileops: stat %s",
+		    "%s"), path);
+
+	RETRY_CHK(
+	    ((attrs = GetFileAttributes(tpath)) == (DWORD)-1 ? 1 : 0), ret);
+	if (ret == 0) {
+		if (isdirp != NULL)
+			*isdirp = (attrs & FILE_ATTRIBUTE_DIRECTORY);
+	} else
+		ret = __os_posix_err(ret);
+
+	FREE_STRING(env, tpath);
+	return (ret);
+}
+
+/*
+ * __os_ioinfo --
+ *	Return file size and I/O size; abstracted to make it easier
+ *	to replace.
+ */
+int
+__os_ioinfo(env, path, fhp, mbytesp, bytesp, iosizep)
+	ENV *env;
+	const char *path;
+	DB_FH *fhp;
+	u_int32_t *mbytesp, *bytesp, *iosizep;
+{
+	int ret;
+	BY_HANDLE_FILE_INFORMATION bhfi;
+	unsigned __int64 filesize;
+	u_int32_t io_sz;
+
+	RETRY_CHK((!GetFileInformationByHandle(fhp->handle, &bhfi)), ret);
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR("0034",
+		    "GetFileInformationByHandle"));
+		return (__os_posix_err(ret));
+	}
+
+	filesize = ((unsigned __int64)bhfi.nFileSizeHigh << 32) +
+	    bhfi.nFileSizeLow;
+
+	/* Return the size of the file. */
+	if (mbytesp != NULL)
+		*mbytesp = (u_int32_t)(filesize / MEGABYTE);
+	if (bytesp != NULL)
+		*bytesp = (u_int32_t)(filesize % MEGABYTE);
+
+	if (iosizep != NULL) {
+		/*
+		 * Attempt to retrieve a file system cluster size, if the
+		 * call succeeds, and the value returned is reasonable,
+		 * use it as the default page size. Otherwise use a
+		 * reasonable default value.
+		 */
+		if (__os_get_cluster_size(path, &io_sz) != 0 || io_sz < 1025)
+			*iosizep = DB_DEF_IOSIZE;
+		else
+			*iosizep = io_sz;
+	}
+	return (0);
+}
diff --git a/src/os_windows/os_truncate.c b/src/os_windows/os_truncate.c
new file mode 100644
index 00000000..fcbb37b2
--- /dev/null
+++ b/src/os_windows/os_truncate.c
@@ -0,0 +1,99 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_truncate --
+ *	Truncate the file.
+ */
+int
+__os_truncate(env, fhp, pgno, pgsize)
+	ENV *env;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+	u_int32_t pgsize;
+{
+	/* Yes, this really is how Microsoft have designed their API */
+	union {
+		__int64 bigint;
+		struct {
+			unsigned long low;
+			long high;
+		};
+	} off;
+	DB_ENV *dbenv;
+	off_t offset;
+	int ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+	offset = (off_t)pgsize * pgno;
+	ret = 0;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0021", "fileops: truncate %s to %lu",
+		    "%s %lu"), fhp->name, (u_long)offset);
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+	/*
+	 * If the filesystem doesn't zero fill, it isn't safe to extend the
+	 * file, or we end up with junk blocks.  Just return in that case.
+	 */
+	if (__os_fs_notzero()) {
+		off_t stat_offset;
+		u_int32_t mbytes, bytes;
+
+		/* Stat the file. */
+		if ((ret =
+		    __os_ioinfo(env, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+			return (ret);
+		stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+
+		if (offset > stat_offset)
+			return (0);
+	}
+#endif
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	/*
+	 * Windows doesn't provide truncate directly.  Instead, it has
+	 * SetEndOfFile, which truncates to the current position.  To
+	 * deal with that, we open a duplicate file handle for truncating.
+	 *
+	 * We want to retry the truncate call, which involves a SetFilePointer
+	 * and a SetEndOfFile, but there are several complications:
+	 *
+	 * 1) since the Windows API deals in 32-bit values, it's possible that
+	 *    the return from SetFilePointer (the low 32-bits) is
+	 *    INVALID_SET_FILE_POINTER even when the call has succeeded.  So we
+	 *    have to also check whether GetLastError() returns NO_ERROR.
+	 *
+	 * 2) when it returns, SetFilePointer overwrites the high bits of the
+	 *    offset, so if we need to retry, we have to reset the offset each
+	 *    time.
+	 *
+	 * We can't switch to SetFilePointerEx, which knows about 64-bit
+	 * offsets, because it isn't supported on Win9x/ME.
+	 */
+	RETRY_CHK((off.bigint = (__int64)pgsize * pgno,
+	    (SetFilePointer(fhp->trunc_handle, off.low, &off.high, FILE_BEGIN)
+	    == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) ||
+	    !SetEndOfFile(fhp->trunc_handle)), ret);
+
+	if (ret != 0) {
+		__db_syserr(env, ret, DB_STR_A("0022", "SetFilePointer: %lu",
+		    "%lu"), pgno * pgsize);
+		ret = __os_posix_err(ret);
+	}
+
+	return (ret);
+}
diff --git a/src/os_windows/os_unlink.c b/src/os_windows/os_unlink.c
new file mode 100644
index 00000000..6a0a6572
--- /dev/null
+++ b/src/os_windows/os_unlink.c
@@ -0,0 +1,123 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unlink --
+ *	Remove a file.
+ */
+int
+__os_unlink(env, path, overwrite_test)
+	ENV *env;
+	const char *path;
+	int overwrite_test;
+{
+	DB_ENV *dbenv;
+	HANDLE h;
+	_TCHAR *tpath, *orig_tpath, buf[DB_MAXPATHLEN];
+	u_int32_t id;
+	int ret, t_ret;
+
+	dbenv = env == NULL ? NULL : env->dbenv;
+
+	if (dbenv != NULL &&
+	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+		__db_msg(env, DB_STR_A("0028", "fileops: unlink %s",
+		    "%s"), path);
+
+	/* Optionally overwrite the contents of the file to enhance security. */
+	if (dbenv != NULL && overwrite_test && F_ISSET(dbenv, DB_ENV_OVERWRITE))
+		(void)__db_file_multi_write(env, path);
+
+	TO_TSTRING(env, path, tpath, ret);
+	if (ret != 0)
+		return (ret);
+	orig_tpath = tpath;
+
+	LAST_PANIC_CHECK_BEFORE_IO(env);
+
+	/*
+	 * Windows NT and its descendants allow removal of open files, but the
+	 * DeleteFile Win32 system call isn't equivalent to a POSIX unlink.
+	 * Firstly, it only succeeds if FILE_SHARE_DELETE is set when the file
+	 * is opened.  Secondly, it leaves the file in a "zombie" state, where
+	 * it can't be opened again, but a new file with the same name can't be
+	 * created either.
+	 *
+	 * Since we depend on being able to recreate files (during recovery,
+	 * say), we have to first rename the file, and then delete it.  It
+	 * still hangs around, but with a name we don't care about.  The rename
+	 * will fail if the file doesn't exist, which isn't a problem, but if
+	 * it fails for some other reason, we need to know about it or a
+	 * subsequent open may fail for no apparent reason.
+	 */
+	if (__os_is_winnt()) {
+		__os_unique_id(env, &id);
+		_sntprintf(buf, DB_MAXPATHLEN, _T("%s.del.%010u"), tpath, id);
+		if (MoveFile(tpath, buf))
+			tpath = buf;
+		else {
+			ret = __os_get_syserr();
+			if (__os_posix_err(ret) != ENOENT)
+				/*
+				 * System doesn't always return ENOENT when
+				 * file is missing. So we need a double check
+				 * here. Set the return value to ENOENT when
+				 * file doesn't exist.
+				 */
+				if (__os_exists(env, path, NULL) == 0)
+					__db_err(env, ret, DB_STR_A("0029",
+					    "MoveFile: "
+					    "rename %s to temporary file",
+					    "%s"), path);
+				else
+					ret = ENOENT;
+		}
+
+		/*
+		 * Try removing the file using the delete-on-close flag.  This
+		 * plays nicer with files that are still open than DeleteFile.
+		 */
+		h = CreateFile(tpath, 0,
+		    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+		    NULL, OPEN_EXISTING, FILE_FLAG_DELETE_ON_CLOSE, 0);
+		if (h != INVALID_HANDLE_VALUE) {
+			(void)CloseHandle (h);
+			if (GetFileAttributes(tpath) == INVALID_FILE_ATTRIBUTES)
+				goto skipdel;
+		}
+	}
+
+	RETRY_CHK((!DeleteFile(tpath)), ret);
+
+skipdel:
+	FREE_STRING(env, orig_tpath);
+
+	/*
+	 * XXX
+	 * We shouldn't be testing for an errno of ENOENT here, but ENOENT
+	 * signals that a file is missing, and we attempt to unlink things
+	 * (such as v. 2.x environment regions, in ENV->remove) that we
+	 * are expecting not to be there.  Reporting errors in these cases
+	 * is annoying.
+	 */
+	if ((ret != 0) && (t_ret = __os_posix_err(ret)) != ENOENT) {
+		/* Double check if the file exists. */
+		if (__os_exists(env, path, NULL) == 0) {
+			__db_syserr(env, ret, DB_STR_A("0030",
+			    "DeleteFile: %s", "%s"), path);
+			ret = t_ret;
+		} else
+			ret = ENOENT;
+	}
+
+	return (ret);
+}
diff --git a/src/os_windows/os_yield.c b/src/os_windows/os_yield.c
new file mode 100644
index 00000000..0d32ef69
--- /dev/null
+++ b/src/os_windows/os_yield.c
@@ -0,0 +1,35 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_yield --
+ *	Yield the processor, optionally pausing until running again.
+ */
+void
+__os_yield(env, secs, usecs)
+	ENV *env;
+	u_long secs, usecs;		/* Seconds and microseconds. */
+{
+	COMPQUIET(env, NULL);
+
+	/* Don't require the values be normalized. */
+	for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+		++secs;
+
+	/*
+	 * Yield the processor so other processes or threads can run.
+	 *
+	 * Sheer raving paranoia -- don't sleep for 0 time, in case some
+	 * implementation doesn't yield the processor in that case.
+	 */
+	Sleep(secs * MS_PER_SEC + (usecs / US_PER_MS) + 1);
+}
diff --git a/src/qam/qam.c b/src/qam/qam.c
new file mode 100644
index 00000000..e81d4795
--- /dev/null
+++ b/src/qam/qam.c
@@ -0,0 +1,1760 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+static int __qam_bulk __P((DBC *, DBT *, u_int32_t));
+static int __qamc_close __P((DBC *, db_pgno_t, int *));
+static int __qamc_del __P((DBC *, u_int32_t));
+static int __qamc_destroy __P((DBC *));
+static int __qamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __qamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __qam_consume __P((DBC *, QMETA *, db_recno_t));
+static int __qam_getno __P((DB *, const DBT *, db_recno_t *));
+
+#define	DONT_NEED_LOCKS(dbc) ((dbc)->txn == NULL ||			\
+	F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED))
+
+/*
+ * __qam_position --
+ *	Position a queued access method cursor at a record.  This returns
+ *	the page locked.  *exactp will be set if the record is valid.
+ * PUBLIC: int __qam_position
+ * PUBLIC:      __P((DBC *, db_recno_t *, u_int32_t, int *));
+ */
+int
+__qam_position(dbc, recnop, get_mode, exactp)
+	DBC *dbc;		/* open cursor */
+	db_recno_t *recnop;	/* pointer to recno to find */
+	u_int32_t get_mode;	/* flags to __memp_fget */
+	int *exactp;		/* indicate if it was found */
+{
+	DB *dbp;
+	QAMDATA  *qp;
+	QUEUE_CURSOR *cp;
+	db_pgno_t pg;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+
+	/* Fetch the page for this recno. */
+	cp->pgno = pg = QAM_RECNO_PAGE(dbp, *recnop);
+
+	cp->page = NULL;
+	*exactp = 0;
+	if ((ret = __qam_fget(dbc, &pg, get_mode, &cp->page)) != 0) {
+		if (!FLD_ISSET(get_mode, DB_MPOOL_CREATE) &&
+		    (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+			ret = 0;
+		return (ret);
+	}
+	cp->indx = QAM_RECNO_INDEX(dbp, pg, *recnop);
+
+	if (PGNO(cp->page) == 0) {
+		/*
+		 * We have read an uninitialized page: set the page number if
+		 * we're creating the page.  Otherwise, we know that the record
+		 * doesn't exist yet.
+		 */
+		if (!FLD_ISSET(get_mode, DB_MPOOL_CREATE)) {
+			*exactp = 0;
+			return (0);
+		}
+		DB_ASSERT(dbp->env, FLD_ISSET(get_mode, DB_MPOOL_CREATE));
+		PGNO(cp->page) = pg;
+		TYPE(cp->page) = P_QAMDATA;
+	}
+
+	qp = QAM_GET_RECORD(dbp, cp->page, cp->indx);
+	*exactp = F_ISSET(qp, QAM_VALID) ? 1 : 0;
+
+	return (ret);
+}
+
+/*
+ * __qam_pitem --
+ *	Put an item on a queue page.  Copy the data to the page and set the
+ *	VALID and SET bits.  If logging and the record was previously set,
+ *	log that data, otherwise just log the new data.
+ *
+ *   pagep must be write locked
+ *
+ * PUBLIC: int __qam_pitem
+ * PUBLIC:    __P((DBC *,  QPAGE *, u_int32_t, db_recno_t, DBT *));
+ */
+int
+__qam_pitem(dbc, pagep, indx, recno, data)
+	DBC *dbc;
+	QPAGE *pagep;
+	u_int32_t indx;
+	db_recno_t recno;
+	DBT *data;
+{
+	DB *dbp;
+	DBT olddata, pdata, *datap;
+	ENV *env;
+	QAMDATA *qp;
+	QUEUE *t;
+	u_int8_t *dest, *p;
+	int allocated, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	t = (QUEUE *)dbp->q_internal;
+	allocated = ret = 0;
+
+	if (data->size > t->re_len)
+		return (__db_rec_toobig(env, data->size, t->re_len));
+	qp = QAM_GET_RECORD(dbp, pagep, indx);
+
+	p = qp->data;
+	datap = data;
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		if (data->doff + data->dlen > t->re_len) {
+			__db_errx(env, DB_STR_A("1142",
+"Record length error: data offset plus length larger than record size of %lu",
+			    "%s %lu"), (u_long)t->re_len);
+			return (EINVAL);
+		}
+
+		if (data->size != data->dlen)
+			return (__db_rec_repl(env, data->size, data->dlen));
+
+		if (data->size == t->re_len)
+			goto no_partial;
+
+		/*
+		 * If we are logging, then we have to build the record
+		 * first, otherwise, we can simply drop the change
+		 * directly on the page.  After this clause, make
+		 * sure that datap and p are set up correctly so that
+		 * copying datap into p does the right thing.
+		 *
+		 * Note, I am changing this so that if the existing
+		 * record is not valid, we create a complete record
+		 * to log so that both this and the recovery code is simpler.
+		 */
+
+		if (DBC_LOGGING(dbc) || !F_ISSET(qp, QAM_VALID)) {
+			datap = &pdata;
+			memset(datap, 0, sizeof(*datap));
+
+			if ((ret = __os_malloc(env,
+			    t->re_len, &datap->data)) != 0)
+				return (ret);
+			allocated = 1;
+			datap->size = t->re_len;
+
+			/*
+			 * Construct the record if it's valid, otherwise set it
+			 * all to the pad character.
+			 */
+			dest = datap->data;
+			if (F_ISSET(qp, QAM_VALID))
+				memcpy(dest, p, t->re_len);
+			else
+				memset(dest, (int)t->re_pad, t->re_len);
+
+			dest += data->doff;
+			memcpy(dest, data->data, data->size);
+		} else {
+			datap = data;
+			p += data->doff;
+		}
+	}
+
+no_partial:
+	if (DBC_LOGGING(dbc)) {
+		olddata.size = 0;
+		if (F_ISSET(qp, QAM_SET)) {
+			olddata.data = qp->data;
+			olddata.size = t->re_len;
+		}
+		if ((ret = __qam_add_log(dbp, dbc->txn, &LSN(pagep),
+		    0, &LSN(pagep), pagep->pgno,
+		    indx, recno, datap, qp->flags,
+		    olddata.size == 0 ? NULL : &olddata)) != 0)
+			goto err;
+	} else if (!F_ISSET((dbc), DBC_RECOVER))
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	F_SET(qp, QAM_VALID | QAM_SET);
+	memcpy(p, datap->data, datap->size);
+	if (!F_ISSET(data, DB_DBT_PARTIAL))
+		memset(p + datap->size,
+		    (int)t->re_pad, t->re_len - datap->size);
+
+err:	if (allocated)
+		__os_free(env, datap->data);
+
+	return (ret);
+}
+/*
+ * __qamc_put
+ *	Cursor put for queued access method.
+ *	BEFORE and AFTER cannot be specified.
+ */
+static int
+__qamc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	QMETA *meta;
+	QUEUE_CURSOR *cp;
+	db_pgno_t metapg;
+	db_recno_t new_cur, new_first;
+	u_int32_t opcode;
+	int exact, ret, t_ret, writelock;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	if (pgnop != NULL)
+		*pgnop = PGNO_INVALID;
+
+	cp = (QUEUE_CURSOR *)dbc->internal;
+
+	switch (flags) {
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		if ((ret = __qam_getno(dbp, key, &cp->recno)) != 0)
+			return (ret);
+		/* FALLTHROUGH */
+	case DB_CURRENT:
+		break;
+	default:
+		/* The interface shouldn't let anything else through. */
+		return (__db_ferr(env, "DBC->put", 0));
+	}
+
+	/* Write lock the record. */
+	if ((ret = __db_lget(dbc, LCK_COUPLE,
+	    cp->recno, DB_LOCK_WRITE, DB_LOCK_RECORD, &cp->lock)) != 0)
+		return (ret);
+
+	if ((ret = __qam_position(dbc, &cp->recno,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &exact)) != 0) {
+		/* We could not get the page, we can release the record lock. */
+		(void)__LPUT(dbc, cp->lock);
+		return (ret);
+	}
+
+	if (exact != 0 && flags == DB_NOOVERWRITE)
+		ret = DB_KEYEXIST;
+	else
+		/* Put the item on the page. */
+		ret = __qam_pitem(dbc,
+		     (QPAGE *)cp->page, cp->indx, cp->recno, data);
+
+	if ((t_ret = __qam_fput(dbc,
+	    cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	cp->page = NULL;
+	cp->lock_mode = DB_LOCK_WRITE;
+	if (ret != 0)
+		return (ret);
+
+	/* Unlock the record if not in a transaction. */
+	if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+		return (ret);
+
+	/* We may need to reset the head or tail of the queue. */
+	metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+
+	writelock = 0;
+	if ((ret = __memp_fget(mpf, &metapg,
+	    dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		return (ret);
+
+	opcode = 0;
+	new_cur = new_first = 0;
+
+	/*
+	 * If the put address is outside the queue, adjust the head and
+	 * tail of the queue.  If the order is inverted we move
+	 * the one which is closer.  The first case is when the
+	 * queue is empty, move first and current to where the new
+	 * insert is.
+	 */
+
+recheck:
+	if (meta->first_recno == meta->cur_recno) {
+		new_first = cp->recno;
+		new_cur = cp->recno;
+		QAM_INC_RECNO(new_cur);
+		opcode |= QAM_SETFIRST;
+		opcode |= QAM_SETCUR;
+	} else {
+		if (QAM_BEFORE_FIRST(meta, cp->recno)) {
+			new_first = cp->recno;
+			opcode |= QAM_SETFIRST;
+		}
+
+		if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+			new_cur = cp->recno;
+			QAM_INC_RECNO(new_cur);
+			opcode |= QAM_SETCUR;
+		}
+	}
+
+	if (opcode == 0)
+		goto done;
+
+	/* Exclusive latch the metadata page. */
+	if (writelock == 0 && (ret = __memp_dirty(mpf, &meta,
+	    dbc->thread_info, dbc->txn, dbc->priority, DB_MPOOL_DIRTY)) != 0)
+		goto done;
+	if (writelock++ == 0)
+		goto recheck;
+
+	if (DBC_LOGGING(dbc) && (ret = __qam_mvptr_log(dbp, dbc->txn,
+	    &meta->dbmeta.lsn, 0, opcode, meta->first_recno,
+	    new_first, meta->cur_recno, new_cur,
+	    &meta->dbmeta.lsn, PGNO_BASE_MD)) != 0)
+		opcode = 0;
+
+	if (opcode & QAM_SETCUR)
+		meta->cur_recno = new_cur;
+	if (opcode & QAM_SETFIRST)
+		meta->first_recno = new_first;
+
+	QAM_WAKEUP(dbc, ret);
+
+done:	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qam_append --
+ *	Perform a put(DB_APPEND) in queue.
+ *
+ * PUBLIC: int __qam_append __P((DBC *, DBT *, DBT *));
+ */
+int
+__qam_append(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	QMETA *meta;
+	QPAGE *page;
+	QUEUE *qp;
+	QUEUE_CURSOR *cp;
+	db_pgno_t pg, metapg;
+	db_recno_t recno;
+	int ret, t_ret, waited;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	LOCK_INIT(lock);
+
+	/* Exclusive latch the meta page. */
+	metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+again:	if ((ret = __memp_fget(mpf, &metapg,
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+		return (ret);
+
+	/* Get the next record number. */
+	recno = meta->cur_recno;
+	QAM_INC_RECNO(meta->cur_recno);
+
+	if (meta->cur_recno == meta->first_recno) {
+		QAM_DEC_RECNO(meta->cur_recno);
+		ret = EFBIG;
+		goto err;
+	}
+
+	if (QAM_BEFORE_FIRST(meta, recno))
+		meta->first_recno = recno;
+
+	/* Lock the record. */
+	waited = 0;
+	ret = __db_lget(dbc, 0, recno,
+	    DB_LOCK_WRITE, DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+
+	/* Release the meta page. */
+	if ((t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	meta = NULL;
+	/* If we couldn't lock the record try again. */
+	if (t_ret == 0 &&
+	    (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK)) {
+		waited = 1;
+		ret = __db_lget(dbc, 0, recno,
+		    DB_LOCK_WRITE, DB_LOCK_RECORD, &lock);
+	}
+
+	/*
+	 * The application may modify the data based on the selected record
+	 * number.  We always want to call this even if we ultimately end
+	 * up aborting, because we are allocating a record number, regardless.
+	 */
+	if (dbc->dbp->db_append_recno != NULL &&
+	    (t_ret = dbc->dbp->db_append_recno(dbc->dbp, data, recno)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Capture errors from either the lock couple or the call to
+	 * dbp->db_append_recno.
+	 */
+	if (ret != 0)
+		goto err;
+
+	pg = QAM_RECNO_PAGE(dbp, recno);
+
+	/* Fetch for write the data page. */
+	if ((ret = __qam_fget(dbc, &pg,
+	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &page)) != 0)
+		goto err;
+
+	/* See if this is a new page. */
+	if (page->pgno == 0) {
+		page->pgno = pg;
+		page->type = P_QAMDATA;
+	} else if (waited && F_ISSET(QAM_GET_RECORD(
+	    dbp, page, QAM_RECNO_INDEX(dbp, pg, recno)), QAM_VALID)) {
+		/* The record is in use, try again. */
+		if ((ret = __qam_fput(dbc, pg, page, dbc->priority)) != 0)
+			goto err;
+		if ((ret = __LPUT(dbc, lock)) != 0)
+			goto err;
+		goto again;
+	}
+
+	cp->lock = lock;
+	cp->lock_mode = DB_LOCK_WRITE;
+	LOCK_INIT(lock);
+
+	/* Put the item on the page and log it. */
+	ret = __qam_pitem(dbc, page,
+	    QAM_RECNO_INDEX(dbp, pg, recno), recno, data);
+
+	if ((t_ret = __qam_fput(dbc,
+	    pg, page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Return the record number to the user. */
+	if (ret == 0 && key != NULL)
+		ret = __db_retcopy(dbp->env, key,
+		    &recno, sizeof(recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+	/* Position the cursor on this record. */
+	cp->recno = recno;
+
+	/* See if we are leaving the extent. */
+	qp = (QUEUE *) dbp->q_internal;
+	if (qp->page_ext != 0 &&
+	    (recno % (qp->page_ext * qp->rec_page) == 0 ||
+	    recno == UINT32_MAX)) {
+		if ((ret = __memp_fget(mpf, &metapg,
+		    dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+			goto err;
+		if (!QAM_AFTER_CURRENT(meta, recno))
+			if ((ret = __qam_fclose(dbp, pg)) != 0)
+				goto err;
+	}
+
+	QAM_WAKEUP(dbc, ret);
+
+err:	/* Release the meta page. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qamc_del --
+ *	Qam cursor->am_del function
+ */
+static int
+__qamc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT data;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	QAMDATA *qp;
+	QMETA *meta;
+	QUEUE_CURSOR *cp;
+	db_pgno_t metapg;
+	db_recno_t first;
+	int exact, ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+
+	metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+
+	/* Read latch the meta page. */
+	if ((ret = __memp_fget(mpf, &metapg,
+	    dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		return (ret);
+
+	if (QAM_NOT_VALID(meta, cp->recno)) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+	first = meta->first_recno;
+
+	/* Don't hold the meta page long term. */
+	if ((ret = __memp_fput(mpf,
+	     dbc->thread_info, meta, dbc->priority)) != 0)
+		goto err;
+	meta = NULL;
+
+	/* Get the record. */
+	if ((ret = __db_lget(dbc, LCK_COUPLE,
+	    cp->recno, DB_LOCK_WRITE, DB_LOCK_RECORD, &cp->lock)) != 0)
+		goto err;
+	cp->lock_mode = DB_LOCK_WRITE;
+
+	/* Find the record; delete only deletes exact matches. */
+	if ((ret = __qam_position(dbc, &cp->recno,
+	    DB_MPOOL_DIRTY, &exact)) != 0)
+		goto err;
+
+	if (!exact) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+
+	pagep = cp->page;
+	qp = QAM_GET_RECORD(dbp, pagep, cp->indx);
+
+	if (DBC_LOGGING(dbc)) {
+		if (((QUEUE *)dbp->q_internal)->page_ext == 0 ||
+		    ((QUEUE *)dbp->q_internal)->re_len == 0) {
+			if ((ret = __qam_del_log(dbp,
+			    dbc->txn, &LSN(pagep), 0, &LSN(pagep),
+			    pagep->pgno, cp->indx, cp->recno)) != 0)
+				goto err;
+		} else {
+			data.size = ((QUEUE *)dbp->q_internal)->re_len;
+			data.data = qp->data;
+			if ((ret = __qam_delext_log(dbp,
+			    dbc->txn, &LSN(pagep), 0, &LSN(pagep),
+			    pagep->pgno, cp->indx, cp->recno, &data)) != 0)
+				goto err;
+		}
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	F_CLR(qp, QAM_VALID);
+	if ((ret = __qam_fput(dbc,
+	    cp->pgno, cp->page, dbc->priority)) != 0)
+		goto err;
+	cp->page = NULL;
+
+	/*
+	 * Other threads cannot move first_recno past
+	 * our position while we have the record locked.
+	 * If it's pointing at the deleted record then get
+	 * the metapage and check again as lower numbered
+	 * record may have been inserted.
+	 */
+	if (LF_ISSET(DB_CONSUME) || cp->recno == first) {
+		if ((ret = __memp_fget(mpf, &metapg,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+		if (LF_ISSET(DB_CONSUME) || cp->recno == meta->first_recno)
+			ret = __qam_consume(dbc, meta, RECNO_OOB);
+	}
+
+err:	if (meta != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info,
+	    meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (cp->page != NULL &&
+	    (t_ret = __qam_fput(dbc,
+	    cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	cp->page = NULL;
+
+	return (ret);
+}
+
+#ifdef	DEBUG_WOP
+#define	QDEBUG
+#endif
+
+/*
+ * __qamc_get --
+ *	Queue DBC->get function.
+ */
+static int
+__qamc_get(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBC *dbcdup;
+	DBT tmp;
+	DB_LOCK lock, metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *pg;
+	QAMDATA *qp;
+	QMETA *meta;
+	QUEUE *t;
+	QUEUE_CURSOR *cp;
+	db_lockmode_t lock_mode;
+	db_pgno_t metapno;
+	db_recno_t first;
+	int exact, inorder, is_first, ret, t_ret, wait, with_delete;
+	int retrying;
+	u_int32_t skip, meta_mode;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	LOCK_INIT(lock);
+
+	lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+	meta_mode = 0;
+	meta = NULL;
+	*pgnop = 0;
+	pg = NULL;
+	retrying = t_ret = wait = with_delete = 0;
+
+	if (flags == DB_CONSUME_WAIT) {
+		wait = 1;
+		flags = DB_CONSUME;
+	}
+	if (flags == DB_CONSUME) {
+		with_delete = 1;
+		flags = DB_FIRST;
+		meta_mode = DB_MPOOL_DIRTY;
+		lock_mode = DB_LOCK_WRITE;
+	}
+	inorder = F_ISSET(dbp, DB_AM_INORDER) && with_delete;
+
+	DEBUG_LREAD(dbc, dbc->txn, "qamc_get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+	/* Make lint and friends happy. */
+	is_first = 0;
+	first = 0;
+
+	t = (QUEUE *)dbp->q_internal;
+	metapno = t->q_meta;
+
+	/*
+	 * Get the meta page first
+	 */
+	if ((ret = __memp_fget(mpf, &metapno,
+	     dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+		return (ret);
+
+	/* Release any previous lock if not in a transaction. */
+	if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+		goto err;
+
+	skip = 0;
+retry:	/* Update the record number. */
+	switch (flags) {
+	case DB_CURRENT:
+		break;
+	case DB_NEXT_DUP:
+	case DB_PREV_DUP:
+		ret = DB_NOTFOUND;
+		goto err;
+		/* NOTREACHED */
+	case DB_NEXT:
+	case DB_NEXT_NODUP:
+		if (cp->recno != RECNO_OOB) {
+			if (with_delete && !inorder &&
+			    QAM_BEFORE_FIRST(meta, cp->recno))
+				cp->recno = meta->first_recno;
+			else
+				QAM_INC_RECNO(cp->recno);
+			/*
+			 * Check to see if we are out of data.
+			 */
+			if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+				pg = NULL;
+				if (!wait) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
+				/*
+				 * If we skipped a locked record, go back and
+				 * find it.  If we find a locked record again
+				 * wait for it.
+				 */
+				if (skip == 1 &&
+				    !QAM_AFTER_CURRENT(meta, first)) {
+					retrying = 1;
+					cp->recno = first;
+					goto dolock;
+				}
+				flags = DB_FIRST;
+
+				if (CDB_LOCKING(env)) {
+					/* Drop the metapage before we wait. */
+					ret = __memp_fput(mpf, dbc->thread_info,
+					    meta, dbc->priority);
+					meta = NULL;
+					if (ret != 0)
+						goto err;
+					if ((ret = __lock_get(
+					    env, dbc->locker,
+					    DB_LOCK_SWITCH, &dbc->lock_dbt,
+					    DB_LOCK_WAIT, &dbc->mylock)) != 0)
+						goto err;
+
+					if ((ret = __lock_get(
+					    env, dbc->locker,
+					    DB_LOCK_UPGRADE, &dbc->lock_dbt,
+					    DB_LOCK_WRITE, &dbc->mylock)) != 0)
+						goto err;
+					if ((ret = __memp_fget(mpf, &metapno,
+					    dbc->thread_info,
+					    dbc->txn, meta_mode, &meta)) != 0)
+						goto err;
+					goto retry;
+				}
+
+				/*
+				 * Put us in the wait queue, when someone
+				 * adds something they will unlock it.
+				 */
+				if ((ret = __db_lget(dbc,
+				    0, PGNO_INVALID, DB_LOCK_WAIT,
+				    DB_LOCK_NOWAIT, &metalock)) != 0)
+					goto err;
+
+				/* Drop the metapage before we wait. */
+				ret = __memp_fput(mpf,
+				     dbc->thread_info, meta, dbc->priority);
+				meta = NULL;
+				if (ret != 0)
+					goto err;
+
+				/* Upgrade the lock to wait on it. */
+				if ((ret = __db_lget(dbc, 0,
+				    PGNO_INVALID, DB_LOCK_WAIT,
+				    DB_LOCK_UPGRADE, &metalock)) != 0) {
+					if (ret == DB_LOCK_DEADLOCK)
+						ret = DB_LOCK_NOTGRANTED;
+					goto err;
+				}
+
+				if ((ret = __memp_fget(mpf,
+				    &metapno, dbc->thread_info, dbc->txn,
+				    meta_mode, &meta)) != 0)
+					goto err;
+				goto retry;
+			}
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_FIRST:
+		flags = DB_NEXT;
+		is_first = 1;
+
+		/* get the first record number */
+		cp->recno = first = meta->first_recno;
+
+		break;
+	case DB_PREV:
+	case DB_PREV_NODUP:
+		if (cp->recno != RECNO_OOB) {
+			if (cp->recno == meta->first_recno ||
+			   QAM_BEFORE_FIRST(meta, cp->recno)) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+			QAM_DEC_RECNO(cp->recno);
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_LAST:
+		if (meta->first_recno == meta->cur_recno) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+		cp->recno = meta->cur_recno;
+		QAM_DEC_RECNO(cp->recno);
+		break;
+	case DB_SET:
+	case DB_SET_RANGE:
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		if ((ret = __qam_getno(dbp, key, &cp->recno)) != 0)
+			goto err;
+		break;
+	default:
+		ret = __db_unknown_flag(env, "__qamc_get", flags);
+		goto err;
+	}
+
+dolock:	if (!with_delete || inorder || retrying) {
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, meta, dbc->priority)) != 0)
+			goto err;
+		meta = NULL;
+	}
+
+	/* Lock the record. */
+	if (((ret = __db_lget(dbc, LCK_COUPLE, cp->recno, lock_mode,
+	    (with_delete && !inorder && !retrying) ?
+	    DB_LOCK_NOWAIT | DB_LOCK_RECORD : DB_LOCK_RECORD,
+	    &lock)) == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+	    with_delete) {
+#ifdef QDEBUG
+		if (DBC_LOGGING(dbc))
+			(void)__log_printf(env,
+			    dbc->txn, "Queue S: %x %u %u",
+			    dbc->locker ? dbc->locker->id : 0,
+			    cp->recno, first);
+#endif
+		skip = 1;
+		goto retry;
+	}
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * In the DB_FIRST or DB_LAST cases we must wait and then start over
+	 * since the first/last may have moved while we slept.  If we are
+	 * reading in order and the first record was not there, we can skip it
+	 * as it must have been aborted was was skipped by a non-queue insert
+	 * or we could not have gotten its lock.  If we have the wrong
+	 * record we release our locks and try again.
+	 */
+	switch (flags) {
+	default:
+		if (inorder) {
+			if (first != cp->recno)
+				break;
+		} else if (with_delete || !is_first)
+			break;
+		/* FALLTHROUGH */
+	case DB_SET:
+	case DB_SET_RANGE:
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+	case DB_LAST:
+		if ((ret = __memp_fget(mpf, &metapno,
+		     dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+			goto lerr;
+		if ((is_first && cp->recno != meta->first_recno) ||
+		    (flags == DB_LAST && cp->recno != meta->cur_recno - 1)) {
+			if ((ret = __LPUT(dbc, lock)) != 0)
+				goto err;
+			if (is_first)
+				flags = DB_FIRST;
+			goto retry;
+		} else if (!is_first && flags != DB_LAST) {
+			if (QAM_BEFORE_FIRST(meta, cp->recno)) {
+				if (flags == DB_SET_RANGE ||
+				    flags == DB_GET_BOTH_RANGE) {
+					cp->lock = lock;
+					LOCK_INIT(lock);
+					goto release_retry;
+				}
+				ret = DB_NOTFOUND;
+				goto lerr;
+			}
+			if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+				ret = DB_NOTFOUND;
+				goto lerr;
+			}
+		}
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, meta, dbc->priority)) != 0)
+			goto err;
+		meta = NULL;
+	}
+
+	/* Position the cursor on the record. */
+	if ((ret = __qam_position(dbc, &cp->recno, 0, &exact)) != 0) {
+		/* We cannot get the page, release the record lock. */
+		(void)__LPUT(dbc, lock);
+		goto err;
+	}
+
+	pg = cp->page;
+	cp->lock = lock;
+	cp->lock_mode = lock_mode;
+	LOCK_INIT(lock);
+
+	if (!exact) {
+release_retry:	/* Release locks and retry, if possible. */
+#ifdef QDEBUG
+		if (with_delete && DBC_LOGGING(dbc)) {
+			(void)__log_printf(dbp->env, dbc->txn,
+			    "Queue E: %x %u %u",
+			    dbc->locker ? dbc->locker->id : 0,
+			    cp->recno, first);
+		}
+#endif
+		if (pg != NULL)
+			(void)__qam_fput(dbc, cp->pgno, pg, dbc->priority);
+		cp->page = pg = NULL;
+		if (with_delete) {
+			if ((ret = __LPUT(dbc, cp->lock)) != 0)
+				goto err1;
+		} else if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+			goto err1;
+
+		if (meta == NULL && (ret = __memp_fget(mpf, &metapno,
+		     dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+			goto err1;
+		/*
+		 * If we don't need locks and we are out of range
+		 * then we can just skip to the FIRST/LAST record
+		 * otherwise we must iterate to lock the records
+		 * and get serializability.
+		 */
+		switch (flags) {
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+			if (!with_delete)
+				is_first = 0;
+			else if (first == cp->recno)
+				/* we have verified that this record is gone. */
+				QAM_INC_RECNO(first);
+			if (QAM_BEFORE_FIRST(meta, cp->recno) &&
+			    DONT_NEED_LOCKS(dbc))
+				flags = DB_FIRST;
+			break;
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_NODUP:
+			if (QAM_AFTER_CURRENT(meta, cp->recno) &&
+			    DONT_NEED_LOCKS(dbc))
+				flags = DB_LAST;
+			else
+				flags = DB_PREV;
+			break;
+
+		case DB_GET_BOTH_RANGE:
+		case DB_SET_RANGE:
+			if (QAM_BEFORE_FIRST(meta, cp->recno) &&
+			    DONT_NEED_LOCKS(dbc))
+				flags = DB_FIRST;
+			else
+				flags = DB_NEXT;
+			break;
+
+		default:
+			/* this is for the SET and GET_BOTH cases */
+			ret = DB_KEYEMPTY;
+			goto err1;
+		}
+		retrying = 0;
+		goto retry;
+	}
+
+	if (with_delete && cp->recno == first) {
+		if (meta == NULL &&
+		     (ret = __memp_fget(mpf, &metapno, dbc->thread_info,
+		     dbc->txn, DB_MPOOL_DIRTY | DB_MPOOL_TRY, &meta)) != 0) {
+			if (ret == DB_LOCK_NOTGRANTED) {
+				first = RECNO_OOB;
+				ret = 0;
+			} else
+				goto err;
+		}
+		if (meta != NULL && cp->recno != meta->cur_recno) {
+			if (DBC_LOGGING(dbc)) {
+#ifdef QDEBUG
+				(void)__log_printf(dbp->env, dbc->txn,
+				    "Queue I: %x %u %u %u",
+				    dbc->locker ? dbc->locker->id : 0,
+				    cp->recno, first, meta->cur_recno);
+#endif
+				if ((ret = __qam_incfirst_log(dbp,
+				    dbc->txn, &meta->dbmeta.lsn, 0,
+				    cp->recno, PGNO_BASE_MD)) != 0)
+					goto err;
+			} else
+				LSN_NOT_LOGGED(meta->dbmeta.lsn);
+
+			meta->first_recno = cp->recno;
+			QAM_INC_RECNO(meta->first_recno);
+		}
+	}
+	if (meta != NULL) {
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, meta, dbc->priority)) != 0)
+			goto err;
+		meta = NULL;
+	}
+
+	qp = QAM_GET_RECORD(dbp, pg, cp->indx);
+
+	/* Return the data item. */
+	if (flags == DB_GET_BOTH || flags == DB_GET_BOTH_RANGE) {
+		/*
+		 * Need to compare
+		 */
+		tmp.data = qp->data;
+		tmp.size = t->re_len;
+		if ((ret = __bam_defcmp(dbp, data, &tmp)) != 0) {
+			if (flags == DB_GET_BOTH_RANGE)
+				goto release_retry;
+			ret = DB_NOTFOUND;
+			goto err1;
+		}
+	}
+
+	/* Return the key if the user didn't give us one. */
+	if (key != NULL && !F_ISSET(key, DB_DBT_ISSET)) {
+		if ((ret = __db_retcopy(dbp->env,
+		    key, &cp->recno, sizeof(cp->recno),
+		    &dbc->rkey->data, &dbc->rkey->ulen)) != 0)
+			goto err1;
+		F_SET(key, DB_DBT_ISSET);
+	}
+
+	if (data != NULL &&
+	    !F_ISSET(dbc, DBC_MULTIPLE|DBC_MULTIPLE_KEY) &&
+	    !F_ISSET(data, DB_DBT_ISSET)) {
+		if ((ret = __db_retcopy(dbp->env, data, qp->data, t->re_len,
+		    &dbc->rdata->data, &dbc->rdata->ulen)) != 0)
+			goto err1;
+		F_SET(data, DB_DBT_ISSET);
+	}
+
+	/* Finally, if we are doing DB_CONSUME mark the record. */
+	if (with_delete) {
+		/*
+		 * Assert that we're not a secondary index.  Doing a DB_CONSUME
+		 * on a secondary makes very little sense, since one can't
+		 * DB_APPEND there;  attempting one should be forbidden by
+		 * the interface.
+		 */
+		DB_ASSERT(env, !F_ISSET(dbp, DB_AM_SECONDARY));
+
+		/*
+		 * If we have any secondary indices, call __dbc_del_primary to
+		 * delete the references to the item we're about to delete.
+		 *
+		 * Note that we work on a duplicated cursor, since the
+		 * __db_ret work has already been done, so it's not safe
+		 * to perform any additional ops on this cursor.
+		 */
+		if (DB_IS_PRIMARY(dbp)) {
+			if ((ret = __dbc_idup(dbc,
+			    &dbcdup, DB_POSITION)) != 0)
+				goto err1;
+
+			if ((ret = __qam_fput(dbc,
+			    cp->pgno, cp->page, dbc->priority)) != 0)
+				goto err1;
+			cp->page = NULL;
+			if (meta != NULL &&
+			    (ret = __memp_fput(mpf,
+			    dbc->thread_info, meta, dbc->priority)) != 0)
+				goto err1;
+			meta = NULL;
+			if ((ret = __dbc_del_primary(dbcdup)) != 0) {
+				/*
+				 * The __dbc_del_primary return is more
+				 * interesting.
+				 */
+				(void)__dbc_close(dbcdup);
+				goto err1;
+			}
+
+			if ((ret = __dbc_close(dbcdup)) != 0)
+				goto err1;
+			if ((ret = __qam_fget(dbc,
+			    &cp->pgno, DB_MPOOL_DIRTY, &cp->page)) != 0)
+				goto err;
+		} else if ((ret = __qam_dirty(dbc,
+		     cp->pgno, &cp->page, dbc->priority)) != 0)
+			goto err1;
+
+		pg = cp->page;
+
+		if (DBC_LOGGING(dbc)) {
+			if (t->page_ext == 0 || t->re_len == 0) {
+				if ((ret = __qam_del_log(dbp, dbc->txn,
+				    &LSN(pg), 0, &LSN(pg),
+				    pg->pgno, cp->indx, cp->recno)) != 0)
+					goto err1;
+			} else {
+				tmp.data = qp->data;
+				tmp.size = t->re_len;
+				if ((ret = __qam_delext_log(dbp,
+				   dbc->txn, &LSN(pg), 0, &LSN(pg),
+				   pg->pgno, cp->indx, cp->recno, &tmp)) != 0)
+					goto err1;
+			}
+		} else
+			LSN_NOT_LOGGED(LSN(pg));
+
+		F_CLR(qp, QAM_VALID);
+		if ((ret = __qam_fput(dbc,
+		    cp->pgno, cp->page, dbc->priority)) != 0)
+			goto err;
+		cp->page = NULL;
+
+		/*
+		 * Clean up the first pointer, need to check two things:
+		 * Are we leaving an page or an extent?
+		 * Is  the first pointer is beyond the first one we looked at?
+		 * If we deleted the first record we checked then we moved
+		 * the first pointer  properly.
+		 */
+
+		if (first == cp->recno && (skip = (first % t->rec_page)) != 0)
+			goto done;
+		if (meta == NULL &&
+		     (ret = __memp_fget(mpf, &metapno,
+		     dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+			goto err;
+		if (skip && !QAM_BEFORE_FIRST(meta, first))
+			goto done;
+
+#ifdef QDEBUG
+		if (DBC_LOGGING(dbc))
+			(void)__log_printf(env,
+			    dbc->txn, "Queue D: %x %u %u %u",
+			    dbc->locker ? dbc->locker->id : 0,
+			    cp->recno, first, meta->first_recno);
+#endif
+		ret = __qam_consume(dbc, meta, first);
+	}
+
+err1:	if (cp->page != NULL) {
+		if ((t_ret = __qam_fput(dbc,
+		    cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+
+		cp->page = NULL;
+	}
+	if (0) {
+lerr:		(void)__LPUT(dbc, lock);
+	}
+
+done:
+err:	if (meta) {
+		/* Release the meta page. */
+		if ((t_ret = __memp_fput(mpf,
+		    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+	    DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+/*
+ * __qam_consume -- try to reset the head of the queue.
+ *
+ */
+static int
+__qam_consume(dbc, meta, first)
+	DBC *dbc;
+	QMETA *meta;
+	db_recno_t first;
+{
+	DB *dbp;
+	DB_LOCK lock, save_lock;
+	DB_MPOOLFILE *mpf;
+	QUEUE_CURSOR *cp;
+	db_indx_t save_indx;
+	db_pgno_t save_page;
+	db_recno_t current, save_first, save_recno;
+	u_int32_t rec_extent;
+	int exact, ret, t_ret, wrapped;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	ret = 0;
+
+	save_page = cp->pgno;
+	save_indx = cp->indx;
+	save_recno = cp->recno;
+	save_lock = cp->lock;
+	save_first = first;
+
+	/*
+	 * We call this routine for two reasons:
+	 *  1) to toss pages and extents as we leave them.
+	 *  2) update meta->first_recno.
+	 * We do not need to update first_recno if we deleted
+	 * the first record we tried since we updated it then.
+	 * If we are not going to update meta->first_recno we
+	 * do not need an exclusive latch.
+	 */
+	if (first != cp->recno && (ret = __memp_dirty(mpf,
+	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+	/*
+	 * If we skipped some deleted records, we need to
+	 * reposition on the first one.  Get a lock
+	 * in case someone is trying to put it back.
+	 */
+	if (first == RECNO_OOB || !QAM_BEFORE_FIRST(meta, first))
+		first = meta->first_recno;
+
+	if (first != cp->recno) {
+		ret = __db_lget(dbc, 0, first, DB_LOCK_READ,
+		    DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+		if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+			ret = 0;
+			goto done;
+		}
+		if (ret != 0)
+			goto err;
+		if (cp->page != NULL && (ret =
+		    __qam_fput(dbc, cp->pgno, cp->page, dbc->priority)) != 0)
+			goto err;
+		cp->page = NULL;
+		if ((ret = __qam_position(dbc, &first, 0, &exact)) != 0) {
+			(void)__LPUT(dbc, lock);
+			goto err;
+		}
+		if ((ret = __LPUT(dbc, lock)) != 0)
+			goto err;
+		if (exact != 0)
+			goto done;
+	}
+
+	current = meta->cur_recno;
+	wrapped = 0;
+	if (first > current)
+		wrapped = 1;
+	rec_extent = meta->page_ext * meta->rec_page;
+
+	/* Loop until we find a record or hit current */
+	for (;;) {
+		/*
+		 * Check to see if we are moving off the extent
+		 * and remove the extent.
+		 * If we are moving off a page we need to
+		 * get rid of the buffer.
+		 */
+		if (rec_extent != 0 &&
+		    ((exact = (first % rec_extent == 0)) ||
+		    (first % meta->rec_page == 0) ||
+		    first == UINT32_MAX)) {
+#ifdef QDEBUG
+			if (DBC_LOGGING(dbc))
+				(void)__log_printf(dbp->env, dbc->txn,
+				    "Queue R: %x %u %u %u",
+				    dbc->locker ? dbc->locker->id : 0,
+				    cp->pgno, first, meta->first_recno);
+#endif
+			if (cp->page != NULL && (ret = __qam_fput(dbc,
+			    cp->pgno, cp->page, DB_PRIORITY_VERY_LOW)) != 0)
+				break;
+			cp->page = NULL;
+
+			if (exact == 1 &&
+			    (ret = __qam_fremove(dbp, cp->pgno)) != 0)
+				break;
+		} else if (cp->page != NULL && (ret = __qam_fput(dbc,
+		     cp->pgno, cp->page, dbc->priority)) != 0)
+			break;
+		cp->page = NULL;
+		first++;
+		if (first == RECNO_OOB) {
+			wrapped = 0;
+			first++;
+		}
+
+		/*
+		 * LOOP EXIT when we come move to the current
+		 * pointer.
+		 */
+		if (!wrapped && first >= current)
+			break;
+
+		ret = __db_lget(dbc, 0, first, DB_LOCK_READ,
+		    DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+		if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+			ret = 0;
+			break;
+		}
+		if (ret != 0)
+			break;
+
+		if ((ret = __qam_position(dbc, &first, 0, &exact)) != 0) {
+			(void)__LPUT(dbc, lock);
+			break;
+		}
+		if ((ret =__LPUT(dbc, lock)) != 0 || exact) {
+			if ((t_ret = __qam_fput(dbc, cp->pgno,
+			    cp->page, dbc->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			cp->page = NULL;
+			break;
+		}
+	}
+
+	cp->pgno = save_page;
+	cp->indx = save_indx;
+	cp->recno = save_recno;
+	cp->lock = save_lock;
+
+done:
+	/*
+	 * We have advanced as far as we can.
+	 * Advance first_recno to this point.
+	 */
+	if (ret == 0 && meta->first_recno != first && save_first != cp->recno) {
+		if (DBC_LOGGING(dbc)) {
+#ifdef QDEBUG
+			(void)__log_printf(dbp->env, dbc->txn,
+			    "Queue M: %x %u %u %u",
+			    dbc->locker ? dbc->locker->id : 0,
+			    cp->recno, first, meta->first_recno);
+#endif
+			if ((ret = __qam_incfirst_log(dbp,
+			    dbc->txn, &meta->dbmeta.lsn, 0,
+			    first, PGNO_BASE_MD)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(meta->dbmeta.lsn);
+		meta->first_recno = first;
+	}
+
+err:
+	return (ret);
+}
+
+static int
+__qam_bulk(dbc, data, flags)
+	DBC *dbc;
+	DBT *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_LOCK rlock;
+	DB_MPOOLFILE *mpf;
+	PAGE *pg;
+	QAMDATA *qp;
+	QMETA *meta;
+	QUEUE_CURSOR *cp;
+	db_indx_t indx;
+	db_lockmode_t lkmode;
+	db_pgno_t metapno;
+	u_int32_t  *endp, *offp;
+	u_int32_t pagesize, re_len, recs;
+	u_int8_t *dbuf, *dp, *np;
+	int exact, ret, t_ret, valid;
+	int is_key, need_pg, size, space;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+
+	lkmode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+
+	pagesize = dbp->pgsize;
+	re_len = ((QUEUE *)dbp->q_internal)->re_len;
+	recs = ((QUEUE *)dbp->q_internal)->rec_page;
+	metapno = ((QUEUE *)dbp->q_internal)->q_meta;
+
+	is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+	size = 0;
+
+	dbuf = data->data;
+	np = dp = dbuf;
+
+	/* Keep track of space that is left.  There is an termination entry */
+	space = (int)data->ulen;
+	space -= (int)sizeof(*offp);
+
+	/* Build the offset/size table from the end up. */
+	endp = (u_int32_t *)((u_int8_t *)dbuf + data->ulen);
+	endp--;
+	offp = endp;
+	/* Save the lock on the current position of the cursor. */
+	rlock = cp->lock;
+	LOCK_INIT(cp->lock);
+
+	if ((ret = __memp_fget(mpf, &metapno,
+	     dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		return (ret);
+
+next_pg:
+	/* Wrap around, skipping zero. */
+	if (cp->recno == RECNO_OOB)
+		cp->recno++;
+	if ((ret = __qam_position(dbc, &cp->recno, 0, &exact)) != 0)
+		goto done;
+
+	pg = cp->page;
+	indx = cp->indx;
+	need_pg = 1;
+
+	do {
+		/*
+		 * If this page is a nonexistent page at the end of an
+		 * extent, pg may be NULL.  A NULL page has no valid records,
+		 * so just keep looping as though qp exists and isn't QAM_VALID;
+		 * calling QAM_GET_RECORD is unsafe.
+		 */
+		valid = 0;
+
+		if (pg != NULL) {
+			if ((ret = __db_lget(dbc, LCK_COUPLE, cp->recno, lkmode,
+			    DB_LOCK_NOWAIT | DB_LOCK_RECORD, &rlock)) != 0) {
+				if (ret != DB_LOCK_NOTGRANTED &&
+				    ret != DB_LOCK_DEADLOCK)
+					goto done;
+				/* If we put anything in the buffer return. */
+				if (offp != endp)
+					break;
+				if ((ret = __memp_fput(mpf, dbc->thread_info,
+				     meta, dbc->priority)) != 0)
+					goto done;
+				meta = NULL;
+				if ((ret = __db_lget(dbc, LCK_COUPLE, cp->recno,
+				    lkmode, DB_LOCK_RECORD, &rlock)) != 0)
+					goto done;
+				if ((ret = __memp_fget(mpf,
+				    &metapno, dbc->thread_info,
+				    dbc->txn, 0, &meta)) != 0)
+					goto done;
+			}
+			qp = QAM_GET_RECORD(dbp, pg, indx);
+			if (F_ISSET(qp, QAM_VALID)) {
+				valid = 1;
+				space -= (int)
+				    ((is_key ? 3 : 2) * sizeof(*offp));
+				if (space < 0)
+					goto get_space;
+				if (need_pg) {
+					dp = np;
+					size = (int)pagesize - QPAGE_SZ(dbp);
+					if (space < size) {
+get_space:
+						if (offp == endp) {
+							data->size = (u_int32_t)
+							    DB_ALIGN((u_int32_t)
+							    size + pagesize,
+							    sizeof(u_int32_t));
+							ret = DB_BUFFER_SMALL;
+							break;
+						}
+						if (indx != 0)
+							indx--;
+						cp->recno--;
+						space = 0;
+						break;
+					}
+					memcpy(dp,
+					    (u_int8_t *)pg + QPAGE_SZ(dbp),
+					    (u_int)size);
+					need_pg = 0;
+					space -= size;
+					np += size;
+				}
+				if (is_key)
+					*offp-- = cp->recno;
+				*offp-- = (u_int32_t)((((u_int8_t *)qp -
+				    (u_int8_t *)pg) - QPAGE_SZ(dbp)) +
+				    (dp - dbuf) + SSZA(QAMDATA, data));
+				*offp-- = re_len;
+			}
+		}
+		if (!valid && is_key == 0) {
+			*offp-- = 0;
+			*offp-- = 0;
+		}
+		cp->recno++;
+	} while (++indx < recs && cp->recno != RECNO_OOB &&
+	    !QAM_AFTER_CURRENT(meta, cp->recno));
+
+	if (cp->page != NULL) {
+		if ((t_ret = __qam_fput(dbc,
+		    cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		cp->page = NULL;
+	}
+
+	if (ret == 0 && space > 0 &&
+	    (indx >= recs || cp->recno == RECNO_OOB) &&
+	    !QAM_AFTER_CURRENT(meta, cp->recno))
+		goto next_pg;
+
+	/*
+	 * Correct recno in two cases:
+	 * 1) If we just wrapped fetch must start at record 1 not a FIRST.
+	 * 2) We ran out of space exactly at the end of a page.
+	 */
+	if (cp->recno == RECNO_OOB || (space == 0 && indx == recs))
+		cp->recno--;
+
+	if (is_key == 1)
+		*offp = RECNO_OOB;
+	else
+		*offp = (u_int32_t)-1;
+
+done:	/* Release the meta page. */
+	if ((t_ret = __memp_fput(mpf,
+	     dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	cp->lock = rlock;
+
+	return (ret);
+}
+
+/*
+ * __qamc_close --
+ *	Close down the cursor from a single use.
+ */
+static int
+__qamc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	QUEUE_CURSOR *cp;
+	int ret;
+
+	COMPQUIET(root_pgno, 0);
+	COMPQUIET(rmroot, NULL);
+
+	cp = (QUEUE_CURSOR *)dbc->internal;
+
+	/* Discard any locks not acquired inside of a transaction. */
+	ret = __TLPUT(dbc, cp->lock);
+
+	LOCK_INIT(cp->lock);
+	cp->page = NULL;
+	cp->pgno = PGNO_INVALID;
+	cp->indx = 0;
+	cp->lock_mode = DB_LOCK_NG;
+	cp->recno = RECNO_OOB;
+	cp->flags = 0;
+
+	return (ret);
+}
+
+/*
+ * __qamc_dup --
+ *	Duplicate a queue cursor, such that the new one holds appropriate
+ *	locks for the position of the original.
+ *
+ * PUBLIC: int __qamc_dup __P((DBC *, DBC *));
+ */
+int
+__qamc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	QUEUE_CURSOR *orig, *new;
+
+	orig = (QUEUE_CURSOR *)orig_dbc->internal;
+	new = (QUEUE_CURSOR *)new_dbc->internal;
+
+	new->recno = orig->recno;
+
+	return (0);
+}
+
+/*
+ * __qamc_init
+ *
+ * PUBLIC: int __qamc_init __P((DBC *));
+ */
+int
+__qamc_init(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	QUEUE_CURSOR *cp;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	/* Allocate the internal structure. */
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	if (cp == NULL) {
+		if ((ret =
+		    __os_calloc(dbp->env, 1, sizeof(QUEUE_CURSOR), &cp)) != 0)
+			return (ret);
+		dbc->internal = (DBC_INTERNAL *)cp;
+	}
+
+	/* Initialize methods. */
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __dbc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	dbc->am_bulk = __qam_bulk;
+	dbc->am_close = __qamc_close;
+	dbc->am_del = __qamc_del;
+	dbc->am_destroy = __qamc_destroy;
+	dbc->am_get = __qamc_get;
+	dbc->am_put = __qamc_put;
+	dbc->am_writelock = NULL;
+
+	return (0);
+}
+
+/*
+ * __qamc_destroy --
+ *	Close a single cursor -- internal version.
+ */
+static int
+__qamc_destroy(dbc)
+	DBC *dbc;
+{
+	/* Discard the structures. */
+	__os_free(dbc->env, dbc->internal);
+
+	return (0);
+}
+
+/*
+ * __qam_getno --
+ *	Check the user's record number.
+ */
+static int
+__qam_getno(dbp, key, rep)
+	DB *dbp;
+	const DBT *key;
+	db_recno_t *rep;
+{
+	/* If passed an empty DBT from Java, key->data may be NULL */
+	if (key->size != sizeof(db_recno_t)) {
+		__db_errx(dbp->env, DB_STR("1143",
+		    "illegal record number size"));
+		return (EINVAL);
+	}
+
+	if ((*rep = *(db_recno_t *)key->data) == 0) {
+		__db_errx(dbp->env, DB_STR("1144",
+		    "illegal record number of 0"));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __qam_truncate --
+ *	Truncate a queue database
+ *
+ * PUBLIC: int __qam_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__qam_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	QMETA *meta;
+	db_pgno_t metapno;
+	u_int32_t count;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	/* Walk the queue, counting rows. */
+	for (count = 0;
+	    (ret = __qamc_get(dbc, NULL, NULL, DB_CONSUME, &metapno)) == 0;)
+		count++;
+	if (ret != DB_NOTFOUND)
+		return (ret);
+
+	mpf = dbp->mpf;
+	/* Update the meta page. */
+	metapno = ((QUEUE *)dbp->q_internal)->q_meta;
+	if ((ret = __memp_fget(mpf, &metapno, dbc->thread_info, dbc->txn,
+	    DB_MPOOL_DIRTY, &meta)) != 0)
+		return (ret);
+
+	/* Remove the last extent file. */
+	if (meta->cur_recno > 1 && ((QUEUE *)dbp->q_internal)->page_ext != 0) {
+		if ((ret = __qam_fremove(dbp,
+		    QAM_RECNO_PAGE(dbp, meta->cur_recno - 1))) != 0)
+			goto err;
+	}
+
+	if (DBC_LOGGING(dbc)) {
+		ret = __qam_mvptr_log(dbp, dbc->txn, &meta->dbmeta.lsn, 0,
+		    QAM_SETCUR | QAM_SETFIRST | QAM_TRUNCATE, meta->first_recno,
+		    1, meta->cur_recno, 1, &meta->dbmeta.lsn, PGNO_BASE_MD);
+	} else
+		LSN_NOT_LOGGED(meta->dbmeta.lsn);
+	if (ret == 0)
+		meta->first_recno = meta->cur_recno = 1;
+
+err:	if ((t_ret = __memp_fput(mpf,
+	     dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (countp != NULL)
+		*countp = count;
+
+	return (ret);
+}
+
+/*
+ * __qam_delete --
+ *	Queue fast delete function.
+ *
+ * PUBLIC: int __qam_delete __P((DBC *,  DBT *, u_int32_t));
+ */
+int
+__qam_delete(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	QUEUE_CURSOR *cp;
+	int ret;
+
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	if ((ret = __qam_getno(dbc->dbp, key, &cp->recno)) != 0)
+		goto err;
+
+	ret = __qamc_del(dbc, flags);
+
+err:	return (ret);
+}
diff --git a/src/qam/qam.src b/src/qam/qam.src
new file mode 100644
index 00000000..a8e2e4e0
--- /dev/null
+++ b/src/qam/qam.src
@@ -0,0 +1,89 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__qam
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/qam.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * incfirst
+ * Used when we increment first_recno.
+ */
+BEGIN incfirst		42	84
+DB	fileid		int32_t		ld
+ARG	recno		db_recno_t	lu
+ARG	meta_pgno	db_pgno_t	lu
+END
+
+/*
+ * mvptr
+ * Used when we change one or both of cur_recno and first_recno.
+ */
+BEGIN mvptr		42	85
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	old_first	db_recno_t	lu
+ARG	new_first	db_recno_t	lu
+ARG	old_cur		db_recno_t	lu
+ARG	new_cur		db_recno_t	lu
+POINTER	metalsn		DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+END
+
+
+/*
+ * del
+ * Used when we delete a record.
+ * recno is the record that is being deleted.
+ */
+BEGIN del		42	79
+DB	fileid		int32_t		ld
+POINTER	lsn		DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	recno		db_recno_t	lu
+END
+
+/*
+ * add
+ * Used when we put a record on a page.
+ * recno is the record being added.
+ * data is the record itself.
+ */
+BEGIN add		42	80
+DB	fileid		int32_t		ld
+POINTER	lsn		DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	recno		db_recno_t	lu
+DBT	data		DBT		s
+ARG	vflag		u_int32_t	lu
+DBT	olddata		DBT		s
+END
+
+/*
+ * delext
+ * Used when we delete a record in extent based queue.
+ * recno is the record that is being deleted.
+ */
+BEGIN delext		42	83
+DB	fileid		int32_t		ld
+POINTER	lsn		DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	recno		db_recno_t	lu
+DBT	data		DBT		s
+END
diff --git a/src/qam/qam_auto.c b/src/qam/qam_auto.c
new file mode 100644
index 00000000..604ad3f4
--- /dev/null
+++ b/src/qam/qam_auto.c
@@ -0,0 +1,83 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __qam_incfirst_desc[] = {
+	{LOGREC_DB, SSZ(__qam_incfirst_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__qam_incfirst_args, recno), "recno", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_incfirst_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_mvptr_desc[] = {
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, opcode), "opcode", "%lu"},
+	{LOGREC_DB, SSZ(__qam_mvptr_args, fileid), "fileid", ""},
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, old_first), "old_first", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, new_first), "new_first", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, old_cur), "old_cur", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, new_cur), "new_cur", "%lu"},
+	{LOGREC_POINTER, SSZ(__qam_mvptr_args, metalsn), "metalsn", ""},
+	{LOGREC_ARG, SSZ(__qam_mvptr_args, meta_pgno), "meta_pgno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_del_desc[] = {
+	{LOGREC_DB, SSZ(__qam_del_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__qam_del_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__qam_del_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_del_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_del_args, recno), "recno", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_add_desc[] = {
+	{LOGREC_DB, SSZ(__qam_add_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__qam_add_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__qam_add_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_add_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_add_args, recno), "recno", "%lu"},
+	{LOGREC_DBT, SSZ(__qam_add_args, data), "data", ""},
+	{LOGREC_ARG, SSZ(__qam_add_args, vflag), "vflag", "%lu"},
+	{LOGREC_DBT, SSZ(__qam_add_args, olddata), "olddata", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_delext_desc[] = {
+	{LOGREC_DB, SSZ(__qam_delext_args, fileid), "fileid", ""},
+	{LOGREC_POINTER, SSZ(__qam_delext_args, lsn), "lsn", ""},
+	{LOGREC_ARG, SSZ(__qam_delext_args, pgno), "pgno", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_delext_args, indx), "indx", "%lu"},
+	{LOGREC_ARG, SSZ(__qam_delext_args, recno), "recno", "%lu"},
+	{LOGREC_DBT, SSZ(__qam_delext_args, data), "data", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __qam_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_incfirst_recover, DB___qam_incfirst)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_mvptr_recover, DB___qam_mvptr)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_del_recover, DB___qam_del)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_add_recover, DB___qam_add)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_delext_recover, DB___qam_delext)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/qam/qam_autop.c b/src/qam/qam_autop.c
new file mode 100644
index 00000000..123a0a37
--- /dev/null
+++ b/src/qam/qam_autop.c
@@ -0,0 +1,126 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_QUEUE
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __qam_incfirst_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_incfirst_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__qam_incfirst", __qam_incfirst_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_mvptr_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_mvptr_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__qam_mvptr", __qam_mvptr_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_del_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_del_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__qam_del", __qam_del_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_add_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_add_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__qam_add", __qam_add_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_delext_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__qam_delext_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__qam_delext", __qam_delext_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_incfirst_print, DB___qam_incfirst)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_mvptr_print, DB___qam_mvptr)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_del_print, DB___qam_del)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_add_print, DB___qam_add)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __qam_delext_print, DB___qam_delext)) != 0)
+		return (ret);
+	return (0);
+}
+#endif /* HAVE_QUEUE */
diff --git a/src/qam/qam_conv.c b/src/qam/qam_conv.c
new file mode 100644
index 00000000..beb7c973
--- /dev/null
+++ b/src/qam/qam_conv.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+
+/*
+ * __qam_mswap --
+ *	Swap the bytes on the queue metadata page.
+ *
+ * PUBLIC: int __qam_mswap __P((ENV *, PAGE *));
+ */
+int
+__qam_mswap(env, pg)
+	ENV *env;
+	PAGE *pg;
+{
+	u_int8_t *p;
+
+	COMPQUIET(env, NULL);
+
+	 __db_metaswap(pg);
+	 p = (u_int8_t *)pg + sizeof(DBMETA);
+
+	SWAP32(p);		/* first_recno */
+	SWAP32(p);		/* cur_recno */
+	SWAP32(p);		/* re_len */
+	SWAP32(p);		/* re_pad */
+	SWAP32(p);		/* rec_page */
+	SWAP32(p);		/* page_ext */
+	p += 91 * sizeof(u_int32_t); /* unused */
+	SWAP32(p);		/* crypto_magic */
+
+	return (0);
+}
+
+/*
+ * __qam_pgin_out --
+ *	Convert host-specific page layout to/from the host-independent format
+ *	stored on disk.
+ *  We only need to fix up a few fields in the header
+ *
+ * PUBLIC: int __qam_pgin_out __P((ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__qam_pgin_out(env, pg, pp, cookie)
+	ENV *env;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB_PGINFO *pginfo;
+	QPAGE *h;
+
+	COMPQUIET(pg, 0);
+	pginfo = (DB_PGINFO *)cookie->data;
+	if (!F_ISSET(pginfo, DB_AM_SWAP))
+		return (0);
+
+	h = pp;
+	if (h->type == P_QAMMETA)
+	    return (__qam_mswap(env, pp));
+
+	M_32_SWAP(h->lsn.file);
+	M_32_SWAP(h->lsn.offset);
+	M_32_SWAP(h->pgno);
+
+	return (0);
+}
diff --git a/src/qam/qam_files.c b/src/qam/qam_files.c
new file mode 100644
index 00000000..e9a9ff07
--- /dev/null
+++ b/src/qam/qam_files.c
@@ -0,0 +1,939 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+#define	QAM_EXNAME(Q, I, B, L)						\
+	snprintf((B), (L),						\
+	    QUEUE_EXTENT, (Q)->dir, PATH_SEPARATOR[0], (Q)->name, (I))
+
+/*
+ * __qam_fprobe -- calculate and open extent
+ *
+ * Calculate which extent the page is in, open and create if necessary.
+ *
+ * PUBLIC: int __qam_fprobe __P((DBC *, db_pgno_t,
+ * PUBLIC:     void *, qam_probe_mode, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__qam_fprobe(dbc, pgno, addrp, mode, priority, flags)
+	DBC *dbc;
+	db_pgno_t pgno;
+	void *addrp;
+	qam_probe_mode mode;
+	DB_CACHE_PRIORITY priority;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	MPFARRAY *array;
+	QUEUE *qp;
+	u_int8_t fid[DB_FILE_ID_LEN];
+	u_int32_t i, extid, maxext, numext, lflags, offset, oldext, openflags;
+	char buf[DB_MAXPATHLEN];
+	int ftype, less, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	qp = (QUEUE *)dbp->q_internal;
+	ret = 0;
+
+	if (qp->page_ext == 0) {
+		mpf = dbp->mpf;
+		switch (mode) {
+		case QAM_PROBE_GET:
+			return (__memp_fget(mpf, &pgno,
+			    dbc->thread_info, dbc->txn, flags, addrp));
+		case QAM_PROBE_PUT:
+			return (__memp_fput(mpf,
+			    dbc->thread_info, addrp, priority));
+		case QAM_PROBE_DIRTY:
+			return (__memp_dirty(mpf, addrp,
+			    dbc->thread_info, dbc->txn, priority, flags));
+		case QAM_PROBE_MPF:
+			*(DB_MPOOLFILE **)addrp = mpf;
+			return (0);
+		}
+	}
+
+	mpf = NULL;
+
+	/*
+	 * Need to lock long enough to find the mpf or create the file.
+	 * The file cannot go away because we must have a record locked
+	 * in that file.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+	extid = QAM_PAGE_EXTENT(dbp, pgno);
+
+	/* Array1 will always be in use if array2 is in use. */
+	array = &qp->array1;
+	if (array->n_extent == 0) {
+		/* Start with 4 extents */
+		array->n_extent = 4;
+		array->low_extent = extid;
+		numext = offset = oldext = 0;
+		less = 0;
+		goto alloc;
+	}
+
+retry:
+	if (extid < array->low_extent) {
+		less = 1;
+		offset = array->low_extent - extid;
+	} else {
+		less = 0;
+		offset = extid - array->low_extent;
+	}
+	if (qp->array2.n_extent != 0 &&
+	    (extid >= qp->array2.low_extent ?
+	    offset > extid - qp->array2.low_extent :
+	    offset > qp->array2.low_extent - extid)) {
+		array = &qp->array2;
+		if (extid < array->low_extent) {
+			less = 1;
+			offset = array->low_extent - extid;
+		} else {
+			less = 0;
+			offset = extid - array->low_extent;
+		}
+	}
+
+	/*
+	 * Check to see if the requested extent is outside the range of
+	 * extents in the array.  This is true by default if there are
+	 * no extents here yet.
+	 */
+	if (less == 1 || offset >= array->n_extent) {
+		oldext = array->n_extent;
+		numext = (array->hi_extent - array->low_extent) + 1;
+		if (less == 1 && offset + numext <= array->n_extent) {
+			/*
+			 * If we can fit this one into the existing array by
+			 * shifting the existing entries then we do not have
+			 * to allocate.
+			 */
+			memmove(&array->mpfarray[offset],
+			    array->mpfarray, numext
+			    * sizeof(array->mpfarray[0]));
+			memset(array->mpfarray, 0, offset
+			    * sizeof(array->mpfarray[0]));
+			offset = 0;
+		} else if (less == 0 && offset == array->n_extent &&
+		    (mode == QAM_PROBE_GET || mode == QAM_PROBE_PUT) &&
+		    array->mpfarray[0].pinref == 0) {
+			/*
+			 * If this is at the end of the array and the file at
+			 * the beginning has a zero pin count we can close
+			 * the bottom extent and put this one at the end.
+			 */
+			mpf = array->mpfarray[0].mpf;
+			if (mpf != NULL && (ret = __memp_fclose(mpf, 0)) != 0)
+				goto err;
+			memmove(&array->mpfarray[0], &array->mpfarray[1],
+			    (array->n_extent - 1) * sizeof(array->mpfarray[0]));
+			array->low_extent++;
+			array->hi_extent++;
+			offset--;
+			array->mpfarray[offset].mpf = NULL;
+			array->mpfarray[offset].pinref = 0;
+		} else {
+			/*
+			 * See if we have wrapped around the queue.
+			 * If it has then allocate the second array.
+			 * Otherwise just expand the one we are using.
+			 */
+			maxext = (u_int32_t) UINT32_MAX
+			    / (qp->page_ext * qp->rec_page);
+			if (offset >= maxext/2) {
+				array = &qp->array2;
+				DB_ASSERT(env, array->n_extent == 0);
+				oldext = 0;
+				array->n_extent = 4;
+				array->low_extent = extid;
+				offset = 0;
+				numext = 0;
+			} else if (array->mpfarray[0].pinref == 0) {
+				/*
+				 * Check to see if there are extents marked
+				 * for deletion at the beginning of the cache.
+				 * If so close them so they will go away.
+				 */
+				for (i = 0; i < array->n_extent; i++) {
+					if (array->mpfarray[i].pinref != 0)
+						break;
+					mpf = array->mpfarray[i].mpf;
+					if (mpf == NULL)
+						continue;
+					(void)__memp_get_flags(mpf, &lflags);
+					if (!FLD_ISSET(lflags, DB_MPOOL_UNLINK))
+						break;
+
+					array->mpfarray[i].mpf = NULL;
+					if ((ret = __memp_fclose(mpf, 0)) != 0)
+						goto err;
+				}
+				if (i == 0)
+					goto increase;
+				memmove(&array->mpfarray[0],
+				     &array->mpfarray[i],
+				    (array->n_extent - i) *
+				    sizeof(array->mpfarray[0]));
+				memset(&array->mpfarray[array->n_extent - i],
+				     '\0', i * sizeof(array->mpfarray[0]));
+				array->low_extent += i;
+				array->hi_extent += i;
+				goto retry;
+			} else {
+				/*
+				 * Increase the size to at least include
+				 * the new one and double it.
+				 */
+increase:			array->n_extent += offset;
+				array->n_extent <<= 2;
+			}
+alloc:			if ((ret = __os_realloc(env,
+			    array->n_extent * sizeof(struct __qmpf),
+			    &array->mpfarray)) != 0)
+				goto err;
+
+			if (less == 1) {
+				/*
+				 * Move the array up and put the new one
+				 * in the first slot.
+				 */
+				memmove(&array->mpfarray[offset],
+				    array->mpfarray,
+				    numext * sizeof(array->mpfarray[0]));
+				memset(array->mpfarray, 0,
+				    offset * sizeof(array->mpfarray[0]));
+				memset(&array->mpfarray[numext + offset], 0,
+				    (array->n_extent - (numext + offset))
+				    * sizeof(array->mpfarray[0]));
+				offset = 0;
+			}
+			else
+				/* Clear the new part of the array. */
+				memset(&array->mpfarray[oldext], 0,
+				    (array->n_extent - oldext) *
+				    sizeof(array->mpfarray[0]));
+		}
+	}
+
+	/* Update the low and hi range of saved extents. */
+	if (extid < array->low_extent)
+		array->low_extent = extid;
+	if (extid > array->hi_extent)
+		array->hi_extent = extid;
+
+	/* If the extent file is not yet open, open it. */
+	if (array->mpfarray[offset].mpf == NULL) {
+		QAM_EXNAME(qp, extid, buf, sizeof(buf));
+		if ((ret = __memp_fcreate(
+		    env, &array->mpfarray[offset].mpf)) != 0)
+			goto err;
+		mpf = array->mpfarray[offset].mpf;
+		(void)__memp_set_lsn_offset(mpf, 0);
+		(void)__memp_set_pgcookie(mpf, &qp->pgcookie);
+		(void)__memp_get_ftype(dbp->mpf, &ftype);
+		(void)__memp_set_ftype(mpf, ftype);
+		(void)__memp_set_clear_len(mpf, dbp->pgsize);
+
+		/* Set up the fileid for this extent. */
+		__qam_exid(dbp, fid, extid);
+		(void)__memp_set_fileid(mpf, fid);
+		openflags = DB_EXTENT;
+		if (LF_ISSET(DB_MPOOL_CREATE))
+			openflags |= DB_CREATE;
+		if (F_ISSET(dbp, DB_AM_RDONLY))
+			openflags |= DB_RDONLY;
+		if (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB))
+			openflags |= DB_DIRECT;
+		if ((ret = __memp_fopen(mpf, NULL,
+		    buf, NULL, openflags, qp->mode, dbp->pgsize)) != 0) {
+			array->mpfarray[offset].mpf = NULL;
+			(void)__memp_fclose(mpf, 0);
+			goto err;
+		}
+	}
+
+	/*
+	 * We have found the right file.  Update its ref count
+	 * before dropping the dbp mutex so it does not go away.
+	 */
+	mpf = array->mpfarray[offset].mpf;
+	if (mode == QAM_PROBE_GET)
+		array->mpfarray[offset].pinref++;
+
+	/*
+	 * If we may create the page, then we are writing,
+	 * the file may nolonger be empty after this operation
+	 * so we clear the UNLINK flag.
+	 */
+	if (LF_ISSET(DB_MPOOL_CREATE))
+		(void)__memp_set_flags(mpf, DB_MPOOL_UNLINK, 0);
+
+err:
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (ret == 0) {
+		pgno--;
+		pgno %= qp->page_ext;
+		switch (mode) {
+		case QAM_PROBE_GET:
+			ret = __memp_fget(mpf, &pgno,
+			    dbc->thread_info, dbc->txn, flags, addrp);
+			if (ret == 0)
+				return (0);
+			break;
+		case QAM_PROBE_PUT:
+			ret = __memp_fput(mpf,
+			    dbc->thread_info, addrp, dbp->priority);
+			break;
+		case QAM_PROBE_DIRTY:
+			return (__memp_dirty(mpf, addrp,
+			    dbc->thread_info, dbc->txn, dbp->priority, flags));
+		case QAM_PROBE_MPF:
+			*(DB_MPOOLFILE **)addrp = mpf;
+			return (0);
+		}
+
+		MUTEX_LOCK(env, dbp->mutex);
+		/* Recalculate because we dropped the lock. */
+		offset = extid - array->low_extent;
+		DB_ASSERT(env, array->mpfarray[offset].pinref > 0);
+		if (--array->mpfarray[offset].pinref == 0 &&
+		    (mode == QAM_PROBE_GET || ret == 0)) {
+			/* Check to see if this file will be unlinked. */
+			(void)__memp_get_flags(mpf, &flags);
+			if (LF_ISSET(DB_MPOOL_UNLINK)) {
+				array->mpfarray[offset].mpf = NULL;
+				if ((t_ret =
+				    __memp_fclose(mpf, 0)) != 0 && ret == 0)
+					ret = t_ret;
+			}
+		}
+		MUTEX_UNLOCK(env, dbp->mutex);
+	}
+	return (ret);
+}
+
+/*
+ * __qam_fclose -- close an extent.
+ *
+ * Calculate which extent the page is in and close it.
+ * We assume the mpf entry is present.
+ *
+ * PUBLIC: int __qam_fclose __P((DB *, db_pgno_t));
+ */
+int
+__qam_fclose(dbp, pgnoaddr)
+	DB *dbp;
+	db_pgno_t pgnoaddr;
+{
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	MPFARRAY *array;
+	QUEUE *qp;
+	u_int32_t extid, offset;
+	int ret;
+
+	ret = 0;
+	env = dbp->env;
+	qp = (QUEUE *)dbp->q_internal;
+
+	MUTEX_LOCK(env, dbp->mutex);
+
+	extid = QAM_PAGE_EXTENT(dbp, pgnoaddr);
+	array = &qp->array1;
+	if (array->low_extent > extid || array->hi_extent < extid)
+		array = &qp->array2;
+	offset = extid - array->low_extent;
+
+	DB_ASSERT(env,
+	    extid >= array->low_extent && offset < array->n_extent);
+
+	/* If other threads are still using this file, leave it. */
+	if (array->mpfarray[offset].pinref != 0)
+		goto done;
+
+	mpf = array->mpfarray[offset].mpf;
+	array->mpfarray[offset].mpf = NULL;
+	ret = __memp_fclose(mpf, 0);
+
+done:
+	MUTEX_UNLOCK(env, dbp->mutex);
+	return (ret);
+}
+
+/*
+ * __qam_fremove -- remove an extent.
+ *
+ * Calculate which extent the page is in and remove it.  There is no way
+ * to remove an extent without probing it first and seeing that is is empty
+ * so we assume the mpf entry is present.
+ *
+ * PUBLIC: int __qam_fremove __P((DB *, db_pgno_t));
+ */
+int
+__qam_fremove(dbp, pgnoaddr)
+	DB *dbp;
+	db_pgno_t pgnoaddr;
+{
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	MPFARRAY *array;
+	QUEUE *qp;
+	u_int32_t extid, offset;
+	int ret;
+
+	qp = (QUEUE *)dbp->q_internal;
+	env = dbp->env;
+	ret = 0;
+
+	MUTEX_LOCK(env, dbp->mutex);
+
+	extid = QAM_PAGE_EXTENT(dbp, pgnoaddr);
+	array = &qp->array1;
+	if (array->low_extent > extid || array->hi_extent < extid)
+		array = &qp->array2;
+	offset = extid - array->low_extent;
+
+	DB_ASSERT(env,
+	    extid >= array->low_extent && offset < array->n_extent);
+
+	mpf = array->mpfarray[offset].mpf;
+	/* This extent my already be marked for delete and closed. */
+	if (mpf == NULL)
+		goto err;
+
+	/*
+	 * The log must be flushed before the file is deleted.  We depend on
+	 * the log record of the last delete to recreate the file if we crash.
+	 */
+	if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0)
+		goto err;
+
+	(void)__memp_set_flags(mpf, DB_MPOOL_UNLINK, 1);
+	/* Someone could be real slow, let them close it down. */
+	if (array->mpfarray[offset].pinref != 0)
+		goto err;
+	array->mpfarray[offset].mpf = NULL;
+	if ((ret = __memp_fclose(mpf, 0)) != 0)
+		goto err;
+
+	/*
+	 * If the file is at the bottom of the array
+	 * shift things down and adjust the end points.
+	 */
+	if (offset == 0) {
+		memmove(array->mpfarray, &array->mpfarray[1],
+		    (array->hi_extent - array->low_extent)
+		    * sizeof(array->mpfarray[0]));
+		array->mpfarray[
+		    array->hi_extent - array->low_extent].mpf = NULL;
+		if (array->low_extent != array->hi_extent)
+			array->low_extent++;
+	} else {
+		if (extid == array->hi_extent)
+			array->hi_extent--;
+	}
+
+err:	MUTEX_UNLOCK(env, dbp->mutex);
+
+	return (ret);
+}
+
+/*
+ * __qam_sync --
+ *	Flush the database cache.
+ *
+ * PUBLIC: int __qam_sync __P((DB *));
+ */
+int
+__qam_sync(dbp)
+	DB *dbp;
+{
+	int ret;
+	/*
+	 * We can't easily identify the extent files associated with a specific
+	 * Queue file, so flush all Queue extent files.
+	 */
+	if ((ret = __memp_fsync(dbp->mpf)) != 0)
+		return (ret);
+	if (((QUEUE *)dbp->q_internal)->page_ext != 0)
+		return (__memp_sync_int(
+		    dbp->env, NULL, 0, DB_SYNC_QUEUE_EXTENT, NULL, NULL));
+	return (0);
+}
+
+/*
+ * __qam_gen_filelist -- generate a list of extent files.
+ *	Another thread may close the handle so this should only
+ *	be used single threaded or with care.
+ *
+ * PUBLIC: int __qam_gen_filelist __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, QUEUE_FILELIST **));
+ */
+int
+__qam_gen_filelist(dbp, ip, filelistp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	QUEUE_FILELIST **filelistp;
+{
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	QMETA *meta;
+	QUEUE *qp;
+	size_t extent_cnt;
+	db_recno_t i, current, first, stop, rec_extent;
+	QUEUE_FILELIST *fp;
+	int ret;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	qp = (QUEUE *)dbp->q_internal;
+	*filelistp = NULL;
+
+	if (qp->page_ext == 0)
+		return (0);
+
+	/* This may happen during metapage recovery. */
+	if (qp->name == NULL)
+		return (0);
+
+	/* Find out the first and last record numbers in the database. */
+	i = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &i, ip, NULL, 0, &meta)) != 0)
+		return (ret);
+
+	current = meta->cur_recno;
+	first = meta->first_recno;
+
+	if ((ret = __memp_fput(mpf, ip, meta, dbp->priority)) != 0)
+		return (ret);
+
+	/*
+	 * Allocate the extent array.  Calculate the worst case number of
+	 * pages and convert that to a count of extents.   The count of
+	 * extents has 3 or 4 extra slots:
+	 *   roundoff at first (e.g., current record in extent);
+	 *   roundoff at current (e.g., first record in extent);
+	 *   NULL termination; and
+	 *   UINT32_MAX wraparound (the last extent can be small).
+	 */
+	rec_extent = qp->rec_page * qp->page_ext;
+	if (current >= first)
+		extent_cnt = (current - first) / rec_extent + 3;
+	else
+		extent_cnt =
+		    (current + (UINT32_MAX - first)) / rec_extent + 4;
+
+	if (extent_cnt == 0)
+		return (0);
+	if ((ret = __os_calloc(env,
+	    extent_cnt, sizeof(QUEUE_FILELIST), filelistp)) != 0)
+		return (ret);
+	fp = *filelistp;
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+again:
+	if (current >= first)
+		stop = current;
+	else
+		stop = UINT32_MAX;
+
+	/*
+	 * Make sure that first is at the same offset in the extent as stop.
+	 * This guarantees that the stop will be reached in the loop below,
+	 * even if it is the only record in its extent.  This calculation is
+	 * safe because first won't move out of its extent.
+	 */
+	first -= first % rec_extent;
+	first += stop % rec_extent;
+
+	for (i = first; i >= first && i <= stop; i += rec_extent) {
+		if ((ret = __qam_fprobe(dbc, QAM_RECNO_PAGE(dbp, i),
+		    &fp->mpf, QAM_PROBE_MPF, dbp->priority, 0)) != 0) {
+			if (ret == ENOENT)
+				continue;
+			goto err;
+		}
+		fp->id = QAM_RECNO_EXTENT(dbp, i);
+		fp++;
+		DB_ASSERT(env, (size_t)(fp - *filelistp) < extent_cnt);
+	}
+
+	if (current < first) {
+		first = 1;
+		goto again;
+	}
+
+err:	(void)__dbc_close(dbc);
+	return (ret);
+}
+
+/*
+ * __qam_extent_names -- generate a list of extent files names.
+ *
+ * PUBLIC: int __qam_extent_names __P((ENV *, char *, char ***));
+ */
+int
+__qam_extent_names(env, name, namelistp)
+	ENV *env;
+	char *name;
+	char ***namelistp;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	QUEUE *qp;
+	QUEUE_FILELIST *filelist, *fp;
+	size_t len;
+	int cnt, ret, t_ret;
+	char buf[DB_MAXPATHLEN], **cp, *freep;
+
+	*namelistp = NULL;
+	filelist = NULL;
+	ENV_GET_THREAD_INFO(env, ip);
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+	if ((ret = __db_open(dbp, ip,
+	    NULL, name, NULL, DB_QUEUE, DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+		goto done;
+	qp = dbp->q_internal;
+	if (qp->page_ext == 0)
+		goto done;
+
+	if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+		goto done;
+
+	if (filelist == NULL)
+		goto done;
+
+	cnt = 0;
+	for (fp = filelist; fp->mpf != NULL; fp++)
+		cnt++;
+
+	/* QUEUE_EXTENT contains extra chars, but add 6 anyway for the int. */
+	len = (size_t)cnt * (sizeof(**namelistp) +
+	    strlen(QUEUE_EXTENT) + strlen(qp->dir) + strlen(qp->name) + 6);
+
+	if ((ret = __os_malloc(dbp->env, len, namelistp)) != 0)
+		goto done;
+	cp = *namelistp;
+	freep = (char *)(cp + cnt + 1);
+	for (fp = filelist; fp->mpf != NULL; fp++) {
+		QAM_EXNAME(qp, fp->id, buf, sizeof(buf));
+		len = strlen(buf);
+		*cp++ = freep;
+		(void)strcpy(freep, buf);
+		freep += len + 1;
+	}
+	*cp = NULL;
+
+done:
+	if (filelist != NULL)
+		__os_free(dbp->env, filelist);
+	if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qam_exid --
+ *	Generate a fileid for an extent based on the fileid of the main
+ * file.  Since we do not log schema creates/deletes explicitly, the log
+ * never captures the fileid of an extent file.  In order that masters and
+ * replicas have the same fileids (so they can explicitly delete them), we
+ * use computed fileids for the extent files of Queue files.
+ *
+ * An extent file id retains the low order 12 bytes of the file id and
+ * overwrites the dev/inode fields, placing a 0 in the inode field, and
+ * the extent number in the dev field.
+ *
+ * PUBLIC: void __qam_exid __P((DB *, u_int8_t *, u_int32_t));
+ */
+void
+__qam_exid(dbp, fidp, exnum)
+	DB *dbp;
+	u_int8_t *fidp;
+	u_int32_t exnum;
+{
+	int i;
+	u_int8_t *p;
+
+	/* Copy the fileid from the master. */
+	memcpy(fidp, dbp->fileid, DB_FILE_ID_LEN);
+
+	/* The first four bytes are the inode or the FileIndexLow; 0 it. */
+	for (i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = 0;
+
+	/* The next four bytes are the dev/FileIndexHigh; insert the exnum . */
+	for (p = (u_int8_t *)&exnum, i = sizeof(u_int32_t); i > 0; --i)
+		*fidp++ = *p++;
+}
+
+/*
+ * __qam_nameop --
+ *	Remove or rename  extent files associated with a particular file.
+ * This is to remove or rename (both in mpool and the file system) any
+ * extent files associated with the given dbp.
+ * This is either called from the QUEUE remove or rename methods or
+ * when undoing a transaction that created the database.
+ *
+ * PUBLIC: int __qam_nameop __P((DB *, DB_TXN *, const char *, qam_name_op));
+ */
+int
+__qam_nameop(dbp, txn, newname, op)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *newname;
+	qam_name_op op;
+{
+	ENV *env;
+	QUEUE *qp;
+	size_t exlen, fulllen, len;
+	u_int8_t fid[DB_FILE_ID_LEN];
+	u_int32_t exid;
+	int cnt, i, ret, t_ret;
+	char buf[DB_MAXPATHLEN], nbuf[DB_MAXPATHLEN], sepsave;
+	char *endname, *endpath, *exname, *fullname, **names;
+	char *ndir, *namep, *new, *cp;
+
+	env = dbp->env;
+	qp = (QUEUE *)dbp->q_internal;
+	cnt = ret = t_ret = 0;
+	namep = exname = fullname = NULL;
+	names = NULL;
+
+	/* If this isn't a queue with extents, we're done. */
+	if (qp->page_ext == 0)
+		return (0);
+
+	/*
+	 * Generate the list of all queue extents for this file (from the
+	 * file system) and then cycle through removing them and evicting
+	 * from mpool.  We have two modes of operation here.  If we are
+	 * undoing log operations, then do not write log records and try
+	 * to keep going even if we encounter failures in nameop.  If we
+	 * are in mainline code, then return as soon as we have a problem.
+	 * Memory allocation errors (__db_appname, __os_malloc) are always
+	 * considered failure.
+	 *
+	 * Set buf to : dir/__dbq.NAME.0 and fullname to HOME/dir/__dbq.NAME.0
+	 * or, in the case of an absolute path: /dir/__dbq.NAME.0
+	 */
+	QAM_EXNAME(qp, 0, buf, sizeof(buf));
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, buf, &dbp->dirname, &fullname)) != 0)
+		return (ret);
+
+	/* We should always have a path separator here. */
+	if ((endpath = __db_rpath(fullname)) == NULL) {
+		ret = EINVAL;
+		goto err;
+	}
+	sepsave = *endpath;
+	*endpath = '\0';
+
+	/*
+	 * Get the list of all names in the directory and restore the
+	 * path separator.
+	 */
+	if ((ret = __os_dirlist(env, fullname, 0, &names, &cnt)) != 0)
+		goto err;
+	*endpath = sepsave;
+
+	/* If there aren't any names, don't allocate any space. */
+	if (cnt == 0)
+		goto err;
+
+	/*
+	 * Now, make endpath reference the queue extent names upon which
+	 * we can match.  Then we set the end of the path to be the
+	 * beginning of the extent number, and we can compare the bytes
+	 * between endpath and endname (__dbq.NAME.).
+	 */
+	endpath++;
+	endname = strrchr(endpath, '.');
+	if (endname == NULL) {
+		ret = EINVAL;
+		goto err;
+	}
+	++endname;
+	*endname = '\0';
+	len = strlen(endpath);
+	fulllen = strlen(fullname);
+
+	/* Allocate space for a full extent name.  */
+	exlen = fulllen + 20;
+	if ((ret = __os_malloc(env, exlen, &exname)) != 0)
+		goto err;
+
+	ndir = new = NULL;
+	if (newname != NULL) {
+		if ((ret = __os_strdup(env, newname, &namep)) != 0)
+			goto err;
+		ndir = namep;
+		if ((new = __db_rpath(namep)) != NULL)
+			*new++ = '\0';
+		else {
+			new = namep;
+			ndir = PATH_DOT;
+		}
+	}
+	for (i = 0; i < cnt; i++) {
+		/* Check if this is a queue extent file. */
+		if (strncmp(names[i], endpath, len) != 0)
+			continue;
+		/* Make sure we have all numbers. foo.db vs. foo.db.0. */
+		for (cp = &names[i][len]; *cp != '\0'; cp++)
+			if (!isdigit((int)*cp))
+				break;
+		if (*cp != '\0')
+			continue;
+
+		/*
+		 * We have a queue extent file.  We need to generate its
+		 * name and its fileid.
+		 */
+		exid = (u_int32_t)strtoul(names[i] + len, NULL, 10);
+		__qam_exid(dbp, fid, exid);
+
+		switch (op) {
+		case QAM_NAME_DISCARD:
+			snprintf(exname, exlen,
+			     "%s%s", fullname, names[i] + len);
+			if ((t_ret = __memp_nameop(dbp->env,
+			    fid, NULL, exname, NULL,
+			    F_ISSET(dbp, DB_AM_INMEM))) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+
+		case QAM_NAME_RENAME:
+			snprintf(nbuf, sizeof(nbuf), QUEUE_EXTENT,
+			     ndir, PATH_SEPARATOR[0], new, exid);
+			QAM_EXNAME(qp, exid, buf, sizeof(buf));
+			if ((ret = __fop_rename(env,
+			    txn, buf, nbuf, &dbp->dirname, fid, DB_APP_DATA, 1,
+			    F_ISSET(dbp, DB_AM_NOT_DURABLE) ?
+			    DB_LOG_NOT_DURABLE : 0)) != 0)
+				goto err;
+			break;
+
+		case QAM_NAME_REMOVE:
+			QAM_EXNAME(qp, exid, buf, sizeof(buf));
+			if ((ret = __fop_remove(env, txn, fid,
+			    buf, &dbp->dirname,
+			    DB_APP_DATA, F_ISSET(dbp, DB_AM_NOT_DURABLE) ?
+			    DB_LOG_NOT_DURABLE : 0)) != 0)
+				goto err;
+			break;
+		}
+	}
+
+err:	if (fullname != NULL)
+		__os_free(env, fullname);
+	if (exname != NULL)
+		__os_free(env, exname);
+	if (namep != NULL)
+		__os_free(env, namep);
+	if (names != NULL)
+		__os_dirfree(env, names, cnt);
+	return (ret);
+}
+
+/*
+ * __qam_lsn_reset -- reset the lsns for extents.
+ *
+ * PUBLIC: int __qam_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__qam_lsn_reset(dbp, ip)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+{
+	QUEUE *qp;
+	QUEUE_FILELIST *filelist, *fp;
+	int ret;
+
+	qp = dbp->q_internal;
+	if (qp->page_ext == 0)
+		return (0);
+
+	if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+		return (ret);
+
+	if (filelist == NULL)
+		return (ret);
+
+	for (fp = filelist; fp->mpf != NULL; fp++)
+		if ((ret = __db_lsn_reset(fp->mpf, ip)) != 0)
+			break;
+
+	__os_free(dbp->env, filelist);
+	return (ret);
+}
+
+/*
+ * __qam_backup_extents--
+ *	Routine to safely copy the active queue extents of a database.
+ * PUBLIC: int __qam_backup_extents __P((DB *,
+ * PUBLIC:       DB_THREAD_INFO *, const char *, u_int32_t));
+ */
+int
+__qam_backup_extents(dbp, ip, target, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	const char *target;
+	u_int32_t flags;
+{
+	DB_FH *filep;
+	QUEUE *qp;
+	QUEUE_FILELIST *fp, *filelist;
+	int ret, t_ret;
+	char buf[DB_MAXPATHLEN];
+	void *handle;
+
+	if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+		return (ret);
+
+	if (filelist == NULL)
+		return (0);
+
+	qp = dbp->q_internal;
+
+	for (fp = filelist; fp->mpf != NULL; fp++) {
+		QAM_EXNAME(qp, fp->id, buf, sizeof(buf));
+		if ((ret = __memp_backup_open(dbp->dbenv->env,
+		    fp->mpf, buf, target, flags, &filep, &handle)) == 0)
+			ret = __memp_backup_mpf(dbp->dbenv->env, fp->mpf, ip,
+			    0, fp->mpf->mfp->last_pgno, filep, handle, flags);
+		if ((t_ret = __memp_backup_close(dbp->dbenv->env,
+		    fp->mpf, buf, filep, handle)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			break;
+	}
+
+	__os_free(dbp->env, filelist);
+
+	return (ret);
+}
diff --git a/src/qam/qam_method.c b/src/qam/qam_method.c
new file mode 100644
index 00000000..0867e5dd
--- /dev/null
+++ b/src/qam/qam_method.c
@@ -0,0 +1,399 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __qam_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+	       const char *, const char *, const char *, qam_name_op));
+static int __qam_set_extentsize __P((DB *, u_int32_t));
+
+/*
+ * __qam_db_create --
+ *	Queue specific initialization of the DB structure.
+ *
+ * PUBLIC: int __qam_db_create __P((DB *));
+ */
+int
+__qam_db_create(dbp)
+	DB *dbp;
+{
+	QUEUE *t;
+	int ret;
+
+	/* Allocate and initialize the private queue structure. */
+	if ((ret = __os_calloc(dbp->env, 1, sizeof(QUEUE), &t)) != 0)
+		return (ret);
+	dbp->q_internal = t;
+	dbp->get_q_extentsize = __qam_get_extentsize;
+	dbp->set_q_extentsize = __qam_set_extentsize;
+
+	t->re_pad = ' ';
+
+	return (0);
+}
+
+/*
+ * __qam_db_close --
+ *	Queue specific discard of the DB structure.
+ *
+ * PUBLIC: int __qam_db_close __P((DB *, u_int32_t));
+ */
+int
+__qam_db_close(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	MPFARRAY *array;
+	QUEUE *t;
+	struct __qmpf *mpfp;
+	u_int32_t i;
+	int ret, t_ret;
+
+	ret = 0;
+	if ((t = dbp->q_internal) == NULL)
+		return (0);
+
+	array = &t->array1;
+again:
+	mpfp = array->mpfarray;
+	if (mpfp != NULL) {
+		for (i = array->low_extent;
+		    i <= array->hi_extent; i++, mpfp++) {
+			mpf = mpfp->mpf;
+			mpfp->mpf = NULL;
+			if (mpf != NULL && (t_ret = __memp_fclose(mpf,
+			    LF_ISSET(DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0))
+			    != 0 && ret == 0)
+				ret = t_ret;
+		}
+		__os_free(dbp->env, array->mpfarray);
+	}
+	if (t->array2.n_extent != 0) {
+		array = &t->array2;
+		array->n_extent = 0;
+		goto again;
+	}
+
+	if (LF_ISSET(DB_AM_DISCARD) &&
+	     (t_ret = __qam_nameop(dbp, NULL,
+	     NULL, QAM_NAME_DISCARD)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (t->path != NULL)
+		__os_free(dbp->env, t->path);
+	__os_free(dbp->env, t);
+	dbp->q_internal = NULL;
+
+	return (ret);
+}
+
+/*
+ * __qam_get_extentsize --
+ *	The DB->q_get_extentsize method.
+ *
+ * PUBLIC: int __qam_get_extentsize __P((DB *, u_int32_t *));
+ */
+int
+__qam_get_extentsize(dbp, q_extentsizep)
+	DB *dbp;
+	u_int32_t *q_extentsizep;
+{
+	*q_extentsizep = ((QUEUE*)dbp->q_internal)->page_ext;
+	return (0);
+}
+
+static int
+__qam_set_extentsize(dbp, extentsize)
+	DB *dbp;
+	u_int32_t extentsize;
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_extentsize");
+
+	if (extentsize < 1) {
+		__db_errx(dbp->env, DB_STR("1140",
+		    "Extent size must be at least 1"));
+		return (EINVAL);
+	}
+
+	((QUEUE*)dbp->q_internal)->page_ext = extentsize;
+
+	return (0);
+}
+
+/*
+ * __queue_pageinfo -
+ *	Given a dbp, get first/last page information about a queue.
+ *
+ * PUBLIC: int __queue_pageinfo __P((DB *, db_pgno_t *, db_pgno_t *,
+ * PUBLIC:       int *, int, u_int32_t));
+ */
+int
+__queue_pageinfo(dbp, firstp, lastp, emptyp, prpage, flags)
+	DB *dbp;
+	db_pgno_t *firstp, *lastp;
+	int *emptyp;
+	int prpage;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	QMETA *meta;
+	db_pgno_t first, i, last;
+	int empty, ret, t_ret;
+
+	mpf = dbp->mpf;
+	ENV_GET_THREAD_INFO(dbp->env, ip);
+
+	/* Find out the page number of the last page in the database. */
+	i = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &i, ip, NULL, 0, &meta)) != 0)
+		return (ret);
+
+	first = QAM_RECNO_PAGE(dbp, meta->first_recno);
+	last = QAM_RECNO_PAGE(
+	    dbp, meta->cur_recno == 1 ? 1 : meta->cur_recno - 1);
+
+	empty = meta->cur_recno == meta->first_recno;
+	if (firstp != NULL)
+		*firstp = first;
+	if (lastp != NULL)
+		*lastp = last;
+	if (emptyp != NULL)
+		*emptyp = empty;
+#ifdef HAVE_STATISTICS
+	if (prpage)
+		ret = __db_prpage(dbp, (PAGE *)meta, flags);
+#else
+	COMPQUIET(prpage, 0);
+	COMPQUIET(flags, 0);
+#endif
+
+	if ((t_ret = __memp_fput(mpf,
+	    ip, meta, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __db_prqueue --
+ *	Print out a queue
+ *
+ * PUBLIC: int __db_prqueue __P((DB *, u_int32_t));
+ */
+int
+__db_prqueue(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_THREAD_INFO *ip;
+	PAGE *h;
+	db_pgno_t first, i, last, pg_ext, stop;
+	int empty, ret, t_ret;
+
+	if ((ret = __queue_pageinfo(dbp, &first, &last, &empty, 1, flags)) != 0)
+		return (ret);
+
+	if (empty || ret != 0)
+		return (ret);
+
+	ENV_GET_THREAD_INFO(dbp->env, ip);
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+	i = first;
+	if (first > last)
+		stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+	else
+		stop = last;
+
+	/* Dump each page. */
+	pg_ext = ((QUEUE *)dbp->q_internal)->page_ext;
+begin:
+	for (; i <= stop; ++i) {
+		if ((ret = __qam_fget(dbc, &i, 0, &h)) != 0) {
+			if (pg_ext == 0) {
+				if (ret == DB_PAGE_NOTFOUND && first == last)
+					ret = 0;
+				goto err;
+			}
+			if (ret == ENOENT || ret == DB_PAGE_NOTFOUND) {
+				i += (pg_ext - ((i - 1) % pg_ext)) - 1;
+				ret = 0;
+				continue;
+			}
+			goto err;
+		}
+		(void)__db_prpage(dbp, h, flags);
+		if ((ret = __qam_fput(dbc, i, h, dbp->priority)) != 0)
+			goto err;
+	}
+
+	if (first > last) {
+		i = 1;
+		stop = last;
+		first = last;
+		goto begin;
+	}
+
+err:
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+#endif
+
+/*
+ * __qam_remove --
+ *	Remove method for a Queue.
+ *
+ * PUBLIC: int __qam_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:    const char *, const char *, u_int32_t));
+ */
+int
+__qam_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__qam_rr(dbp, ip, txn, name, subdb, NULL, QAM_NAME_REMOVE));
+}
+
+/*
+ * __qam_rename --
+ *	Rename method for a Queue.
+ *
+ * PUBLIC: int __qam_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:         const char *, const char *, const char *));
+ */
+int
+__qam_rename(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	return (__qam_rr(dbp, ip, txn, name, subdb, newname, QAM_NAME_RENAME));
+}
+
+/*
+ * __qam_rr --
+ *	Remove/Rename method for a Queue.
+ */
+static int
+__qam_rr(dbp, ip, txn, name, subdb, newname, op)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	qam_name_op op;
+{
+	DB *tmpdbp;
+	ENV *env;
+	QUEUE *qp;
+	int ret, t_ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	if (subdb != NULL && name != NULL) {
+		__db_errx(env, DB_STR("1141",
+		    "Queue does not support multiple databases per file"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Since regular rename no longer opens the database, we may have
+	 * to do it here.
+	 */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		tmpdbp = dbp;
+	else {
+		if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+			return (ret);
+
+		/*
+		 * We need to make sure we don't self-deadlock, so give
+		 * this dbp the same locker as the incoming one.
+		 */
+		tmpdbp->locker = dbp->locker;
+		if ((ret = __db_open(tmpdbp, ip, txn,
+		    name, NULL, DB_QUEUE, DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+			goto err;
+	}
+
+	qp = (QUEUE *)tmpdbp->q_internal;
+	if (qp->page_ext != 0)
+		ret = __qam_nameop(tmpdbp, txn, newname, op);
+
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err:		/*
+		 * Since we copied the locker ID from the dbp, we'd better not
+		 * free it here.
+		 */
+		tmpdbp->locker = NULL;
+
+		/* We need to remove the lock event we associated with this. */
+		if (txn != NULL)
+			__txn_remlock(env,
+			    txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+		if ((t_ret = __db_close(tmpdbp,
+		    txn, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+
+/*
+ * __qam_map_flags --
+ *	Map queue-specific flags from public to the internal values.
+ *
+ * PUBLIC: void __qam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__qam_map_flags(dbp, inflagsp, outflagsp)
+	DB *dbp;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(dbp, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_INORDER)) {
+		FLD_SET(*outflagsp, DB_AM_INORDER);
+		FLD_CLR(*inflagsp, DB_INORDER);
+	}
+}
+
+/*
+ * __qam_set_flags --
+ *	Set queue-specific flags.
+ *
+ * PUBLIC: int __qam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__qam_set_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+
+	__qam_map_flags(dbp, flagsp, &dbp->flags);
+	return (0);
+}
diff --git a/src/qam/qam_open.c b/src/qam/qam_open.c
new file mode 100644
index 00000000..69f6cb75
--- /dev/null
+++ b/src/qam/qam_open.c
@@ -0,0 +1,346 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/fop.h"
+
+static int __qam_init_meta __P((DB *, QMETA *));
+
+/*
+ * __qam_open
+ *
+ * PUBLIC: int __qam_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char *, db_pgno_t, int, u_int32_t));
+ */
+int
+__qam_open(dbp, ip, txn, name, base_pgno, mode, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	int mode;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	QMETA *qmeta;
+	QUEUE *t;
+	int ret, t_ret;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	t = dbp->q_internal;
+	ret = 0;
+	qmeta = NULL;
+
+	if (name == NULL && t->page_ext != 0) {
+		__db_errx(env, DB_STR("1134",
+	"Extent size may not be specified for in-memory queue database"));
+		return (EINVAL);
+	}
+
+	if (MULTIVERSION(dbp)) {
+		__db_errx(env, DB_STR("1135",
+		    "Multiversion queue databases are not supported"));
+		return (EINVAL);
+	}
+
+	/* Initialize the remaining fields/methods of the DB. */
+	dbp->db_am_remove = __qam_remove;
+	dbp->db_am_rename = __qam_rename;
+
+	/*
+	 * Get a cursor.  If DB_CREATE is specified, we may be creating
+	 * pages, and to do that safely in CDB we need a write cursor.
+	 * In STD_LOCKING mode, we'll synchronize using the meta page
+	 * lock instead.
+	 */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+	    LF_ISSET(DB_CREATE) && CDB_LOCKING(env) ?
+	    DB_WRITECURSOR : 0)) != 0)
+		return (ret);
+
+	/*
+	 * Get the meta data page.  It must exist, because creates of
+	 * files/databases come in through the __qam_new_file interface
+	 * and queue doesn't support subdatabases.
+	 */
+	if ((ret = __memp_fget(mpf, &base_pgno, ip, txn, 0, &qmeta)) != 0)
+		goto err;
+
+	/* If the magic number is incorrect, that's a fatal error. */
+	if (qmeta->dbmeta.magic != DB_QAMMAGIC) {
+		__db_errx(env, DB_STR_A("1136",
+		    "__qam_open: %s: unexpected file type or format", "%s"),
+		    name);
+		ret = EINVAL;
+		goto err;
+	}
+
+	/* Setup information needed to open extents. */
+	t->page_ext = qmeta->page_ext;
+
+	if (t->page_ext != 0 && (ret = __qam_set_ext_data(dbp, name)) != 0)
+		goto err;
+
+	if (mode == 0)
+		mode = DB_MODE_660;
+	t->mode = mode;
+	t->re_pad = (int)qmeta->re_pad;
+	t->re_len = qmeta->re_len;
+	t->rec_page = qmeta->rec_page;
+
+	t->q_meta = base_pgno;
+	t->q_root = base_pgno + 1;
+
+err:	if (qmeta != NULL && (t_ret =
+	    __memp_fput(mpf, ip, qmeta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qam_set_ext_data --
+ *	Setup DBP data for opening queue extents.
+ *
+ * PUBLIC: int __qam_set_ext_data __P((DB*, const char *));
+ */
+int
+__qam_set_ext_data(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	QUEUE *t;
+	int ret;
+
+	t = dbp->q_internal;
+	t->pginfo.db_pagesize = dbp->pgsize;
+	t->pginfo.flags =
+	    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+	t->pginfo.type = dbp->type;
+	t->pgcookie.data = &t->pginfo;
+	t->pgcookie.size = sizeof(DB_PGINFO);
+
+	if ((ret = __os_strdup(dbp->env, name,  &t->path)) != 0)
+		return (ret);
+	t->dir = t->path;
+	if ((t->name = __db_rpath(t->path)) == NULL) {
+		t->name = t->path;
+		t->dir = PATH_DOT;
+	} else
+		*t->name++ = '\0';
+
+	return (0);
+}
+
+/*
+ * __qam_metachk --
+ *
+ * PUBLIC: int __qam_metachk __P((DB *, const char *, QMETA *));
+ */
+int
+__qam_metachk(dbp, name, qmeta)
+	DB *dbp;
+	const char *name;
+	QMETA *qmeta;
+{
+	ENV *env;
+	u_int32_t vers;
+	int ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * At this point, all we know is that the magic number is for a Queue.
+	 * Check the version, the database may be out of date.
+	 */
+	vers = qmeta->dbmeta.version;
+	if (F_ISSET(dbp, DB_AM_SWAP))
+		M_32_SWAP(vers);
+	switch (vers) {
+	case 1:
+	case 2:
+		__db_errx(env, DB_STR_A("1137",
+		    "%s: queue version %lu requires a version upgrade",
+		    "%s %lu"), name, (u_long)vers);
+		return (DB_OLD_VERSION);
+	case 3:
+	case 4:
+		break;
+	default:
+		__db_errx(env, DB_STR_A("1138",
+		    "%s: unsupported qam version: %lu", "%s %lu"),
+		    name, (u_long)vers);
+		return (EINVAL);
+	}
+
+	/* Swap the page if we need to. */
+	if (F_ISSET(dbp, DB_AM_SWAP) &&
+	    (ret = __qam_mswap(env, (PAGE *)qmeta)) != 0)
+		return (ret);
+
+	/* Check the type. */
+	if (dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+		return (EINVAL);
+	dbp->type = DB_QUEUE;
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE);
+
+	/* Set the page size. */
+	dbp->pgsize = qmeta->dbmeta.pagesize;
+
+	/* Copy the file's ID. */
+	memcpy(dbp->fileid, qmeta->dbmeta.uid, DB_FILE_ID_LEN);
+
+	/* Set up AM-specific methods that do not require an open. */
+	dbp->db_am_rename = __qam_rename;
+	dbp->db_am_remove = __qam_remove;
+
+	return (ret);
+}
+
+/*
+ * __qam_init_meta --
+ *	Initialize the meta-data for a Queue database.
+ */
+static int
+__qam_init_meta(dbp, meta)
+	DB *dbp;
+	QMETA *meta;
+{
+	ENV *env;
+	QUEUE *t;
+
+	env = dbp->env;
+	t = dbp->q_internal;
+
+	memset(meta, 0, sizeof(QMETA));
+	LSN_NOT_LOGGED(meta->dbmeta.lsn);
+	meta->dbmeta.pgno = PGNO_BASE_MD;
+	meta->dbmeta.last_pgno = 0;
+	meta->dbmeta.magic = DB_QAMMAGIC;
+	meta->dbmeta.version = DB_QAMVERSION;
+	meta->dbmeta.pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+		DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+		meta->crypto_magic = meta->dbmeta.magic;
+	}
+	meta->dbmeta.type = P_QAMMETA;
+	meta->re_pad = (u_int32_t)t->re_pad;
+	meta->re_len = t->re_len;
+	meta->rec_page = CALC_QAM_RECNO_PER_PAGE(dbp);
+	meta->cur_recno = 1;
+	meta->first_recno = 1;
+	meta->page_ext = t->page_ext;
+	t->rec_page = meta->rec_page;
+	memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+	/* Verify that we can fit at least one record per page. */
+	if (QAM_RECNO_PER_PAGE(dbp) < 1) {
+		__db_errx(env, DB_STR_A("1139",
+		    "Record size of %lu too large for page size of %lu",
+		    "%lu %lu"), (u_long)t->re_len, (u_long)dbp->pgsize);
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __qam_new_file --
+ * Create the necessary pages to begin a new queue database file.
+ *
+ * PUBLIC: int __qam_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__qam_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	DBT pdbt;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	QMETA *meta;
+	db_pgno_t pgno;
+	int ret, t_ret;
+
+	/*
+	 * Build meta-data page.
+	 *
+	 * This code appears more complex than it is because of the two cases
+	 * (named and unnamed).
+	 *
+	 * For each page being created, there are three parts: 1) a "get page"
+	 * chunk (which either uses malloc'd memory or calls __memp_fget), 2)
+	 * the initialization, and 3) the "put page" chunk which either does a
+	 * fop write or an __memp_fput.
+	 */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		mpf = dbp->mpf;
+		pgno = PGNO_BASE_MD;
+		if ((ret = __memp_fget(mpf, &pgno, ip, txn,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+			return (ret);
+
+		if ((ret = __qam_init_meta(dbp, meta)) != 0)
+			goto err1;
+
+		if ((ret = __db_log_page(dbp,
+		    txn, &meta->dbmeta.lsn, pgno, (PAGE *)meta)) != 0)
+			goto err1;
+err1:		if ((t_ret =
+		    __memp_fput(mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	} else {
+		env = dbp->env;
+		if ((ret = __os_calloc(env, 1, dbp->pgsize, &meta)) != 0)
+			return (ret);
+
+		if ((ret = __qam_init_meta(dbp, meta)) != 0)
+			goto err2;
+
+		pginfo.db_pagesize = dbp->pgsize;
+		pginfo.flags =
+		    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+		pginfo.type = DB_QUEUE;
+		DB_SET_DBT(pdbt, &pginfo, sizeof(pginfo));
+		if ((ret =
+		    __db_pgout(env->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+			goto err2;
+		ret = __fop_write(env, txn, name, dbp->dirname,
+		    DB_APP_DATA, fhp, dbp->pgsize, 0, 0, meta, dbp->pgsize, 1,
+		    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err2:		__os_free(env, meta);
+	}
+
+	return (ret);
+}
diff --git a/src/qam/qam_rec.c b/src/qam/qam_rec.c
new file mode 100644
index 00000000..c9ff6c83
--- /dev/null
+++ b/src/qam/qam_rec.c
@@ -0,0 +1,687 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __qam_adjust_first __P((DB *, DBC *, QMETA *, db_recno_t));
+
+/*
+ * LSNs in queue data pages are advisory.  They do not have to be accurate
+ * as all operations are idempotent on records.  They should not be rolled
+ * forward during recovery as committed transaction may obscure updates from
+ * an incomplete transaction that updates the same page.  The incomplete
+ * transaction may be completed during a later hot backup cycle.
+ */
+
+/* Queue version of REC_DIRTY -- needs to probe the correct file. */
+#define	QAM_DIRTY(dbc, pgno, pagep)					\
+	if ((ret = __qam_dirty((dbc),					\
+	    pgno, pagep, (dbc)->priority)) != 0) {			\
+		ret = __db_pgerr((dbc)->dbp, (pgno), ret);		\
+		goto out;						\
+	}
+
+static int
+__qam_adjust_first(file_dbp, dbc, meta, recno)
+	DB *file_dbp;
+	DBC *dbc;
+	QMETA *meta;
+	db_recno_t recno;
+{
+	QUEUE_CURSOR *cp;
+	u_int32_t rec_ext;
+	int exact, ret;
+
+	ret = 0;
+	if (meta->page_ext == 0)
+		rec_ext = 0;
+	else
+		rec_ext = meta->page_ext * meta->rec_page;
+	cp = (QUEUE_CURSOR *)dbc->internal;
+	if (meta->first_recno == RECNO_OOB)
+		meta->first_recno++;
+	while (meta->first_recno != meta->cur_recno &&
+	    !QAM_BEFORE_FIRST(meta, recno)) {
+		if ((ret = __qam_position(dbc,
+		    &meta->first_recno, 0, &exact)) != 0)
+			return (ret);
+		if (cp->page != NULL && (ret = __qam_fput(dbc,
+		    cp->pgno, cp->page, dbc->priority)) != 0)
+			return (ret);
+
+		if (exact == 1)
+			break;
+		if (cp->page != NULL &&
+		    rec_ext != 0 && meta->first_recno % rec_ext == 0)
+			if ((ret =
+			    __qam_fremove(file_dbp, cp->pgno)) != 0)
+				return (ret);
+		REC_DIRTY(file_dbp->mpf,
+		    dbc->thread_info, dbc->priority, &meta);
+		QAM_INC_RECNO(meta->first_recno);
+	}
+out:	return (ret);
+}
+
+/*
+ * __qam_incfirst_recover --
+ *	Recovery function for incfirst.
+ *
+ * PUBLIC: int __qam_incfirst_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_incfirst_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__qam_incfirst_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN trunc_lsn;
+	DB_MPOOLFILE *mpf;
+	QMETA *meta;
+	db_pgno_t metapg;
+	int ret;
+
+	COMPQUIET(meta, NULL);
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__qam_incfirst_print);
+	REC_INTRO(__qam_incfirst_read, ip, 0);
+
+	metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+
+	/* Allocate our own cursor without DB_RECOVER as we need a locker. */
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+		goto out;
+	F_SET(dbc, DBC_RECOVER);
+
+	if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+	    0, &meta)) != 0) {
+		if (DB_REDO(op)) {
+			if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+			    DB_MPOOL_CREATE, &meta)) != 0)
+				goto out;
+			meta->dbmeta.pgno = metapg;
+			meta->dbmeta.type = P_QAMMETA;
+		} else {
+			*lsnp = argp->prev_lsn;
+			goto out;
+		}
+	}
+
+	/*
+	 * Only move first_recno backwards so we pick up the aborted delete.
+	 * When going forward we need to be careful since
+	 * we may have bumped over a locked record.
+	 */
+	if (DB_UNDO(op)) {
+		if (QAM_BEFORE_FIRST(meta, argp->recno)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->first_recno = argp->recno;
+		}
+
+		trunc_lsn = ((DB_TXNHEAD *)info)->trunc_lsn;
+		/* if we are truncating, update the LSN */
+		if (!IS_ZERO_LSN(trunc_lsn) &&
+		    LOG_COMPARE(&LSN(meta), &trunc_lsn) > 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			LSN(meta) = trunc_lsn;
+		}
+	} else {
+		if (LOG_COMPARE(&LSN(meta), lsnp) < 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			LSN(meta) = *lsnp;
+		}
+		if ((ret = __qam_adjust_first(file_dbp,
+		    dbc, meta, argp->recno + 1)) != 0)
+			goto err;
+	}
+
+	ret = __memp_fput(mpf, ip, meta, dbc->priority);
+	if (ret != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	if (0) {
+err:		(void)__memp_fput(mpf, ip, meta, dbc->priority);
+	}
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __qam_mvptr_recover --
+ *	Recovery function for mvptr.
+ *
+ * PUBLIC: int __qam_mvptr_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_mvptr_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__qam_mvptr_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN trunc_lsn;
+	DB_MPOOLFILE *mpf;
+	QMETA *meta;
+	QUEUE_CURSOR *cp;
+	db_pgno_t metapg;
+	int cmp_n, cmp_p, exact, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__qam_mvptr_print);
+	REC_INTRO(__qam_mvptr_read, ip, 0);
+
+	/* Allocate our own cursor without DB_RECOVER as we need a locker. */
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+		goto out;
+	F_SET(dbc, DBC_RECOVER);
+
+	metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+
+	if ((ret = __memp_fget(mpf, &metapg, ip, NULL, 0, &meta)) != 0) {
+		if (DB_REDO(op)) {
+			if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+			    DB_MPOOL_CREATE, &meta)) != 0) {
+				goto out;
+			}
+			meta->dbmeta.pgno = metapg;
+			meta->dbmeta.type = P_QAMMETA;
+		} else {
+			*lsnp = argp->prev_lsn;
+			goto out;
+		}
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->metalsn);
+
+	/*
+	 * Under normal circumstances, we never undo a movement of one of
+	 * the pointers.  Just move them along regardless of abort/commit.
+	 * When going forward we need to verify that this is really where
+	 * the pointer belongs.  A transaction may roll back and reinsert
+	 * a record that was missing at the time of this action.
+	 *
+	 * If we're undoing a truncate, we need to reset the pointers to
+	 * their state before the truncate.
+	 */
+	if (DB_UNDO(op)) {
+		if ((argp->opcode & QAM_TRUNCATE) && cmp_n <= 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->first_recno = argp->old_first;
+			meta->cur_recno = argp->old_cur;
+			LSN(meta) = argp->metalsn;
+		}
+		/* If the page lsn is beyond the truncate point, move it back */
+		trunc_lsn = ((DB_TXNHEAD *)info)->trunc_lsn;
+		if (!IS_ZERO_LSN(trunc_lsn) &&
+		    LOG_COMPARE(&trunc_lsn, &LSN(meta)) < 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			LSN(meta) = argp->metalsn;
+		}
+	} else if (op == DB_TXN_APPLY || cmp_p == 0) {
+		REC_DIRTY(mpf, ip, dbc->priority, &meta);
+		cp = (QUEUE_CURSOR *)dbc->internal;
+		if ((argp->opcode & QAM_SETFIRST) &&
+		    meta->first_recno == argp->old_first) {
+			if (argp->old_first > argp->new_first)
+				meta->first_recno = argp->new_first;
+			else {
+				if ((ret = __qam_position(dbc,
+				    &meta->first_recno, 0, &exact)) != 0)
+					goto err;
+				if (!exact)
+					meta->first_recno = argp->new_first;
+				if (cp->page != NULL &&
+				    (ret = __qam_fput(dbc,
+				    cp->pgno, cp->page, dbc->priority)) != 0)
+					goto err;
+			}
+		}
+
+		if ((argp->opcode & QAM_SETCUR) &&
+		    meta->cur_recno == argp->old_cur) {
+			if (argp->old_cur < argp->new_cur)
+				meta->cur_recno = argp->new_cur;
+			else {
+				if ((ret = __qam_position(dbc,
+				     &meta->cur_recno, 0, &exact)) != 0)
+					goto err;
+				if (!exact)
+					meta->cur_recno = argp->new_cur;
+				if (cp->page != NULL &&
+				    (ret = __qam_fput(dbc,
+				    cp->pgno, cp->page, dbc->priority)) != 0)
+					goto err;
+			}
+		}
+
+		meta->dbmeta.lsn = *lsnp;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	if (0) {
+err:		(void)__memp_fput(mpf, ip, meta, dbc->priority);
+	}
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __qam_del_recover --
+ *	Recovery function for del.
+ *		Non-extent version or if there is no data (zero len).
+ *
+ * PUBLIC: int __qam_del_recover
+ * PUBLIC:      __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_del_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__qam_del_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	QAMDATA *qp;
+	QMETA *meta;
+	QPAGE *pagep;
+	db_pgno_t metapg;
+	int cmp_n, ret, t_ret;
+
+	COMPQUIET(pagep, NULL);
+	meta = NULL;
+	pagep = NULL;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__qam_del_print);
+	REC_INTRO(__qam_del_read, ip, 0);
+
+	/* Allocate our own cursor without DB_RECOVER as we need a locker. */
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+		goto out;
+	F_SET(dbc, DBC_RECOVER);
+
+	/* Get the meta page before latching the page. */
+	metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+	if ((ret = __memp_fget(mpf, &metapg,
+	     ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+		goto err;
+
+	if ((ret = __qam_fget(dbc, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+		goto err;
+
+	if (pagep->pgno == PGNO_INVALID) {
+		QAM_DIRTY(dbc, argp->pgno, &pagep);
+		pagep->pgno = argp->pgno;
+		pagep->type = P_QAMDATA;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+	if (DB_UNDO(op)) {
+		/* make sure first is behind us */
+		if (meta->first_recno == RECNO_OOB ||
+		    (QAM_BEFORE_FIRST(meta, argp->recno) &&
+		    (meta->first_recno <= meta->cur_recno ||
+		    meta->first_recno -
+		    argp->recno < argp->recno - meta->cur_recno))) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->first_recno = argp->recno;
+		}
+
+		/* Need to undo delete - mark the record as present */
+		QAM_DIRTY(dbc, pagep->pgno, &pagep);
+		qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+		F_SET(qp, QAM_VALID);
+
+		/*
+		 * Move the LSN back to this point;  do not move it forward.
+		 * If we're in an abort, because we don't hold a page lock,
+		 * we could foul up a concurrent put.  Having too late an
+		 * LSN * is harmless in queue except when we're determining
+		 * what we need to roll forward during recovery.  [#2588]
+		 */
+		if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+			LSN(pagep) = argp->lsn;
+
+		if (op == DB_TXN_ABORT)
+			QAM_WAKEUP(dbc, ret);
+
+	} else if (op == DB_TXN_APPLY || (cmp_n > 0 && DB_REDO(op))) {
+		/* Need to redo delete - clear the valid bit */
+		QAM_DIRTY(dbc, pagep->pgno, &pagep);
+		qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+		F_CLR(qp, QAM_VALID);
+
+		/*
+		 * We only move the LSN forward during replication.
+		 * During recovery we could obscure an update from
+		 * a partially completed transaction while processing
+		 * a hot backup.  [#13823]
+		 */
+		if (op == DB_TXN_APPLY)
+			LSN(pagep) = *lsnp;
+		if ((ret = __qam_fput(dbc,
+		     argp->pgno, pagep, dbc->priority)) != 0)
+			goto err;
+		pagep = NULL;
+		if ((ret = __qam_adjust_first(file_dbp,
+		    dbc, meta, argp->recno)) != 0)
+			goto err;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+err:	if (pagep != NULL && (t_ret =
+	    __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (meta != NULL && (t_ret =
+	    __memp_fput(mpf, ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+out:	REC_CLOSE;
+}
+
+/*
+ * __qam_delext_recover --
+ *	Recovery function for del in an extent based queue.
+ *
+ * PUBLIC: int __qam_delext_recover
+ * PUBLIC:      __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_delext_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__qam_delext_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	QAMDATA *qp;
+	QMETA *meta;
+	QPAGE *pagep;
+	db_pgno_t metapg;
+	int cmp_n, ret, t_ret;
+
+	COMPQUIET(pagep, NULL);
+	meta = NULL;
+	pagep = NULL;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__qam_delext_print);
+	REC_INTRO(__qam_delext_read, ip, 0);
+
+	/* Allocate our own cursor without DB_RECOVER as we need a locker. */
+	if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+	    DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+		goto out;
+	F_SET(dbc, DBC_RECOVER);
+
+	metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+	if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+	    DB_MPOOL_EDIT, &meta)) != 0)
+		goto err;
+
+	if ((ret = __qam_fget(dbc, &argp->pgno,
+	     DB_REDO(op) ? 0 : DB_MPOOL_CREATE, &pagep)) != 0) {
+		/*
+		 * If we are redoing a delete and the page is not there
+		 * we are done.
+		 */
+		if (DB_REDO(op) && (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+			goto done;
+		goto out;
+	}
+
+	if (pagep->pgno == PGNO_INVALID) {
+		QAM_DIRTY(dbc, argp->pgno, &pagep);
+		pagep->pgno = argp->pgno;
+		pagep->type = P_QAMDATA;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+	if (DB_UNDO(op)) {
+		/* make sure first is behind us */
+		if (meta->first_recno == RECNO_OOB ||
+		    (QAM_BEFORE_FIRST(meta, argp->recno) &&
+		    (meta->first_recno <= meta->cur_recno ||
+		    meta->first_recno -
+		    argp->recno < argp->recno - meta->cur_recno))) {
+			meta->first_recno = argp->recno;
+		}
+
+		QAM_DIRTY(dbc, pagep->pgno, &pagep);
+		if ((ret = __qam_pitem(dbc, pagep,
+		    argp->indx, argp->recno, &argp->data)) != 0)
+			goto err;
+
+		/*
+		 * Move the LSN back to this point;  do not move it forward.
+		 * If we're in an abort, because we don't hold a page lock,
+		 * we could foul up a concurrent put.  Having too late an
+		 * LSN is harmless in queue except when we're determining
+		 * what we need to roll forward during recovery.  [#2588]
+		 */
+		if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+			LSN(pagep) = argp->lsn;
+
+		if (op == DB_TXN_ABORT)
+			QAM_WAKEUP(dbc, ret);
+
+	} else if (op == DB_TXN_APPLY || (cmp_n > 0 && DB_REDO(op))) {
+		QAM_DIRTY(dbc, pagep->pgno, &pagep);
+		/* Need to redo delete - clear the valid bit */
+		qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+		F_CLR(qp, QAM_VALID);
+		/*
+		 * We only move the LSN forward during replication.
+		 * During recovery we could obscure an update from
+		 * a partially completed transaction while processing
+		 * a hot backup.  [#13823]
+		 */
+		if (op == DB_TXN_APPLY)
+			LSN(pagep) = *lsnp;
+		if ((ret = __qam_fput(dbc,
+		     argp->pgno, pagep, dbc->priority)) != 0)
+			goto err;
+		pagep = NULL;
+		if ((ret = __qam_adjust_first(file_dbp,
+		    dbc, meta, argp->recno)) != 0)
+			goto err;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+err:	if (pagep != NULL && (t_ret =
+	    __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if (meta != NULL && (t_ret =
+	    __memp_fput(mpf, ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+out:	REC_CLOSE;
+}
+
+/*
+ * __qam_add_recover --
+ *	Recovery function for add.
+ *
+ * PUBLIC: int __qam_add_recover
+ * PUBLIC:      __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_add_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__qam_add_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	QAMDATA *qp;
+	QMETA *meta;
+	QPAGE *pagep;
+	db_pgno_t metapg;
+	int cmp_n, ret;
+
+	COMPQUIET(pagep, NULL);
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__qam_add_print);
+	REC_INTRO(__qam_add_read, ip, 1);
+
+	if ((ret = __qam_fget(dbc, &argp->pgno,
+	     DB_UNDO(op) ? 0 : DB_MPOOL_CREATE, &pagep)) != 0) {
+		/*
+		 * If we are undoing an append and the page is not there
+		 * we are done.
+		 */
+		if (DB_UNDO(op) && (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+			goto done;
+		goto out;
+	}
+
+	if (pagep->pgno == PGNO_INVALID) {
+		QAM_DIRTY(dbc, argp->pgno, &pagep);
+		pagep->pgno = argp->pgno;
+		pagep->type = P_QAMDATA;
+	}
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+	if (DB_REDO(op)) {
+		/* Fix meta-data page. */
+		metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+		if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+		    0, &meta)) != 0)
+			goto err;
+		if (QAM_BEFORE_FIRST(meta, argp->recno)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->first_recno = argp->recno;
+		}
+		if (argp->recno == meta->cur_recno ||
+		    QAM_AFTER_CURRENT(meta, argp->recno)) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->cur_recno = argp->recno + 1;
+		}
+		if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+			goto err;
+
+		/* Now update the actual page if necessary. */
+		if (op == DB_TXN_APPLY || cmp_n > 0) {
+			QAM_DIRTY(dbc, pagep->pgno, &pagep);
+			/* Need to redo add - put the record on page */
+			if ((ret = __qam_pitem(dbc,
+			    pagep, argp->indx, argp->recno, &argp->data)) != 0)
+				goto err;
+			/*
+			 * We only move the LSN forward during replication.
+			 * During recovery we could obscure an update from
+			 * a partially completed transaction while processing
+			 * a hot backup.  [#13823]
+			 */
+			if (op == DB_TXN_APPLY) {
+				LSN(pagep) = *lsnp;
+				QAM_WAKEUP(dbc, ret);
+			}
+		}
+	} else if (DB_UNDO(op)) {
+		/*
+		 * Need to undo add
+		 *	If this was an overwrite, put old record back.
+		 *	Otherwise just clear the valid bit
+		 */
+		if (argp->olddata.size != 0) {
+			QAM_DIRTY(dbc, pagep->pgno, &pagep);
+			if ((ret = __qam_pitem(dbc, pagep,
+			    argp->indx, argp->recno, &argp->olddata)) != 0)
+				goto err;
+
+			if (!(argp->vflag & QAM_VALID)) {
+				qp = QAM_GET_RECORD(
+				    file_dbp, pagep, argp->indx);
+				F_CLR(qp, QAM_VALID);
+			}
+		} else {
+			QAM_DIRTY(dbc, pagep->pgno, &pagep);
+			qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+			qp->flags = 0;
+		}
+
+		/*
+		 * Move the LSN back to this point;  do not move it forward.
+		 * If we're in an abort, because we don't hold a page lock,
+		 * we could foul up a concurrent put.  Having too late an
+		 * LSN is harmless in queue except when we're determining
+		 * what we need to roll forward during recovery.  [#2588]
+		 */
+		if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+			LSN(pagep) = argp->lsn;
+	}
+
+	if ((ret = __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	if (0) {
+err:		(void)__qam_fput(dbc, argp->pgno, pagep, dbc->priority);
+	}
+
+out:	REC_CLOSE;
+}
diff --git a/src/qam/qam_stat.c b/src/qam/qam_stat.c
new file mode 100644
index 00000000..15c41bb5
--- /dev/null
+++ b/src/qam/qam_stat.c
@@ -0,0 +1,255 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __qam_stat --
+ *	Gather/print the qam statistics
+ *
+ * PUBLIC: int __qam_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__qam_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_LOCK lock;
+	DB_MPOOLFILE *mpf;
+	DB_QUEUE_STAT *sp;
+	PAGE *h;
+	QAMDATA *qp, *ep;
+	QMETA *meta;
+	QUEUE *t;
+	db_indx_t indx;
+	db_pgno_t first, last, pgno, pg_ext, stop;
+	u_int32_t re_len;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+
+	LOCK_INIT(lock);
+	mpf = dbp->mpf;
+	sp = NULL;
+	t = dbp->q_internal;
+
+	if (spp == NULL)
+		return (0);
+
+	/* Allocate and clear the structure. */
+	if ((ret = __os_umalloc(dbp->env, sizeof(*sp), &sp)) != 0)
+		goto err;
+	memset(sp, 0, sizeof(*sp));
+
+	re_len = ((QUEUE *)dbp->q_internal)->re_len;
+
+	/* Determine the last page of the database. */
+	if ((ret = __db_lget(dbc, 0, t->q_meta, DB_LOCK_READ, 0, &lock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &t->q_meta,
+	    dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	if (flags == DB_FAST_STAT) {
+		sp->qs_nkeys = meta->dbmeta.key_count;
+		sp->qs_ndata = meta->dbmeta.record_count;
+		goto meta_only;
+	}
+
+	first = QAM_RECNO_PAGE(dbp, meta->first_recno);
+	last = QAM_RECNO_PAGE(dbp, meta->cur_recno);
+
+	ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	pgno = first;
+	if (first > last)
+		stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+	else
+		stop = last;
+
+	/* Dump each page. */
+	pg_ext = ((QUEUE *)dbp->q_internal)->page_ext;
+begin:
+	/* Walk through the pages and count. */
+	for (; pgno <= stop; ++pgno) {
+		if ((ret =
+		    __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+			goto err;
+		ret = __qam_fget(dbc, &pgno, 0, &h);
+		if (ret == ENOENT) {
+			pgno += pg_ext - 1;
+			continue;
+		}
+		if (ret == DB_PAGE_NOTFOUND) {
+			if (pg_ext == 0) {
+				if (pgno != stop && first != last)
+					goto err;
+				ret = 0;
+				break;
+			}
+			pgno += (pg_ext - ((pgno - 1) % pg_ext)) - 1;
+			continue;
+		}
+		if (ret != 0)
+			goto err;
+
+		++sp->qs_pages;
+
+		ep = (QAMDATA *)((u_int8_t *)h + dbp->pgsize - re_len);
+		for (indx = 0, qp = QAM_GET_RECORD(dbp, h, indx);
+		    qp <= ep;
+		    ++indx,  qp = QAM_GET_RECORD(dbp, h, indx)) {
+			if (F_ISSET(qp, QAM_VALID))
+				sp->qs_ndata++;
+			else
+				sp->qs_pgfree += re_len;
+		}
+
+		ret = __qam_fput(dbc, pgno, h, dbc->priority);
+		if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+	}
+
+	if ((ret = __LPUT(dbc, lock)) != 0)
+		goto err;
+	if (first > last) {
+		pgno = 1;
+		stop = last;
+		first = last;
+		goto begin;
+	}
+
+	/* Get the meta-data page. */
+	if ((ret = __db_lget(dbc,
+	    0, t->q_meta, F_ISSET(dbp, DB_AM_RDONLY) ?
+	    DB_LOCK_READ : DB_LOCK_WRITE, 0, &lock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &t->q_meta, dbc->thread_info, dbc->txn,
+	    F_ISSET(dbp, DB_AM_RDONLY) ? 0 : DB_MPOOL_DIRTY, &meta)) != 0)
+		goto err;
+
+	if (!F_ISSET(dbp, DB_AM_RDONLY))
+		meta->dbmeta.key_count =
+		    meta->dbmeta.record_count = sp->qs_ndata;
+	sp->qs_nkeys = sp->qs_ndata;
+
+meta_only:
+	/* Get the metadata fields. */
+	sp->qs_magic = meta->dbmeta.magic;
+	sp->qs_version = meta->dbmeta.version;
+	sp->qs_metaflags = meta->dbmeta.flags;
+	sp->qs_pagesize = meta->dbmeta.pagesize;
+	sp->qs_extentsize = meta->page_ext;
+	sp->qs_re_len = meta->re_len;
+	sp->qs_re_pad = meta->re_pad;
+	sp->qs_first_recno = meta->first_recno;
+	sp->qs_cur_recno = meta->cur_recno;
+
+	/* Discard the meta-data page. */
+	ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	*(DB_QUEUE_STAT **)spp = sp;
+
+	if (0) {
+err:		if (sp != NULL)
+			__os_ufree(dbp->env, sp);
+	}
+
+	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qam_stat_print --
+ *	Display queue statistics.
+ *
+ * PUBLIC: int __qam_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__qam_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_QUEUE_STAT *sp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __qam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL)) {
+		__db_msg(env, "%s", DB_GLOBAL(db_line));
+		__db_msg(env, "Default Queue database information:");
+	}
+	__db_msg(env, "%lx\tQueue magic number", (u_long)sp->qs_magic);
+	__db_msg(env, "%lu\tQueue version number", (u_long)sp->qs_version);
+	__db_dl(env, "Fixed-length record size", (u_long)sp->qs_re_len);
+	__db_msg(env, "%#x\tFixed-length record pad", (int)sp->qs_re_pad);
+	__db_dl(env,
+	    "Underlying database page size", (u_long)sp->qs_pagesize);
+	__db_dl(env,
+	    "Underlying database extent size", (u_long)sp->qs_extentsize);
+	__db_dl(env,
+	    "Number of records in the database", (u_long)sp->qs_nkeys);
+	__db_dl(env,
+	    "Number of data items in the database", (u_long)sp->qs_ndata);
+	__db_dl(env, "Number of database pages", (u_long)sp->qs_pages);
+	__db_dl_pct(env,
+	    "Number of bytes free in database pages",
+	    (u_long)sp->qs_pgfree,
+	    DB_PCT_PG(sp->qs_pgfree, sp->qs_pages, sp->qs_pagesize), "ff");
+	__db_msg(env,
+	    "%lu\tFirst undeleted record", (u_long)sp->qs_first_recno);
+	__db_msg(env,
+	    "%lu\tNext available record number", (u_long)sp->qs_cur_recno);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__qam_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbc->env));
+}
+#endif
diff --git a/src/qam/qam_stub.c b/src/qam/qam_stub.c
new file mode 100644
index 00000000..f5140079
--- /dev/null
+++ b/src/qam/qam_stub.c
@@ -0,0 +1,339 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef	HAVE_QUEUE
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/qam.h"
+
+/*
+ * If the library wasn't compiled with the Queue access method, various
+ * routines aren't available.  Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_no_queue_am --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_queue_am __P((ENV *));
+ */
+int
+__db_no_queue_am(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("1145",
+    "library build did not include support for the Queue access method"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__db_prqueue(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_31_qammeta(dbp, real_name, buf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *buf;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(buf, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_32_qammeta(dbp, real_name, buf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *buf;
+{
+	COMPQUIET(real_name, NULL);
+	COMPQUIET(buf, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_append(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qamc_dup(orig_dbc, new_dbc)
+	DBC *orig_dbc, *new_dbc;
+{
+	COMPQUIET(new_dbc, NULL);
+	return (__db_no_queue_am(orig_dbc->env));
+}
+
+int
+__qamc_init(dbc)
+	DBC *dbc;
+{
+	return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_db_close(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+int
+__qam_db_create(dbp)
+	DB *dbp;
+{
+	COMPQUIET(dbp, NULL);
+	return (0);
+}
+
+int
+__qam_extent_names(env, name, namelistp)
+	ENV *env;
+	char *name;
+	char ***namelistp;
+{
+	COMPQUIET(name, NULL);
+	COMPQUIET(namelistp, NULL);
+	return (__db_no_queue_am(env));
+}
+
+int
+__qam_gen_filelist(dbp, ip, filelistp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	QUEUE_FILELIST **filelistp;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(filelistp, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__qam_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+
+int
+__qam_metachk(dbp, name, qmeta)
+	DB *dbp;
+	const char *name;
+	QMETA *qmeta;
+{
+	COMPQUIET(name, NULL);
+	COMPQUIET(qmeta, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_mswap(env, pg)
+	ENV *env;
+	PAGE *pg;
+{
+	COMPQUIET(pg, NULL);
+	return (__db_no_queue_am(env));
+}
+
+int
+__qam_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(fhp, NULL);
+	COMPQUIET(name, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_open(dbp, ip, txn, name, base_pgno, mode, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	db_pgno_t base_pgno;
+	int mode;
+	u_int32_t flags;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(base_pgno, 0);
+	COMPQUIET(mode, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_pgin_out(env, pg, pp, cookie)
+	ENV *env;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	COMPQUIET(pg, 0);
+	COMPQUIET(pp, NULL);
+	COMPQUIET(cookie, NULL);
+	return (__db_no_queue_am(env));
+}
+
+int
+__qam_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(h, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_set_ext_data(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	COMPQUIET(name, NULL);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_stat_print(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_sync(dbp)
+	DB *dbp;
+{
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	COMPQUIET(countp, NULL);
+	return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_vrfy_data(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	QPAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(h, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	QMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(meta, NULL);
+	COMPQUIET(pgno, 0);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_structure(dbp, vdp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_no_queue_am(dbp->env));
+}
+#endif	/* !HAVE_QUEUE */
diff --git a/src/qam/qam_upgrade.c b/src/qam/qam_upgrade.c
new file mode 100644
index 00000000..ac96c889
--- /dev/null
+++ b/src/qam/qam_upgrade.c
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_upgrade.h"
+#include "dbinc/db_page.h"
+#include "dbinc/qam.h"
+
+/*
+ * __qam_31_qammeta --
+ *	Upgrade the database from version 1 to version 2.
+ *
+ * PUBLIC: int __qam_31_qammeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__qam_31_qammeta(dbp, real_name, buf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *buf;
+{
+	QMETA30 *oldmeta;
+	QMETA31 *newmeta;
+
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(real_name, NULL);
+
+	newmeta = (QMETA31 *)buf;
+	oldmeta = (QMETA30 *)buf;
+
+	/*
+	 * Copy the fields to their new locations.
+	 * They may overlap so start at the bottom and use memmove().
+	 */
+	newmeta->rec_page = oldmeta->rec_page;
+	newmeta->re_pad = oldmeta->re_pad;
+	newmeta->re_len = oldmeta->re_len;
+	newmeta->cur_recno = oldmeta->cur_recno;
+	newmeta->first_recno = oldmeta->first_recno;
+	newmeta->start = oldmeta->start;
+	memmove(newmeta->dbmeta.uid,
+	    oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+	newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+	newmeta->dbmeta.record_count = 0;
+	newmeta->dbmeta.key_count = 0;
+	ZERO_LSN(newmeta->dbmeta.unused3);
+
+	/* Update the version. */
+	newmeta->dbmeta.version = 2;
+
+	return (0);
+}
+
+/*
+ * __qam_32_qammeta --
+ *	Upgrade the database from version 2 to version 3.
+ *
+ * PUBLIC: int __qam_32_qammeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__qam_32_qammeta(dbp, real_name, buf)
+	DB *dbp;
+	char *real_name;
+	u_int8_t *buf;
+{
+	QMETA31 *oldmeta;
+	QMETA32 *newmeta;
+
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(real_name, NULL);
+
+	newmeta = (QMETA32 *)buf;
+	oldmeta = (QMETA31 *)buf;
+
+	/*
+	 * Copy the fields to their new locations.
+	 * We are dropping the first field so move
+	 * from the top.
+	 */
+	newmeta->first_recno = oldmeta->first_recno;
+	newmeta->cur_recno = oldmeta->cur_recno;
+	newmeta->re_len = oldmeta->re_len;
+	newmeta->re_pad = oldmeta->re_pad;
+	newmeta->rec_page = oldmeta->rec_page;
+	newmeta->page_ext = 0;
+	/* cur_recno now points to the first free slot. */
+	newmeta->cur_recno++;
+	if (newmeta->first_recno == 0)
+		newmeta->first_recno = 1;
+
+	/* Update the version. */
+	newmeta->dbmeta.version = 3;
+
+	return (0);
+}
diff --git a/src/qam/qam_verify.c b/src/qam/qam_verify.c
new file mode 100644
index 00000000..af5ab5db
--- /dev/null
+++ b/src/qam/qam_verify.c
@@ -0,0 +1,653 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+/*
+ * __qam_vrfy_meta --
+ *	Verify the queue-specific part of a metadata page.
+ *
+ * PUBLIC: int __qam_vrfy_meta __P((DB *, VRFY_DBINFO *, QMETA *,
+ * PUBLIC:     db_pgno_t, u_int32_t));
+ */
+int
+__qam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	QMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	QUEUE *qp;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t *extents, extid, first, last;
+	size_t len;
+	int count, i, isbad, nextents, ret, t_ret;
+	char *buf, **names;
+
+	COMPQUIET(count, 0);
+
+	env = dbp->env;
+	qp = (QUEUE *)dbp->q_internal;
+	extents = NULL;
+	first = last = 0;
+	isbad = 0;
+	buf = NULL;
+	names = NULL;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/*
+	 * Queue can't be used in subdatabases, so if this isn't set
+	 * something very odd is going on.
+	 */
+	if (!F_ISSET(pip, VRFY_INCOMPLETE))
+		EPRINT((env, DB_STR_A("1146",
+		    "Page %lu: queue databases must be one-per-file",
+		    "%lu"), (u_long)pgno));
+
+	/*
+	 * We have already checked the common fields in __db_vrfy_pagezero.
+	 * However, we used the on-disk metadata page, it may have been stale.
+	 * We now have the page from mpool, so check that.
+	 */
+	if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * Because the metapage pointers are rolled forward by
+	 * aborting transactions, the extent of the queue may
+	 * extend beyond the allocated pages, so we do
+	 * not check that meta_current is within the allocated
+	 * pages.
+	 */
+
+	/*
+	 * re_len:  If this is bad, we can't safely verify queue data pages, so
+	 * return DB_VERIFY_FATAL
+	 */
+	if (DB_ALIGN(meta->re_len + sizeof(QAMDATA) - 1, sizeof(u_int32_t)) *
+	    meta->rec_page + QPAGE_SZ(dbp) > dbp->pgsize) {
+		EPRINT((env, DB_STR_A("1147",
+    "Page %lu: queue record length %lu too high for page size and recs/page",
+		    "%lu %lu"), (u_long)pgno, (u_long)meta->re_len));
+		ret = DB_VERIFY_FATAL;
+		goto err;
+	} else {
+		/*
+		 * We initialize the Queue internal pointer;  we may need
+		 * it when handling extents.  It would get set up in open,
+		 * if we called open normally, but we don't.
+		 */
+		vdp->re_pad = meta->re_pad;
+		qp->re_pad = (int)meta->re_pad;
+		qp->re_len = vdp->re_len = meta->re_len;
+		qp->rec_page = vdp->rec_page = meta->rec_page;
+		qp->page_ext = vdp->page_ext = meta->page_ext;
+	}
+
+	/*
+	 * There's no formal maximum extentsize, and a 0 value represents
+	 * no extents, so there's nothing to verify.
+	 *
+	 * Note that since QUEUE databases can't have subdatabases, it's an
+	 * error to see more than one QUEUE metadata page in a single
+	 * verifier run.  Theoretically, this should really be a structure
+	 * rather than a per-page check, but since we're setting qp fields
+	 * here (and have only one qp to set) we raise the alarm now if
+	 * this assumption fails.  (We need the qp info to be reasonable
+	 * before we do per-page verification of queue extents.)
+	 */
+	if (F_ISSET(vdp, VRFY_QMETA_SET)) {
+		isbad = 1;
+		EPRINT((env, DB_STR_A("1148",
+		    "Page %lu: database contains multiple Queue metadata pages",
+		    "%lu"), (u_long)pgno));
+		goto err;
+	}
+	F_SET(vdp, VRFY_QMETA_SET);
+	qp->page_ext = meta->page_ext;
+	dbp->pgsize = meta->dbmeta.pagesize;
+	qp->q_meta = pgno;
+	qp->q_root = pgno + 1;
+	vdp->first_recno = meta->first_recno;
+	vdp->last_recno = meta->cur_recno;
+	if (qp->page_ext != 0) {
+		first = QAM_RECNO_EXTENT(dbp, vdp->first_recno);
+		last = QAM_RECNO_EXTENT(dbp, vdp->last_recno);
+	}
+
+	/*
+	 * Look in the data directory to see if there are any extents
+	 * around that are not in the range of the queue.  If so,
+	 * then report that and look there if we are salvaging.
+	 */
+
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, qp->dir, NULL, &buf)) != 0)
+		goto err;
+	if ((ret = __os_dirlist(env, buf, 0, &names, &count)) != 0)
+		goto err;
+	__os_free(env, buf);
+	buf = NULL;
+
+	/* In-memory dbs cannot have extents. */
+	nextents = 0;
+	if (!F_ISSET(dbp, DB_AM_INMEM)) {
+		len = strlen(QUEUE_EXTENT_HEAD) + strlen(qp->name) + 1;
+		if ((ret = __os_malloc(env, len, &buf)) != 0)
+			goto err;
+		len = (size_t)snprintf(buf, len, QUEUE_EXTENT_HEAD, qp->name);
+		for (i = 0; i < count; i++) {
+			if (strncmp(names[i], buf, len) == 0) {
+				/* Only save extents out of bounds. */
+				extid = (db_pgno_t)strtoul(
+				    &names[i][len], NULL, 10);
+				if (qp->page_ext != 0 &&
+				    (last > first ?
+					(extid >= first && extid <= last) :
+					(extid >= first || extid <= last)))
+					continue;
+				if (extents == NULL && (ret = __os_malloc(
+				    env, (size_t)(count - i) * sizeof(extid),
+				    &extents)) != 0)
+					goto err;
+				extents[nextents] = extid;
+				nextents++;
+			}
+		}
+	}
+	if (nextents > 0)
+		__db_errx(env, DB_STR_A("1149",
+		    "Warning: %d extra extent files found", "%d"), nextents);
+	vdp->nextents = nextents;
+	vdp->extents = extents;
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (names != NULL)
+		__os_dirfree(env, names, count);
+	if (buf != NULL)
+		__os_free(env, buf);
+	if (ret != 0 && extents != NULL)
+		__os_free(env, extents);
+	if (LF_ISSET(DB_SALVAGE) &&
+	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __qam_meta2pgset --
+ * For a given Queue meta page, add all of the db's pages to the pgset.  Dealing
+ * with extents complicates things, as it is possible for there to be gaps in
+ * the page number sequence (the user could have re-inserted record numbers that
+ * had been on deleted extents) so we test the existence of each extent before
+ * adding its pages to the pgset.  If there are no extents, just loop from
+ * first_recno to last_recno.
+ *
+ * PUBLIC: int __qam_meta2pgset __P((DB *, VRFY_DBINFO *, DB *));
+ */
+int
+__qam_meta2pgset(dbp, vdp, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DB *pgset;
+{
+	DBC *dbc;
+	PAGE *h;
+	db_pgno_t first, last, pgno, pg_ext, stop;
+	int ret, t_ret;
+	u_int32_t i;
+
+	ret = 0;
+	h = NULL;
+	if (vdp->last_recno <= vdp->first_recno)
+		return (0);
+
+	pg_ext = vdp->page_ext;
+
+	first = QAM_RECNO_PAGE(dbp, vdp->first_recno);
+
+	/*
+	 * last_recno gives the next recno to be allocated, we want the last
+	 * allocated recno.
+	 */
+	last = QAM_RECNO_PAGE(dbp, vdp->last_recno - 1);
+
+	if (first == PGNO_INVALID || last == PGNO_INVALID)
+		return (DB_VERIFY_BAD);
+
+	pgno = first;
+	if (first > last)
+		stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+	else
+		stop = last;
+
+	/*
+	 * If this db doesn't have extents, just add all page numbers from first
+	 * to last.
+	 */
+	if (pg_ext == 0) {
+		for (pgno = first; pgno <= stop; pgno++)
+			if ((ret = __db_vrfy_pgset_inc(
+			    pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+				break;
+		if (first > last)
+			for (pgno = 1; pgno <= last; pgno++)
+				if ((ret = __db_vrfy_pgset_inc(pgset,
+				    vdp->thread_info, vdp->txn, pgno)) != 0)
+					break;
+
+		return (ret);
+	}
+
+	if ((ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+		return (ret);
+	/*
+	 * Check if we can get the first page of each extent.  If we can, then
+	 * add all of that extent's pages to the pgset.  If we can't, assume the
+	 * extent doesn't exist and don't add any pages, if we're wrong we'll
+	 * find the pages in __db_vrfy_walkpages.
+	 */
+begin:	for (; pgno <= stop; pgno += pg_ext) {
+		if ((ret = __qam_fget(dbc, &pgno, 0, &h)) != 0) {
+			if (ret == ENOENT || ret == DB_PAGE_NOTFOUND) {
+				ret = 0;
+				continue;
+			}
+			goto err;
+		}
+		if ((ret = __qam_fput(dbc, pgno, h, dbp->priority)) != 0)
+			goto err;
+
+		for (i = 0; i < pg_ext && pgno + i <= last; i++)
+			if ((ret = __db_vrfy_pgset_inc(
+			    pgset, vdp->thread_info, vdp->txn, pgno + i)) != 0)
+				goto err;
+
+		/* The first recno won't always occur on the first page of the
+		 * extent.  Back up to the beginning of the extent before the
+		 * end of the loop so that the increment works correctly.
+		 */
+		if (pgno == first)
+			pgno = pgno % pg_ext + 1;
+	}
+
+	if (first > last) {
+		pgno = 1;
+		first = last;
+		stop = last;
+		goto begin;
+	}
+
+err:
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __qam_vrfy_data --
+ *	Verify a queue data page.
+ *
+ * PUBLIC: int __qam_vrfy_data __P((DB *, VRFY_DBINFO *, QPAGE *,
+ * PUBLIC:     db_pgno_t, u_int32_t));
+ */
+int
+__qam_vrfy_data(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	QPAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	DB fakedb;
+	struct __queue fakeq;
+	QAMDATA *qp;
+	db_recno_t i;
+
+	/*
+	 * Not much to do here, except make sure that flags are reasonable.
+	 *
+	 * QAM_GET_RECORD assumes a properly initialized q_internal
+	 * structure, however, and we don't have one, so we play
+	 * some gross games to fake it out.
+	 */
+	fakedb.q_internal = &fakeq;
+	fakedb.flags = dbp->flags;
+	fakeq.re_len = vdp->re_len;
+
+	for (i = 0; i < vdp->rec_page; i++) {
+		qp = QAM_GET_RECORD(&fakedb, h, i);
+		if ((u_int8_t *)qp >= (u_int8_t *)h + dbp->pgsize) {
+			EPRINT((dbp->env, DB_STR_A("1150",
+		    "Page %lu: queue record %lu extends past end of page",
+			    "%lu %lu"), (u_long)pgno, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+
+		if (qp->flags & ~(QAM_VALID | QAM_SET)) {
+			EPRINT((dbp->env, DB_STR_A("1151",
+			    "Page %lu: queue record %lu has bad flags (%#lx)",
+			    "%lu %lu %#lx"), (u_long)pgno, (u_long)i,
+			    (u_long)qp->flags));
+			return (DB_VERIFY_BAD);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __qam_vrfy_structure --
+ *	Verify a queue database structure, such as it is.
+ *
+ * PUBLIC: int __qam_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+ */
+int
+__qam_vrfy_structure(dbp, vdp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	u_int32_t flags;
+{
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i;
+	int ret, isbad;
+
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+		return (ret);
+
+	if (pip->type != P_QAMMETA) {
+		EPRINT((dbp->env, DB_STR_A("1152",
+		    "Page %lu: queue database has no meta page", "%lu"),
+		    (u_long)PGNO_BASE_MD));
+		isbad = 1;
+		goto err;
+	}
+
+	if ((ret = __db_vrfy_pgset_inc(
+	    vdp->pgset, vdp->thread_info, vdp->txn, 0)) != 0)
+		goto err;
+
+	for (i = 1; i <= vdp->last_pgno; i++) {
+		/* Send feedback to the application about our progress. */
+		if (!LF_ISSET(DB_SALVAGE))
+			__db_vrfy_struct_feedback(dbp, vdp);
+
+		if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 ||
+		    (ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+			return (ret);
+		if (!F_ISSET(pip, VRFY_IS_ALLZEROES) &&
+		    pip->type != P_QAMDATA && !F_ISSET(pip, VRFY_NONEXISTENT)) {
+			EPRINT((dbp->env, DB_STR_A("1153",
+		    "Page %lu: queue database page of incorrect type %lu",
+			    "%lu %lu"), (u_long)i, (u_long)pip->type));
+			isbad = 1;
+			goto err;
+		} else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+		    vdp->thread_info, vdp->txn, i)) != 0)
+			goto err;
+	}
+
+err:	if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+		return (ret);
+	return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __qam_vrfy_walkqueue --
+ *    Do a "walkpages" per-page verification pass over the set of Queue
+ * extent pages.
+ *
+ * PUBLIC: int __qam_vrfy_walkqueue __P((DB *, VRFY_DBINFO *, void *,
+ * PUBLIC:    int (*)(void *, const void *), u_int32_t));
+ */
+int
+__qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	PAGE *h;
+	QUEUE *qp;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t first, i, last, pg_ext, stop;
+	int isbad, nextents, ret, t_ret;
+
+	COMPQUIET(h, NULL);
+
+	env = dbp->env;
+	qp = dbp->q_internal;
+	pip = NULL;
+	pg_ext = qp->page_ext;
+	isbad = ret = t_ret = 0;
+	h = NULL;
+
+	/* If this database has no extents, we've seen all the pages already. */
+	if (pg_ext == 0)
+		return (0);
+
+	first = QAM_RECNO_PAGE(dbp, vdp->first_recno);
+	last = QAM_RECNO_PAGE(dbp, vdp->last_recno);
+
+	i = first;
+	if (first > last)
+		stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+	else
+		stop = last;
+	nextents = vdp->nextents;
+
+	/* Verify/salvage each page. */
+	if ((ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+		return (ret);
+begin:	for (; i <= stop; i++) {
+		/*
+		 * If DB_SALVAGE is set, we inspect our database of completed
+		 * pages, and skip any we've already printed in the subdb pass.
+		 */
+		if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+			continue;
+		if ((t_ret = __qam_fget(dbc, &i, 0, &h)) != 0) {
+			if (t_ret == ENOENT || t_ret == DB_PAGE_NOTFOUND) {
+				i += (pg_ext - ((i - 1) % pg_ext)) - 1;
+				continue;
+			}
+
+			/*
+			 * If an individual page get fails, keep going iff
+			 * we're salvaging.
+			 */
+			if (LF_ISSET(DB_SALVAGE)) {
+				if (ret == 0)
+					ret = t_ret;
+				continue;
+			}
+			h = NULL;
+			ret = t_ret;
+			goto err;
+		}
+
+		if (LF_ISSET(DB_SALVAGE)) {
+			/*
+			 * We pretty much don't want to quit unless a
+			 * bomb hits.  May as well return that something
+			 * was screwy, however.
+			 */
+			if ((t_ret = __db_salvage_pg(dbp,
+			    vdp, i, h, handle, callback, flags)) != 0) {
+				if (ret == 0)
+					ret = t_ret;
+				isbad = 1;
+			}
+		} else {
+			/*
+			 * If we are not salvaging, and we get any error
+			 * other than DB_VERIFY_BAD, return immediately;
+			 * it may not be safe to proceed.  If we get
+			 * DB_VERIFY_BAD, keep going;  listing more errors
+			 * may make it easier to diagnose problems and
+			 * determine the magnitude of the corruption.
+			 */
+			if ((ret = __db_vrfy_common(dbp,
+			    vdp, h, i, flags)) == DB_VERIFY_BAD)
+				isbad = 1;
+			else if (ret != 0)
+				goto err;
+
+			__db_vrfy_struct_feedback(dbp, vdp);
+
+			if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+				goto err;
+			if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+				goto put;
+			if (pip->type != P_QAMDATA) {
+				EPRINT((env, DB_STR_A("1154",
+		    "Page %lu: queue database page of incorrect type %lu",
+				    "%lu %lu"), (u_long)i, (u_long)pip->type));
+				isbad = 1;
+				goto err;
+			}
+			if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+			    vdp->thread_info, vdp->txn, i)) != 0)
+				goto err;
+			if ((ret = __qam_vrfy_data(dbp, vdp,
+			    (QPAGE *)h, i, flags)) == DB_VERIFY_BAD)
+				isbad = 1;
+			else if (ret != 0)
+				goto err;
+
+put:			if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+				goto err1;
+			pip = NULL;
+		}
+
+		/* Again, keep going iff we're salvaging. */
+		if ((t_ret = __qam_fput(dbc, i, h, dbp->priority)) != 0) {
+			if (LF_ISSET(DB_SALVAGE)) {
+				if (ret == 0)
+					ret = t_ret;
+				continue;
+			}
+			ret = t_ret;
+			goto err1;
+		}
+	}
+
+	if (first > last) {
+		i = 1;
+		stop = last;
+		first = last;
+		goto begin;
+	}
+
+	/*
+	 * Now check to see if there were any lingering
+	 * extents and dump their data.
+	 */
+	if (LF_ISSET(DB_SALVAGE) && nextents != 0) {
+		nextents--;
+		i = 1 +
+		    vdp->extents[nextents] * vdp->page_ext;
+		stop = i + vdp->page_ext;
+		goto begin;
+	}
+
+	if (0) {
+err:		if (h != NULL && (t_ret =
+		    __qam_fput(dbc, i, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (pip != NULL && (t_ret =
+		    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+err1:	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __qam_salvage --
+ *	Safely dump out all recnos and data on a queue page.
+ *
+ * PUBLIC: int __qam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
+ * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__qam_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBT dbt, key;
+	QAMDATA *qp, *qep;
+	db_recno_t recno;
+	int ret, err_ret, t_ret;
+	u_int32_t pagesize, qlen;
+	u_int32_t i;
+
+	memset(&dbt, 0, sizeof(DBT));
+	memset(&key, 0, sizeof(DBT));
+
+	err_ret = ret = 0;
+
+	pagesize = (u_int32_t)dbp->mpf->mfp->pagesize;
+	qlen = ((QUEUE *)dbp->q_internal)->re_len;
+	dbt.size = qlen;
+	key.data = &recno;
+	key.size = sizeof(recno);
+	recno = (pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+	i = 0;
+	qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+	for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+	    recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+		if (F_ISSET(qp, ~(QAM_VALID|QAM_SET)))
+			continue;
+		if (!F_ISSET(qp, QAM_SET))
+			continue;
+
+		if (!LF_ISSET(DB_AGGRESSIVE) && !F_ISSET(qp, QAM_VALID))
+			continue;
+
+		dbt.data = qp->data;
+		if ((ret = __db_vrfy_prdbt(&key,
+		    0, " ", handle, callback, 1, 0, vdp)) != 0)
+			err_ret = ret;
+
+		if ((ret = __db_vrfy_prdbt(&dbt,
+		    0, " ", handle, callback, 0, 0, vdp)) != 0)
+			err_ret = ret;
+	}
+
+	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+		return (t_ret);
+	return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
diff --git a/src/rep/mlease.html b/src/rep/mlease.html
new file mode 100644
index 00000000..7d44b465
--- /dev/null
+++ b/src/rep/mlease.html
@@ -0,0 +1,1198 @@
+<!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en">
+<!--Copyright (c) 2011, 2012 Oracle and/or its affiliates.  All rights reserved.-->
+<html>
+<head>
+  <meta http-equiv="Content-Type"
+ content="text/html; charset=iso-8859-1">
+  <meta name="GENERATOR"
+ content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]">
+  <title>Master Lease</title>
+</head>
+<body>
+<center>
+<h1>Master Leases for Berkeley DB</h1>
+</center>
+<center><i>Susan LoVerso</i> <br>
+<i>sue@sleepycat.com</i> <br>
+<i>Rev 1.1</i><br>
+<i>2007 Feb 2</i><br>
+</center>
+<p><br>
+</p>
+<h2>What are Master Leases?</h2>
+A master lease is a mechanism whereby clients grant master-ship rights
+to a site and that master, by holding lease rights can provide a&nbsp;
+guarantee of durability to a replication group for a given period of
+time.&nbsp; By granting a lease to a master,
+a&nbsp; client will not participate in an election to elect a new
+master until that granted master lease has expired.&nbsp; By holding a
+collection of granted leases, a master will be able to supply
+authoritative read requests to applications.&nbsp; By holding leases a
+read operation on a master can guarantee several things to the
+application:<br>
+<ol>
+  <li>Authoritative reads: a guarantee that the data being read by the
+application is durable and can never be rolled back.</li>
+  <li>Freshness: a guarantee that the data being read by the
+application <b>at the master</b> is
+not stale.</li>
+  <li>Master viability: a guarantee that a current master with valid
+leases will not encounter a duplicate master situation.<br>
+  </li>
+</ol>
+<h2>Requirements</h2>
+The requirements of DB to support this include:<br>
+<ul>
+  <li>After turning them on, users can choose to ignore them in reads
+or not.</li>
+  <li>We are providing read authority on the master only.&nbsp; A
+read on a client is equivalent to a read while ignoring leases.</li>
+  <li>We guarantee that data committed on a master <b>that has been
+read by an application on the
+master</b> will not be rolled back.&nbsp; Data read on a client or
+while ignoring leases <i>or data
+successfully updated/committed but not read,</i>
+may be rolled back.<br>
+  </li>
+  <li>A master will not return successfully from a read operation
+unless it holds a
+majority of leases unless leases are ignored.</li>
+  <li>Master leases will remove the possibility of a current/correct
+master being "shot down" by DUPMASTER.&nbsp; <b>NOTE: Old/Expired
+masters may discover a
+later master and return DUPMASTER to the application however.</b><br>
+  </li>
+  <li>Any send callback failure must result in premature lease
+expiration on the master.<br>
+  </li>
+  <li>Users who change the system clock during master leases void the
+guarantee and may get undefined behavior.&nbsp; We assume time always
+runs forward. <br>
+  </li>
+  <li>Clients are forbidden from participating in elections while they
+have an outstanding lease granted to another site.</li>
+  <li>Clients are forbidden from accepting a new master while they have
+an outstanding lease granted to another site.</li>
+  <li>Clients are forbidden from upgrading themselves to master while
+they have an outstanding lease granted to another site.</li>
+  <li>When asked for a lease grant explicitly by the master, the client
+cannot grant the lease to the master unless the LSN in the master's
+request has been processed by this client.<br>
+  </li>
+</ul>
+The requirements of the
+application using leases include:<br>
+<ul>
+  <li>Users must implement (Base API users on their own, RepMgr users
+via configuration) a majority (or larger) ACK policy. <br>
+  </li>
+  <li>The application must use the election mechanism to decide a master.
+It may not simply declare a site master.</li>
+  <li>The send callback must return an error if the majority ACK policy
+is not met for PERM records.</li>
+  <li>Users must set the number of sites in the group.</li>
+  <li>Using leases in a replication group is all-or-none.&nbsp;
+Therefore, if a site knows it is using leases, it can assume other
+sites are also.<br>
+  </li>
+  <li>All applications that care about read guarantees must forward or
+perform all reads on the master.&nbsp; Reading on the client means a
+read ignoring leases. </li>
+</ul>
+<p>There are some open questions
+remaining.</p>
+<ul>
+  <li>There is one major showstopper issue, see Crashing - Potential
+problem near the end of the document.&nbsp; We need a better solution
+than the one shown there (writing to disk every time a lease is
+granted). Perhaps just documenting that durability means it must be
+flushed to disk before success to avoid that situation?<br>
+  </li>
+  <li>What about db-&gt;join?&nbsp; Users can call join, but the calls
+on the join cursor to get the data would be subject to leases and
+therefore protected.&nbsp; Ok, this is not an open question.</li>
+  <li>What about other read-like operations?&nbsp; Clearly <i>
+DB-&gt;get, DB-&gt;pget, DBC-&gt;get,
+DBC-&gt;pget</i> need lease checks.&nbsp; However, other APIs use
+keys.&nbsp; <i>DB-&gt;key_range</i>
+provides an estimate only so it shouldn't need lease checks. <i>
+DB-&gt;stat</i> provides exact counts
+to <i>bt_nkeys</i> and <i>bt_ndata</i> fields.&nbsp; Are those
+fields considered authoritative that providing those values implies a
+durability guarantee and therefore <i>DB-&gt;stat</i>
+should be subject to lease verification?&nbsp; <i>DBC-&gt;count</i>
+provides a count for
+the number of data items associated with a key.&nbsp; Is this
+authoritative information? This is similar to stat - should it be
+subject to lease verification?<br>
+  </li>
+  <li>Do we require master lease checks on write operations?&nbsp; I
+think lease checks are not needed on write operations.&nbsp; It doesn't
+add correctness and adds a lot of complexity (checking leases in put,
+del, and cursors, then what about rename, remove, etc).<br>
+  </li>
+  <li>Do master leases give an iron-clad guarantee of never rolling
+back a transaction? No, but it should mean that a committed transaction
+can never be <b>read</b> on a master
+unless the lease is valid.&nbsp; A committed transaction on a master
+that has never been presented to the application may get rolled back.<br>
+  </li>
+  <li>Do we need to quarantine or prevent reads on an ex-master until
+sync-up is done?&nbsp; No.&nbsp; A master that is simply downgraded to
+client or crashes and reboots is now a client.&nbsp; Reading from that
+client is the same as saying Ignore Leases.</li>
+  <li>What about adding and removing sites while leases are
+active?&nbsp; This is SR 14778.&nbsp; A consistent <i>nsites</i> value
+is required by master
+leases.&nbsp; &nbsp; It isn't
+clear to me what a master is
+supposed to do if the value of nsites gets smaller while leases are
+active.&nbsp; Perhaps it leaves its larger table intact and simply
+checks for a smaller number of granted leases?<br>
+  </li>
+  <li>Can users turn leases off?&nbsp; No.&nbsp; There is no planned <i>turn
+leases off</i> API.</li>
+  <li>Clock skew will be a percentage.&nbsp; However, the smallest, 1%,
+is probably rather large for clock skew.&nbsp; Percentage was chosen
+for simplicity and similarity to other APIs.&nbsp; What granularity is
+appropriate here?</li>
+</ul>
+<h2>API Changes</h2>
+The API changes that are visible
+to the user are fairly minimal.&nbsp;
+There are a few API calls they need to make to configure master leases
+and then there is the API call to turn them on.&nbsp; There is also a
+new flag to existing APIs to allow read operations to ignore leases and
+return data that
+may be non-durable potentially.<br>
+<h3>Lease Timeout<br>
+</h3>
+There is a new timout the user
+must configure for leases called <b>DB_REP_LEASE_TIMEOUT</b>.&nbsp;
+This timeout will be new to
+the <i>dbenv-&gt;rep_set_timeout</i> method. The <b>DB_REP_LEASE_TIMEOUT</b>
+has no default and it is required that the user configure a timeout
+before they turn on leases (obviously, this timeout need not be set of
+leases will not be used).&nbsp; That timeout is the amount of time
+the lease is valid on the master and how long it is granted
+on the client.&nbsp; This timeout must be the same
+value on all sites (like log file size).&nbsp; The timeout used when
+refreshing leases is the <b>DB_REP_ACK_TIMEOUT</b>
+for RepMgr application.&nbsp; For Base API applications, lease
+refreshes will use the same mechanism as <b>PERM</b> messages and they
+should
+have no additional burden.&nbsp; This timeout is used for lease
+refreshment and is the amount of time a reader will wait to refresh
+leases before returning failure to the application from a read
+operation.<br>
+<br>
+This timeout will be both stored
+with its original value, and also
+converted to a <i>db_timespec</i>
+using the <b>DB_TIMEOUT_TO_TIMESPEC</b>
+macro and have the clock skew accounted for and stored in the shared
+rep structure:<br>
+<pre>db_timeout_t lease_timeout;<br>db_timespec lease_duration;<br></pre>
+NOTE:&nbsp; By sending the lease refresh during DB operations, we are
+forcing/assuming that the operation's process has a replication
+transport function set.&nbsp; That is obviously the case for write
+operations, but would it be a burden for read processes (on a
+master)?&nbsp; I think mostly not, but if we need leases for <i>
+DB-&gt;stat</i> then we need to
+document it as it is certainly possible for an application to have a
+separate or dedicated <i>stat</i>
+application or attempt to use <i>db_stat</i>
+(which will not work if leases must be checked).<br>
+<br>
+Leases should be checked after the local operation so that we don't
+have a window/boundary if we were to check leases first, get
+descheduled, the lose our lease and then perform the operation.&nbsp;
+Do the operation, then check leases before returning to the user.<br>
+<h3>Using Leases</h3>
+There is a new API that the user must call to tell the system to use
+the lease mechanism.&nbsp; The method must be called before the
+application calls <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.
+This new
+method is:<br>
+<br>
+<pre>&nbsp;&nbsp;&nbsp; dbenv-&gt;rep_set_lease(DB_ENV *dbenv, u_int32_t clock_scale_factor, u_int32_t flags)<br>
+</pre>
+The <i>clock_scale_factor</i>
+parameter is interpreted as a percentage, greater than 100 (to transmit
+a floating point number as an integer to the API) that represents the
+maximum shkew between any two sites' clocks.&nbsp; That is, a <span
+ style="font-style: italic;">clock_scale_factor</span> of 150 suggests
+that the greatest discrepancy between clocks is that one runs 50%
+faster than the others.&nbsp; Both the
+master and client sides
+compensate for possible clock skew.&nbsp; The master uses the value to
+compensate in case the replica has a slow clock and replicas compensate
+in case they have a fast clock.&nbsp; This scaling factor will need to
+be divided by 100 on all sites to truly represent the percentage for
+adjustments made to time values.<br>
+<br>
+Assume the slowest replica's clock is a factor of <i>clock_scale_factor</i>
+slower than the
+fastest clock.&nbsp; Using that assumption, if the fastest clock goes
+from time t1 to t2 in X
+seconds, the slowest clock does it in (<i>clock_scale_factor</i> / 100)
+* X seconds.<br>
+<br>
+The <i>flags</i> parameter is not
+currently used.<br>
+<br>
+When the <i>dbenv-&gt;rep_set_lease</i>
+method is called, we will set a configuration flag indicating that
+leases are turned on:<br>
+<b>#define REP_C_LEASE &lt;value&gt;</b>.&nbsp;
+We will also record the <b>u_int32_t
+clock_skew</b> value passed in.&nbsp; The <i>rep_set_lease</i> method
+will not allow
+calls after <i>rep_start.&nbsp; </i>If
+multiple calls are made prior to calling <i>rep_start</i> then later
+calls will
+overwrite the earlier clock skew value.&nbsp; <br>
+<br>
+We need a new flag to prevent calling <i>rep_set_lease</i>
+after <i>rep_start</i>.&nbsp; The
+simplest solution would be to reject the call to
+<i>rep_set_lease&nbsp;
+</i>if<b>
+REP_F_CLIENT</b>
+or <b>REP_F_MASTER</b> is set.&nbsp;
+However that does not work in the cases where a site cleanly closes its
+environment and then opens without running recovery.&nbsp; The
+replication state will still be set.&nbsp; The prevention will be
+implemented as:<br>
+<pre>#define REP_F_START_CALLED &lt;some bit value&gt;<br></pre>
+In __rep_start, at the end:<br>
+<pre>if (ret == 0 ) {<br>	REP_SYSTEM_LOCK<br>	F_SET(rep, REP_F_START_CALLED)<br>	REP_SYSTEM_UNLOCK<br>}</pre>
+In <i>__rep_env_refresh</i>, if we
+are the last reference closing the env (we already check for that):<br>
+<pre>F_CLR(rep, REP_F_START_CALLED);</pre>
+In order to avoid run-time floating point operations
+on <i>db_timespec</i> structures,
+when a site is declared as a client or master in <i>rep_start</i> we
+will pre-compute the
+lease duration based on the integer-based clock skew and the
+integer-based lease timeout.&nbsp; A master should set a replica's
+lease expiration to the <b>start time of
+the sent message +
+(lease_timeout / clock_scale_factor)</b> in case the replica has a
+slow clock.&nbsp; Replicas extend their leases to <b>received message
+time + (lease_timeout *
+clock_scale_factor)</b> in case this replica has a fast clock.&nbsp;
+Therefore, the computation will be as follows if the site is becoming a
+master:<br>
+<pre>db_timeout_t tmp;<br>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout / ((double)rep-&gt;clock_skew / (double)100));<br>rep-&gt;lease_duration = DB_TIMEOUT_TO_TIMESPEC(&amp;tmp);<br></pre>
+Similarly, on a client the computation is:<br>
+<pre>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout * ((double)rep-&gt;clock_skew / (double)100));<br></pre>
+When a site changes state, its lease duration will change based on
+whether it is becoming a master or client and it will be recomputed
+from the original values.&nbsp; Note that these computations, coupled
+with the fact that the lease on the master is computed based on the
+master's time that it sent the message means that leases on the master
+are more conservatively computed than on the clients.<br>
+<br>
+The <i>dbenv-&gt;rep_set_lease</i>
+method must be called after <i>dbenv-&gt;open</i>,
+similar to <i>dbenv-&gt;rep_set_config</i>.&nbsp;
+The reason is so that we can check that this is a replication
+environment and we have access to the replication shared memory region.<br>
+<h3>Read Operations<br>
+</h3>
+Authoritative read operations on the master with leases enabled will
+abide by leases by default.&nbsp; We will provide a flag that allows an
+operation on a master to ignore leases.&nbsp; <b>All read operations
+on a client imply
+ignoring leases.</b> If an application wants authoritative reads
+they must forward the read requests to the master and it is the
+application's responsibility to provide the forwarding.
+The consensus was that forcing <span style="font-weight: bold;">DB_IGNORE_LEASE</span>
+on client read operations (with leases enabled, obviously) was too
+heavy handed.&nbsp; Read operations on the client will ignore leases,
+but do no special flag checking.<br>
+<br>
+The flag will be called <b>DB_IGNORE_LEASE</b>
+and it will be a flag that can be OR'd into the DB access method and
+cursor operation values.&nbsp; It will be similar to the <b>DB_READ_UNCOMMITTED</b>
+flag.
+<br>
+</b>The methods that will
+adhere to leases are:<br>
+<ul>
+  <li><i>Db-&gt;get</i></li>
+  <li><i>Db-&gt;pget</i></li>
+  <li><i>Dbc-&gt;get</i></li>
+  <li><i>Dbc-&gt;pget</i></li>
+</ul>
+The code that will check leases for a client reading would look
+something
+like this, if we decide to become heavy-handed:<br>
+<pre>if (IS_REP_CLIENT(dbenv)) {<br>	[get to rep structure]<br>	if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br>		db_err("Read operations must ignore leases or go to master");<br>		ret = EINVAL;<br>		goto err;<br>	}<br>}<br></pre>
+On the master, the new code to abide by leases is more complex.&nbsp;
+After the call to perform the operation we will check the lease.&nbsp;
+In that checking code, the master will see if it has a valid
+lease.&nbsp; If so, then all is well.&nbsp; If not, it will try to
+refresh the leases.&nbsp; If that refresh attempt results in leases,
+all is well.&nbsp; If the refresh attempt does not get leases, then the
+master cannot respond to the read as an authority and we return an
+error.&nbsp; The new error is called <b>DB_REP_LEASE_EXPIRED</b>.&nbsp;
+The location of the master lease check is down after the internal call
+to read the data is successful:<br>
+<pre>if (IS_REP_MASTER(dbenv) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br>	[get to rep structure]<br>	if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp;<br>	    (ret = __rep_lease_check(dbenv)) != 0) {<br>		/*<br>		 * We don't hold the lease.<br>		 */<br>		goto err;<br>	}<br>}<br></pre>
+See below for the details of <i>__rep_lease_check</i>.<br>
+<br>
+Also note that if leases (or replication) are not configured, then <span
+ style="font-weight: bold;">DB_IGNORE_LEASE</span> is a no-op.&nbsp; It
+is ignored (and won't error) if used when leases are not in
+effect.&nbsp; The reason is so that we can generically set that flag in
+utility programs like <span style="font-style: italic;">db_dump</span>
+that walk the database with a cursor.&nbsp; Note that <span
+ style="font-style: italic;">db_dump</span> is the only utility that
+reads with a cursor.<span style="font-style: italic;"><span
+ style="font-style: italic;"></span></span><br>
+<h3><b>Nsites
+and Elections</b></h3>
+The call to <i>dbenv-&gt;rep_set_nsites</i>
+must be performed before the call to <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.&nbsp;
+This document assumes either that <b>SR
+14778</b> gets resolved, or assumes that the value of <i>nsites</i> is
+immutable.&nbsp; The
+master and all clients need to know how many sites and leases are in
+the group.&nbsp; Clients need to know for elections.&nbsp; The master
+needs to know for the size of the lease table and to know what value a
+majority of the group is. <b>[Until
+14778 is resolved, the master lease work must assume <i>nsites</i> is
+immutable and will
+therefore enforce that this is called before <i>rep_start</i> using
+the same mechanism
+as <i>rep_set_lease</i>.]</b><br>
+<br>
+Elections and leases need to agree on the number of sites in the
+group.&nbsp; Therefore, when leases are in effect on clients, all calls
+to <i>dbenv-&gt;rep_elect</i> must
+set the <i>nsites</i> parameter to
+0.&nbsp; The <i>rep_elect</i> code
+path will return <b>EINVAL</b> if <b>REP_C_LEASE</b> is set and <i>nsites</i>
+is non-0.
+<h2>Lease Management</h2>
+<h3>Message Changes</h3>
+In order for clients to grant leases to the master a new message type
+must be added for that purpose.&nbsp; This will be the <b>REP_LEASE_GRANT</b>
+message.&nbsp;
+Granting leases will be a result of applying a <b>DB_REP_PERMANENT</b>
+record and therefore we
+do not need any additional message in order for a master to request a
+lease grant.&nbsp; The <b>REP_LEASE_GRANT</b>
+message will pass a structure as its message DBT:<br>
+<pre>struct __rep_lease_grant {<br>	db_timespec msg_time;<br>#ifdef DIAGNOSTIC<br>	db_timespec expire_time;<br>#endif<br>} REP_GRANT_INFO;<br></pre>
+In the <b>REP_LEASE_GRANT</b>
+message, the client is actually giving the master several pieces of
+information.&nbsp; We only need the echoed <i>msg_time</i> in this
+structure because
+everything else is already sent.&nbsp; The client is really sending the
+master:<br>
+<ul>
+  <li>Its EID (parameter to <span style="font-style: italic;">rep_send_message</span>
+and <span style="font-style: italic;">rep_process_message</span>)<br>
+  </li>
+  <li>The PERM LSN this message acknowledged (sent in the control
+message)</li>
+  <li>Unique identifier echoed back to master (<i>msg_time</i> sent in
+message as above)</li>
+</ul>
+On the client, we always maintain the maximum PERM LSN already in <i>lp-&gt;max_perm_lsn</i>.&nbsp;
+<h3>Local State Management</h3>
+Each client must maintain a <i>db_timespec</i>
+timestamp containing the expiration of its granted lease.&nbsp; This
+field will be in the replication shared memory structure:<br>
+<pre>db_timespec grant_expire;<br></pre>
+This timestamp already takes into account the clock skew.&nbsp; All
+new fields must be initialized when the region is created. Whenever we
+grant our master lease and want to send the <b>REP_LEASE_GRANT</b>
+message, this value
+will be updated.&nbsp; It will be used in the following way:
+<pre>db_timespec mytime;<br>DB_LSN perm_lsn;<br>DBT lease_dbt;<br>REP_GRANT_INFO gi;<br><br><br>timespecclear(&amp;mytime);<br>timespecclear(&amp;newgrant);<br>memset(&amp;lease_dbt, 0, sizeof(lease_dbt));<br>memset(&amp;gi, 0, sizeof(gi));<br>__os_gettime(dbenv, &amp;mytime);<br>timespecadd(&amp;mytime, &amp;rep-&gt;lease_duration);<br>MUTEX_LOCK(rep-&gt;clientdb_mutex);<br>perm_lsn = lp-&gt;max_perm_lsn;<br>MUTEX_UNLOCK(rep-&gt;clientdb_mutex);<br>REP_SYSTEM_LOCK(dbenv);<br>if (timespeccmp(mytime, rep-&gt;grant_expire, &gt;))<br>	rep-&gt;grant_expire = mytime;<br>gi.msg_time = msg-&gt;msg_time;<br>#ifdef DIAGNOSTIC<br>gi.expire_time = rep-&gt;grant_expire;<br>#endif<br>lease_dbt.data = &amp;gi;<br>lease_dbt.size = sizeof(gi);<br>REP_SYSTEM_UNLOCK(dbenv);<br>__rep_send_message(dbenv, eid, REP_LEASE_GRANT, &amp;perm_lsn, &amp;lease_dbt, 0, 0);<br></pre>
+This updating of the lease grant will occur in the <b>PERM</b> code
+path when we have
+successfully applied the permanent record.<br>
+<h3>Maintaining Leases on the
+Master/Rep_start</h3>
+The master maintains a lease table that it checks when fulfilling a
+read request that is subject to leases.&nbsp; This table is initialized
+when a site calls<i>
+dbenv-&gt;rep_start(DB_MASTER)</i> and the site is undergoing a role
+change (i.e. a master making additional calls to <i>dbenv-&gt;rep_start(DB_MASTER)</i>
+does
+not affect an already existing table).<br>
+<br>
+When a non-master site becomes master, it must do two things related to
+leases on a role change.&nbsp; First, a client cannot upgrade to master
+while it has an outstanding lease granted to another site.&nbsp; If a
+client attempts to do so, an error, <b>EINVAL</b>,
+will be returned.&nbsp; The only way this should happen is if the
+application simply declares a site master, instead of using
+elections.&nbsp; Elections will already wait for leases to expire
+before proceeding. (See below.) 
+<br>
+<br>
+Second, once we are proceeding with becoming a master, the site must
+allocate the table it will use to maintain lease information.&nbsp;
+This table will be sized based on <i>nsites</i>
+and it will be an array of the following structure:<br>
+<pre>struct  {<br>	int eid;			/* EID of client site. */<br>	db_timespec start_time;	/* Unique time ID client echoes back on grants. */<br>	db_timespec end_time;	/* Master's lease expiration time. */<br>	DB_LSN lease_lsn;	/* Durable LSN this lease applies to. */<br>	u_int32_t flags;	/* Unused for now?? */<br>} REP_LEASE_ENTRY;<br></pre>
+<h3>Granting Leases</h3>
+It is the burden of the application to make sure that all sites in the
+group
+are using leases, or none are.&nbsp; Therefore, when a client processes
+a <b>PERM</b>
+log record that arrived from the master, it will grant its lease
+automatically if that record is permanent (i.e. <b>DB_REP_ISPERM</b>
+is being returned),
+and leases are configured.&nbsp; A client will not send a
+lease grant when it is processing log records (even <b>PERM</b>
+ones) it receives from other clients that use client-to-client
+synchronization.&nbsp; The reason is that the master requires a unique
+time-of-msg ID (see below) that the client echoes back in its lease
+grant and it will not have such an ID from another client.<br>
+<br>
+The master stores a time-of-msg ID in each message and the client
+simply echoes it back to the master.&nbsp; In its lease table, it does
+keep the base
+time-of-msg for a valid lease.&nbsp; When <b>REP_LEASE_GRANT</b>
+message comes in,
+the master does a number of things:<br>
+<ol>
+  <li>Pulls the echoed timespec from the client message, into <i>msg_time</i>.<br>
+  </li>
+  <li>Finds the entry in its lease table for the client's EID.&nbsp; It
+walks the table searching for the ID.&nbsp; EIDs of <span
+ style="font-weight: bold;">DB_EID_INVALID</span> are
+illegal.&nbsp; Either the master will find the entry, or it will find
+an empty slot in the table (i.e. it is still populating the table with
+leases).</li>
+  <li>If this is a previously unknown site lease, the master
+initializes the entry by copying to the <i>eid</i>, <i>start_time, </i>and
+    <i>lease_lsn</i> fields.&nbsp; The master
+also computes the <i>end_time</i>
+based on the adjusted <i>rep-&gt;lease_duration</i>.</li>
+  <li>If this is a lease from a previously known site, the master must
+perform <i>timespeccmp(&amp;msg_time,
+&amp;table[i].start_time, &gt;)</i> and only update the <i>end_time</i>
+of the lease when this is
+a more recent message.&nbsp; If it is a more recent message, then we
+should update
+the <i>lease_lsn</i> to the LSN in
+the message.</li>
+  <li>Since lease durations are computed taking the clock skew into
+account, clients compute them based on the current time and the master
+computes it based on original sending time, for diagnostic purposes
+only, I also plan to send the client's expiration time.&nbsp; The
+client errs on the side of computing a larger lease expiration time and
+the master errs on the side of computing a smaller duration.&nbsp;
+Since both are taking the clock skew
+into account, the client's ending expiration time should never be
+smaller than
+the master's computed expiration time or their value for clock skew may
+not be correct.<br>
+  </li>
+</ol>
+Any log records (new or resent) that originate from the master and
+result in <b>DB_REP_ISPERM</b> get an
+ack.<br>
+<br>
+<h3>Refreshing Leases</h3>
+Leases get refreshed when a master receives a <b>REP_LEASE_GRANT</b>
+message from a client. There are three pieces to lease
+refreshment.&nbsp; <br>
+<h4>Lazy Lease Refreshing on Read<br>
+</h4>
+If the master discovers that leases are
+expired during the read operation, it attempts to refresh its
+collection of lease grants.&nbsp; It does this by calling a new
+function <i>__rep_lease_refresh</i>.&nbsp;
+This function is very similar to the already-existing function <i>__rep_flush</i>.&nbsp;
+Basically, to
+refresh the lease, the master simply needs to resend the last PERM
+record to the clients.&nbsp; The requirements state that when the
+application send function returns successfully from sending a PERM
+record, the majority of clients have that PERM LSN durable.&nbsp; We
+will have a new public DB error return called <b>DB_REP_LEASE_EXPIRED</b>
+that will be
+returned back to the caller if the master cannot assert its
+authority.&nbsp; The code will look something like this:<br>
+<pre>/*<br> * Use lp-&gt;max_perm_lsn on the master (currently not used on the master)<br> * to keep track of the last PERM record written through the logging system.<br> * need to initialize lp-&gt;max_perm_lsn in rep_start on role_chg.<br> */<br>call __rep_send_message on the last PERM record the master wrote, with DB_REP_PERMANENT<br>if failure<br>	expire leases<br>	return lease expired error to caller<br>else /* success */<br>	recheck lease table<br>	/*<br>	 * We need to recheck the lease table because the client<br>	 * lease grant messages may not be processed yet, or got<br>	 * lost, or racing with the application's ACK messages or<br>	 * whatever. <br>	 */<br>	if we have a majority of valid leases<br>		return success<br>	else<br>		return lease expired error to caller <br></pre>
+<h4>Ongoing Update Refreshment<br>
+</h4>
+Second is having the master indicate to
+the client it needs to send a lease grant in response to the current
+PERM log message.&nbsp; The problem is
+that acknowledgements must contain a master-supplied message timestamp
+that the client sends back to the master.&nbsp; We need to modify the
+structure of the&nbsp; log record messages when leases are configured
+so
+that when a PERM message is sent, the master sends, and the client
+expects, the message timestamp.&nbsp; There are three fairly
+straightforward and different implementations to consider.<br>
+<ol>
+  <li>Adding the timestamp to the <b>REP_CONTROL</b>
+structure.&nbsp; If this option is chosen, then the code trivially
+sends back the timestamp in the client's reply.&nbsp; There is no
+special processing done by either side with the message contents.&nbsp;
+So, on a PERM log record, the master will send a non-zero
+timestamp.&nbsp; On a normal log record the timestamp will be zero or
+some known invalid value.&nbsp; If the client sees a non-zero
+timestamp, it sends a <b>REP_LEASE_GRANT</b>
+with the <i>lp-&gt;max_perm_lsn</i>
+after applying that log record.&nbsp; If it is zero, then the client
+does nothing different.&nbsp; The advantage is ease of code.&nbsp; The
+disadvantage is that for mixed version systems, the client is now
+dealing with different sized control structures.&nbsp; We would have to
+retain the old control structure so that during a mixed version group
+the (upgraded) clients can use, expect and send old control structures
+to the master.&nbsp; This is unfortunate, so let's consider additional
+implementations that don't require modifying the control structure.<br>
+  </li>
+  <li>Adding a new <b>REPCTL_LEASE</b>
+flag to the list of flags for the control structure, but do not change
+the control structure fields.&nbsp; When a master wants to send a
+message that needs a lease ack, it sets the flag.&nbsp; Additionally,
+instead of simply sending a log record DBT as the <i>rec</i> parameter
+for replication, we
+would send a new structure that had the timestamp first and then the
+record (similar to the bulk transfer buffer).&nbsp; The advantage of
+this is that the control structure does not change.&nbsp; Disadvantages
+include more special-cased code in the normal code path where we have
+to check the flag.&nbsp; If the flag is set we have to extract the
+timestamp value and massage the incoming data to pass on the real log
+record to <i>rep_apply</i>.&nbsp; On
+bulk transfer, we would just add the timestamp into the buffer.&nbsp;
+On normal transfers, it would incur an additional data copy on the
+master side.&nbsp; That is unfortunate.&nbsp; Additionally, if this
+record needs to be stored in the temp db, we need some way to get it
+back again later or <span style="font-style: italic;">rep_apply</span>
+would have to extract the timestamp out when it processed the record
+(either live or from the temp db).<br>
+  </li>
+  <li>Adding a different message type, such as <b>REP_LOG_ACK</b>.&nbsp;
+Similarly to <b>REP_LOG_MORE</b> this message would be a
+special-case version of a log record.&nbsp; We would extract out the
+timestamp and then handle as a normal log record.&nbsp; This
+implementation is rejected because it actually would require three new
+message types: <b>REP_LOG_ACK,
+REP_LOG_ACK_MORE, REP_BULK_LOG_ACK</b>.&nbsp; That is just too ugly
+to contemplate.</li>
+</ol>
+<b>[Slight digression:</b> it occurs
+to me while writing about #2 and #3 above, that our implementation of
+all of the *_MORE messages could really be implemented with a <b>REPCTL_MORE</b>
+flag instead of a
+separate message type.&nbsp; We should clean that up and simplify the
+messages but not part of master leases. Hmm, taking that thought
+process further, we really could get rid of the <b>REP_BULK_*</b>
+messages as well if we
+added a <b>REPCTL_BULK</b>
+flag.&nbsp; I think we should definitely do it for the *_MORE
+messages.&nbsp; I am not sure we should do it for bulk because the
+structure of the incoming data record is vastly different.]<br>
+<br>
+Of these options, I believe that modifying the control structure is the
+best alternative.&nbsp; The handling of the old structure will be very
+isolated to code dealing with old versions and is far less complicated
+than injecting the timestamp into the log record DBT and doing a data
+copy.&nbsp; Actually, I will likely combine #1 and the flag from #2
+above.&nbsp; I will have the <b>REPCTL_LEASE</b>
+flag that indicates a lease grant reply is expected and have the
+timestamp in the control structure.&nbsp;
+Also I will probably add in a spare field or two for future use in the <b>REP_CONTROL</b>
+structure.<br>
+<h4>Gap processing</h4>
+No matter which implementation we choose for ongoing lease refreshment,
+gap processing must be considered.&nbsp; The code above assumes the
+timestamps will be placed on PERM records only.&nbsp; Normal log
+records will not have a timestamp, nor a flag or anything else like
+that.&nbsp; However, any log message can fill a gap on a client and
+result in the processing of that normal log record to return <b>DB_REP_ISPERM</b>
+because later records
+were also processed.<br>
+<br>
+The current implementation should work fine in that case because when
+we store the message in the client temp db we store both the control
+DBT and the record DBT.&nbsp; Therefore, when a normal record fills a
+gap, the later PERM record, when retrieved will look just like it did
+when it arrived.&nbsp; The client will have access to the LSN, and the
+timestamp, etc.&nbsp; However, it does mean that sending the <b>REP_LEASE_GRANT</b>
+message must take
+place down in <i>__rep_apply</i>
+because that is the only place we have access to the contents of those
+stored records with the timestamps.<br>
+<br>
+There are two logical choices to consider for granting the lease when
+processing an update.&nbsp; As we process (either a live record or one
+read from the temp db after filling a gap) a PERM message, we send the <b>REP_LEASE_GRANT</b>
+message for each
+PERM record we successfully apply.&nbsp; Or, second, we keep track of
+the largest timestamp of all PERM records we've processed and at the
+end of the function after we've applied all records, we send back a
+single lease grant with the <i>max_perm_lsn</i>
+and a new <i>max_lease_timestamp</i>
+value to the master.&nbsp; The first is easier to implement, the second
+results in possibly slightly fewer messages at the expense of more
+bookkeeping on the client.<br>
+<br>
+A third, more complicated option would be to have the message timestamp
+on all records, but grants are only sent on the PERM messages.&nbsp; A
+reason to do this is that the later timestamp of a normal log record
+would be used as the timestamp sent in the reply and the master would
+get a more up to date timestamp value and a longer lease.&nbsp; <br>
+<br>
+If we change the <span style="font-weight: bold;">REP_CONTROL</span>
+structure to include the timestamp, we potentially break or at least
+need to revisit the gap processing algorithm.&nbsp; That code assumes
+that the control and record elements for the same LSN look the same
+each and every time.&nbsp; The code stores the <span
+ style="font-style: italic;">control</span> DBT as the key and the <span
+ style="font-style: italic;">rec</span> DBT as the data.&nbsp; We use a
+specialized compare function to sort based on the LSN in the control
+DBT.&nbsp; With master leases, the same record transmitted by a master
+multiple times or client for the same LSN will be different because the
+timestamp field will not be the same.&nbsp; Therefore, the client will
+end up with duplicate entries in the temp database for the same
+LSN.&nbsp; Both solutions (adding the timestamp to <span
+ style="font-weight: bold;">REP_CONTROL</span> and adding a <span
+ style="font-weight: bold;">REPCTL_LEASE</span> flag) can yield
+duplicate entries.&nbsp; The flag would cause the same record from the
+master and client to be different as well.<br>
+<h4>Handling Incoming Lease Grants<br>
+</h4>
+The third piece of lease management is handling the incoming <b>REP_LEASE_GRANT</b>
+message on the
+master.&nbsp; When this message is received, the master must do the
+following:<br>
+<pre>REP_SYSTEM_LOCK<br>msg_timestamp = cntrl-&gt;timestamp;<br>client_lease = __rep_lease_entry(dbenv, client eid)<br>if (client_lease == NULL)<br>	initial lease for this site, DB_ASSERT there is space in the table<br>	add this to the table if there is space<br>} else <br>	compare msg_timestamp with client_lease-&gt;start_time<br>	if (msg_timestamp is more recent &amp;&amp; msg_lsn &gt;= lease LSN)<br>		update entry in table<br>REP_SYSTEM_UNLOCK<br></pre>
+<h3>Expiring Leases</h3>
+Leases can expire in two ways.&nbsp; First they can expire naturally
+due to the passage of time.&nbsp; When checking leases, if the current
+time is later than the lease entry's <i>end_time</i>
+then the lease is expired.&nbsp; Second, they can be forced with a
+premature expiration when the application's transport function returns
+an error.&nbsp; In the first case, there is nothing to do, in the
+second case we need to manipulate the <i>end_time</i>
+so that all future lease checks fail.&nbsp; Since the lease <i>start_time</i>
+is guaranteed to not be in the future we will have a function <i>__rep_lease_expire</i>
+that will:<br>
+<pre>REP_SYSTEM_LOCK<br>for each entry in the lease table<br>	entry-&gt;end_time = entry-&gt;start_time;<br>REP_SYSTEM_UNLOCK<br></pre>
+Is there a potential race or problem with prematurely expiring
+leases?&nbsp; Consider an application that enforces an ALL
+acknowledgement policy for PERM records in its transport
+callback.&nbsp; There are four clients and three send the PERM ack to
+the application.&nbsp; The callback returns an error to the master DB
+code.&nbsp; The DB code will now prematurely expire its leases.&nbsp;
+However, at approximately the same time the three clients are also
+sending their <span style="font-weight: bold;">REP_LEASE_GRANT</span>
+messages to the master.&nbsp; There is a race between the master
+processing those messages and the thread handling the callback failure
+expiring the table.&nbsp; This is only an issue if the messages arrive
+after the table has been expired.<br>
+<br>
+Let's assume all three clients send their grants after the master
+expires the table.&nbsp; If we accept those grants and then a read
+occurs the read will succeed since the master has a majority of leases
+even though the callback failed earlier.&nbsp; Is that a problem?&nbsp;
+The lease code is using a majority and the application policy is using
+something other value.&nbsp; It feels like this should be okay since
+the data is held by leases on a majority.&nbsp; Should we consider
+having the lease checking threshold be the same as the permanent ack
+policy?&nbsp; That is difficult because Base API users implement
+whatever they want and DB does not know what it is.<br>
+<h3>Checking Leases</h3>
+When a read operation on the master completes, the last thing we need
+to do is verify the master leases.&nbsp; We've already discussed
+refreshing them when they are expired above.&nbsp; We need two things
+for a lease to be valid.&nbsp; It must be within the timeframe of the
+lease grant and the lease must be valid for the last PERM record
+LSN.&nbsp; Here is the logic
+for checking the validity of leases in <i>__rep_lease_check</i>:<br>
+<pre>#define MAX_REFRESH_TRIES	3<br>DB_LSN lease_lsn;<br>REP_LEASE_ENTRY *entry;<br>u_int32_t min_leases, valid_leases;<br>db_timespec cur_time;<br>int ret, tries;<br><br>	tries = 0;<br>retry:<br>	ret = 0;<br>	LOG_SYSTEM_LOCK<br>	lease_lsn = lp-&gt;lsn<br>	LOG_SYSTEM_UNLOCK<br>	REP_SYSTEM_LOCK<br>	min_leases = rep-&gt;nsites / 2;<br>	__os_gettime(dbenv, &amp;cur_time);<br>	for (entry = head of table, valid_leases = 0; entry != NULL &amp;&amp; valid_leases &lt; min_leases; entry++)<br>		if (timespec_cmp(&amp;entry-&gt;end_time, &amp;cur_time) &gt;= 0 &amp;&amp; log_compare(&amp;entry-&gt;lsn, lease_lsn) == 0)<br>			valid_leases++;<br>	REP_SYSTEM_UNLOCK<br>	if (valid_leases &lt; min_leases) {<br>		ret =__rep_lease_refresh(dbenv, ...);<br>		/*<br>		 * If we are successful, we need to recheck the leases because <br>		 * the lease grant messages may have raced with the PERM<br>		 * acknowledgement.  Give those messages a chance to arrive.<br>		 */<br>		if (ret == 0) {<br>			if (tries &lt;= MAX_REFRESH_TRIES) {<br>				/*<br>				 * If we were successful sending, but not successful in racing the<br>				 * message thread, yield the processor so that message<br>				 * threads may have a chance to run.<br>				 */<br>				if (tries &gt; 0)<br>					/* __os_sleep instead?? */<br>					__os_yield()<br>				tries++;<br>				goto retry;<br>			} else<br>				ret = DB_RET_LEASE_EXPIRED;<br>		}<br>	}<br>	return (ret);</pre>
+If the master has enough valid leases it returns success.&nbsp; If it
+does not have enough, it attempts to refresh them.&nbsp; This attempt
+may fail if sending the PERM record does not receive sufficient
+acks.&nbsp; If we do receive sufficient acknowledgements we may still
+find that scheduling of message threads means the master hasn't yet
+processed the incoming <b>REP_LEASE_GRANT</b>
+messages yet.&nbsp; We will retry a couple times (possibly
+parameterized) if the master discovers that situation.&nbsp; <br>
+<h2>Elections</h2>
+When a client grants a lease to a master, it gives up the right to
+participate in an election until that grant expires.&nbsp; If we are
+the master and <i>dbenv-&gt;rep_elect</i>
+is called, it should return, no matter what, like it does today.&nbsp;
+If we are a client and <i>rep_elect</i>
+is called special processing takes place when leases are in
+effect.&nbsp; First, the easy case is if the lease granted by this
+client has already expired, then the client goes directly into the
+election as normal.&nbsp; If a valid lease grant is outstanding to a
+master, this site cannot participate in an election until that grant
+expires.&nbsp; We have at least two options when a site calls the <i>dbenv-&gt;rep_elect</i>
+API while
+leases are in effect.<br>
+<ol>
+  <li>The simplest coding solution for DB would be simply to refuse to
+participate in the election if this site has a current lease granted to
+a master.&nbsp; We would detect this situation and return EINVAL.&nbsp;
+This is correct behavior and trivial to implement.&nbsp; The
+disadvantage of this solution is that the application would then be
+responsible for repeatedly attempting an election until the lease grant
+expired.<br>
+  </li>
+  <li>The more satisfying solution is for DB to wait the remaining time
+for the grant.&nbsp; If this client hears from the master during that
+time the election does not take place and the call to <i>rep_elect</i>
+returns with the
+information for the current/old master.</li>
+</ol>
+<h3>Election Code Changes</h3>
+The code changes to support leases in the election code are fairly
+isolated.&nbsp; First if leases are configured, we must verify the <i>nsites</i>
+parameter is set to 0.&nbsp;
+Second, in <i>__rep_elect_init</i>
+we must not overwrite the value of <i>rep-&gt;nsites</i>
+for leases because it is controlled by the <i>dbenv-&gt;rep_set_nsites</i>
+API.&nbsp;
+These changes are small and easy to understand.<br>
+<br>
+The more complicated code will be the client code when it has an
+outstanding lease granted.&nbsp; The client will wait for the current
+lease grant to expire before proceeding with the election.&nbsp; The
+client will only do so if it does not hear from the master for the
+remainder of the lease grant time.&nbsp; If the client hears from the
+master, it returns and does not begin participating in the
+election.&nbsp; A new election phase, <b>REP_EPHASE0</b>
+will exist so that the call to <i>__rep_wait</i>
+can detect if a master responds.&nbsp; The client, while waiting for
+the lease grant to expire, will send a <b>REP_MASTER_REQ</b>
+message so that the master will respond with a <b>REP_NEWMASTER</b>
+message and thus,
+allow the client to know the master exists.&nbsp; However, it is also
+desirable that if the master
+replies to the client, the master wants the client to update its lease
+grant.&nbsp; <br>
+<br>
+Recall that the <b>REP_NEWMASTER</b>
+message does not result in a lease grant from the client.&nbsp; The
+client responds when it processes a PERM record that has the <b>REPCTL_LEASE</b>
+flag set in the message
+with its lease grant up to the given LSN.&nbsp; Therefore, we want the
+client's <b>REP_MASTER_REQ</b> to
+yield both the discovery of the existing master and have the master
+refresh its leases.&nbsp; The client will also use the <b>REPCTL_LEASE</b>
+flag in its <b>REP_MASTER_REQ</b> message to the
+master.&nbsp; This flag will serve as the indicator to the master that
+it needs to deal with leases and both send the <b>REP_NEWMASTER</b>
+message and refresh
+the lease.<br>
+The code will work as follows:<br>
+<pre>if (leases_configured &amp;&amp; (my_grant_still_valid || lease_never_granted) {<br>	if (lease_never_granted)<br>		wait_time = lease_timeout<br>	else<br>		wait_time = grant_expiration - current_time<br>	F_SET(REP_F_EPHASE0);<br>	__rep_send_message(..., REP_MASTER_REQ, ... REPCTL_LEASE);<br>	ret = __rep_wait(..., REP_F_EPHASE0);<br>	if (we found a master)<br>		return<br>} /* if we don't return, fall out and proceed with election */<br></pre>
+On the master side, the code handling the <b>REP_MASTER_REQ</b> will
+do:<br>
+<pre>if (I am master) {<br>	...<br>	__rep_send_message(REP_NEWMASTER...)<br>	if (F_ISSET(rp, REPCTL_LEASE))<br>		__rep_lease_refresh(...)<br>}<br></pre>
+Other minor implementation details are that<i> __rep_elect_done</i>
+must also clear
+the <b>REP_F_EPHASE0</b> flag.&nbsp;
+We also, obviously, need to define <b>REP_F_EPHASE0</b>
+in the list of replication flags.&nbsp; Note that the client's call to <i>__rep_wait</i>
+will return upon
+receiving the <b>REP_NEWMASTER</b>
+message.&nbsp; The client will independently refresh its lease when it
+receives the log record from the master's call to refresh the lease.<br>
+<br>
+Again, similar to what I suggested above, the code could simply assume
+global leases are configured, and instead of having the <b>REPCTL_LEASE</b>
+flag at all, the master
+assumes that it needs to refresh leases because it has them configured,
+not because it is specified in the <b>REP_MASTER_REQ</b>
+message it is processing. Right now I don't think every possible
+<b>REP_MASTER_REQ</b> message should result in a lease grant request.<br>
+<h4>Elections and Quiescient Systems</h4>
+It is possible that a master is slow or the client is close to its
+expiration time, or that the master is quiescient and all leases are
+currently expired, but nothing much is going on anyway, yet some client
+calls <i>__rep_elect</i> at that
+time.&nbsp; In the code above, we will not send the <b>REP_MASTER_REQ</b>
+because the lease is
+not valid.&nbsp; The client will simply proceed directly to sending the
+<b>REP_VOTE1</b> message, throwing all
+other clients into an election.&nbsp; The master is still master and
+should stay that way.&nbsp; Currently in response to a vote message, a
+master will broadcast out a <b>REP_NEWMASTER</b>
+to assert its mastership.&nbsp; That causes the election to
+complete.&nbsp; However, if desired the master may want to proactively
+refresh its leases.&nbsp; This situation indicates to me that the
+master should choose to refresh leases based on configuration, not a
+flag sent from the client.&nbsp; I believe anytime the master asserts
+its mastership via sending a <b>REP_NEWMASTER</b>
+message that I need to add code to proactively refresh leases at that
+time.<br>
+<h2>Other Implementation Details</h2>
+<h3>Role Changes<br>
+</h3>
+When a site changes its role via a call to <i>rep_start</i> in either
+direction, we
+must take action when leases are configured.&nbsp; There are three
+types of role changes that all need changes to deal with leases:<br>
+<ol>
+  <li><i>A master downgrading to a
+client.</i> When a master downgrades to a client, it can do so
+immediately after it has proactively expired all existing leases it
+holds.&nbsp; This situation is similar to an error from the send
+callback, and it effectively cancels all outstanding leases held on
+this site.&nbsp; Note that if this master expires its leases, it does
+not have any effect on when the clients' lease grants expire on the
+client side.&nbsp; The clients must still wait their full expected
+grant time.<br>
+  </li>
+  <li><i>A client upgrading to master.</i>
+If a client is upgrading to a master but it has an outstanding lease
+granted to another site, the code will return an <b>EINVAL</b>
+error.&nbsp; This situation
+only arises if the application simply declares this site master.&nbsp;
+If a site wins an election then the election itself should have waited
+long enough for the granted lease to expire and this state should not
+arise then.</li>
+  <li><i>A client finding a new master.</i>
+When a client discovers a new and different master, via a <b>REP_NEWMASTER</b>
+message then the
+client cannot accept that new master until its current lease grant
+expires.&nbsp; This situation should only occur when a site declares
+itself master without an election and that site's lease grant expires
+before this client's grant expires.&nbsp; However, it is <b>possible</b>
+for this situation to arise
+with elections also.&nbsp; If we have 5 sites holding an election and 4
+of those sites have leases expire at about the same time T, and this
+site's lease expires at time T+N and the election timeout is &lt; N,
+then those 4 sites may hold an election and elect a master without this
+site's participation.&nbsp; A client in this situation must call <i>__rep_wait</i>
+with the time remaining
+on its lease.&nbsp; If the lease is expired after waiting the remaining
+time, then the client can accept this new master.&nbsp; If the lease
+was refreshed during the waiting period then the client does not accept
+this new master and returns.<br>
+  </li>
+</ol>
+<h3>DUPMASTER</h3>
+A duplicate master situation can occur if an old master becomes
+disconnected from the rest of the group, that group elects a new master
+and then the partition is resolved.&nbsp; The requirement for master
+leases is that this situation will not cause the newly elected,
+rightful master to receive the <b>DB_REP_DUPMASTER</b>
+return.&nbsp; It is okay for the old master to get that return
+value.&nbsp; When a dual master situation exists, the following will
+happen:<br>
+<ul>
+  <li><i>On the current master and all
+current clients</i> - If the current master receives an update
+message or other conflicting message from the old master then that
+message will be ignored because the generation number is out of date.</li>
+  <li><i>On the old master</i> - If
+the old master receives an update message from the current master, or
+any other message with a later generation from any site, the new
+generation number will trigger this site to return <b>DB_REP_DUPMASTER</b>.&nbsp;
+However,
+instead of broadcasting out the <b>REP_DUPMASTER</b>
+message to shoot down others as well, this site, if leases are
+configured, will call <i>__rep_lease_check</i>
+and if they are expired, return the error.&nbsp; It should be
+impossible for us to receive a later generation message and still hold
+a majority of master leases.&nbsp; Something is seriously wrong and we
+will <b>DB_ASSERT</b> this situation
+cannot happen.<br>
+  </li>
+</ul>
+<h3>Client to Client Synchronization</h3>
+One question to ask is how lease grants interact with client-to-client
+synchronization. The only answer is that they do not.&nbsp; A client
+that is sending log records to another client cannot request the
+receiving client refresh its lease with the master.&nbsp; That client
+does not have a timestamp it can use for the master and clock skew
+makes it meaningless between machines.&nbsp; Therefore, sites that use
+client-to-client synchronization will likely see more lease refreshment
+during the read path and leases will be refreshed during live updates
+only.&nbsp; Of course, if a client supplies log records that fill a
+gap, and the later log records stored came from the master in a live
+update then the client will respond as per the discussion on Gap
+Processing above.<br>
+<h2>Interaction Matrix</h2>
+If leases are granted (by a client) or held (by a master) what should
+the following APIs and messages do?<br>
+<br>
+Other:<br>
+log_archive: Leases do not affect log_archive.&nbsp; OK.<br>
+dbenv-&gt;close: OK.<br>
+crash during lease grant and restart: <b>Potential
+problem here.&nbsp; See discussion below</b>.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: Already discussed above.&nbsp; Must wait for lease to expire.<br>
+rep_flush: Master only, OK - this will be the basis for refreshing
+leases.<br>
+rep_get_*: Not affected by leases.<br>
+rep_process_message: Generally OK.&nbsp; We'll discuss each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+and <i>nsites</i> is immutable until
+14778 is resolved.<br>
+rep_set_priority: OK<br>
+rep_set_timeout: OK.&nbsp; Used to set lease timeout.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): Role changes are discussed above.&nbsp; Make sure
+duplicate rep_start calls are no-ops for leases.<br>
+rep_start(CLIENT): Role changes are discussed above.&nbsp; Make sure
+duplicate calls are no-ops for leases.<br>
+rep_stat: OK.<br>
+rep_sync: Should not be able to happen.&nbsp; Client cannot accept new
+master with outstanding lease grant.&nbsp; Add DB_ASSERT here.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_BULK_PAGE: Should never process one with lease granted.&nbsp; Add
+DB_ASSERT.<br>
+REP_DUPMASTER: Should never happen, this is what leases are supposed to
+prevent.&nbsp; See above.<br>
+REP_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_MORE: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.&nbsp; Clients check to send ACK.<br>
+REP_NEWMASTER: See above.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Should never process one with lease granted.&nbsp;
+Add DB_ASSERT.<br>
+REP_PAGE_FAIL:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_MORE:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_UPDATE_REQ: OK.&nbsp; This is a master-only message.<br>
+REP_VERIFY: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_FAIL: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.&nbsp; Client cannot send
+one with an outstanding lease however.<br>
+REP_VOTE2: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.<br>
+<br>
+If the following method or message processing is in progress and a
+client wants to grant a lease, what should it do?&nbsp; Let's examine
+what this means.&nbsp; The client wanting to grant a lease simply means
+it is responding to the receipt of a <b>REP_LOG</b>
+(or its variants) message and applying a log record.&nbsp; Therefore,
+we need to consider a thread processing a log message racing with these
+other actions.<br>
+<br>
+Other:<br>
+log_archive: OK.&nbsp; <br>
+dbenv-&gt;close: User error.&nbsp; User should not be closing the env
+while other threads are using that handle.&nbsp; Should have no effect
+if a 2nd dbenv handle to same env is closed.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: See Election discussion above.&nbsp; <i>rep_elect</i>
+should wait and may grant
+lease while election is in progress.<br>
+rep_flush: Should not be called on client.<br>
+rep_get_*: OK.<br>
+rep_process_message: Generally OK.&nbsp; See handling each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK.<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+until 14778 is resolved.<br>
+rep_set_priority: OK.<br>
+rep_set_timeout: OK.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_start(CLIENT): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_stat: OK.<br>
+rep_sync: Shouldn't happen because client cannot grant leases during
+sync-up.&nbsp; Incoming log message ignored.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.<br>
+REP_BULK_PAGE: OK.&nbsp; Incoming log message ignored during internal
+init.<br>
+REP_DUPMASTER: Shouldn't happen.&nbsp; See DUPMASTER discussion above.<br>
+REP_LOG: OK.<br>
+REP_LOG_MORE: OK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.<br>
+REP_NEWMASTER: See above.&nbsp; If a client accepts a new master
+because its lease grant expired, then that master sends a message
+requesting the lease grant, this client will not process the log record
+if it is in sync-up recovery, or it may after the master switch is
+complete and the client doesn't need sync-up recovery.&nbsp; Basically,
+just uses existing log record processing/newmaster infrastructure.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Receiving a log record during internal init PAGE
+phase should ignore log record.<br>
+REP_PAGE_FAIL: OK.<br>
+REP_PAGE_MORE: OK.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Receiving a log record during internal init
+should ignore log record.<br>
+REP_UPDATE_REQ: OK - master-only message.<br>
+REP_VERIFY: OK.&nbsp; Receiving a log record during verify phase
+ignores log record.<br>
+REP_VERIFY_FAIL: OK.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; This client is processing someone else's vote when
+the lease request comes in.&nbsp; That is fine.&nbsp; We protect our
+own election and lease interaction in <i>__rep_elect</i>.<br>
+REP_VOTE2: OK.<br>
+<h4>Crashing - Potential Problem<br>
+</h4>
+It appears there is one area where we could have a problem.&nbsp; I
+believe that crashes can cause us to break our guarantee on durability,
+authoritative reads and inability to elect duplicate masters.&nbsp;
+Consider this scenario:<br>
+<ol>
+  <li>A master and 4 clients are all up and running.</li>
+  <li>The master commits a txn and all 4 clients refresh their lease
+grants at time T.</li>
+  <li>All 4 clients have the txn and log records in the cache.&nbsp;
+None are flushing to disk.</li>
+  <li>All 4 clients have responded to the PERM messages as well as
+refreshed their lease with the master.</li>
+  <li>All 4 clients hit the same application coding error and crash
+(machine/OS stays up).</li>
+  <li>Master authoritatively reads data in txn from step 2.</li>
+  <li>All 4 clients restart the application and run recovery, thus the
+txn from step 2 is lost on all clients because it isn't any logs.<span
+ style="font-weight: bold;"></span><br>
+  </li>
+  <li>A network partition happens and the master is alone on its side.</li>
+  <li>All 4 clients are on the other side and elect a new master.</li>
+  <li>Partition resolves itself and we have duplicate masters, where
+the former master still holds all valid lease grants.<span
+ style="font-weight: bold;"></span><br>
+  </li>
+</ol>
+Therefore, we have broken both guarantees.&nbsp; In step 6 the data is
+really not durable and we've given it to the user.&nbsp; One can argue
+that if this is an issue the application better be syncing somewhere if
+they really want durability.&nbsp; However, worse than that is that we
+have a legitimate DUPMASTER situation in step 10 where both masters
+hold valid leases.&nbsp; The reason is that all lease knowledge is in
+the shared memory and that is lost when the app restarts and runs
+recovery.<br>
+<br>
+How can we solve this?&nbsp; The obvious solution is (ugh, yet another)
+durable BDB-owned file with some information in it, such as the current
+lease expiration time so that rebooting after a crash leaves the
+knowledge that the lease was granted.&nbsp; However, writing and
+syncing every lease grant on every client out to disk is far too
+expensive.<br>
+<br>
+A second possible solution is to have clients wait a full lease timeout
+before entering an election the first time. This solution solves the
+DUPMASTER issue, but not the non-authoritative read.&nbsp; This
+solution naturally falls out of elections and leases really.&nbsp; If a
+client has never granted a lease, it should be considered as having to
+wait a full lease timeout before entering an election.&nbsp;
+Applications already know that leases impact elections and this does
+not seem so bad as it is only on the first election.<br>
+<br>
+Is it sufficient to document that the authoritative read is only as
+authoritative as the durability guarantees they make on the sites that
+indicate it is permanent? Yes, I believe this is sufficient.&nbsp; If
+the application says it is permanent and it really isn't, then the
+application is at fault.&nbsp; Believing the application when it
+indicates with the PERM response that it is permanent avoids the
+authoritative problem.&nbsp; <br>
+<h2>Upgrade/Mixed Versions</h2>
+Clearly leases cannot be used with mixed version sites since masters
+running older releases will not have any knowledge of lease
+support.&nbsp; What considerations are needed in the lease code for
+mixed versions?<br>
+<br>
+First if the <b>REP_CONTROL</b>
+structure changes, we need to maintain and use an old version of the
+structure for talking to older clients and masters.&nbsp; The
+implementation of this would be similar to the way we manage for old <b>REP_VOTE_INFO</b>
+structures.&nbsp;
+Second any new messages need translation table entries added.&nbsp;
+Third, if we are assuming global leases then clearly any mixed versions
+cannot have leases configured, and leases cannot be used in mixed
+version groups.&nbsp; Maintaining two versions of the control structure
+is not necessary if we choose a different style of implementation and
+don't change the control structure.<br>
+<br>
+However, then how could an old application both run continuously,
+upgrade to the new release and take advantage of leases without taking
+down the entire application?&nbsp; I believe it is possible for clients
+to be configured for leases but be subject to the master regarding
+leases, yet the master code can assume that if it has leases
+configured, all client sites do as well.&nbsp; In several places above
+I suggested that a client could make a choice based on either a new <b>REPCTL_LEASE</b>
+flag or simply having
+leases turned on locally.&nbsp; If we choose to use the flag, then we
+can support leases with mixed versions.&nbsp; The upgraded clients can
+configure leases and they simply will not be granted until the old
+master is upgraded and send PERM message with the flag indicating it
+wants a lease grant.&nbsp; The client will not grant a lease until such
+time.&nbsp; The clients, while having the leases configured, will not
+grant a lease until told to do so and will simply have an expired
+lease.&nbsp; Then, when the old master finally upgrades, it too can
+configure leases and suddenly all sites are using them.&nbsp; I believe
+this should work just fine and I will need to make sure a client's
+granting of leases is only in response to the master asking for a
+grant.&nbsp; If the master never asks, then the client has them
+configured, but doesn't grant them.<br>
+<h2>Testing</h2>
+Clearly any user-facing API changes will need the equivalent reflection
+in the Tcl API for testing, under CONFIG_TEST.<br>
+<br>
+I am sure the list of tests will grow but off the top of my head:<br>
+Basic test: have N sites all configure leases, run some,&nbsp; read on
+master, etc.<br>
+Refresh test: Perform update on master, sleep until past expiration,
+read on master and make sure leases are refreshed/read successful<br>
+Error test: Test error conditions (reading on client with leases but no
+ignore flag, calling after rep_start, etc)<br>
+Read test: Test reading on both client and master both with and without
+the IGNORE flag.&nbsp; Test that data read with the ignore flag can be
+rolled back.<br>
+Dupmaster test: Force a DUPMASTER situation and verify that the newer
+master cannot get DUPMASTER error.<br>
+Election test: Call election while grant is outstanding and master
+exists.<br>
+Call election while grant is outstanding and master does not exist.<br>
+Call election after expiration on quiescient system with master
+existing.<br>
+Run with a group where some members have leases configured and other do
+not to make sure we get errors instead of dumping core.<br>
+<br>
+<small><br>
+</small>
+</body>
+</html>
diff --git a/src/rep/rep.msg b/src/rep/rep.msg
new file mode 100644
index 00000000..b751a64d
--- /dev/null
+++ b/src/rep/rep.msg
@@ -0,0 +1,160 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX  __rep
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/mp.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * bulk - message for bulk log records or pages
+ */
+BEGIN_MSG bulk		check_length
+ARG	len		u_int32_t
+ARG	lsn		DB_LSN
+ARG	bulkdata	DBT
+END
+
+/*
+ * control - replication control message
+ */
+BEGIN_MSG control	check_length
+ARG	rep_version	u_int32_t
+ARG	log_version	u_int32_t
+ARG	lsn		DB_LSN
+ARG	rectype		u_int32_t
+ARG	gen		u_int32_t
+ARG	msg_sec		u_int32_t
+ARG	msg_nsec	u_int32_t
+ARG	flags		u_int32_t
+END
+
+/*
+ * egen data
+ */
+BEGIN_MSG egen		check_length
+ARG	egen		u_int32_t
+END
+
+/*
+ * file info
+ *
+ * NOTE:  The order of the DBTs is important and relevant in the
+ * GET_CURINFO macro.
+ */
+BEGIN_MSG fileinfo	alloc check_length version
+ARG	pgsize		u_int32_t
+ARG	pgno		db_pgno_t
+ARG	max_pgno	db_pgno_t
+ARG	filenum		u_int32_t
+ARG	finfo_flags	u_int32_t		
+ARG	type		u_int32_t
+ARG	db_flags	u_int32_t
+ARG	uid		DBT
+ARG	info		DBT
+ARG	dir		DBT
+END
+
+BEGIN_MSG fileinfo_v6	alloc check_length version
+ARG	pgsize		u_int32_t
+ARG	pgno		db_pgno_t
+ARG	max_pgno	db_pgno_t
+ARG	filenum		u_int32_t
+ARG	finfo_flags	u_int32_t		
+ARG	type		u_int32_t
+ARG	db_flags	u_int32_t
+ARG	uid		DBT
+ARG	info		DBT
+END
+
+/*
+ * grant info - clients send to masters granting a lease.
+ */
+BEGIN_MSG grant_info	check_length
+ARG	msg_sec		u_int32_t
+ARG	msg_nsec	u_int32_t
+END
+
+/*
+ * We do not need to do anything with LOG record data.
+ * It is opaque data to us.
+ */
+
+/*
+ * log request
+ */
+BEGIN_MSG logreq	check_length
+ARG	endlsn		DB_LSN
+END
+
+/*
+ * We do not need to do anything with NEWCLIENT/NEWSITE cdata dbt.
+ * It is user data and the app has to do whatever transformation
+ * it needs to with its own data.
+ */
+/*
+ * newfile version
+ */
+BEGIN_MSG newfile	check_length
+ARG	version		u_int32_t
+END
+
+/*
+ * update - send update information
+ */
+BEGIN_MSG update	alloc check_length version
+ARG	first_lsn	DB_LSN
+ARG	first_vers	u_int32_t
+ARG	num_files	u_int32_t
+END
+
+/*
+ * vote info.  Current version.
+ */
+BEGIN_MSG vote_info	check_length
+ARG	egen		u_int32_t
+ARG	nsites		u_int32_t
+ARG	nvotes		u_int32_t
+ARG	priority	u_int32_t
+ARG	spare_pri	u_int32_t
+ARG	tiebreaker	u_int32_t
+ARG	data_gen	u_int32_t
+END
+/*
+ * vote info old version from REPVERSION 5 and earlier.
+ */
+BEGIN_MSG vote_info_v5	check_length
+ARG	egen		u_int32_t
+ARG	nsites		u_int32_t
+ARG	nvotes		u_int32_t
+ARG	priority	u_int32_t
+ARG	tiebreaker	u_int32_t
+END
+
+/*
+ * LSN history database - key
+ */ 
+BEGIN_MSG lsn_hist_key
+ARG	version		u_int32_t
+ARG	gen		u_int32_t
+END
+
+/*
+ * LSN history database - data
+ */ 
+BEGIN_MSG lsn_hist_data
+ARG	envid		u_int32_t
+ARG	lsn		DB_LSN
+ARG	hist_sec	u_int32_t
+ARG	hist_nsec	u_int32_t
+END
diff --git a/src/rep/rep_automsg.c b/src/rep/rep_automsg.c
new file mode 100644
index 00000000..5d8155fb
--- /dev/null
+++ b/src/rep/rep_automsg.c
@@ -0,0 +1,1041 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC:	 u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_bulk_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_bulk_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_BULK_SIZE
+	    + (size_t)argp->bulkdata.size)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->len);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+	DB_HTONL_COPYOUT(env, bp, argp->bulkdata.size);
+	if (argp->bulkdata.size > 0) {
+		memcpy(bp, argp->bulkdata.data, argp->bulkdata.size);
+		bp += argp->bulkdata.size;
+	}
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC:	 u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_bulk_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_bulk_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+
+	needed = __REP_BULK_SIZE;
+	if (max < needed)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->len, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+	DB_NTOHL_COPYIN(env, argp->bulkdata.size, bp);
+	if (argp->bulkdata.size == 0)
+		argp->bulkdata.data = NULL;
+	else
+		argp->bulkdata.data = bp;
+	needed += (size_t)argp->bulkdata.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->bulkdata.size;
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_bulk message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_control_marshal __P((ENV *, __rep_control_args *,
+ * PUBLIC:	 u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_control_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_control_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_CONTROL_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->rep_version);
+	DB_HTONL_COPYOUT(env, bp, argp->log_version);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+	DB_HTONL_COPYOUT(env, bp, argp->rectype);
+	DB_HTONL_COPYOUT(env, bp, argp->gen);
+	DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+	DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_control_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_control_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_control_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_control_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_CONTROL_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->rep_version, bp);
+	DB_NTOHL_COPYIN(env, argp->log_version, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+	DB_NTOHL_COPYIN(env, argp->rectype, bp);
+	DB_NTOHL_COPYIN(env, argp->gen, bp);
+	DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+	DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_control message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_egen_marshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC:	 u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_egen_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_egen_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_EGEN_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->egen);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC:	 u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_egen_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_egen_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_EGEN_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->egen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_egen message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_marshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_marshal(env, version, argp, bp, max, lenp)
+	ENV *env;
+	u_int32_t version;
+	__rep_fileinfo_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	int copy_only;
+	u_int8_t *start;
+
+	if (max < __REP_FILEINFO_SIZE
+	    + (size_t)argp->uid.size
+	    + (size_t)argp->info.size
+	    + (size_t)argp->dir.size)
+		return (ENOMEM);
+	start = bp;
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+	if (copy_only) {
+		memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->pgno);
+	if (copy_only) {
+		memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+	if (copy_only) {
+		memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->filenum);
+	if (copy_only) {
+		memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+	if (copy_only) {
+		memcpy(bp, &argp->type, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->type);
+	if (copy_only) {
+		memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+	if (copy_only) {
+		memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+	if (argp->uid.size > 0) {
+		memcpy(bp, argp->uid.data, argp->uid.size);
+		bp += argp->uid.size;
+	}
+	if (copy_only) {
+		memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->info.size);
+	if (argp->info.size > 0) {
+		memcpy(bp, argp->info.data, argp->info.size);
+		bp += argp->info.size;
+	}
+	if (copy_only) {
+		memcpy(bp, &argp->dir.size, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->dir.size);
+	if (argp->dir.size > 0) {
+		memcpy(bp, argp->dir.data, argp->dir.size);
+		bp += argp->dir.size;
+	}
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp)
+	ENV *env;
+	u_int32_t version;
+	__rep_fileinfo_args **argpp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+	__rep_fileinfo_args *argp;
+	int ret;
+	int copy_only;
+
+	needed = __REP_FILEINFO_SIZE;
+	if (max < needed)
+		goto too_few;
+	if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+		return (ret);
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+	if (copy_only) {
+		memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->pgno, bp);
+	if (copy_only) {
+		memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+	if (copy_only) {
+		memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->filenum, bp);
+	if (copy_only) {
+		memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+	if (copy_only) {
+		memcpy(&argp->type, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->type, bp);
+	if (copy_only) {
+		memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+	if (copy_only) {
+		memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+	if (argp->uid.size == 0)
+		argp->uid.data = NULL;
+	else
+		argp->uid.data = bp;
+	needed += (size_t)argp->uid.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->uid.size;
+	if (copy_only) {
+		memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->info.size, bp);
+	if (argp->info.size == 0)
+		argp->info.data = NULL;
+	else
+		argp->info.data = bp;
+	needed += (size_t)argp->info.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->info.size;
+	if (copy_only) {
+		memcpy(&argp->dir.size, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->dir.size, bp);
+	if (argp->dir.size == 0)
+		argp->dir.data = NULL;
+	else
+		argp->dir.data = bp;
+	needed += (size_t)argp->dir.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->dir.size;
+
+	if (nextp != NULL)
+		*nextp = bp;
+	*argpp = argp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_fileinfo message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_v6_marshal(env, version, argp, bp, max, lenp)
+	ENV *env;
+	u_int32_t version;
+	__rep_fileinfo_v6_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	int copy_only;
+	u_int8_t *start;
+
+	if (max < __REP_FILEINFO_V6_SIZE
+	    + (size_t)argp->uid.size
+	    + (size_t)argp->info.size)
+		return (ENOMEM);
+	start = bp;
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+	if (copy_only) {
+		memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->pgno);
+	if (copy_only) {
+		memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+	if (copy_only) {
+		memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->filenum);
+	if (copy_only) {
+		memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+	if (copy_only) {
+		memcpy(bp, &argp->type, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->type);
+	if (copy_only) {
+		memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+	if (copy_only) {
+		memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+	if (argp->uid.size > 0) {
+		memcpy(bp, argp->uid.data, argp->uid.size);
+		bp += argp->uid.size;
+	}
+	if (copy_only) {
+		memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->info.size);
+	if (argp->info.size > 0) {
+		memcpy(bp, argp->info.data, argp->info.size);
+		bp += argp->info.size;
+	}
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_v6_unmarshal(env, version, argpp, bp, max, nextp)
+	ENV *env;
+	u_int32_t version;
+	__rep_fileinfo_v6_args **argpp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+	__rep_fileinfo_v6_args *argp;
+	int ret;
+	int copy_only;
+
+	needed = __REP_FILEINFO_V6_SIZE;
+	if (max < needed)
+		goto too_few;
+	if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+		return (ret);
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+	if (copy_only) {
+		memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->pgno, bp);
+	if (copy_only) {
+		memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+	if (copy_only) {
+		memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->filenum, bp);
+	if (copy_only) {
+		memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+	if (copy_only) {
+		memcpy(&argp->type, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->type, bp);
+	if (copy_only) {
+		memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+	if (copy_only) {
+		memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+	if (argp->uid.size == 0)
+		argp->uid.data = NULL;
+	else
+		argp->uid.data = bp;
+	needed += (size_t)argp->uid.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->uid.size;
+	if (copy_only) {
+		memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->info.size, bp);
+	if (argp->info.size == 0)
+		argp->info.data = NULL;
+	else
+		argp->info.data = bp;
+	needed += (size_t)argp->info.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->info.size;
+
+	if (nextp != NULL)
+		*nextp = bp;
+	*argpp = argp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_fileinfo_v6 message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_marshal __P((ENV *,
+ * PUBLIC:	 __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_grant_info_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_grant_info_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_GRANT_INFO_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+	DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_grant_info_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_grant_info_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_GRANT_INFO_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+	DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_grant_info message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC:	 u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_logreq_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_logreq_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_LOGREQ_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->endlsn.file);
+	DB_HTONL_COPYOUT(env, bp, argp->endlsn.offset);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC:	 u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_logreq_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_logreq_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_LOGREQ_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->endlsn.file, bp);
+	DB_NTOHL_COPYIN(env, argp->endlsn.offset, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_logreq message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *,
+ * PUBLIC:	 u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_newfile_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_newfile_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_NEWFILE_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_newfile_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_newfile_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_NEWFILE_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_newfile message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_update_marshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_update_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_update_marshal(env, version, argp, bp, max, lenp)
+	ENV *env;
+	u_int32_t version;
+	__rep_update_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	int copy_only;
+	u_int8_t *start;
+
+	if (max < __REP_UPDATE_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(bp, &argp->first_lsn.file, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+		memcpy(bp, &argp->first_lsn.offset, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else {
+		DB_HTONL_COPYOUT(env, bp, argp->first_lsn.file);
+		DB_HTONL_COPYOUT(env, bp, argp->first_lsn.offset);
+	}
+	if (copy_only) {
+		memcpy(bp, &argp->first_vers, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->first_vers);
+	if (copy_only) {
+		memcpy(bp, &argp->num_files, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_HTONL_COPYOUT(env, bp, argp->num_files);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_update_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC:	 __rep_update_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_update_unmarshal(env, version, argpp, bp, max, nextp)
+	ENV *env;
+	u_int32_t version;
+	__rep_update_args **argpp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	__rep_update_args *argp;
+	int ret;
+	int copy_only;
+
+	if (max < __REP_UPDATE_SIZE)
+		goto too_few;
+	if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+		return (ret);
+
+	copy_only = 0;
+	if (version < DB_REPVERSION_47)
+		copy_only = 1;
+	if (copy_only) {
+		memcpy(&argp->first_lsn.file, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+		memcpy(&argp->first_lsn.offset, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else {
+		DB_NTOHL_COPYIN(env, argp->first_lsn.file, bp);
+		DB_NTOHL_COPYIN(env, argp->first_lsn.offset, bp);
+	}
+	if (copy_only) {
+		memcpy(&argp->first_vers, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->first_vers, bp);
+	if (copy_only) {
+		memcpy(&argp->num_files, bp, sizeof(u_int32_t));
+		bp += sizeof(u_int32_t);
+	} else
+		DB_NTOHL_COPYIN(env, argp->num_files, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	*argpp = argp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_update message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_marshal __P((ENV *,
+ * PUBLIC:	 __rep_vote_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_vote_info_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_vote_info_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_VOTE_INFO_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->egen);
+	DB_HTONL_COPYOUT(env, bp, argp->nsites);
+	DB_HTONL_COPYOUT(env, bp, argp->nvotes);
+	DB_HTONL_COPYOUT(env, bp, argp->priority);
+	DB_HTONL_COPYOUT(env, bp, argp->spare_pri);
+	DB_HTONL_COPYOUT(env, bp, argp->tiebreaker);
+	DB_HTONL_COPYOUT(env, bp, argp->data_gen);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_vote_info_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_vote_info_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_VOTE_INFO_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->egen, bp);
+	DB_NTOHL_COPYIN(env, argp->nsites, bp);
+	DB_NTOHL_COPYIN(env, argp->nvotes, bp);
+	DB_NTOHL_COPYIN(env, argp->priority, bp);
+	DB_NTOHL_COPYIN(env, argp->spare_pri, bp);
+	DB_NTOHL_COPYIN(env, argp->tiebreaker, bp);
+	DB_NTOHL_COPYIN(env, argp->data_gen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_vote_info message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_v5_marshal __P((ENV *,
+ * PUBLIC:	 __rep_vote_info_v5_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_vote_info_v5_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__rep_vote_info_v5_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REP_VOTE_INFO_V5_SIZE)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->egen);
+	DB_HTONL_COPYOUT(env, bp, argp->nsites);
+	DB_HTONL_COPYOUT(env, bp, argp->nvotes);
+	DB_HTONL_COPYOUT(env, bp, argp->priority);
+	DB_HTONL_COPYOUT(env, bp, argp->tiebreaker);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_v5_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_vote_info_v5_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_vote_info_v5_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_vote_info_v5_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_VOTE_INFO_V5_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->egen, bp);
+	DB_NTOHL_COPYIN(env, argp->nsites, bp);
+	DB_NTOHL_COPYIN(env, argp->nvotes, bp);
+	DB_NTOHL_COPYIN(env, argp->priority, bp);
+	DB_NTOHL_COPYIN(env, argp->tiebreaker, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_vote_info_v5 message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_lsn_hist_key_marshal __P((ENV *,
+ * PUBLIC:	 __rep_lsn_hist_key_args *, u_int8_t *));
+ */
+void
+__rep_lsn_hist_key_marshal(env, argp, bp)
+	ENV *env;
+	__rep_lsn_hist_key_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+	DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __rep_lsn_hist_key_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_lsn_hist_key_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_lsn_hist_key_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_LSN_HIST_KEY_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+	DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_lsn_hist_key message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_lsn_hist_data_marshal __P((ENV *,
+ * PUBLIC:	 __rep_lsn_hist_data_args *, u_int8_t *));
+ */
+void
+__rep_lsn_hist_data_marshal(env, argp, bp)
+	ENV *env;
+	__rep_lsn_hist_data_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->envid);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+	DB_HTONL_COPYOUT(env, bp, argp->hist_sec);
+	DB_HTONL_COPYOUT(env, bp, argp->hist_nsec);
+}
+
+/*
+ * PUBLIC: int __rep_lsn_hist_data_unmarshal __P((ENV *,
+ * PUBLIC:	 __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_lsn_hist_data_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__rep_lsn_hist_data_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REP_LSN_HIST_DATA_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->envid, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+	DB_NTOHL_COPYIN(env, argp->hist_sec, bp);
+	DB_NTOHL_COPYIN(env, argp->hist_nsec, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __rep_lsn_hist_data message"));
+	return (EINVAL);
+}
+
diff --git a/src/rep/rep_backup.c b/src/rep/rep_backup.c
new file mode 100644
index 00000000..cfde7622
--- /dev/null
+++ b/src/rep/rep_backup.c
@@ -0,0 +1,3568 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * Context information needed for buffer management during the building of a
+ * list of database files present in the environment.  When fully built, the
+ * buffer is in the form of an UPDATE message: a (marshaled) update_args,
+ * followed by some number of (marshaled) fileinfo_args.
+ *
+ * Note that the fileinfo for the first file in the list always appears at
+ * (constant) offset __REP_UPDATE_SIZE in the buffer.
+ */
+typedef struct {
+	u_int8_t	*buf;	/* Buffer base address. */
+	u_int32_t	size;	/* Total allocated buffer size. */
+	u_int8_t	*fillptr; /* Pointer to first unused space. */
+	u_int32_t	count;	/* Number of entries currently in list. */
+	u_int32_t	version; /* Rep version of marshaled format. */
+} FILE_LIST_CTX;
+#define	FIRST_FILE_PTR(buf)	((buf) + __REP_UPDATE_SIZE)
+
+/*
+ * Function that performs any desired processing on a single file, as part of
+ * the traversal of a list of database files, such as with internal init.
+ */
+typedef int (FILE_WALK_FN) __P((ENV *, __rep_fileinfo_args *, void *));
+
+static FILE_WALK_FN __rep_check_uid;
+static int __rep_clean_interrupted __P((ENV *));
+static FILE_WALK_FN __rep_cleanup_nimdbs;
+static int __rep_filedone __P((ENV *, DB_THREAD_INFO *ip, int,
+     REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_find_dbs __P((ENV *, FILE_LIST_CTX *));
+static FILE_WALK_FN __rep_find_inmem;
+static int __rep_get_fileinfo __P((ENV *, const char *,
+    const char *, __rep_fileinfo_args *, u_int8_t *));
+static int __rep_get_file_list __P((ENV *,
+    DB_FH *, u_int32_t, u_int32_t *, DBT *));
+static int __rep_is_replicated_db __P((const char *, const char *));
+static int __rep_log_setup __P((ENV *,
+    REP *, u_int32_t, u_int32_t, DB_LSN *));
+static int __rep_mpf_open __P((ENV *, DB_MPOOLFILE **,
+    __rep_fileinfo_args *, u_int32_t));
+static int __rep_nextfile __P((ENV *, int, REP *));
+static int __rep_page_gap __P((ENV *,
+     REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_page_sendpages __P((ENV *, DB_THREAD_INFO *, int,
+    __rep_control_args *, __rep_fileinfo_args *, DB_MPOOLFILE *, DB *));
+static int __rep_queue_filedone __P((ENV *,
+    DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+static int __rep_remove_all __P((ENV *, u_int32_t, DBT *));
+static FILE_WALK_FN __rep_remove_by_list;
+static int __rep_remove_by_prefix __P((ENV *, const char *, const char *,
+    size_t, APPNAME));
+static FILE_WALK_FN __rep_remove_file;
+static int __rep_remove_logs __P((ENV *));
+static int __rep_remove_nimdbs __P((ENV *));
+static int __rep_rollback __P((ENV *, DB_LSN *));
+static int __rep_unlink_by_list __P((ENV *, u_int32_t,
+    u_int8_t *, u_int32_t, u_int32_t));
+static FILE_WALK_FN __rep_unlink_file;
+static int __rep_walk_filelist __P((ENV *, u_int32_t, u_int8_t *,
+    u_int32_t, u_int32_t, FILE_WALK_FN *, void *));
+static int __rep_walk_dir __P((ENV *, const char *, const char *,
+    FILE_LIST_CTX*));
+static int __rep_write_page __P((ENV *,
+    DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+
+/*
+ * __rep_update_req -
+ *	Process an update_req and send the file information to clients.
+ *
+ * PUBLIC: int __rep_update_req __P((ENV *, __rep_control_args *));
+ */
+int
+__rep_update_req(env, rp)
+	ENV *env;
+	__rep_control_args *rp;
+{
+	DBT updbt, vdbt;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	REP *rep;
+	__rep_update_args u_args;
+	FILE_LIST_CTX context;
+	size_t updlen;
+	u_int32_t flag, version;
+	int ret, t_ret;
+
+	/*
+	 * Start by allocating 1Meg, which ought to be plenty enough to describe
+	 * all databases in the environment.  (If it's not, __rep_walk_dir can
+	 * grow the size.)
+	 *
+	 * The data we send looks like this:
+	 *	__rep_update_args
+	 *	__rep_fileinfo_args
+	 *	__rep_fileinfo_args
+	 *	...
+	 */
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_SYSTEM_LOCK(env);
+	if (F_ISSET(rep, REP_F_INUPDREQ)) {
+		REP_SYSTEM_UNLOCK(env);
+		return (0);
+	}
+	F_SET(rep, REP_F_INUPDREQ);
+	REP_SYSTEM_UNLOCK(env);
+
+	dblp = env->lg_handle;
+	logc = NULL;
+	if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+		goto err_noalloc;
+	context.size = MEGABYTE;
+	context.count = 0;
+	context.version = rp->rep_version;
+
+	/* Reserve space for the update_args, and fill in file info. */
+	context.fillptr = FIRST_FILE_PTR(context.buf);
+	if ((ret = __rep_find_dbs(env, &context)) != 0)
+		goto err;
+
+	/*
+	 * Now get our first LSN.  We send the lsn of the first
+	 * non-archivable log file.
+	 */
+	flag = DB_SET;
+	if ((ret = __log_get_stable_lsn(env, &lsn, 0)) != 0) {
+		if (ret != DB_NOTFOUND)
+			goto err;
+		/*
+		 * If ret is DB_NOTFOUND then there is no checkpoint
+		 * in this log, that is okay, just start at the beginning.
+		 */
+		ret = 0;
+		flag = DB_FIRST;
+	}
+
+	/*
+	 * Now get the version number of the log file of that LSN.
+	 */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+
+	memset(&vdbt, 0, sizeof(vdbt));
+	/*
+	 * Set our log cursor on the LSN we are sending.  Or
+	 * to the first LSN if we have no stable LSN.
+	 */
+	if ((ret = __logc_get(logc, &lsn, &vdbt, flag)) != 0) {
+		/*
+		 * We could be racing a fresh master starting up.  If we
+		 * have no log records, assume an initial LSN and current
+		 * log version.
+		 */
+		if (ret != DB_NOTFOUND)
+			goto err;
+		INIT_LSN(lsn);
+		version = DB_LOGVERSION;
+	} else {
+		if ((ret = __logc_version(logc, &version)) != 0)
+			goto err;
+	}
+	/*
+	 * Package up the update information.
+	 */
+	u_args.first_lsn = lsn;
+	u_args.first_vers = version;
+	u_args.num_files = context.count;
+	if ((ret = __rep_update_marshal(env, rp->rep_version,
+	    &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+		goto err;
+	DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+	/*
+	 * We have all the file information now.  Send it.
+	 */
+	DB_INIT_DBT(updbt, context.buf, context.fillptr - context.buf);
+
+	LOG_SYSTEM_LOCK(env);
+	lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+	LOG_SYSTEM_UNLOCK(env);
+	(void)__rep_send_message(
+	    env, DB_EID_BROADCAST, REP_UPDATE, &lsn, &updbt, 0, 0);
+
+err:	__os_free(env, context.buf);
+err_noalloc:
+	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	REP_SYSTEM_LOCK(env);
+	F_CLR(rep, REP_F_INUPDREQ);
+	REP_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __rep_find_dbs -
+ *	Walk through all the named files/databases including those in the
+ *	environment or data_dirs and those that in named and in-memory.  We
+ *	need to	open them, gather the necessary information and then close
+ *	them.
+ *
+ * May be called either while holding REP_SYSTEM_LOCK or without.
+ */
+static int
+__rep_find_dbs(env, context)
+	ENV *env;
+	FILE_LIST_CTX *context;
+{
+	DB_ENV *dbenv;
+	int ret;
+	char **ddir, *real_dir;
+
+	dbenv = env->dbenv;
+	ret = 0;
+	real_dir = NULL;
+
+	/*
+	 * If we have a data directory, walk it get a list of the
+	 * replicated user databases.  If the application has a metadata_dir,
+	 * this will also find any persistent internal system databases.
+	 */
+	if (dbenv->db_data_dir != NULL) {
+		for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+			if ((ret = __db_appname(env,
+			    DB_APP_NONE, *ddir, NULL, &real_dir)) != 0)
+				break;
+			if ((ret = __rep_walk_dir(env,
+			    real_dir, *ddir, context)) != 0)
+				break;
+			__os_free(env, real_dir);
+			real_dir = NULL;
+		}
+	}
+	/*
+	 * Walk the environment directory.  If the application doesn't
+	 * have a metadata_dir, this will return persistent internal system
+	 * databases.  If the application doesn't have a separate data
+	 * directory, this will also return all user databases.
+	 */
+	if (ret == 0)
+		ret = __rep_walk_dir(env, env->db_home, NULL, context);
+
+	/* Now, collect any in-memory named databases. */
+	if (ret == 0)
+		ret = __rep_walk_dir(env, NULL, NULL, context);
+
+	if (real_dir != NULL)
+		__os_free(env, real_dir);
+	return (ret);
+}
+
+/*
+ * __rep_walk_dir --
+ *
+ * This is the routine that walks a directory and fills in the structures
+ * that we use to generate messages to the client telling it what
+ * files are available.  If the directory name is NULL, then we should
+ * walk the list of in-memory named files.
+ */
+static int
+__rep_walk_dir(env, dir, datadir, context)
+	ENV *env;
+	const char *dir, *datadir;
+	FILE_LIST_CTX *context;
+{
+	__rep_fileinfo_args tmpfp;
+	size_t avail, len;
+	int cnt, first_file, i, ret;
+	u_int8_t uid[DB_FILE_ID_LEN];
+	char *file, **names, *subdb;
+
+	if (dir == NULL) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Walk_dir: Getting info for in-memory named files"));
+		if ((ret = __memp_inmemlist(env, &names, &cnt)) != 0)
+			return (ret);
+	} else {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Walk_dir: Getting info for datadir %s, dir: %s",
+		    datadir == NULL ? "NULL" : datadir, dir));
+		if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+			return (ret);
+	}
+	VPRINT(env, (env, DB_VERB_REP_SYNC, "Walk_dir: Dir %s has %d files",
+	    (dir == NULL) ? "INMEM" : dir, cnt));
+	first_file = 1;
+	for (i = 0; i < cnt; i++) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Walk_dir: File %d name: %s", i, names[i]));
+		if (!__rep_is_replicated_db(names[i], dir))
+			continue;
+
+		/* We found a file to process. */
+		if (dir == NULL) {
+			file = NULL;
+			subdb = names[i];
+		} else {
+			file = names[i];
+			subdb = NULL;
+		}
+		if ((ret = __rep_get_fileinfo(env,
+		    file, subdb, &tmpfp, uid)) != 0) {
+			/*
+			 * If we find a file that isn't a database, skip it.
+			 */
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "Walk_dir: File %d %s: returned error %s",
+			    i, names[i], db_strerror(ret)));
+			ret = 0;
+			continue;
+		}
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Walk_dir: File %s at 0x%lx: pgsize %lu, max_pgno %lu",
+		    names[i], P_TO_ULONG(context->fillptr),
+		    (u_long)tmpfp.pgsize, (u_long)tmpfp.max_pgno));
+
+		/*
+		 * On the first time through the loop, check to see if the file
+		 * we're about to add is already on the list.  If it is, it must
+		 * have been added in a previous call, and that means the
+		 * directory we're currently scanning has already been scanned
+		 * before.  (This can happen if the user called
+		 * env->set_data_dir() more than once for the same directory.)
+		 * If that's the case, we're done: not only is it a waste of
+		 * time to scan the same directory again, but doing so would
+		 * result in the same files appearing in the list more than
+		 * once.
+		 */
+		if (first_file && dir != NULL &&
+		    (ret = __rep_walk_filelist(env, context->version,
+		    FIRST_FILE_PTR(context->buf), context->size,
+		    context->count, __rep_check_uid, uid)) != 0) {
+			if (ret == DB_KEYEXIST)
+				ret = 0;
+			goto err;
+		}
+		first_file = 0;
+
+		/*
+		 * Finally we know that this file is a suitable database file
+		 * that we haven't yet included on our list.
+		 */
+		tmpfp.filenum = context->count++;
+
+		if (datadir != NULL)
+			DB_SET_DBT(tmpfp.dir, datadir, strlen(datadir) + 1);
+		else {
+			DB_SET_DBT(tmpfp.dir, NULL, 0);
+		}
+		DB_SET_DBT(tmpfp.info, names[i], strlen(names[i]) + 1);
+		DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN);
+retry:		avail = (size_t)(&context->buf[context->size] -
+		    context->fillptr);
+		if (context->version < DB_REPVERSION_53)
+			/*
+			 * It is safe to cast to the old struct
+			 * because the first part of the current
+			 * struct matches the old struct.
+			 */
+			ret = __rep_fileinfo_v6_marshal(env, context->version,
+			    (__rep_fileinfo_v6_args *)&tmpfp,
+			    context->fillptr, avail, &len);
+		else
+			ret = __rep_fileinfo_marshal(env, context->version,
+			    &tmpfp, context->fillptr, avail, &len);
+		if (ret == ENOMEM) {
+			/*
+			 * Here, 'len' is the total space in use in the buffer.
+			 */
+			len = (size_t)(context->fillptr - context->buf);
+			context->size *= 2;
+
+			if ((ret = __os_realloc(env,
+			    context->size, &context->buf)) != 0)
+				goto err;
+			context->fillptr = context->buf + len;
+
+			/*
+			 * Now that we've reallocated the space, try to
+			 * store it again.
+			 */
+			goto retry;
+		}
+		/*
+		 * Here, 'len' (still) holds the length of the marshaled
+		 * information about the current file (as filled in by the last
+		 * call to  __rep_fileinfo_marshal()).
+		 */
+		context->fillptr += len;
+	}
+err:
+	__os_dirfree(env, names, cnt);
+	return (ret);
+}
+
+/*
+ * Returns a boolean to indicate whether a file/database with the given name
+ * should be included in internal init.
+ */
+static int
+__rep_is_replicated_db(name, dir)
+	const char *name, *dir;
+{
+	if (strcmp(name, "DB_CONFIG") == 0 || strcmp(name, "pragma") == 0)
+		return (0);
+	if (IS_LOG_FILE(name))
+		return (0);
+
+	/*
+	 * Remaining things that don't have a "__db" prefix are eligible.
+	 */
+	if (!IS_DB_FILE(name))
+		return (1);
+
+	/* Here, we know we have a "__db" name. */
+	if (name[sizeof(DB_REGION_PREFIX) - 1] == 'p')
+		return (1);	/* Partition files are eligible. */
+
+	/*
+	 * Replicated system databases are eligible.  When on disk, both DBs are
+	 * sub-databases of a single database file.
+	 */
+	if (dir == NULL) {
+		if (strcmp(name, REPMEMBERSHIP) == 0 ||
+		    strcmp(name, REPLSNHIST) == 0)
+			return (1);
+	} else {
+		if (IS_REP_FILE(name))
+			return (1);
+	}
+
+	/* Some other "__db" named file. */
+	return (0);
+}
+
+/*
+ * Check whether the given uid is already present in the list of files being
+ * built in the context buffer.  A return of DB_KEYEXIST means it is.
+ */
+static int
+__rep_check_uid(env, rfp, uid)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *uid;
+{
+	int ret;
+
+	ret = 0;
+	if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+			"Check_uid: Found matching file."));
+		ret = DB_KEYEXIST;
+	}
+	return (ret);
+
+}
+
+static int
+__rep_get_fileinfo(env, file, subdb, rfp, uid)
+	ENV *env;
+	const char *file, *subdb;
+	__rep_fileinfo_args *rfp;
+	u_int8_t *uid;
+{
+	DB *dbp;
+	DBC *dbc;
+	DBMETA *dbmeta;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	int lorder, ret, t_ret;
+
+	dbp = NULL;
+	dbc = NULL;
+	pagep = NULL;
+
+	ENV_GET_THREAD_INFO(env, ip);
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+	/*
+	 * Use DB_AM_RECOVER to prevent getting locks, otherwise exclusive
+	 * database handles would block the master from handling UPDATE_REQ.
+	 */
+	F_SET(dbp, DB_AM_RECOVER);
+	if ((ret = __db_open(dbp, ip, NULL, file, subdb, DB_UNKNOWN,
+	    DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+	    0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		goto err;
+	if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn,
+	    0, &pagep)) != 0)
+		goto err;
+	/*
+	 * We have the meta page.  Set up our information.
+	 */
+	dbmeta = (DBMETA *)pagep;
+	rfp->pgno = 0;
+	/*
+	 * Queue is a special-case.  We need to set max_pgno to 0 so that
+	 * the client can compute the pages from the meta-data.
+	 */
+	if (dbp->type == DB_QUEUE)
+		rfp->max_pgno = 0;
+	else
+		rfp->max_pgno = dbmeta->last_pgno;
+	rfp->pgsize = dbp->pgsize;
+	memcpy(uid, dbp->fileid, DB_FILE_ID_LEN);
+	rfp->type = (u_int32_t)dbp->type;
+	rfp->db_flags = dbp->flags;
+	rfp->finfo_flags = 0;
+	/*
+	 * Send the lorder of this database.
+	 */
+	(void)__db_get_lorder(dbp, &lorder);
+	if (lorder == 1234)
+		FLD_SET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+	else
+		FLD_CLR(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+
+	ret = __memp_fput(dbp->mpf, ip, pagep, dbc->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto err;
+err:
+	/*
+	 * Check status of pagep in case any new error paths out leave
+	 * a valid page.  All current paths out have pagep NULL.
+	 */
+	DB_ASSERT(env, pagep == NULL);
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_page_req
+ *	Process a page_req and send the page information to the client.
+ *
+ * PUBLIC: int __rep_page_req __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page_req(env, ip, eid, rp, rec)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	__rep_control_args *rp;
+	DBT *rec;
+{
+	__rep_fileinfo_args *msgfp, msgf;
+	__rep_fileinfo_v6_args *msgfpv6;
+	DB_MPOOLFILE *mpf;
+	DB_REP *db_rep;
+	REP *rep;
+	int ret, t_ret;
+	u_int8_t *next;
+	void *msgfree;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (rp->rep_version < DB_REPVERSION_53) {
+		/*
+		 * Build a current struct by copying in the older
+		 * version struct and then setting up the data_dir.
+		 * This is safe because all old fields are in the
+		 * same location in the current struct.
+		 */
+		if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
+		    &msgfpv6, rec->data, rec->size, &next)) != 0)
+			return (ret);
+		memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
+		msgf.dir.data = NULL;
+		msgf.dir.size = 0;
+		msgfp = &msgf;
+		msgfree = msgfpv6;
+	} else {
+		if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+		    &msgfp, rec->data, rec->size, &next)) != 0)
+			return (ret);
+		msgfree = msgfp;
+	}
+
+	DB_TEST_SET(env->test_abort, DB_TEST_NO_PAGES);
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "page_req: file %d page %lu to %lu",
+	    msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+
+	/*
+	 * We need to open the file and then send its pages.
+	 * If we cannot open the file, we send REP_FILE_FAIL.
+	 */
+	VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "page_req: Open %d via mpf_open", msgfp->filenum));
+	if ((ret = __rep_mpf_open(env, &mpf, msgfp, 0)) != 0) {
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "page_req: Open %d failed", msgfp->filenum));
+		if (F_ISSET(rep, REP_F_MASTER))
+			(void)__rep_send_message(env, eid, REP_FILE_FAIL,
+			    NULL, rec, 0, 0);
+		else
+			ret = DB_NOTFOUND;
+		goto err;
+	}
+
+	ret = __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, NULL);
+	t_ret = __memp_fclose(mpf, 0);
+	if (ret == 0 && t_ret != 0)
+		ret = t_ret;
+err:
+DB_TEST_RECOVERY_LABEL
+	__os_free(env, msgfree);
+	return (ret);
+}
+
+static int
+__rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	__rep_control_args *rp;
+	__rep_fileinfo_args *msgfp;
+	DB_MPOOLFILE *mpf;
+	DB *dbp;
+{
+	DB *qdbp;
+	DBC *qdbc;
+	DBT msgdbt;
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	PAGE *pagep;
+	REP *rep;
+	REP_BULK bulk;
+	REP_THROTTLE repth;
+	db_pgno_t p;
+	uintptr_t bulkoff;
+	size_t len, msgsz;
+	u_int32_t bulkflags, use_bulk;
+	int opened, ret, t_ret;
+	u_int8_t *buf;
+
+	dblp = env->lg_handle;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	opened = 0;
+	t_ret = 0;
+	qdbp = NULL;
+	qdbc = NULL;
+	buf = NULL;
+	bulk.addr = NULL;
+	use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+	if (msgfp->type == (u_int32_t)DB_QUEUE) {
+		if (dbp == NULL) {
+			if ((ret = __db_create_internal(&qdbp, env, 0)) != 0)
+				goto err;
+			/*
+			 * We need to check whether this is in-memory so that
+			 * we pass the name correctly as either the file or
+			 * the database name.
+			 */
+			if ((ret = __db_open(qdbp, ip, NULL,
+			    FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+			    NULL : msgfp->info.data,
+			    FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+			    msgfp->info.data : NULL,
+			    DB_UNKNOWN,
+			DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+			    0, PGNO_BASE_MD)) != 0)
+				goto err;
+			opened = 1;
+		} else
+			qdbp = dbp;
+		if ((ret = __db_cursor(qdbp, ip, NULL, &qdbc, 0)) != 0)
+			goto err;
+	}
+	msgsz = __REP_FILEINFO_SIZE + DB_FILE_ID_LEN + msgfp->pgsize +
+	    msgfp->dir.size;
+	if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+		goto err;
+	memset(&msgdbt, 0, sizeof(msgdbt));
+	VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "sendpages: file %d page %lu to %lu",
+	    msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+	memset(&repth, 0, sizeof(repth));
+	/*
+	 * If we're doing bulk transfer, allocate a bulk buffer to put our
+	 * pages in.  We still need to initialize the throttle info
+	 * because if we encounter a page larger than our entire bulk
+	 * buffer, we need to send it as a singleton.
+	 *
+	 * Use a local var so that we don't need to worry if someone else
+	 * turns on/off bulk in the middle of our call here.
+	 */
+	if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+	    &bulkoff, &bulkflags, REP_BULK_PAGE)) != 0)
+		goto err;
+	REP_SYSTEM_LOCK(env);
+	repth.gbytes = rep->gbytes;
+	repth.bytes = rep->bytes;
+	repth.type = REP_PAGE;
+	repth.data_dbt = &msgdbt;
+	REP_SYSTEM_UNLOCK(env);
+
+	for (p = msgfp->pgno; p <= msgfp->max_pgno; p++) {
+		if (msgfp->type == (u_int32_t)DB_QUEUE && p != 0) {
+			/*
+			 * If queue returns ENOENT or if queue is not configured
+			 * convert it into PAGE_NOTFOUND.  Queue might return
+			 * ENOENT if an entire extent file does not exist in the
+			 * "middle" of the database.
+			 */
+#ifdef HAVE_QUEUE
+			if ((ret = __qam_fget(qdbc, &p, 0, &pagep)) == ENOENT)
+#endif
+				ret = DB_PAGE_NOTFOUND;
+		} else
+			ret = __memp_fget(mpf, &p, ip, NULL, 0, &pagep);
+		msgfp->pgno = p;
+		if (ret == DB_PAGE_NOTFOUND) {
+			if (F_ISSET(rep, REP_F_MASTER)) {
+				ret = 0;
+				RPRINT(env, (env, DB_VERB_REP_SYNC,
+				    "sendpages: PAGE_FAIL on page %lu",
+				    (u_long)p));
+				if (rp->rep_version < DB_REPVERSION_53)
+					/*
+					 * It is safe to cast to the old struct
+					 * because the first part of the current
+					 * struct matches the old struct.
+					 */
+					ret = __rep_fileinfo_v6_marshal(env,
+					    rp->rep_version,
+					    (__rep_fileinfo_v6_args *)msgfp,
+					    buf, msgsz, &len);
+				else
+					ret = __rep_fileinfo_marshal(env,
+					    rp->rep_version, msgfp, buf,
+					    msgsz, &len);
+				if (ret != 0)
+					goto err;
+				LOG_SYSTEM_LOCK(env);
+				lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+				LOG_SYSTEM_UNLOCK(env);
+				DB_SET_DBT(msgdbt, buf, len);
+				(void)__rep_send_message(env, eid,
+				    REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0);
+				continue;
+			} else
+				ret = DB_NOTFOUND;
+			goto err;
+		} else if (ret != 0)
+			goto err;
+		else
+			DB_SET_DBT(msgfp->info, pagep, msgfp->pgsize);
+		len = 0;
+		/*
+		 * Send along an indication of the byte order of this mpool
+		 * page.  Since mpool always keeps pages in the native byte
+		 * order of the local environment, this is simply my
+		 * environment's byte order.
+		 *
+		 * Since pages can be served from a variety of sites when using
+		 * client-to-client synchronization, the receiving client needs
+		 * to know the byte order of each page independently.
+		 */
+		if (F_ISSET(env, ENV_LITTLEENDIAN))
+			FLD_SET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+		else
+			FLD_CLR(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "sendpages: %lu, page lsn [%lu][%lu]", (u_long)p,
+		    (u_long)pagep->lsn.file, (u_long)pagep->lsn.offset));
+		if (rp->rep_version < DB_REPVERSION_53)
+			/*
+			 * It is safe to cast to the old struct
+			 * because the first part of the current
+			 * struct matches the old struct.
+			 */
+			ret = __rep_fileinfo_v6_marshal(env,
+			    rp->rep_version,
+			    (__rep_fileinfo_v6_args *)msgfp,
+			    buf, msgsz, &len);
+		else
+			ret = __rep_fileinfo_marshal(env, rp->rep_version,
+			    msgfp, buf, msgsz, &len);
+		if (msgfp->type != (u_int32_t)DB_QUEUE || p == 0)
+			t_ret = __memp_fput(mpf,
+			    ip, pagep, DB_PRIORITY_UNCHANGED);
+#ifdef HAVE_QUEUE
+		else
+			/*
+			 * We don't need an #else for HAVE_QUEUE here because if
+			 * we're not compiled with queue, then we're guaranteed
+			 * to have set REP_PAGE_FAIL above.
+			 */
+			t_ret = __qam_fput(qdbc, p, pagep, qdbp->priority);
+#endif
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+
+		DB_ASSERT(env, len <= msgsz);
+		DB_SET_DBT(msgdbt, buf, len);
+
+		LOG_SYSTEM_LOCK(env);
+		repth.lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+		/*
+		 * If we are configured for bulk, try to send this as a bulk
+		 * request.  If not configured, or it is too big for bulk
+		 * then just send normally.
+		 */
+		if (use_bulk)
+			ret = __rep_bulk_message(env, &bulk, &repth,
+			    &repth.lsn, &msgdbt, 0);
+		if (!use_bulk || ret == DB_REP_BULKOVF)
+			ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "sendpages: %lu, lsn [%lu][%lu]", (u_long)p,
+		    (u_long)repth.lsn.file, (u_long)repth.lsn.offset));
+		/*
+		 * If we have REP_PAGE_MORE we need to break this loop.
+		 * Otherwise, with REP_PAGE, we keep going.
+		 */
+		if (repth.type == REP_PAGE_MORE || ret != 0) {
+			/* Ignore send failure, except to break the loop. */
+			if (ret == DB_REP_UNAVAIL)
+				ret = 0;
+			break;
+		}
+	}
+
+err:
+	/*
+	 * We're done, force out whatever remains in the bulk buffer and
+	 * free it.
+	 */
+	if (use_bulk && bulk.addr != NULL &&
+	    (t_ret = __rep_bulk_free(env, &bulk, 0)) != 0 && ret == 0 &&
+	    t_ret != DB_REP_UNAVAIL)
+		ret = t_ret;
+	if (qdbc != NULL && (t_ret = __dbc_close(qdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (opened && (t_ret = __db_close(qdbp, NULL, DB_NOSYNC)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+	if (buf != NULL)
+		__os_free(env, buf);
+	return (ret);
+}
+
+/*
+ * __rep_update_setup
+ *	Process and setup with this file information.
+ *
+ * PUBLIC: int __rep_update_setup __P((ENV *, int, __rep_control_args *,
+ * PUBLIC:     DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_update_setup(env, eid, rp, rec, savetime, lsn)
+	ENV *env;
+	int eid;
+	__rep_control_args *rp;
+	DBT *rec;
+	time_t savetime;
+	DB_LSN *lsn;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	__rep_update_args *rup;
+	DB_LSN verify_lsn;
+	int clientdb_locked, *origbuf, ret;
+	u_int32_t count, size;
+	u_int8_t *end, *next;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	infop = env->reginfo;
+	renv = infop->primary;
+	clientdb_locked = 0;
+	ret = 0;
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	verify_lsn = lp->verify_lsn;
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(env);
+	if (rep->sync_state != SYNC_UPDATE || IN_ELECTION(rep)) {
+		REP_SYSTEM_UNLOCK(env);
+		return (0);
+	}
+	rep->sync_state = SYNC_OFF;
+
+	if ((ret = __rep_update_unmarshal(env, rp->rep_version,
+	    &rup, rec->data, rec->size, &next)) != 0)
+		return (ret);
+	DB_ASSERT(env, next == FIRST_FILE_PTR((u_int8_t*)rec->data));
+
+	/*
+	 * If we're doing an abbreviated internal init, it's because we found a
+	 * sync point but we needed to materialize any NIMDBs.  However, if we
+	 * now see that there are no NIMDBs we can just skip to verify_match,
+	 * just as we would have done if we had already loaded the NIMDBs.  In
+	 * other words, if there are no NIMDBs, then I can trivially say that
+	 * I've already loaded all of them!  The whole abbreviated internal init
+	 * turns out not to have been necessary after all.
+	 */
+	if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+		count = rup->num_files;
+		end = &((u_int8_t*)rec->data)[rec->size];
+		size = (u_int32_t)(end - next);
+		if ((ret = __rep_walk_filelist(env, rp->rep_version,
+		    next, size, count, __rep_find_inmem, NULL)) == 0) {
+			/*
+			 * Not found: there are no NIMDBs on the list.  Revert
+			 * to VERIFY state, so that we can pick up where we left
+			 * off, except that from now on (i.e., future master
+			 * changes) we can skip checking for NIMDBs if we find a
+			 * sync point.
+			 */
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "UPDATE msg reveals no NIMDBs"));
+			F_SET(rep, REP_F_NIMDBS_LOADED);
+			rep->sync_state = SYNC_VERIFY;
+			F_CLR(rep, REP_F_ABBREVIATED);
+			ret = __rep_notify_threads(env, AWAIT_NIMDB);
+
+			REP_SYSTEM_UNLOCK(env);
+			if (ret == 0 && (ret = __rep_verify_match(env,
+			    &verify_lsn, savetime)) == DB_REP_WOULDROLLBACK)
+				*lsn = verify_lsn;
+			__os_free(env, rup);
+			return (ret);
+		} else if (ret != DB_KEYEXIST)
+			goto err;
+	}
+
+	/*
+	 * We know we're the first to come in here due to the
+	 * SYNC_UPDATE state.
+	 */
+	rep->sync_state = SYNC_PAGE;
+	/*
+	 * We should not ever be in internal init with a lease granted.
+	 */
+	DB_ASSERT(env,
+	    !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+	/*
+	 * We do not clear REP_LOCKOUT_* in this code.
+	 * We'll eventually call the normal __rep_verify_match recovery
+	 * code and that will clear all the flags and allow others to
+	 * proceed.  We lockout both the messages and API here.
+	 * We lockout messages briefly because we are about to reset
+	 * all our LSNs and we do not want another thread possibly
+	 * using/needing those.  We have to lockout the API for
+	 * the duration of internal init.
+	 */
+	if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+		goto err;
+
+	if ((ret = __rep_lockout_api(env, rep)) != 0)
+		goto err;
+	/*
+	 * We need to update the timestamp and kill any open handles
+	 * on this client.  The files are changing completely.
+	 */
+	(void)time(&renv->rep_timestamp);
+
+	REP_SYSTEM_UNLOCK(env);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	__os_gettime(env, &lp->rcvd_ts, 1);
+	lp->wait_ts = rep->request_gap;
+	ZERO_LSN(lp->ready_lsn);
+	ZERO_LSN(lp->verify_lsn);
+	ZERO_LSN(lp->prev_ckp);
+	ZERO_LSN(lp->waiting_lsn);
+	ZERO_LSN(lp->max_wait_lsn);
+	ZERO_LSN(lp->max_perm_lsn);
+	if (db_rep->rep_db == NULL)
+		ret = __rep_client_dbinit(env, 0, REP_DB);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	if (ret != 0)
+		goto err_nolock;
+
+	/*
+	 * We need to empty out any old log records that might be in the
+	 * temp database.
+	 */
+	ENV_GET_THREAD_INFO(env, ip);
+	if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &count)) != 0)
+		goto err_nolock;
+	STAT_SET(env,
+	    rep, log_queued, rep->stat.st_log_queued, 0, &lp->ready_lsn);
+
+	REP_SYSTEM_LOCK(env);
+	if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+		/*
+		 * For an abbreviated internal init, the place from which we'll
+		 * want to request master's logs after (NIMDB) pages are loaded
+		 * is precisely the sync point we found during VERIFY.  We'll
+		 * roll back to there in a moment.
+		 *
+		 * We don't need first_vers, because it's only used with
+		 * __log_newfile, which only happens with non-ABBREVIATED
+		 * internal init.
+		 */
+		rep->first_lsn = verify_lsn;
+	} else {
+		/*
+		 * We will remove all logs we have so we need to request
+		 * from the master's beginning.
+		 */
+		rep->first_lsn = rup->first_lsn;
+		rep->first_vers = rup->first_vers;
+	}
+	rep->last_lsn = rp->lsn;
+	rep->nfiles = rup->num_files;
+
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "Update setup for %d files.", rep->nfiles));
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "Update setup:  First LSN [%lu][%lu].",
+	    (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset));
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "Update setup:  Last LSN [%lu][%lu]",
+	    (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+
+	if (rep->nfiles > 0) {
+		rep->infoversion = rp->rep_version;
+		rep->originfolen = rep->infolen =
+		    rec->size - __REP_UPDATE_SIZE;
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		ret = __env_alloc(infop, (size_t)rep->infolen, &origbuf);
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		if (ret != 0)
+			goto err;
+		else
+			rep->originfo_off = R_OFFSET(infop, origbuf);
+		memcpy(R_ADDR(infop, rep->originfo_off),
+		    FIRST_FILE_PTR((u_int8_t*)rec->data), rep->infolen);
+	}
+
+	/*
+	 * Clear the decks to make room for the logs and databases that we will
+	 * request as part of this internal init.  For a normal, full internal
+	 * init, that means all logs and databases.  For an abbreviated internal
+	 * init, it means only the NIMDBs, and only that portion of the log
+	 * after the sync point.
+	 */
+	if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+		/*
+		 * Note that in order to pare the log back to the sync point, we
+		 * can't just crudely hack it off there.  We need to make sure
+		 * that pages in regular databases get rolled back to a state
+		 * consistent with that sync point.  So we have to do a real
+		 * recovery step.
+		 */
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Will roll back for abbreviated internal init"));
+		if ((ret = __rep_rollback(env, &rep->first_lsn)) != 0) {
+			if (ret == DB_REP_WOULDROLLBACK) {
+				DB_ASSERT(env, LOG_COMPARE(&rep->first_lsn,
+				    &verify_lsn) == 0);
+				*lsn = verify_lsn;
+			}
+			goto err;
+		}
+		ret = __rep_remove_nimdbs(env);
+	} else
+		ret = __rep_remove_all(env, rp->rep_version, rec);
+	if (ret != 0)
+		goto err;
+	FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+
+	REP_SYSTEM_UNLOCK(env);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	clientdb_locked = 1;
+	REP_SYSTEM_LOCK(env);
+	rep->curfile = 0;
+	ret = __rep_nextfile(env, eid, rep);
+	if (ret != 0)
+		goto err;
+
+	if (0) {
+err_nolock:	REP_SYSTEM_LOCK(env);
+	}
+
+err:	/*
+	 * If we get an error, we cannot leave ourselves in the RECOVER_PAGE
+	 * state because we have no file information.  That also means undo'ing
+	 * the rep_lockout.  We need to move back to the RECOVER_UPDATE stage.
+	 * In the non-error path, we will have already cleared LOCKOUT_MSG,
+	 * but it doesn't hurt to clear it again.
+	 */
+	FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+	if (ret != 0) {
+		if (rep->originfo_off != INVALID_ROFF) {
+			MUTEX_LOCK(env, renv->mtx_regenv);
+			__env_alloc_free(infop,
+			    R_ADDR(infop, rep->originfo_off));
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+			rep->originfo_off = INVALID_ROFF;
+		}
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Update_setup: Error: Clear PAGE, set UPDATE again. %s",
+		    db_strerror(ret)));
+		rep->sync_state = SYNC_UPDATE;
+		CLR_LOCKOUT_BDB(rep);
+	}
+	REP_SYSTEM_UNLOCK(env);
+	if (clientdb_locked)
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	__os_free(env, rup);
+	return (ret);
+}
+
+static int
+__rep_find_inmem(env, rfp, unused)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *unused;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(unused, NULL);
+
+	return (FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? DB_KEYEXIST : 0);
+}
+
+/*
+ * Removes any currently existing NIMDBs.  We do this at the beginning of
+ * abbreviated internal init, when any existing NIMDBs should be intact, so
+ * walk_dir should produce reliable results.
+ */
+static int
+__rep_remove_nimdbs(env)
+	ENV *env;
+{
+	FILE_LIST_CTX context;
+	int ret;
+
+	if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+		return (ret);
+	context.size = MEGABYTE;
+	context.count = 0;
+	context.fillptr = context.buf;
+	context.version = DB_REPVERSION;
+
+	/* NB: "NULL" asks walk_dir to consider only in-memory DBs */
+	if ((ret = __rep_walk_dir(env, NULL, NULL, &context)) != 0)
+		goto out;
+
+	if ((ret = __rep_closefiles(env)) != 0)
+		goto out;
+
+	ret = __rep_walk_filelist(env, context.version, context.buf,
+	    context.size, context.count, __rep_remove_file, NULL);
+
+out:
+	__os_free(env, context.buf);
+	return (ret);
+}
+
+/*
+ * Removes all existing logs and databases, at the start of internal init.  But
+ * before we do, write a list of the databases onto the init file, so that in
+ * case we crash in the middle, we'll know how to resume when we restart.
+ * Finally, also write into the init file the UPDATE message from the master (in
+ * the "rec" DBT), which includes the (new) list of databases we intend to
+ * request copies of (again, so that we know what to do if we crash in the
+ * middle).
+ *
+ * For the sake of simplicity, these database lists are in the form of an UPDATE
+ * message (since we already have the mechanisms in place), even though strictly
+ * speaking that contains more information than we really need to store.
+ *
+ * !!! Must be called with the REP_SYSTEM_LOCK held.
+ */
+static int
+__rep_remove_all(env, msg_version, rec)
+	ENV *env;
+	u_int32_t msg_version;
+	DBT *rec;
+{
+	FILE_LIST_CTX context;
+	__rep_update_args u_args;
+	DB_FH *fhp;
+	DB_REP *db_rep;
+#ifdef HAVE_REPLICATION_THREADS
+	DBT dbt;
+#endif
+	REP *rep;
+	size_t cnt, updlen;
+	u_int32_t bufsz, fvers, mvers, zero;
+	int ret, t_ret;
+	char *fname;
+
+	fname = NULL;
+	fhp = NULL;
+#ifdef HAVE_REPLICATION_THREADS
+	dbt.data = NULL;
+#endif
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * 1. Get list of databases currently present at this client, which we
+	 *    intend to remove.
+	 */
+	if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+		return (ret);
+	context.size = MEGABYTE;
+	context.count = 0;
+	context.version = DB_REPVERSION;
+
+	/* Reserve space for the marshaled update_args. */
+	context.fillptr = FIRST_FILE_PTR(context.buf);
+
+	if ((ret = __rep_find_dbs(env, &context)) != 0)
+		goto out;
+	ZERO_LSN(u_args.first_lsn);
+	u_args.first_vers = 0;
+	u_args.num_files = context.count;
+	if ((ret = __rep_update_marshal(env, DB_REPVERSION,
+	    &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+		goto out;
+	DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+	/*
+	 * 2. Before removing anything, safe-store the database list, so that in
+	 *    case we crash before we've removed them all, when we restart we
+	 *    can clean up what we were doing. Only write database list to
+	 *    file if not running in-memory replication.
+	 *
+	 * The original version of the file contains:
+	 * data1 size (4 bytes)
+	 * data1
+	 * data2 size (possibly) (4 bytes)
+	 * data2 (possibly)
+	 *
+	 * As of 4.7 the file has the following form:
+	 * 0 (4 bytes - to indicate a new style file)
+	 * file version (4 bytes)
+	 * data1 version (4 bytes)
+	 * data1 size (4 bytes)
+	 * data1
+	 * data2 version (possibly) (4 bytes)
+	 * data2 size (possibly) (4 bytes)
+	 * data2 (possibly)
+	 */
+	if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+		if ((ret = __db_appname(env,
+		    DB_APP_META, REP_INITNAME, NULL, &fname)) != 0)
+			goto out;
+		/* Sanity check that the write size fits into 32 bits. */
+		DB_ASSERT(env, (size_t)(context.fillptr - context.buf) ==
+		    (u_int32_t)(context.fillptr - context.buf));
+		bufsz = (u_int32_t)(context.fillptr - context.buf);
+
+		/*
+		 * (Short writes aren't possible, so we don't have to verify
+		 * 'cnt'.) This first list is generated internally, so it is
+		 * always in the form of the current message version.
+		 */
+		zero = 0;
+		fvers = REP_INITVERSION;
+		mvers = DB_REPVERSION;
+		if ((ret = __os_open(env, fname, 0,
+		    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, &zero, sizeof(zero), &cnt)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, &fvers, sizeof(fvers), &cnt)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, &bufsz, sizeof(bufsz), &cnt)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, context.buf, bufsz, &cnt)) != 0 ||
+		    (ret = __os_fsync(env, fhp)) != 0) {
+			__db_err(env, ret, "%s", fname);
+			goto out;
+		}
+	}
+
+	/*
+	 * 3. Go ahead and remove logs and databases.  The databases get removed
+	 *    according to the list we just finished safe-storing.
+	 *
+	 * Clearing NIMDBS_LOADED might not really be necessary, since once
+	 * we've committed to removing all there's no chance of doing an
+	 * abbreviated internal init.  This just keeps us honest.
+	 */
+	if ((ret = __rep_remove_logs(env)) != 0)
+		goto out;
+	if ((ret = __rep_closefiles(env)) != 0)
+		goto out;
+	F_CLR(rep, REP_F_NIMDBS_LOADED);
+	if ((ret = __rep_walk_filelist(env, context.version,
+	    FIRST_FILE_PTR(context.buf), context.size,
+	    context.count, __rep_remove_file, NULL)) != 0)
+		goto out;
+
+	/*
+	 * 4. Safe-store the (new) list of database files we intend to copy from
+	 *    the master (again, so that in case we crash before we're finished
+	 *    doing so, we'll have enough information to clean up and start over
+	 *    again).  This list is the list from the master, so it uses
+	 *    the message version. Only write to file if not running
+	 *    in-memory replication.
+	 */
+	if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+		mvers = msg_version;
+		if ((ret =
+		    __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    &rec->size, sizeof(rec->size), &cnt)) != 0 ||
+		    (ret =
+		    __os_write(env, fhp, rec->data, rec->size, &cnt)) != 0 ||
+		    (ret = __os_fsync(env, fhp)) != 0) {
+			__db_err(env, ret, "%s", fname);
+			goto out;
+		}
+#ifdef HAVE_REPLICATION_THREADS
+		/* Invite repmgr to save any info it needs. */
+		if ((ret = __repmgr_init_save(env, &dbt)) != 0)
+			goto out;
+		if (dbt.size > 0 &&
+		    ((ret = __os_write(env, fhp,
+		    &dbt.size, sizeof(dbt.size), &cnt)) != 0 ||
+		    (ret = __os_write(env, fhp,
+		    dbt.data, dbt.size, &cnt)) != 0))
+			goto out;
+#endif
+	}
+
+out:
+#ifdef HAVE_REPLICATION_THREADS
+	if (dbt.data != NULL)
+		__os_free(env, dbt.data);
+#endif
+	if (fhp != NULL && (t_ret = __os_closehandle(env, fhp)) && ret == 0)
+		ret = t_ret;
+	if (fname != NULL)
+		__os_free(env, fname);
+	__os_free(env, context.buf);
+	return (ret);
+}
+
+/*
+ * __rep_remove_logs -
+ *	Remove our logs to prepare for internal init.
+ */
+static int
+__rep_remove_logs(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	LOG *lp;
+	u_int32_t fnum, lastfile;
+	int ret;
+	char *name;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+
+	/*
+	 * Call memp_sync to flush any pages that might be in the log buffers
+	 * and not on disk before we remove files on disk.  If there were no
+	 * dirty pages, the log isn't flushed.  Yet the log buffers could still
+	 * be dirty: __log_flush should take care of this rare situation.
+	 */
+	if ((ret = __memp_sync_int(env,
+	    NULL, 0, DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+		return (ret);
+	if ((ret = __log_flush(env, NULL)) != 0)
+		return (ret);
+	/*
+	 * Forcibly remove existing log files or reset
+	 * the in-memory log space.
+	 */
+	if (lp->db_log_inmemory) {
+		ZERO_LSN(lsn);
+		if ((ret = __log_zero(env, &lsn)) != 0)
+			return (ret);
+	} else {
+		lastfile = lp->lsn.file;
+		for (fnum = 1; fnum <= lastfile; fnum++) {
+			if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
+				return (ret);
+			(void)time(&lp->timestamp);
+			(void)__os_unlink(env, name, 0);
+			__os_free(env, name);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Removes a file during internal init.  Assumes underlying subsystems are
+ * active; therefore, this can't be used for internal init crash recovery.
+ */
+static int
+__rep_remove_file(env, rfp, unused)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *unused;
+{
+	DB *dbp;
+#ifdef HAVE_QUEUE
+	DB_THREAD_INFO *ip;
+#endif
+	char *name;
+	int ret, t_ret;
+
+	COMPQUIET(unused, NULL);
+	dbp = NULL;
+	name = rfp->info.data;
+
+	/*
+	 * Calling __fop_remove will both purge any matching
+	 * fileid from mpool and unlink it on disk.
+	 */
+#ifdef HAVE_QUEUE
+	/*
+	 * Handle queue separately.  __fop_remove will not
+	 * remove extent files.  Use __qam_remove to remove
+	 * extent files that might exist under this name.  Note that
+	 * in-memory queue databases can't have extent files.
+	 */
+	if (rfp->type == (u_int32_t)DB_QUEUE &&
+	    !FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			return (ret);
+
+		/*
+		 * At present, qam_remove expects the passed-in dbp to have a
+		 * locker allocated, and if not, db_open allocates a locker
+		 * which qam_remove then leaks.
+		 *
+		 * TODO: it would be better to avoid cobbling together this
+		 * sequence of low-level operations, if fileops provided some
+		 * API to allow us to remove a database without write-locking
+		 * its handle.
+		 */
+		if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+			goto out;
+
+		ENV_GET_THREAD_INFO(env, ip);
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "QAM: Unlink %s via __qam_remove", name));
+		if ((ret = __qam_remove(dbp, ip, NULL, name, NULL, 0)) != 0) {
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "qam_remove returned %d", ret));
+			goto out;
+		}
+	}
+#endif
+	/*
+	 * We call fop_remove even if we've called qam_remove.
+	 * That will only have removed extent files.  Now
+	 * we need to deal with the actual file itself.
+	 */
+	if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			return (ret);
+		MAKE_INMEM(dbp);
+		F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+		ret = __db_inmem_remove(dbp, NULL, name);
+	} else if ((ret = __fop_remove(env,
+		    NULL, rfp->uid.data, name, (const char **)&rfp->dir.data,
+		    __rep_is_internal_rep_file(rfp->info.data) ?
+		    DB_APP_META : DB_APP_DATA, 0)) != 0)
+			/*
+			 * If fop_remove fails, it could be because
+			 * the client has a different data_dir
+			 * structure than the master.  Retry with the
+			 * local, default settings. 
+			 */
+			ret = __fop_remove(env,
+			    NULL, rfp->uid.data, name, NULL,
+			    __rep_is_internal_rep_file(rfp->info.data) ?
+			    DB_APP_META : DB_APP_DATA, 0);
+#ifdef HAVE_QUEUE
+out:
+#endif
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_bulk_page
+ *	Process a bulk page message.
+ *
+ * PUBLIC: int __rep_bulk_page __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_bulk_page(env, ip, eid, rp, rec)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	__rep_control_args *rp;
+	DBT *rec;
+{
+	__rep_control_args tmprp;
+	__rep_bulk_args b_args;
+	int ret;
+	u_int8_t *p, *ep;
+
+	/*
+	 * We're going to be modifying the rp LSN contents so make
+	 * our own private copy to play with.  We need to set the
+	 * rectype to REP_PAGE because we're calling through __rep_page
+	 * to process each page, and lower functions make decisions
+	 * based on the rectypes (for throttling/gap processing)
+	 */
+	memcpy(&tmprp, rp, sizeof(tmprp));
+	tmprp.rectype = REP_PAGE;
+	ret = 0;
+	for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
+	    p < ep;) {
+		/*
+		 * First thing in the buffer is the length.  Then the LSN
+		 * of this page, then the page info itself.
+		 */
+		if ((ret = __rep_bulk_unmarshal(env,
+		    &b_args, p, rec->size, &p)) != 0)
+			return (ret);
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "rep_bulk_page: Processing LSN [%lu][%lu]",
+		    (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+    "rep_bulk_page: p %#lx ep %#lx pgrec data %#lx, size %lu (%#lx)",
+		    P_TO_ULONG(p), P_TO_ULONG(ep),
+		    P_TO_ULONG(b_args.bulkdata.data),
+		    (u_long)b_args.bulkdata.size,
+		    (u_long)b_args.bulkdata.size));
+		/*
+		 * Now send the page info DBT to the page processing function.
+		 */
+		ret = __rep_page(env, ip, eid, &tmprp, &b_args.bulkdata);
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "rep_bulk_page: rep_page ret %d", ret));
+
+		/*
+		 * If this set of pages is already done just return.
+		 */
+		if (ret != 0) {
+			if (ret == DB_REP_PAGEDONE)
+				ret = 0;
+			break;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __rep_page
+ *	Process a page message.  This processes any page related
+ * message: REP_PAGE, REP_PAGE_FAIL and REP_PAGE_MORE.
+ *
+ * PUBLIC: int __rep_page __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page(env, ip, eid, rp, rec)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	__rep_control_args *rp;
+	DBT *rec;
+{
+
+	DB_REP *db_rep;
+	DBT key, data;
+	REP *rep;
+	__rep_fileinfo_args *msgfp, msgf;
+	__rep_fileinfo_v6_args *msgfpv6;
+	db_recno_t recno;
+	int ret;
+	char *msg;
+	void *msgfree;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (rep->sync_state != SYNC_PAGE)
+		return (DB_REP_PAGEDONE);
+
+	if (rp->rectype == REP_PAGE_FAIL)
+		msg = "PAGE_FAIL";
+	else if (rp->rectype == REP_PAGE_MORE)
+		msg = "PAGE_MORE";
+	else
+		msg = "PAGE";
+	/*
+	 * If we restarted internal init, it is possible to receive
+	 * an old REP_PAGE message, while we're in the current
+	 * stage of recovering pages.  Until we have some sort of
+	 * an init generation number, ignore any message that has
+	 * a message LSN that is before this internal init's first_lsn.
+	 */
+	if (LOG_COMPARE(&rp->lsn, &rep->first_lsn) < 0) {
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "%s: Old page: msg LSN [%lu][%lu] first_lsn [%lu][%lu]",
+		    msg, (u_long)rp->lsn.file, (u_long)rp->lsn.offset,
+		    (u_long)rep->first_lsn.file,
+		    (u_long)rep->first_lsn.offset));
+		return (DB_REP_PAGEDONE);
+	}
+	if (rp->rep_version < DB_REPVERSION_53) {
+		/*
+		 * Build a current struct by copying in the older
+		 * version struct and then setting up the data_dir.
+		 * This is safe because all old fields are in the
+		 * same location in the current struct.
+		 */
+		if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
+		    &msgfpv6, rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
+		msgf.dir.data = NULL;
+		msgf.dir.size = 0;
+		msgfp = &msgf;
+		msgfree = msgfpv6;
+	} else {
+		if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+		    &msgfp, rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		msgfree = msgfp;
+	}
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * Check if the world changed.
+	 */
+	if (rep->sync_state != SYNC_PAGE) {
+		ret = DB_REP_PAGEDONE;
+		goto err;
+	}
+	/*
+	 * We should not ever be in internal init with a lease granted.
+	 */
+	DB_ASSERT(env,
+	    !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+	VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "%s: Received page %lu from file %d",
+	    msg, (u_long)msgfp->pgno, msgfp->filenum));
+	/*
+	 * Check if this page is from the file we're expecting.
+	 * This may be an old or delayed page message.
+	 */
+	/*
+	 * !!!
+	 * If we allow dbrename/dbremove on the master while a client
+	 * is updating, then we'd have to verify the file's uid here too.
+	 */
+	if (msgfp->filenum != rep->curfile) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Msg file %d != curfile %d",
+		    msgfp->filenum, rep->curfile));
+		ret = DB_REP_PAGEDONE;
+		goto err;
+	}
+	/*
+	 * We want to create/open our dbp to the database
+	 * where we'll keep our page information.
+	 */
+	if ((ret = __rep_client_dbinit(env, 1, REP_PG)) != 0) {
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "%s: Client_dbinit %s", msg, db_strerror(ret)));
+		goto err;
+	}
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	recno = (db_recno_t)(msgfp->pgno + 1);
+	key.data = &recno;
+	key.ulen = key.size = sizeof(db_recno_t);
+	key.flags = DB_DBT_USERMEM;
+
+	/*
+	 * If we already have this page, then we don't want to bother
+	 * rewriting it into the file.  Otherwise, any other error
+	 * we want to return.
+	 */
+	ret = __db_put(db_rep->file_dbp, ip, NULL, &key, &data, DB_NOOVERWRITE);
+	if (ret == DB_KEYEXIST) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "%s: Received duplicate page %lu from file %d",
+		    msg, (u_long)msgfp->pgno, msgfp->filenum));
+		STAT(rep->stat.st_pg_duplicated++);
+		PERFMON4(env, rep, pg_duplicated, eid,
+		    msgfp->pgno, msgfp->filenum, rep->stat.st_pg_duplicated);
+		ret = 0;
+		goto err;
+	}
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * We put the page in the database file itself.
+	 */
+	if (rp->rectype != REP_PAGE_FAIL) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "%s: Write page %lu into mpool", msg, (u_long)msgfp->pgno));
+		if ((ret = __rep_write_page(env, ip, rep, msgfp)) != 0) {
+			/*
+			 * We got an error storing the page, therefore, we need
+			 * remove this page marker from the page database too.
+			 * !!!
+			 * I'm ignoring errors from the delete because we want
+			 * to return the original error.  If we cannot write the
+			 * page and we cannot delete the item we just put,
+			 * what should we do?  Panic the env and return
+			 * DB_RUNRECOVERY?
+			 */
+			(void)__db_del(db_rep->file_dbp, NULL, NULL, &key, 0);
+			goto err;
+		}
+	}
+	STAT_INC(env, rep, pg_record, rep->stat.st_pg_records, eid);
+	rep->npages++;
+
+	/*
+	 * Now check the LSN on the page and save it if it is later
+	 * than the one we have.
+	 */
+	if (LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0)
+		rep->last_lsn = rp->lsn;
+
+	/*
+	 * We've successfully written the page.  Now we need to see if
+	 * we're done with this file.  __rep_filedone will check if we
+	 * have all the pages expected and if so, set up for the next
+	 * file and send out a page request for the next file's pages.
+	 */
+	ret = __rep_filedone(env, ip, eid, rep, msgfp, rp->rectype);
+
+err:	REP_SYSTEM_UNLOCK(env);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+	__os_free(env, msgfree);
+	return (ret);
+}
+
+/*
+ * __rep_write_page -
+ *	Write this page into a database.
+ */
+static int
+__rep_write_page(env, ip, rep, msgfp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REP *rep;
+	__rep_fileinfo_args *msgfp;
+{
+	DB db;
+	DBT pgcookie;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO *pginfo;
+	DB_REP *db_rep;
+	REGINFO *infop;
+	__rep_fileinfo_args *rfp;
+	int ret;
+	void *dst;
+
+	db_rep = env->rep_handle;
+	infop = env->reginfo;
+	rfp = NULL;
+
+	/*
+	 * If this is the first page we're putting in this database, we need
+	 * to create the mpool file.  Otherwise call memp_fget to create the
+	 * page in mpool.  Then copy the data to the page, and memp_fput the
+	 * page to give it back to mpool.
+	 *
+	 * We need to create the file, removing any existing file and associate
+	 * the correct file ID with the new one.
+	 */
+	GET_CURINFO(rep, infop, rfp);
+	if (db_rep->file_mpf == NULL) {
+		if (!FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+			/*
+			 * Recreate the file on disk.  We'll be putting
+			 * the data into the file via mpool.  System
+			 * databases should go into the environment
+			 * directory, not the data directory.
+			 */
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "rep_write_page: Calling fop_create for %s",
+			    (char *)rfp->info.data));
+			if ((ret = __fop_create(env, NULL, NULL,
+			    rfp->info.data, (const char **)&rfp->dir.data,
+			    __rep_is_internal_rep_file(rfp->info.data) ?
+			    DB_APP_META : DB_APP_DATA, env->db_mode, 0)) != 0) {
+				/*
+				 * If fop_create fails, it could be because
+				 * the client has a different data_dir
+				 * structure than the master.  Retry with the
+				 * local, default settings. 
+				 */
+				RPRINT(env, (env, DB_VERB_REP_SYNC,
+    "rep_write_page: fop_create ret %d.  Retry for %s, master datadir %s",
+				    ret, (char *)rfp->info.data,
+				    rfp->dir.data == NULL ? "NULL" :
+				    (char *)rfp->dir.data));
+				if ((ret = __fop_create(env, NULL, NULL,
+				    rfp->info.data, NULL,
+				    __rep_is_internal_rep_file(rfp->info.data) ?
+				    DB_APP_META : DB_APP_DATA,
+				    env->db_mode, 0)) != 0)
+					goto err;
+			}
+		}
+
+		if ((ret =
+		    __rep_mpf_open(env, &db_rep->file_mpf, rfp,
+		    FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ?
+		    DB_CREATE : 0)) != 0)
+			goto err;
+	}
+	/*
+	 * Handle queue specially.  If we're a QUEUE database, we need to
+	 * use the __qam_fget/put calls.  We need to use db_rep->queue_dbc for
+	 * that.  That dbp is opened after getting the metapage for the
+	 * queue database.  Since the meta-page is always in the queue file,
+	 * we'll use the normal path for that first page.  After that we
+	 * can assume the dbp is opened.
+	 */
+	if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0) {
+#ifdef HAVE_QUEUE
+		ret = __qam_fget(db_rep->queue_dbc, &msgfp->pgno,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+#else
+		/*
+		 * This always returns an error.
+		 */
+		ret = __db_no_queue_am(env);
+#endif
+	} else
+		ret = __memp_fget(db_rep->file_mpf, &msgfp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Before writing this page into our local mpool, see if its byte order
+	 * needs to be swapped.  When in mpool the page should be in the native
+	 * byte order of our local environment.  But the page image we've
+	 * received may be in the opposite order (as indicated in finfo_flags).
+	 */
+	if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+	    !FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN)) ||
+	    (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+	    FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN))) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "write_page: Page %d needs to be swapped", msgfp->pgno));
+		/*
+		 * Set up a dbp to pass into the swap functions.  We need
+		 * only a few things:  The environment and any special
+		 * dbp flags and some obvious basics like db type and
+		 * pagesize.  Those flags were set back in rep_mpf_open
+		 * and are available in the pgcookie set up with the
+		 * mpoolfile associated with this database.
+		 */
+		memset(&db, 0, sizeof(db));
+		db.env = env;
+		db.type = (DBTYPE)msgfp->type;
+		db.pgsize = msgfp->pgsize;
+		mpf = db_rep->file_mpf;
+		if ((ret = __memp_get_pgcookie(mpf, &pgcookie)) != 0)
+			goto err;
+		pginfo = (DB_PGINFO *)pgcookie.data;
+		db.flags = pginfo->flags;
+		if ((ret = __db_pageswap(env,
+		     &db, msgfp->info.data, msgfp->pgsize, NULL, 1)) != 0)
+			goto err;
+	}
+
+	memcpy(dst, msgfp->info.data, msgfp->pgsize);
+#ifdef HAVE_QUEUE
+	if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0)
+		ret = __qam_fput(db_rep->queue_dbc,
+		     msgfp->pgno, dst, db_rep->queue_dbc->priority);
+	else
+#endif
+		ret = __memp_fput(db_rep->file_mpf,
+		    ip, dst, db_rep->file_dbp->priority);
+
+err:	return (ret);
+}
+
+/*
+ * __rep_page_gap -
+ *	After we've put the page into the database, we need to check if
+ *	we have a page gap and whether we need to request pages.
+ */
+static int
+__rep_page_gap(env, rep, msgfp, type)
+	ENV *env;
+	REP *rep;
+	__rep_fileinfo_args *msgfp;
+	u_int32_t type;
+{
+	DBC *dbc;
+	DBT data, key;
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGINFO *infop;
+	__rep_fileinfo_args *rfp;
+	db_recno_t recno;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	infop = env->reginfo;
+	ret = 0;
+	dbc = NULL;
+
+	/*
+	 * We've successfully put this page into our file.
+	 * Now we need to account for it and re-request new pages
+	 * if necessary.
+	 */
+	/*
+	 * We already hold both the db mutex and rep mutex.
+	 */
+	GET_CURINFO(rep, infop, rfp);
+
+	/*
+	 * Make sure we're still talking about the same file.
+	 * If not, we're done here.
+	 */
+	if (rfp->filenum != msgfp->filenum) {
+		ret = DB_REP_PAGEDONE;
+		goto err;
+	}
+
+	/*
+	 * We have 3 possible states:
+	 * 1.  We receive a page we already have accounted for.
+	 *	msg pgno < ready pgno
+	 * 2.  We receive a page that is beyond a gap.
+	 *	msg pgno > ready pgno
+	 * 3.  We receive the page we're expecting next.
+	 *	msg pgno == ready pgno
+	 */
+	/*
+	 * State 1.  This can happen once we put our page record into the
+	 * database, but by the time we acquire the mutex other
+	 * threads have already accounted for this page and moved on.
+	 * We just want to return.
+	 */
+	if (msgfp->pgno < rep->ready_pg) {
+		VPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "PAGE_GAP: pgno %lu < ready %lu, waiting %lu",
+		    (u_long)msgfp->pgno, (u_long)rep->ready_pg,
+		    (u_long)rep->waiting_pg));
+		goto err;
+	}
+
+	/*
+	 * State 2.  This page is beyond the page we're expecting.
+	 * We need to update waiting_pg if this page is less than
+	 * (earlier) the current waiting_pg.  There is nothing
+	 * to do but see if we need to request.
+	 */
+	VPRINT(env, (env, DB_VERB_REP_SYNC,
+    "PAGE_GAP: pgno %lu, max_pg %lu ready %lu, waiting %lu max_wait %lu",
+	    (u_long)msgfp->pgno, (u_long)rfp->max_pgno, (u_long)rep->ready_pg,
+	    (u_long)rep->waiting_pg, (u_long)rep->max_wait_pg));
+	if (msgfp->pgno > rep->ready_pg) {
+		/*
+		 * We receive a page larger than the one we're expecting.
+		 */
+		__os_gettime(env, &rep->last_pg_ts, 1);
+		if (rep->waiting_pg == PGNO_INVALID ||
+		    msgfp->pgno < rep->waiting_pg)
+			rep->waiting_pg = msgfp->pgno;
+	} else {
+		/*
+		 * We received the page we're expecting.
+		 */
+		rep->ready_pg++;
+		__os_gettime(env, &lp->rcvd_ts, 1);
+		if (rep->ready_pg == rep->waiting_pg) {
+			/*
+			 * If we get here we know we just filled a gap.
+			 * Move the cursor to that place and then walk
+			 * forward looking for the next gap, if it exists.
+			 * Similar to log gaps, if we fill a gap we want to
+			 * request the next gap right away if it has been
+			 * a while since we last received a later page.
+			 */
+			lp->rcvd_ts = rep->last_pg_ts;
+			lp->wait_ts = rep->request_gap;
+			rep->max_wait_pg = PGNO_INVALID;
+			/*
+			 * We need to walk the recno database looking for the
+			 * next page we need or expect.
+			 */
+			memset(&key, 0, sizeof(key));
+			memset(&data, 0, sizeof(data));
+			ENV_GET_THREAD_INFO(env, ip);
+			if ((ret = __db_cursor(db_rep->file_dbp, ip, NULL,
+			    &dbc, 0)) != 0)
+				goto err;
+			/*
+			 * Set cursor to the first waiting page.
+			 * Page numbers/record numbers are offset by 1.
+			 */
+			recno = (db_recno_t)rep->waiting_pg + 1;
+			key.data = &recno;
+			key.ulen = key.size = sizeof(db_recno_t);
+			key.flags = DB_DBT_USERMEM;
+			/*
+			 * We know that page is there, this should
+			 * find the record.
+			 */
+			ret = __dbc_get(dbc, &key, &data, DB_SET);
+			if (ret != 0)
+				goto err;
+			VPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "PAGE_GAP: Set cursor for ready %lu, waiting %lu",
+			    (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+		}
+		while (ret == 0 && rep->ready_pg == rep->waiting_pg) {
+			rep->ready_pg++;
+			ret = __dbc_get(dbc, &key, &data, DB_NEXT);
+			/*
+			 * If we get to the end of the list, there are no
+			 * more gaps.  Reset waiting_pg.
+			 */
+			if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) {
+				rep->waiting_pg = PGNO_INVALID;
+				VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "PAGE_GAP: Next cursor No next - ready %lu, waiting %lu",
+				    (u_long)rep->ready_pg,
+				    (u_long)rep->waiting_pg));
+				break;
+			}
+			/*
+			 * Subtract 1 from waiting_pg because record numbers
+			 * are 1-based and pages are 0-based and we added 1
+			 * into the page number when we put it into the db.
+			 */
+			rep->waiting_pg = *(db_pgno_t *)key.data;
+			rep->waiting_pg--;
+			VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "PAGE_GAP: Next cursor ready %lu, waiting %lu",
+			    (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+		}
+	}
+
+	/*
+	 * If we filled a gap and now have the entire file, there's
+	 * nothing to do.  We're done when ready_pg is > max_pgno
+	 * because ready_pg is larger than the last page we received.
+	 */
+	if (rep->ready_pg > rfp->max_pgno)
+		goto err;
+
+	/*
+	 * Check if we need to ask for more pages.
+	 */
+	if ((rep->waiting_pg != PGNO_INVALID &&
+	    rep->ready_pg != rep->waiting_pg) || type == REP_PAGE_MORE) {
+		/*
+		 * We got a page but we may still be waiting for more.
+		 * If we got REP_PAGE_MORE we always want to ask for more.
+		 * We need to set rfp->pgno to the current page number
+		 * we will use to ask for more pages.
+		 */
+		if (type == REP_PAGE_MORE)
+			rfp->pgno = msgfp->pgno;
+		if ((__rep_check_doreq(env, rep) || type == REP_PAGE_MORE) &&
+		    ((ret = __rep_pggap_req(env, rep, rfp,
+		    (type == REP_PAGE_MORE) ? REP_GAP_FORCE : 0)) != 0))
+			goto err;
+	} else {
+		lp->wait_ts = rep->request_gap;
+		rep->max_wait_pg = PGNO_INVALID;
+	}
+
+err:
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __rep_init_cleanup -
+ *	Clean up internal initialization pieces.
+ *
+ * !!!
+ * Caller must hold client database mutex (mtx_clientdb) and REP_SYSTEM_LOCK.
+ *
+ * PUBLIC: int __rep_init_cleanup __P((ENV *, REP *, int));
+ */
+int
+__rep_init_cleanup(env, rep, force)
+	ENV *env;
+	REP *rep;
+	int force;
+{
+	DB *queue_dbp;
+	DB_REP *db_rep;
+	REGENV *renv;
+	REGINFO *infop;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	infop = env->reginfo;
+	renv = infop->primary;
+	ret = 0;
+	/*
+	 * 1.  Close up the file data pointer we used.
+	 * 2.  Close/reset the page database.
+	 * 3.  Close/reset the queue database if we're forcing a cleanup.
+	 * 4.  Free current file info.
+	 * 5.  If we have all files or need to force, free original file info.
+	 */
+	if (db_rep->file_mpf != NULL) {
+		ret = __memp_fclose(db_rep->file_mpf, 0);
+		db_rep->file_mpf = NULL;
+	}
+	if (db_rep->file_dbp != NULL) {
+		t_ret = __db_close(db_rep->file_dbp, NULL, DB_NOSYNC);
+		db_rep->file_dbp = NULL;
+		if (ret == 0)
+			ret = t_ret;
+	}
+	if (force && db_rep->queue_dbc != NULL) {
+		queue_dbp = db_rep->queue_dbc->dbp;
+		if ((t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		db_rep->queue_dbc = NULL;
+		if ((t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+	if (rep->curinfo_off != INVALID_ROFF) {
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		__env_alloc_free(infop, R_ADDR(infop, rep->curinfo_off));
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		rep->curinfo_off = INVALID_ROFF;
+	}
+	if (IN_INTERNAL_INIT(rep) && force) {
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "clean up interrupted internal init"));
+		t_ret = F_ISSET(rep, REP_F_ABBREVIATED) ?
+		    __rep_walk_filelist(env, rep->infoversion,
+			R_ADDR(infop, rep->originfo_off), rep->originfolen,
+			rep->nfiles, __rep_cleanup_nimdbs, NULL) :
+		    __rep_clean_interrupted(env);
+		if (ret == 0)
+			ret = t_ret;
+
+		if (rep->originfo_off != INVALID_ROFF) {
+			MUTEX_LOCK(env, renv->mtx_regenv);
+			__env_alloc_free(infop,
+			    R_ADDR(infop, rep->originfo_off));
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+			rep->originfo_off = INVALID_ROFF;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * Remove NIMDBs that may have been fully or partially loaded during an
+ * abbreviated internal init, when the init gets interrupted.  At this point,
+ * we know that any databases we have processed are listed in originfo.
+ */
+static int
+__rep_cleanup_nimdbs(env, rfp, unused)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *unused;
+{
+	DB *dbp;
+	char *namep;
+	int ret, t_ret;
+
+	COMPQUIET(unused, NULL);
+
+	ret = 0;
+	dbp = NULL;
+
+	if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+		namep = rfp->info.data;
+
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			goto out;
+		MAKE_INMEM(dbp);
+		F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+
+		/*
+		 * Some of these "files" (actually NIMDBs) may not exist
+		 * yet, simply because the interrupted abbreviated
+		 * internal init had not yet progressed far enough to
+		 * retrieve them.  So ENOENT is an acceptable outcome.
+		 */
+		if ((ret = __db_inmem_remove(dbp, NULL, namep)) == ENOENT)
+			ret = 0;
+		if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+out:
+	return (ret);
+}
+
+/*
+ * Clean up files involved in an interrupted internal init.
+ */
+static int
+__rep_clean_interrupted(env)
+	ENV *env;
+{
+	REP *rep;
+	DB_LOG *dblp;
+	LOG *lp;
+	REGINFO *infop;
+	int ret, t_ret;
+
+	rep = env->rep_handle->region;
+	infop = env->reginfo;
+
+	/*
+	 * 1. logs
+	 *   a) remove old log files
+	 *   b) set up initial log file #1
+	 * 2. database files
+	 * 3. the "init file"
+	 *
+	 * Steps 1 and 2 can be attempted independently.  Step 1b is
+	 * dependent on successful completion of 1a.
+	 */
+
+	/* Step 1a. */
+	if ((ret = __rep_remove_logs(env)) == 0) {
+		/*
+		 * Since we have no logs, recover by making it look like
+		 * the case when a new client first starts up, namely we
+		 * have nothing but a fresh log file #1.  This is a
+		 * little wasteful, since we may soon remove this log
+		 * file again.  But it's insignificant in the context of
+		 * interrupted internal init.
+		 */
+		dblp = env->lg_handle;
+		lp = dblp->reginfo.primary;
+
+		/* Step 1b. */
+		ret = __rep_log_setup(env,
+		    rep, 1, DB_LOGVERSION, &lp->ready_lsn);
+	}
+
+	/* Step 2. */
+	if ((t_ret = __rep_walk_filelist(env, rep->infoversion,
+	    R_ADDR(infop, rep->originfo_off), rep->originfolen,
+	    rep->nfiles, __rep_remove_by_list, NULL)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Step 3 must not be done if anything fails along the way, because the
+	 * init file's raison d'etre is to show that some files remain to be
+	 * cleaned up.
+	 */
+	if (ret == 0)
+		ret = __rep_remove_init_file(env);
+
+	return (ret);
+}
+
+/*
+ * __rep_filedone -
+ *	We need to check if we're done with the current file after
+ *	processing the current page.  Stat the database to see if
+ *	we have all the pages.  If so, we need to clean up/close
+ *	this one, set up for the next one, and ask for its pages,
+ *	or if this is the last file, request the log records and
+ *	move to the REP_RECOVER_LOG state.
+ */
+static int
+__rep_filedone(env, ip, eid, rep, msgfp, type)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	REP *rep;
+	__rep_fileinfo_args *msgfp;
+	u_int32_t type;
+{
+	REGINFO *infop;
+	__rep_fileinfo_args *rfp;
+	int ret;
+
+	/*
+	 * We've put our page, now we need to do any gap processing
+	 * that might be needed to re-request pages.
+	 */
+	ret = __rep_page_gap(env, rep, msgfp, type);
+	/*
+	 * The world changed while we were doing gap processing.
+	 * We're done here.
+	 */
+	if (ret == DB_REP_PAGEDONE)
+		return (0);
+
+	infop = env->reginfo;
+	GET_CURINFO(rep, infop, rfp);
+	/*
+	 * max_pgno is 0-based and npages is 1-based, so we don't have
+	 * all the pages until npages is > max_pgno.
+	 */
+	VPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "FILEDONE: have %lu pages. Need %lu.",
+	    (u_long)rep->npages, (u_long)rfp->max_pgno + 1));
+	if (rep->npages <= rfp->max_pgno)
+		return (0);
+
+	/*
+	 * If we're queue and we think we have all the pages for this file,
+	 * we need to do special queue processing.  Queue is handled in
+	 * several stages.
+	 */
+	if (rfp->type == (u_int32_t)DB_QUEUE &&
+	    ((ret = __rep_queue_filedone(env, ip, rep, rfp)) !=
+	    DB_REP_PAGEDONE))
+		return (ret);
+	/*
+	 * We have all the pages for this file.  Clean up.
+	 */
+	if ((ret = __rep_init_cleanup(env, rep, 0)) != 0)
+		goto err;
+
+	rep->curfile++;
+	ret = __rep_nextfile(env, eid, rep);
+err:
+	return (ret);
+}
+
+/*
+ * Starts requesting pages for the next file in the list (if any), or if not,
+ * proceeds to the next stage: requesting logs.
+ *
+ * !!!
+ * Must be called with both clientdb_mutex and REP_SYSTEM, though we may drop
+ * REP_SYSTEM_LOCK momentarily in order to send a LOG_REQ (but not a PAGE_REQ).
+ */
+static int
+__rep_nextfile(env, eid, rep)
+	ENV *env;
+	int eid;
+	REP *rep;
+{
+	DBT dbt;
+	__rep_logreq_args lr_args;
+	DB_LOG *dblp;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	__rep_fileinfo_args *curinfo, *rfp, rf;
+	__rep_fileinfo_v6_args *rfpv6;
+	int *curbuf, ret;
+	u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE], *nextinfo;
+	size_t len, msgsz;
+	void *rffree;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	rfp = NULL;
+
+	/*
+	 * Always direct the next request to the master (at least nominally),
+	 * regardless of where the current response came from.  The application
+	 * can always still redirect it to another client.
+	 */
+	if (rep->master_id != DB_EID_INVALID)
+		eid = rep->master_id;
+
+	while (rep->curfile < rep->nfiles) {
+		/* Set curinfo to next file and examine it. */
+		info_ptr = R_ADDR(infop,
+		    rep->originfo_off + (rep->originfolen - rep->infolen));
+		if (rep->infoversion < DB_REPVERSION_53) {
+			/*
+			 * Build a current struct by copying in the older
+			 * version struct and then setting up the data_dir.
+			 * This is safe because all old fields are in the
+			 * same location in the current struct.
+			 */
+			if ((ret = __rep_fileinfo_v6_unmarshal(env,
+			    rep->infoversion, &rfpv6,
+			    info_ptr, rep->infolen, &nextinfo)) != 0)
+				return (ret);
+			memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
+			rf.dir.data = NULL;
+			rf.dir.size = 0;
+			rfp = &rf;
+			rffree = rfpv6;
+		} else {
+			if ((ret = __rep_fileinfo_unmarshal(env,
+			    rep->infoversion, &rfp, info_ptr,
+			    rep->infolen, &nextinfo)) != 0) {
+				RPRINT(env, (env, DB_VERB_REP_SYNC,
+				    "NEXTINFO: Fileinfo read: %s",
+				    db_strerror(ret)));
+				return (ret);
+			}
+			rffree = rfp;
+		}
+		rep->infolen -= (u_int32_t)(nextinfo - info_ptr);
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		ret = __env_alloc(infop, sizeof(__rep_fileinfo_args) +
+		    rfp->uid.size + rfp->info.size + rfp->dir.size, &curbuf);
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		if (ret != 0) {
+			__os_free(env, rffree);
+			return (ret);
+		} else
+			rep->curinfo_off = R_OFFSET(infop, curbuf);
+		/* Copy fileinfo basic structure into curinfo. */
+		memcpy(R_ADDR(infop, rep->curinfo_off),
+		    (u_int8_t*)rfp, sizeof(__rep_fileinfo_args));
+		/* Set up curinfo pointers to the various DBT data fields. */
+		GET_CURINFO(rep, infop, curinfo);
+		/* Copy uid and info DBT data from originfo buffer. */
+		if (rfp->uid.size > 0)
+			memcpy(curinfo->uid.data,
+			    rfp->uid.data, rfp->uid.size);
+		if (rfp->info.size > 0)
+			memcpy(curinfo->info.data,
+			    rfp->info.data, rfp->info.size);
+		if (rfp->dir.size > 0)
+			memcpy(curinfo->dir.data,
+			    rfp->dir.data, rfp->dir.size);
+		__os_free(env, rffree);
+
+		/* Skip over regular DB's in "abbreviated" internal inits. */
+		if (F_ISSET(rep, REP_F_ABBREVIATED) &&
+		    !FLD_ISSET(curinfo->db_flags, DB_AM_INMEM)) {
+			VPRINT(env, (env, DB_VERB_REP_SYNC,
+			    "Skipping file %d in abbreviated internal init",
+			    curinfo->filenum));
+			MUTEX_LOCK(env, renv->mtx_regenv);
+			__env_alloc_free(infop,
+			    R_ADDR(infop, rep->curinfo_off));
+			MUTEX_UNLOCK(env, renv->mtx_regenv);
+			rep->curinfo_off = INVALID_ROFF;
+			rep->curfile++;
+			continue;
+		}
+
+		/* Request this file's pages. */
+		DB_ASSERT(env, curinfo->pgno == 0);
+		rep->ready_pg = 0;
+		rep->npages = 0;
+		rep->waiting_pg = PGNO_INVALID;
+		rep->max_wait_pg = PGNO_INVALID;
+		memset(&dbt, 0, sizeof(dbt));
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Next file %d: pgsize %lu, maxpg %lu",
+		    curinfo->filenum, (u_long)curinfo->pgsize,
+		    (u_long)curinfo->max_pgno));
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "name %s dir %s",
+		    curinfo->info.size > 0 ?  (char *) curinfo->info.data :
+		    "NULL", curinfo->dir.size > 0 ?
+		    (char *)curinfo->dir.data : "NULL"));
+		msgsz = __REP_FILEINFO_SIZE + curinfo->dir.size +
+		    curinfo->uid.size + curinfo->info.size;
+		if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+			return (ret);
+		if (rep->infoversion < DB_REPVERSION_53)
+			/*
+			 * It is safe to cast to the old struct
+			 * because the first part of the current
+			 * struct matches the old struct.
+			 */
+			ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
+			    (__rep_fileinfo_v6_args *)curinfo, buf,
+			    msgsz, &len);
+		else
+			ret = __rep_fileinfo_marshal(env, rep->infoversion,
+			    curinfo, buf, msgsz, &len);
+		if (ret != 0) {
+			__os_free(env, buf);
+			return (ret);
+		}
+		DB_INIT_DBT(dbt, buf, len);
+		(void)__rep_send_message(env, eid, REP_PAGE_REQ,
+		    NULL, &dbt, 0, DB_REP_ANYWHERE);
+		__os_free(env, buf);
+
+		return (0);
+	}
+
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "NEXTFILE: have %d files.  RECOVER_LOG now", rep->nfiles));
+	/*
+	 * Move to REP_RECOVER_LOG state.
+	 * Request logs.
+	 */
+	/*
+	 * We need to do a sync here so that any later opens
+	 * can find the file and file id.  We need to do it
+	 * before we clear SYNC_PAGE so that we do not
+	 * try to flush the log.
+	 */
+	if ((ret = __memp_sync_int(env, NULL, 0,
+	    DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+		return (ret);
+	rep->sync_state = SYNC_LOG;
+	memset(&dbt, 0, sizeof(dbt));
+	lr_args.endlsn = rep->last_lsn;
+	if ((ret = __rep_logreq_marshal(env, &lr_args, lrbuf,
+	    __REP_LOGREQ_SIZE, &len)) != 0)
+		return (ret);
+	DB_INIT_DBT(dbt, lrbuf, len);
+
+	/*
+	 * Get the logging subsystem ready to receive the first log record we
+	 * are going to ask for.  In the case of a normal internal init, this is
+	 * pretty simple, since we only deal in whole log files.  In the
+	 * ABBREVIATED case we've already taken care of this, back when we
+	 * processed the UPDATE message, because we had to do it by rolling back
+	 * to a sync point at an arbitrary LSN.
+	 */
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	/*
+	 * Update ready_lsn so that future rerequests and VERIFY_FAILs know
+	 * where to start.
+	 */
+	if (!F_ISSET(rep, REP_F_ABBREVIATED) &&
+	    (ret = __rep_log_setup(env, rep,
+	    rep->first_lsn.file, rep->first_vers, &lp->ready_lsn)) != 0)
+		return (ret);
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "NEXTFILE: LOG_REQ from LSN [%lu][%lu] to [%lu][%lu]",
+	    (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset,
+	    (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+	REP_SYSTEM_UNLOCK(env);
+	__os_gettime(env, &lp->rcvd_ts, 1);
+	lp->wait_ts = rep->request_gap;
+	(void)__rep_send_message(env, eid,
+	    REP_LOG_REQ, &rep->first_lsn, &dbt, REPCTL_INIT, DB_REP_ANYWHERE);
+	REP_SYSTEM_LOCK(env);
+	return (0);
+}
+
+/*
+ * Run a recovery, for the purpose of rolling back the client environment to a
+ * specific sync point, in preparation for doing an abbreviated internal init
+ * (materializing only NIMDBs, when we already have the on-disk DBs).
+ *
+ * REP_SYSTEM_LOCK should be held on entry, and will be held on exit, but we
+ * drop it momentarily during the call.
+ */
+static int
+__rep_rollback(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	DB_THREAD_INFO *ip;
+	DB_LSN trunclsn;
+	int ret;
+	u_int32_t unused;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ENV_GET_THREAD_INFO(env, ip);
+
+	DB_ASSERT(env, FLD_ISSET(rep->lockout_flags,
+	    REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP));
+
+	REP_SYSTEM_UNLOCK(env);
+
+	if ((ret = __rep_dorecovery(env, lsnp, &trunclsn)) != 0)
+		goto errlock;
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	lp->ready_lsn = trunclsn;
+	ZERO_LSN(lp->waiting_lsn);
+	ZERO_LSN(lp->max_wait_lsn);
+	lp->max_perm_lsn = *lsnp;
+	lp->wait_ts = rep->request_gap;
+	__os_gettime(env, &lp->rcvd_ts, 1);
+	ZERO_LSN(lp->verify_lsn);
+
+	if (db_rep->rep_db == NULL &&
+	    (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		goto errlock;
+	}
+
+	F_SET(db_rep->rep_db, DB_AM_RECOVER);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+	STAT_SET(env, rep, log_queued, rep->stat.st_log_queued, 0, lsnp);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+errlock:
+	REP_SYSTEM_LOCK(env);
+
+	return (ret);
+}
+
+/*
+ * __rep_mpf_open -
+ *	Create and open the mpool file for a database.
+ *	Used by both master and client to bring files into mpool.
+ */
+static int
+__rep_mpf_open(env, mpfp, rfp, flags)
+	ENV *env;
+	DB_MPOOLFILE **mpfp;
+	__rep_fileinfo_args *rfp;
+	u_int32_t flags;
+{
+	DB db;
+	int ret;
+
+	if ((ret = __memp_fcreate(env, mpfp)) != 0)
+		return (ret);
+
+	/*
+	 * We need a dbp to pass into to __env_mpool.  Set up
+	 * only the parts that it needs.
+	 */
+	memset(&db, 0, sizeof(db));
+	db.env = env;
+	db.type = (DBTYPE)rfp->type;
+	db.pgsize = rfp->pgsize;
+	memcpy(db.fileid, rfp->uid.data, DB_FILE_ID_LEN);
+	db.flags = rfp->db_flags;
+	/* We need to make sure the dbp isn't marked open. */
+	F_CLR(&db, DB_AM_OPEN_CALLED);
+	/*
+	 * The byte order of this database may be different from my local native
+	 * byte order.  If so, set the swap bit so that the necessary swapping
+	 * will be done during file I/O.
+	 */
+	if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+	    !FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN)) ||
+	    (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+	    FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN))) {
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "rep_mpf_open: Different endian database.  Set swap bit."));
+		F_SET(&db, DB_AM_SWAP);
+	} else
+		F_CLR(&db, DB_AM_SWAP);
+
+	db.mpf = *mpfp;
+	if (F_ISSET(&db, DB_AM_INMEM))
+		(void)__memp_set_flags(db.mpf, DB_MPOOL_NOFILE, 1);
+	if ((ret = __env_mpool(&db, rfp->info.data, flags)) != 0) {
+		(void)__memp_fclose(db.mpf, 0);
+		*mpfp = NULL;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_pggap_req -
+ *	Request a page gap.  Assumes the caller holds the rep_mutex.
+ *
+ * PUBLIC: int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__rep_pggap_req(env, rep, reqfp, gapflags)
+	ENV *env;
+	REP *rep;
+	__rep_fileinfo_args *reqfp;
+	u_int32_t gapflags;
+{
+	DBT max_pg_dbt;
+	REGINFO *infop;
+	__rep_fileinfo_args *curinfo, *tmpfp, t;
+	size_t len, msgsz;
+	u_int32_t flags;
+	int alloc, master, ret;
+	u_int8_t *buf;
+
+	infop = env->reginfo;
+	ret = 0;
+	alloc = 0;
+	/*
+	 * There is a window where we have to set REP_RECOVER_PAGE when
+	 * we receive the update information to transition from getting
+	 * file information to getting page information.  However, that
+	 * thread does release and then reacquire mutexes.  So, we might
+	 * try re-requesting before the original thread can get curinfo
+	 * setup.  If curinfo isn't set up there is nothing to do.
+	 */
+	if (rep->curinfo_off == INVALID_ROFF)
+		return (0);
+	GET_CURINFO(rep, infop, curinfo);
+	if (reqfp == NULL) {
+		if ((ret = __rep_finfo_alloc(env, curinfo, &tmpfp)) != 0)
+			return (ret);
+		alloc = 1;
+	} else {
+		t = *reqfp;
+		tmpfp = &t;
+	}
+
+	/*
+	 * If we've never requested this page, then
+	 * request everything between it and the first
+	 * page we have.  If we have requested this page
+	 * then only request this record, not the entire gap.
+	 */
+	flags = 0;
+	memset(&max_pg_dbt, 0, sizeof(max_pg_dbt));
+	/*
+	 * If this is a PAGE_MORE and we're forcing then we want to
+	 * force the request to ask for the next page after this one.
+	 */
+	if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+		tmpfp->pgno++;
+	else
+		tmpfp->pgno = rep->ready_pg;
+	msgsz = __REP_FILEINFO_SIZE + tmpfp->dir.size +
+	    tmpfp->uid.size + tmpfp->info.size;
+	if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+		goto err;
+	if (rep->max_wait_pg == PGNO_INVALID ||
+	    FLD_ISSET(gapflags, REP_GAP_FORCE | REP_GAP_REREQUEST)) {
+		/*
+		 * Request the gap - set max to waiting_pg - 1 or if
+		 * there is no waiting_pg, just ask for one.
+		 */
+		if (rep->waiting_pg == PGNO_INVALID) {
+			if (FLD_ISSET(gapflags,
+			    REP_GAP_FORCE | REP_GAP_REREQUEST))
+				rep->max_wait_pg = curinfo->max_pgno;
+			else
+				rep->max_wait_pg = rep->ready_pg;
+		} else {
+			/*
+			 * If we're forcing, and waiting_pg is less than
+			 * the page we want to start this request at, then
+			 * we set max_wait_pg to the max pgno in the file.
+			 */
+			if (FLD_ISSET(gapflags, REP_GAP_FORCE) &&
+			  rep->waiting_pg < tmpfp->pgno)
+				rep->max_wait_pg = curinfo->max_pgno;
+			else
+				rep->max_wait_pg = rep->waiting_pg - 1;
+		}
+		tmpfp->max_pgno = rep->max_wait_pg;
+		/*
+		 * Gap requests are "new" and can go anywhere.
+		 */
+		if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+			flags = DB_REP_REREQUEST;
+		else
+			flags = DB_REP_ANYWHERE;
+	} else {
+		/*
+		 * Request 1 page - set max to ready_pg.
+		 */
+		rep->max_wait_pg = rep->ready_pg;
+		tmpfp->max_pgno = rep->ready_pg;
+		/*
+		 * If we're dropping to singletons, this is a rerequest.
+		 */
+		flags = DB_REP_REREQUEST;
+	}
+	if ((master = rep->master_id) != DB_EID_INVALID) {
+
+		STAT_INC(env,
+		    rep, pg_request, rep->stat.st_pg_requested, master);
+		/*
+		 * We need to request the pages, but we need to get the
+		 * new info into rep->finfo.  Assert that the sizes never
+		 * change.  The only thing this should do is change
+		 * the pgno field.  Everything else remains the same.
+		 */
+		if (rep->infoversion < DB_REPVERSION_53)
+			/*
+			 * It is safe to cast to the old struct
+			 * because the first part of the current
+			 * struct matches the old struct.
+			 */
+			ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
+			    (__rep_fileinfo_v6_args *)tmpfp, buf,
+			    msgsz, &len);
+		else
+			ret = __rep_fileinfo_marshal(env, rep->infoversion,
+			    tmpfp, buf, msgsz, &len);
+		if (ret == 0) {
+			DB_INIT_DBT(max_pg_dbt, buf, len);
+			DB_ASSERT(env, len == max_pg_dbt.size);
+			(void)__rep_send_message(env, master,
+			    REP_PAGE_REQ, NULL, &max_pg_dbt, 0, flags);
+		}
+	} else
+		(void)__rep_send_message(env, DB_EID_BROADCAST,
+		    REP_MASTER_REQ, NULL, NULL, 0, 0);
+
+	__os_free(env, buf);
+err:
+	if (alloc)
+		__os_free(env, tmpfp);
+	return (ret);
+}
+
+/*
+ * __rep_finfo_alloc -
+ *	Allocate and initialize a fileinfo structure.
+ *
+ * PUBLIC: int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *,
+ * PUBLIC:     __rep_fileinfo_args **));
+ */
+int
+__rep_finfo_alloc(env, rfpsrc, rfpp)
+	ENV *env;
+	__rep_fileinfo_args *rfpsrc, **rfpp;
+{
+	__rep_fileinfo_args *rfp;
+	size_t size;
+	int ret;
+	void *dirp, *infop, *uidp;
+
+	/*
+	 * Allocate enough for the structure and the DBT data areas.
+	 */
+	size = sizeof(__rep_fileinfo_args) + rfpsrc->uid.size +
+	    rfpsrc->info.size + rfpsrc->dir.size;
+	if ((ret = __os_malloc(env, size, &rfp)) != 0)
+		return (ret);
+
+	/*
+	 * Copy the structure itself, and then set the DBT data pointers
+	 * to their space and copy the data itself as well.
+	 */
+	memcpy(rfp, rfpsrc, sizeof(__rep_fileinfo_args));
+	uidp = (u_int8_t *)rfp + sizeof(__rep_fileinfo_args);
+	rfp->uid.data = uidp;
+	memcpy(uidp, rfpsrc->uid.data, rfpsrc->uid.size);
+
+	infop = (u_int8_t *)uidp + rfpsrc->uid.size;
+	rfp->info.data = infop;
+	memcpy(infop, rfpsrc->info.data, rfpsrc->info.size);
+
+	dirp = (u_int8_t *)infop + rfpsrc->info.size;
+	if (rfpsrc->dir.size > 0) {
+		rfp->dir.data = dirp;
+		memcpy(dirp, rfpsrc->dir.data, rfpsrc->dir.size);
+	} else
+		rfp->dir.data = NULL;
+	*rfpp = rfp;
+	return (ret);
+}
+
+/*
+ * __rep_log_setup -
+ *	We know our first LSN and need to reset the log subsystem
+ *	to get our logs set up for the proper file.
+ */
+static int
+__rep_log_setup(env, rep, file, version, lsnp)
+	ENV *env;
+	REP *rep;
+	u_int32_t file;
+	u_int32_t version;
+	DB_LSN *lsnp;
+{
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	LOG *lp;
+	int ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	/*
+	 * Set up the log starting at the file number of the first LSN we
+	 * need to get from the master.
+	 */
+	LOG_SYSTEM_LOCK(env);
+	if ((ret = __log_newfile(dblp, &lsn, file, version)) == 0 &&
+	    lsnp != NULL)
+		*lsnp = lsn;
+	LOG_SYSTEM_UNLOCK(env);
+
+	/*
+	 * We reset first_lsn to the lp->lsn.  We were given the LSN of
+	 * the checkpoint and we now need the LSN for the beginning of
+	 * the file, which __log_newfile conveniently set up for us
+	 * in lp->lsn.
+	 */
+	rep->first_lsn = lp->lsn;
+	TXN_SYSTEM_LOCK(env);
+	ZERO_LSN(region->last_ckp);
+	TXN_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __rep_queue_filedone -
+ *	Determine if we're really done getting the pages for a queue file.
+ *	Queue is handled in several steps.
+ *	1.  First we get the meta page only.
+ *	2.  We use the meta-page information to figure out first and last
+ *	    page numbers (and if queue wraps, first can be > last.
+ *	3.  If first < last, we do a REP_PAGE_REQ for all pages.
+ *	4.  If first > last, we REP_PAGE_REQ from first -> max page number.
+ *	    Then we'll ask for page 1 -> last.
+ *
+ * This function can return several things:
+ *	DB_REP_PAGEDONE - if we're done with this file.
+ *	0 - if we're not done with this file.
+ *	error - if we get an error doing some operations.
+ *
+ * This function will open a dbp handle to the queue file.  This is needed
+ * by most of the QAM macros.  We'll open it on the first pass through
+ * here and we'll close it whenever we decide we're done.
+ */
+static int
+__rep_queue_filedone(env, ip, rep, rfp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REP *rep;
+	__rep_fileinfo_args *rfp;
+{
+#ifndef HAVE_QUEUE
+	COMPQUIET(ip, NULL);
+	COMPQUIET(rep, NULL);
+	COMPQUIET(rfp, NULL);
+	return (__db_no_queue_am(env));
+#else
+	DB *queue_dbp;
+	DB_REP *db_rep;
+	db_pgno_t first, last;
+	u_int32_t flags;
+	int empty, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+	queue_dbp = NULL;
+	if (db_rep->queue_dbc == NULL) {
+		/*
+		 * We need to do a sync here so that the open
+		 * can find the file and file id.
+		 */
+		if ((ret = __memp_sync_int(env, NULL, 0,
+		    DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+			goto out;
+		if ((ret =
+		    __db_create_internal(&queue_dbp, env, 0)) != 0)
+			goto out;
+		flags = DB_NO_AUTO_COMMIT |
+		    (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+		/*
+		 * We need to check whether this is in-memory so that we pass
+		 * the name correctly as either the file or the database name.
+		 */
+		if ((ret = __db_open(queue_dbp, ip, NULL,
+		    FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? NULL :
+			rfp->info.data,
+		    FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? rfp->info.data :
+			NULL,
+		    DB_QUEUE, flags, 0, PGNO_BASE_MD)) != 0)
+			goto out;
+
+		if ((ret = __db_cursor(queue_dbp,
+		    ip, NULL, &db_rep->queue_dbc, 0)) != 0)
+			goto out;
+	} else
+		queue_dbp = db_rep->queue_dbc->dbp;
+
+	if ((ret = __queue_pageinfo(queue_dbp,
+	    &first, &last, &empty, 0, 0)) != 0)
+		goto out;
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "Queue fileinfo: first %lu, last %lu, empty %d",
+	    (u_long)first, (u_long)last, empty));
+	/*
+	 * We can be at the end of 3 possible states.
+	 * 1.  We have received the meta-page and now need to get the
+	 *     rest of the pages in the database.
+	 * 2.  We have received from first -> max_pgno.  We might be done,
+	 *     or we might need to ask for wrapped pages.
+	 * 3.  We have received all pages in the file.  We're done.
+	 */
+	if (rfp->max_pgno == 0) {
+		/*
+		 * We have just received the meta page.  Set up the next
+		 * pages to ask for and check if the file is empty.
+		 */
+		if (empty)
+			goto out;
+		if (first > last) {
+			rfp->max_pgno =
+			    QAM_RECNO_PAGE(db_rep->queue_dbc->dbp, UINT32_MAX);
+		} else
+			rfp->max_pgno = last;
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Queue fileinfo: First req: first %lu, last %lu",
+		    (u_long)first, (u_long)rfp->max_pgno));
+		goto req;
+	} else if (rfp->max_pgno != last) {
+		/*
+		 * If max_pgno != last that means we're dealing with a
+		 * wrapped situation.  Request next batch of pages.
+		 * Set npages to 1 because we already have page 0, the
+		 * meta-page, now we need pages 1-max_pgno.
+		 */
+		first = 1;
+		rfp->max_pgno = last;
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "Queue fileinfo: Wrap req: first %lu, last %lu",
+		    (u_long)first, (u_long)last));
+req:
+		/*
+		 * Since we're simulating a "gap" to resend new PAGE_REQ
+		 * for this file, we need to set waiting page to last + 1
+		 * so that we'll ask for all from ready_pg -> last.
+		 */
+		rep->npages = first;
+		rep->ready_pg = first;
+		rep->waiting_pg = rfp->max_pgno + 1;
+		rep->max_wait_pg = PGNO_INVALID;
+		ret = __rep_pggap_req(env, rep, rfp, 0);
+		return (ret);
+	}
+	/*
+	 * max_pgno == last
+	 * If we get here, we have all the pages we need.
+	 * Close the dbp and return.
+	 */
+out:
+	if (db_rep->queue_dbc != NULL &&
+	    (t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	db_rep->queue_dbc = NULL;
+
+	if (queue_dbp != NULL &&
+	    (t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret == 0)
+		ret = DB_REP_PAGEDONE;
+	return (ret);
+#endif
+}
+
+/*
+ * PUBLIC: int __rep_remove_init_file __P((ENV *));
+ */
+int
+__rep_remove_init_file(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+	char *name;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * If running in-memory replication, return without any file
+	 * operations.
+	 */
+	if (FLD_ISSET(rep->config, REP_C_INMEM))
+		return (0);
+
+	/* Abbreviated internal init doesn't use an init file. */
+	if (F_ISSET(rep, REP_F_ABBREVIATED))
+		return (0);
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_INITNAME, NULL, &name)) != 0)
+		return (ret);
+	(void)__os_unlink(env, name, 0);
+	__os_free(env, name);
+	return (0);
+}
+
+/*
+ * Checks for the existence of the internal init flag file.  If it exists, we
+ * remove all logs and databases, and then remove the flag file.  This is
+ * intended to force the internal init to start over again, and thus affords
+ * protection against a client crashing during internal init.  This function
+ * must be called before normal recovery in order to be properly effective.
+ *
+ * !!!
+ * This function should only be called during initial set-up of the environment,
+ * before various subsystems are initialized.  It doesn't rely on the
+ * subsystems' code having been initialized, and it summarily deletes files "out
+ * from under" them, which might disturb the subsystems if they were up.
+ *
+ * PUBLIC: int __rep_reset_init __P((ENV *));
+ */
+int
+__rep_reset_init(env)
+	ENV *env;
+{
+	DB_FH *fhp;
+	__rep_update_args *rup;
+	DBT dbt;
+	char *allocated_dir, *dir, *init_name;
+	size_t cnt;
+	u_int32_t dbtvers, fvers, zero;
+	u_int8_t *next;
+	int ret, t_ret;
+
+	allocated_dir = NULL;
+	rup = NULL;
+	dbt.data = NULL;
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_INITNAME, NULL, &init_name)) != 0)
+		return (ret);
+
+	if ((ret = __os_open(
+	    env, init_name, 0, DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) {
+		if (ret == ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	RPRINT(env, (env, DB_VERB_REP_SYNC,
+	    "Cleaning up interrupted internal init"));
+
+	/* There are a few possibilities:
+	 *   1. no init file, or less than 1 full file list
+	 *   2. exactly one full file list
+	 *   3. more than one, less then a second full file list
+	 *   4. second file list in full
+	 *
+	 * In cases 2 or 4, we need to remove all logs, and then remove files
+	 * according to the (most recent) file list.  (In case 1 or 3, we don't
+	 * have to do anything.)
+	 *
+	 * The __rep_get_file_list function takes care of folding these cases
+	 * into two simple outcomes.
+	 *
+	 * As of 4.7, the first 4 bytes are 0.  Read the first 4 bytes now.
+	 * If they are non-zero it means we have an old-style init file.
+	 * Otherwise, pass the file version in to rep_get_file_list.
+	 */
+	if ((ret = __os_read(env, fhp, &zero, sizeof(zero), &cnt)) != 0)
+		goto out;
+	/*
+	 * If we read successfully, but not enough, then unlink the file.
+	 */
+	if (cnt != sizeof(zero))
+		goto rm;
+	if (zero != 0) {
+		/*
+		 * Old style file.  We have to set fvers to the 4.6
+		 * version of the file and also rewind the file so
+		 * that __rep_get_file_list can read out the length itself.
+		 */
+		if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+			goto out;
+		fvers = REP_INITVERSION_46;
+	} else if ((ret = __os_read(env,
+	    fhp, &fvers, sizeof(fvers), &cnt)) != 0)
+		goto out;
+	else if (cnt != sizeof(fvers))
+		goto rm;
+	ret = __rep_get_file_list(env, fhp, fvers, &dbtvers, &dbt);
+	if ((t_ret = __os_closehandle(env, fhp)) != 0 || ret != 0) {
+		if (ret == 0)
+			ret = t_ret;
+		goto out;
+	}
+	if (dbt.data == NULL) {
+		/*
+		 * The init file did not end with an intact file list.  Since we
+		 * never start log/db removal without an intact file list
+		 * sync'ed to the init file, this must mean we don't have any
+		 * partial set of files to clean up.  So all we need to do is
+		 * remove the init file.
+		 */
+		goto rm;
+	}
+
+	/* Remove all log files. */
+	if (env->dbenv->db_log_dir == NULL)
+		dir = env->db_home;
+	else {
+		if ((ret = __db_appname(env,
+		    DB_APP_NONE, env->dbenv->db_log_dir, NULL, &dir)) != 0)
+			goto out;
+		allocated_dir = dir;
+	}
+
+	if ((ret = __rep_remove_by_prefix(env,
+	    dir, LFPREFIX, sizeof(LFPREFIX)-1, DB_APP_LOG)) != 0)
+		goto out;
+
+	/*
+	 * Remove databases according to the list, and queue extent files by
+	 * searching them out on a walk through the data_dir's.
+	 */
+	if ((ret = __rep_update_unmarshal(env, dbtvers,
+	    &rup, dbt.data, dbt.size, &next)) != 0)
+		goto out;
+	if ((ret = __rep_unlink_by_list(env, dbtvers,
+	    next, dbt.size, rup->num_files)) != 0)
+		goto out;
+
+	/* Here, we've established that the file exists. */
+rm:	(void)__os_unlink(env, init_name, 0);
+out:	if (rup != NULL)
+		__os_free(env, rup);
+	if (allocated_dir != NULL)
+		__os_free(env, allocated_dir);
+	if (dbt.data != NULL)
+		__os_free(env, dbt.data);
+
+	__os_free(env, init_name);
+	return (ret);
+}
+
+/*
+ * Reads the last fully intact file list from the init file.  If the file ends
+ * with a partial list (or is empty), we're not interested in it.  Lack of a
+ * full file list is indicated by a NULL dbt->data.  On success, the list is
+ * returned in allocated space, which becomes the responsibility of the caller.
+ *
+ * The file format is a u_int32_t buffer length, in native format, followed by
+ * the file list itself, in the same format as in an UPDATE message (though
+ * many parts of it in this case are meaningless).
+ */
+static int
+__rep_get_file_list(env, fhp, fvers, dbtvers, dbt)
+	ENV *env;
+	DB_FH *fhp;
+	u_int32_t fvers;
+	u_int32_t *dbtvers;
+	DBT *dbt;
+{
+#ifdef HAVE_REPLICATION_THREADS
+	DBT mgrdbt;
+#endif
+	u_int32_t length, mvers;
+	size_t cnt;
+	int i, ret;
+
+	/* At most 2 file lists: old and new. */
+	dbt->data = NULL;
+	mvers = DB_REPVERSION_46;
+	length = 0;
+#ifdef HAVE_REPLICATION_THREADS
+	mgrdbt.data = NULL;
+#endif
+	for (i = 1; i <= 2; i++) {
+		if (fvers >= REP_INITVERSION_47) {
+			if ((ret = __os_read(env, fhp, &mvers,
+			    sizeof(mvers), &cnt)) != 0)
+				goto err;
+			if (cnt == 0 && dbt->data != NULL)
+				break;
+			if (cnt != sizeof(mvers))
+				goto err;
+		}
+		if ((ret = __os_read(env,
+		    fhp, &length, sizeof(length), &cnt)) != 0)
+			goto err;
+
+		/*
+		 * Reaching the end here is fine, if we've been through at least
+		 * once already.
+		 */
+		if (cnt == 0 && dbt->data != NULL)
+			break;
+		if (cnt != sizeof(length))
+			goto err;
+
+		if ((ret = __os_realloc(env,
+		    (size_t)length, &dbt->data)) != 0)
+			goto err;
+
+		if ((ret = __os_read(
+		    env, fhp, dbt->data, length, &cnt)) != 0 ||
+		    cnt != (size_t)length)
+			goto err;
+	}
+
+#ifdef HAVE_REPLICATION_THREADS
+	if (i == 3) {
+		if ((ret = __os_read(env, fhp,
+		    &mgrdbt.size, sizeof(mgrdbt.size), &cnt)) != 0)
+			goto err;
+		if (cnt == 0)
+			goto absent;
+		if (cnt != sizeof(mgrdbt.size))
+			goto err;
+		if ((ret = __os_malloc(env,
+		    (size_t)mgrdbt.size, &mgrdbt.data)) != 0)
+			goto err;
+		if ((ret = __os_read(env, fhp,
+		    mgrdbt.data, mgrdbt.size, &cnt)) != 0 &&
+		    cnt != (size_t)mgrdbt.size)
+			goto err;
+		/* Repmgr takes ownership of the allocated memory. */
+		if ((ret = __repmgr_init_restore(env, &mgrdbt)) != 0)
+			goto err;
+	}
+absent:
+#endif
+
+	*dbtvers = mvers;
+	dbt->size = length;
+	return (0);
+
+err:
+#ifdef HAVE_REPLICATION_THREADS
+	if (mgrdbt.data != NULL)
+		__os_free(env, mgrdbt.data);
+#endif
+	/*
+	 * Note that it's OK to get here with a zero value in 'ret': it means we
+	 * read less than we expected, and dbt->data == NULL indicates to the
+	 * caller that we don't have an intact list.
+	 */
+	if (dbt->data != NULL)
+		__os_free(env, dbt->data);
+	dbt->data = NULL;
+	return (ret);
+}
+
+/*
+ * Removes every file in a given directory that matches a given prefix.  Notice
+ * how similar this is to __rep_walk_dir.
+ */
+static int
+__rep_remove_by_prefix(env, dir, prefix, pref_len, appname)
+	ENV *env;
+	const char *dir;
+	const char *prefix;
+	size_t pref_len;
+	APPNAME appname;	/* What kind of name. */
+{
+	char *namep, **names;
+	int cnt, i, ret;
+
+	if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+		return (ret);
+	for (i = 0; i < cnt; i++) {
+		if (strncmp(names[i], prefix, pref_len) == 0) {
+			if ((ret = __db_appname(env,
+			    appname, names[i], NULL, &namep)) != 0)
+				goto out;
+			(void)__os_unlink(env, namep, 0);
+			__os_free(env, namep);
+		}
+	}
+out:	__os_dirfree(env, names, cnt);
+	return (ret);
+}
+
+/*
+ * Removes database files according to the contents of a list.
+ *
+ * This function must support removal either during environment creation, or
+ * when an internal init is reset in the middle.  This means it must work
+ * regardless of whether underlying subsystems are initialized.  However, it may
+ * assume that databases are not open.  That means there is no REP!
+ */
+static int
+__rep_unlink_by_list(env, version, files, size, count)
+	ENV *env;
+	u_int32_t version;
+	u_int8_t *files;
+	u_int32_t size;
+	u_int32_t count;
+{
+	DB_ENV *dbenv;
+	char **ddir, *dir;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	if ((ret = __rep_walk_filelist(env, version,
+	    files, size, count, __rep_unlink_file, NULL)) != 0)
+		goto out;
+
+	/* Notice how similar this code is to __rep_find_dbs. */
+	if (dbenv->db_data_dir == NULL)
+		ret = __rep_remove_by_prefix(env, env->db_home,
+		    QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX) - 1,
+		    DB_APP_DATA);
+	else {
+		for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+			if ((ret = __db_appname(env,
+			    DB_APP_NONE, *ddir, NULL, &dir)) != 0)
+				break;
+			ret = __rep_remove_by_prefix(env, dir,
+			    QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX)-1,
+			    DB_APP_DATA);
+			__os_free(env, dir);
+			if (ret != 0)
+				break;
+		}
+	}
+
+out:
+	return (ret);
+}
+
+static int
+__rep_unlink_file(env, rfp, unused)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *unused;
+{
+	char *namep;
+	int ret;
+
+	COMPQUIET(unused, NULL);
+
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, rfp->info.data, NULL, &namep)) == 0) {
+		(void)__os_unlink(env, namep, 0);
+		__os_free(env, namep);
+	}
+	return (ret);
+}
+
+static int
+__rep_remove_by_list(env, rfp, unused)
+	ENV *env;
+	__rep_fileinfo_args *rfp;
+	void *unused;
+{
+	int ret;
+
+	COMPQUIET(unused, NULL);
+
+	if ((ret = __rep_remove_file(env, rfp, NULL)) == ENOENT) {
+		/*
+		 * If the file already doesn't exist, that's perfectly
+		 * OK.  This can easily happen if we're cleaning up an
+		 * interrupted internal init, and we only got part-way
+		 * through the list of files.
+		 */
+		ret = 0;
+	}
+	return (ret);
+}
+
+static int
+__rep_walk_filelist(env, version, files, size, count, fn, arg)
+	ENV *env;
+	u_int32_t version;
+	u_int8_t *files;
+	u_int32_t size;
+	u_int32_t count;
+	FILE_WALK_FN *fn;
+	void *arg;
+{
+	__rep_fileinfo_args *rfp, rf;
+	__rep_fileinfo_v6_args *rfpv6;
+	u_int8_t *next;
+	int ret;
+	void *rffree;
+
+	ret = 0;
+	rfp = NULL;
+	rfpv6 = NULL;
+	rffree = NULL;
+	while (count-- > 0) {
+		if (version < DB_REPVERSION_53) {
+			/*
+			 * Build a current struct by copying in the older
+			 * version struct and then setting up the data_dir.
+			 * This is safe because all old fields are in the
+			 * same location in the current struct.
+			 */
+			if ((ret = __rep_fileinfo_v6_unmarshal(env, version,
+			    &rfpv6, files, size, &next)) != 0)
+				break;
+			memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
+			rf.dir.data = NULL;
+			rf.dir.size = 0;
+			rfp = &rf;
+			rffree = rfpv6;
+		} else {
+			if ((ret = __rep_fileinfo_unmarshal(env, version,
+			    &rfp, files, size, &next)) != 0)
+				break;
+			rffree = rfp;
+		}
+		size -= (u_int32_t)(next - files);
+		files = next;
+
+		if ((ret = (*fn)(env, rfp, arg)) != 0)
+			break;
+		__os_free(env, rffree);
+		rfp = NULL;
+		rfpv6 = NULL;
+		rffree = NULL;
+	}
+
+	if (rffree != NULL)
+		__os_free(env, rffree);
+	return (ret);
+}
diff --git a/src/rep/rep_elect.c b/src/rep/rep_elect.c
new file mode 100644
index 00000000..9e8c5249
--- /dev/null
+++ b/src/rep/rep_elect.c
@@ -0,0 +1,1486 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * We need to check sites == nsites, not more than half
+ * like we do in __rep_elect and the VOTE2 code.  The
+ * reason is that we want to process all the incoming votes
+ * and not short-circuit once we reach more than half.  The
+ * real winner's vote may be in the last half.
+ */
+#define	IS_PHASE1_DONE(rep)						\
+    ((rep)->sites >= (rep)->nsites && (rep)->winner != DB_EID_INVALID)
+
+#define	I_HAVE_WON(rep, winner)						\
+    ((rep)->votes >= (rep)->nvotes && winner == (rep)->eid)
+
+static void __rep_cmp_vote __P((ENV *, REP *, int, DB_LSN *,
+    u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t));
+static int __rep_elect_init
+	       __P((ENV *, u_int32_t, u_int32_t, int *, u_int32_t *));
+static int __rep_fire_elected __P((ENV *, REP *, u_int32_t));
+static void __rep_elect_master __P((ENV *, REP *));
+static int __rep_grow_sites __P((ENV *, u_int32_t));
+static void __rep_send_vote __P((ENV *, DB_LSN *, u_int32_t,
+    u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, int,
+    u_int32_t, u_int32_t));
+static int __rep_tally __P((ENV *, REP *, int, u_int32_t *, u_int32_t, int));
+static int __rep_wait __P((ENV *, db_timeout_t *, int, u_int32_t, u_int32_t));
+
+/*
+ * __rep_elect_pp --
+ *	Called after master failure to hold/participate in an election for
+ *	a new master.
+ *
+ * PUBLIC:  int __rep_elect_pp
+ * PUBLIC:      __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_pp(dbenv, given_nsites, nvotes, flags)
+	DB_ENV *dbenv;
+	u_int32_t given_nsites, nvotes;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_elect", DB_INIT_REP);
+
+	if (APP_IS_REPMGR(env)) {
+		__db_errx(env, DB_STR("3527",
+"DB_ENV->rep_elect: cannot call from Replication Manager application"));
+		return (EINVAL);
+	}
+
+	/* We need a transport function because we send messages. */
+	if (db_rep->send == NULL) {
+		__db_errx(env, DB_STR("3528",
+    "DB_ENV->rep_elect: must be called after DB_ENV->rep_set_transport"));
+		return (EINVAL);
+	}
+
+	if (!IS_REP_STARTED(env)) {
+		__db_errx(env, DB_STR("3529",
+	    "DB_ENV->rep_elect: must be called after DB_ENV->rep_start"));
+		return (EINVAL);
+	}
+
+	if (IS_USING_LEASES(env) && given_nsites != 0) {
+		__db_errx(env, DB_STR("3530",
+	    "DB_ENV->rep_elect: nsites must be zero if leases configured"));
+		return (EINVAL);
+	}
+
+	ret = __rep_elect_int(env, given_nsites, nvotes, flags);
+
+	/*
+	 * The DB_REP_IGNORE return code can be of use to repmgr (which of
+	 * course calls __rep_elect_int directly), but it may too subtle to be
+	 * useful for (Base API) applications: so preserve the pre-existing API
+	 * behavior for applications by making this look like a 0.
+	 */
+	if (ret == DB_REP_IGNORE)
+		ret = 0;
+	return (ret);
+}
+
+/*
+ * __rep_elect_int --
+ *	Internal processing to hold/participate in an election for
+ *	a new master after master failure.
+ *
+ * PUBLIC:  int __rep_elect_int
+ * PUBLIC:      __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_int(env, given_nsites, nvotes, flags)
+	ENV *env;
+	u_int32_t given_nsites, nvotes;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REP *rep;
+	int done, elected, in_progress;
+	int need_req, ret, send_vote, t_ret;
+	u_int32_t ack, ctlflags, data_gen, egen, nsites;
+	u_int32_t orig_tally, priority, realpri, repflags, tiebreaker;
+	db_timeout_t timeout;
+
+	COMPQUIET(flags, 0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	elected = 0;
+	egen = 0;
+	ret = 0;
+
+	/*
+	 * Specifying 0 for nsites signals us to use the value configured
+	 * previously via rep_set_nsites.  Similarly, if the given nvotes is 0,
+	 * it asks us to compute the value representing a simple majority.
+	 */
+	nsites = given_nsites == 0 ? rep->config_nsites : given_nsites;
+	ack = nvotes == 0 ? ELECTION_MAJORITY(nsites) : nvotes;
+
+	/*
+	 * XXX
+	 * If users give us less than a majority, they run the risk of
+	 * having a network partition.  However, this also allows the
+	 * scenario of master/1 client to elect the client.  Allow
+	 * sub-majority values, but give a warning.
+	 */
+	if (ack <= (nsites / 2)) {
+		__db_errx(env, DB_STR_A("3531",
+    "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)",
+		    "%d %d"), nvotes, nsites);
+	}
+
+	if (nsites < ack) {
+		__db_errx(env, DB_STR_A("3532",
+	    "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)",
+		    "%d %d"), ack, nsites);
+		return (EINVAL);
+	}
+
+	realpri = rep->priority;
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "Start election nsites %d, ack %d, priority %d",
+	    nsites, ack, realpri));
+
+	/*
+	 * Special case when having an election while running with
+	 * sites of potentially mixed versions.  We set a bit indicating
+	 * we're an electable site, but set our priority to 0.
+	 * Old sites will never elect us, with 0 priority, but if all
+	 * we have are new sites, then we can elect the best electable
+	 * site of the group.
+	 *     Thus 'priority' is this special, possibly-fake, effective
+	 * priority that we'll use for this election, while 'realpri' is our
+	 * real, configured priority, as retrieved from REP region.
+	 */
+	ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0;
+	ENV_ENTER(env, ip);
+
+	orig_tally = 0;
+	/* If we are already master, simply broadcast that fact and return. */
+	if (F_ISSET(rep, REP_F_MASTER)) {
+master:		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+		if (IS_USING_LEASES(env))
+			ret = __rep_lease_refresh(env);
+		if (ret == 0)
+		    ret = DB_REP_IGNORE;
+		goto envleave;
+	}
+	REP_SYSTEM_LOCK(env);
+
+	/*
+	 * If leases are configured, wait for them to expire, and
+	 * see if we can discover the master while waiting.
+	 */
+	if (IS_USING_LEASES(env) &&
+	    (timeout = __rep_lease_waittime(env)) != 0) {
+		FLD_SET(rep->elect_flags, REP_E_PHASE0);
+		egen = rep->egen;
+		REP_SYSTEM_UNLOCK(env);
+		VPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "PHASE0 waittime from rep_lease_waittime: %lu",
+		    (u_long)timeout));
+		(void)__rep_send_message(env, DB_EID_BROADCAST,
+		    REP_MASTER_REQ, NULL, NULL, 0, 0);
+
+		/*
+		 * The only possible non-zero return from __rep_wait() is a
+		 * panic for a mutex failure.  So the state of the PHASE0 flag
+		 * doesn't matter much.  If that changes in the future, it is
+		 * still best not to clear the flag after an error, because
+		 * another thread might be in the middle of its PHASE0 wait (and
+		 * not getting an error), so we wouldn't want to cut short its
+		 * wait.  If there isn't another concurrent thread, the worst
+		 * that would happen would be that we would leave the flag set,
+		 * until the next time we came through here and completed a
+		 * wait.  Note that the code here is the only place where we
+		 * check this flag.
+		 */
+		if ((ret = __rep_wait(env,
+		    &timeout, 0, egen, REP_E_PHASE0)) != 0)
+			goto envleave;
+		REP_SYSTEM_LOCK(env);
+		repflags = rep->elect_flags;
+		FLD_CLR(rep->elect_flags, REP_E_PHASE0);
+		/*
+		 * If any other thread cleared PHASE0 while we were waiting,
+		 * then we're done.  Either we heard from a master, or some
+		 * other thread completed its PHASE0 wait.
+		 *
+		 * Or, we could have waited long enough for our lease grant to
+		 * expire.  Check it to make sure.
+		 */
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "after PHASE0 wait, flags 0x%x, elect_flags 0x%x",
+		    rep->flags, rep->elect_flags));
+		if (!FLD_ISSET(repflags, REP_E_PHASE0) ||
+		    __rep_islease_granted(env) || egen != rep->egen) {
+			VPRINT(env, (env, DB_VERB_REP_ELECT,
+    "PHASE0 Done: repflags 0x%x, egen %d rep->egen %d, lease_granted %d",
+    repflags, egen, rep->egen, __rep_islease_granted(env)));
+			goto unlck_lv;
+		}
+		F_SET(rep, REP_F_LEASE_EXPIRED);
+	}
+
+	/*
+	 * After acquiring the mutex, and possibly waiting for leases to
+	 * expire, without the mutex, we need to recheck our state.  It
+	 * may have changed.  If we are now master, we're done.
+	 */
+	if (F_ISSET(rep, REP_F_MASTER)) {
+		REP_SYSTEM_UNLOCK(env);
+		goto master;
+	}
+	if ((ret = __rep_elect_init(env, nsites, ack,
+	    &in_progress, &orig_tally)) != 0)
+		goto unlck_lv;
+	/*
+	 * If another thread is in the middle of an election we
+	 * just quietly return and not interfere.
+	 */
+	if (in_progress) {
+		ret = DB_REP_IGNORE;
+		goto unlck_lv;
+	}
+
+	/*
+	 * Count threads in the guts of rep_elect, so that we only clear
+	 * lockouts when the last thread is finishing.  The "guts" start here,
+	 * and do not include the above test where we "quietly return" via
+	 * envleave.
+	 *
+	 * Closely associated with that is the notion that the current thread
+	 * "owns" the right to process the election at the current egen.  We set
+	 * the local variable "egen" now to "our" egen; if rep->egen ever
+	 * advances "out from under us" we know it's time to yield to a new
+	 * generation.  Our egen value was vetted in __rep_elect_init(), and we
+	 * have not dropped the mutex since then.
+	 *
+	 * Other than occasionally checking that "our" egen still matches the
+	 * current latest rep->egen, there should be no use of rep->egen in this
+	 * function after this point.
+	 */
+	rep->elect_th++;
+	egen = rep->egen;
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+		"Election thread owns egen %lu", (u_long)egen));
+
+	priority = lp->persist.version != DB_LOGVERSION ? 0 : realpri;
+#ifdef	CONFIG_TEST
+	/*
+	 * This allows us to unit test the ELECTABLE flag simply by
+	 * using the priority values.
+	 */
+	if (priority > 0 && priority <= 5) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+	   "Artificially setting priority 0 (ELECTABLE) for CONFIG_TEST mode"));
+		DB_ASSERT(env, ctlflags == REPCTL_ELECTABLE);
+		priority = 0;
+	}
+#endif
+	__os_gettime(env, &rep->etime, 1);
+
+	/*
+	 * Default to the normal timeout unless the user configured
+	 * a full election timeout and we think we need a full election.
+	 */
+	rep->full_elect = 0;
+	timeout = rep->elect_timeout;
+	if (!F_ISSET(rep, REP_F_GROUP_ESTD) && rep->full_elect_timeout != 0) {
+		rep->full_elect = 1;
+		timeout = rep->full_elect_timeout;
+	}
+
+	/*
+	 * We need to lockout applying incoming log records during
+	 * the election.  We need to use a special rep_lockout_apply
+	 * instead of rep_lockout_msg because we do not want to
+	 * lockout all incoming messages, like other VOTEs!
+	 */
+	if ((ret = __rep_lockout_apply(env, rep, 0)) != 0)
+		goto err_locked;
+	if ((ret = __rep_lockout_archive(env, rep)) != 0)
+		goto err_locked;
+
+	/*
+	 * Since the lockout step (above) could have dropped the mutex, we must
+	 * check to see if we still own the right to proceed with the election
+	 * at this egen.
+	 */
+	if (rep->egen != egen) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Found egen %lu, abandon my election at egen %lu",
+		    (u_long)rep->egen, (u_long)egen));
+		goto err_locked;
+	}
+
+	/* Generate a randomized tiebreaker value. */
+	__os_unique_id(env, &tiebreaker);
+
+	FLD_SET(rep->elect_flags, REP_E_PHASE1);
+	FLD_CLR(rep->elect_flags, REP_E_TALLY);
+	/*
+	 * We made sure that leases were expired before starting the
+	 * election, but an existing master may be slow in responding.
+	 * If, during lockout, acquiring mutexes, etc, the client has now
+	 * re-granted its lease, we're done - a master exists.
+	 */
+	if (IS_USING_LEASES(env) &&
+	     __rep_islease_granted(env)) {
+		ret = 0;
+		goto err_locked;
+	}
+
+	/*
+	 * If we are in the middle of recovering or internal
+	 * init, we participate, but we set our priority to 0
+	 * and turn off REPCTL_ELECTABLE.  Check whether we
+	 * are in an internal init state.  If not,
+	 * then that is okay, we can be elected (i.e. we are not
+	 * in an inconsistent state).
+	 */
+	INIT_LSN(lsn);
+	if (ISSET_LOCKOUT_BDB(rep) || IN_INTERNAL_INIT(rep) ||
+	    rep->sync_state == SYNC_UPDATE) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+	   "Setting priority 0, unelectable, due to internal init/recovery"));
+		priority = 0;
+		ctlflags = 0;
+		data_gen = 0;
+	} else {
+		/*
+		 * Use the last commit record as the LSN in the vote.
+		 */
+		if ((ret = __log_cursor(env, &logc)) != 0)
+			goto err_locked;
+		/*
+		 * If we've walked back and there are no commit records,
+		 * then reset LSN to INIT_LSN.
+		 */
+		if ((ret = __rep_log_backup(env,
+		    logc, &lsn, REP_REC_COMMIT)) == DB_NOTFOUND) {
+			INIT_LSN(lsn);
+			ret = 0;
+		}
+		if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err_locked;
+		if ((ret = __rep_get_datagen(env, &data_gen)) != 0)
+			goto err_locked;
+	}
+
+	/*
+	 * We are about to participate at this egen.  We must
+	 * write out the next egen before participating in this one
+	 * so that if we crash we can never participate in this egen
+	 * again.
+	 */
+	if ((ret = __rep_write_egen(env, rep, egen + 1)) != 0)
+		goto err_locked;
+
+	/* Tally our own vote */
+	if ((ret = __rep_tally(env, rep, rep->eid, &rep->sites, egen, 1))
+	    != 0) {
+		/*
+		 * __rep_tally is telling us that this vote is a duplicate.  But
+		 * this is our own vote in this case, and that should be
+		 * impossible for a given egen.
+		 */
+		DB_ASSERT(env, ret != DB_REP_IGNORE);
+		goto err_locked;
+	}
+	__rep_cmp_vote(env, rep, rep->eid, &lsn, priority, rep->gen, data_gen,
+	    tiebreaker, ctlflags);
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT, "Beginning an election"));
+
+	/*
+	 * Now send vote, remembering the details in case we need them later in
+	 * order to send out a duplicate VOTE1.  We must save the nsites and
+	 * nvotes values that we originally send in the VOTE1 message, separate
+	 * from rep->nsites and rep->nvotes, since the latter can change when we
+	 * receive a VOTE1 from another site.
+	 */
+	send_vote = DB_EID_INVALID;
+	done = IS_PHASE1_DONE(rep);
+	rep->vote1.lsn = lsn;
+	rep->vote1.nsites = nsites;
+	rep->vote1.nvotes = ack;
+	rep->vote1.priority = priority;
+	rep->vote1.tiebreaker = tiebreaker;
+	rep->vote1.ctlflags = ctlflags;
+	rep->vote1.data_gen = data_gen;
+	REP_SYSTEM_UNLOCK(env);
+
+	__rep_send_vote(env, &lsn, nsites, ack, priority, tiebreaker, egen,
+	    data_gen, DB_EID_BROADCAST, REP_VOTE1, ctlflags);
+	DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTVOTE1, ret, NULL);
+	if (done) {
+		REP_SYSTEM_LOCK(env);
+		goto vote;
+	}
+
+	ret = __rep_wait(env, &timeout, rep->full_elect, egen, REP_E_PHASE1);
+	REP_SYSTEM_LOCK(env);
+	if (ret != 0)
+		goto err_locked;
+	if (rep->egen > egen)
+		/*
+		 * For one reason or another, this election cycle is over; it
+		 * doesn't matter why.
+		 */
+		goto out;
+
+	if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2)) {
+		/* Received enough votes while waiting to move us to phase 2. */
+		REP_SYSTEM_UNLOCK(env);
+		goto phase2;
+	}
+
+	/*
+	 * If we got here, we haven't heard from everyone, but we've
+	 * run out of time, so it's time to decide if we have enough
+	 * votes to pick a winner and if so, to send out a vote to
+	 * the winner.
+	 */
+	if (rep->sites >= rep->nvotes) {
+vote:
+		/* We think we've seen enough to cast a vote. */
+		send_vote = rep->winner;
+		/*
+		 * See if we won.  This will make sure we
+		 * don't count ourselves twice if we're racing
+		 * with incoming votes.
+		 */
+		if (rep->winner == rep->eid) {
+			if ((ret =__rep_tally(env,
+			    rep, rep->eid, &rep->votes, egen, 2)) != 0 &&
+			    ret != DB_REP_IGNORE)
+				goto err_locked;
+			RPRINT(env, (env, DB_VERB_REP_ELECT,
+			    "Counted my vote %d", rep->votes));
+		}
+		FLD_SET(rep->elect_flags, REP_E_PHASE2);
+		FLD_CLR(rep->elect_flags, REP_E_PHASE1);
+	}
+	if (send_vote == DB_EID_INVALID) {
+		/* We do not have enough votes to elect. */
+		if (rep->sites >= rep->nvotes)
+			__db_errx(env, DB_STR_A("3533",
+	    "No electable site found: recvd %d of %d votes from %d sites",
+			    "%d %d %d"), rep->sites, rep->nvotes, rep->nsites);
+		else
+			__db_errx(env, DB_STR_A("3534",
+	    "Not enough votes to elect: recvd %d of %d from %d sites",
+			    "%d %d %d"), rep->sites, rep->nvotes, rep->nsites);
+		ret = DB_REP_UNAVAIL;
+		goto err_locked;
+	}
+	REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * We have seen enough vote1's.  Now we need to wait
+	 * for all the vote2's.
+	 */
+	if (send_vote != rep->eid) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT, "Sending vote"));
+		__rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 0,
+		    send_vote, REP_VOTE2, 0);
+		/*
+		 * If we are NOT the new master we want to send
+		 * our vote to the winner, and wait longer.  The
+		 * reason is that the winner may be "behind" us
+		 * in the election waiting and if the master is
+		 * down, the winner will wait the full timeout
+		 * and we want to give the winner enough time to
+		 * process all the votes.  Otherwise we could
+		 * incorrectly return DB_REP_UNAVAIL and start a
+		 * new election before the winner can declare
+		 * itself.
+		 */
+		timeout = timeout * 2;
+	}
+
+phase2:
+	if (I_HAVE_WON(rep, rep->winner)) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Skipping phase2 wait: already got %d votes", rep->votes));
+		REP_SYSTEM_LOCK(env);
+		goto i_won;
+	}
+	ret = __rep_wait(env, &timeout, rep->full_elect, egen, REP_E_PHASE2);
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * Since at "err_lock" we're expected to have the lock, it's convenient
+	 * to acquire it before testing "ret" here, since we need it anyway for
+	 * the following stuff.
+	 */
+	if (ret != 0)
+		goto err_locked;
+	if (rep->egen > egen || !IN_ELECTION(rep))
+		goto out;
+
+	/* We must have timed out. */
+	ret = DB_REP_UNAVAIL;
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "After phase 2: votes %d, nvotes %d, nsites %d",
+	    rep->votes, rep->nvotes, rep->nsites));
+
+	if (I_HAVE_WON(rep, rep->winner)) {
+i_won:		__rep_elect_master(env, rep);
+		ret = 0;
+		elected = 1;
+	}
+err_locked:
+	/*
+	 * If we get here because of a non-election error, then we did not tally
+	 * our vote.  In that case we do not want to discard all known election
+	 * info.
+	 */
+	if (ret == 0 || ret == DB_REP_UNAVAIL)
+		__rep_elect_done(env, rep);
+	else if (orig_tally)
+		FLD_SET(rep->elect_flags, orig_tally);
+
+#ifdef CONFIG_TEST
+	if (0) {
+DB_TEST_RECOVERY_LABEL
+		REP_SYSTEM_LOCK(env);
+	}
+#endif
+
+out:
+	/*
+	 * We're leaving, so decrement thread count.  If it's still >0 after
+	 * that, another thread has come along to handle a later egen.  Only the
+	 * last thread to come through here should clear the lockouts.
+	 */
+	need_req = 0;
+	DB_ASSERT(env, rep->elect_th > 0);
+	rep->elect_th--;
+	if (rep->elect_th == 0) {
+		need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) &&
+		    !I_HAVE_WON(rep, rep->winner);
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_APPLY);
+		F_CLR(rep, REP_F_SKIPPED_APPLY);
+	}
+	/*
+	 * Only clear archiving lockout if the election failed.  If
+	 * it succeeded, we keep archiving disabled until we either
+	 * become master or complete synchronization with a master.
+	 */
+	if (ret != 0 && rep->elect_th == 0)
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+	REP_SYSTEM_UNLOCK(env);
+	/*
+	 * If we skipped any log records, request them now.
+	 */
+	if (need_req && (t_ret = __rep_resend_req(env, 0)) != 0 &&
+	    (ret == 0 || ret == DB_REP_UNAVAIL || ret == DB_REP_IGNORE))
+		ret = t_ret;
+
+	/* Note that "elected" implies ret cannot be DB_REP_UNAVAIL here. */
+	if (elected) {
+		/*
+		 * The only way ret can be non-zero is if __rep_resend_req()
+		 * failed.  So we don't have to check for UNAVAIL and IGNORE in
+		 * deciding whether we're overwriting ret, as we did above.
+		 */
+		DB_ASSERT(env, ret != DB_REP_UNAVAIL && ret != DB_REP_IGNORE);
+		if ((t_ret = __rep_fire_elected(env, rep, egen)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "%s %d, e_th %lu, egen %lu, flag 0x%lx, e_fl 0x%lx, lo_fl 0x%lx",
+	    "Ended election with ", ret,
+	    (u_long) rep->elect_th, (u_long)rep->egen,
+	    (u_long)rep->flags, (u_long)rep->elect_flags,
+	    (u_long)rep->lockout_flags));
+
+	if (0) {
+unlck_lv:	REP_SYSTEM_UNLOCK(env);
+	}
+envleave:
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __rep_vote1 --
+ *	Handle incoming vote1 message on a client.
+ *
+ * PUBLIC: int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote1(env, rp, rec, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+{
+	DBT data_dbt;
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	REP_OLD_VOTE_INFO *ovi;
+	VOTE1_CONTENT vote1;
+	__rep_egen_args egen_arg;
+	__rep_vote_info_v5_args tmpvi5;
+	__rep_vote_info_args tmpvi, *vi;
+	u_int32_t egen;
+	int elected, master, resend, ret;
+	u_int8_t buf[__REP_MAXMSG_SIZE];
+	size_t len;
+
+	COMPQUIET(egen, 0);
+
+	elected = resend = ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if (F_ISSET(rep, REP_F_MASTER)) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT, "Master received vote"));
+		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+		return (ret);
+	}
+
+	/*
+	 * In 4.7 we changed to having fixed sized u_int32_t's from
+	 * non-fixed 'int' fields in the vote structure.
+	 */
+	if (rp->rep_version < DB_REPVERSION_47) {
+		ovi = (REP_OLD_VOTE_INFO *)rec->data;
+		tmpvi.egen = ovi->egen;
+		tmpvi.nsites = (u_int32_t)ovi->nsites;
+		tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+		tmpvi.priority = (u_int32_t)ovi->priority;
+		tmpvi.tiebreaker = ovi->tiebreaker;
+		tmpvi.data_gen = 0;
+	} else if (rp->rep_version < DB_REPVERSION_52) {
+		if ((ret = __rep_vote_info_v5_unmarshal(env,
+		    &tmpvi5, rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		tmpvi.egen = tmpvi5.egen;
+		tmpvi.nsites = tmpvi5.nsites;
+		tmpvi.nvotes = tmpvi5.nvotes;
+		tmpvi.priority = tmpvi5.priority;
+		tmpvi.tiebreaker = tmpvi5.tiebreaker;
+		tmpvi.data_gen = 0;
+	} else
+		if ((ret = __rep_vote_info_unmarshal(env,
+		    &tmpvi, rec->data, rec->size, NULL)) != 0)
+			return (ret);
+	vi = &tmpvi;
+	REP_SYSTEM_LOCK(env);
+
+	/*
+	 * If we get a vote from a later election gen, we
+	 * clear everything from the current one, and we'll
+	 * start over by tallying it.  If we get an old vote,
+	 * send an ALIVE to the old participant.
+	 */
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "Received vote1 egen %lu, egen %lu",
+	    (u_long)vi->egen, (u_long)rep->egen));
+	if (vi->egen < rep->egen) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Received old vote %lu, egen %lu, ignoring vote1",
+		    (u_long)vi->egen, (u_long)rep->egen));
+		egen_arg.egen = rep->egen;
+		REP_SYSTEM_UNLOCK(env);
+		if (rep->version < DB_REPVERSION_47)
+			DB_INIT_DBT(data_dbt, &egen_arg.egen,
+			    sizeof(egen_arg.egen));
+		else {
+			if ((ret = __rep_egen_marshal(env,
+			    &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+				return (ret);
+			DB_INIT_DBT(data_dbt, buf, len);
+		}
+		(void)__rep_send_message(env,
+		    eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+		return (0);
+	}
+	if (vi->egen > rep->egen) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Received VOTE1 from egen %lu, my egen %lu",
+		    (u_long)vi->egen, (u_long)rep->egen));
+		/*
+		 * Terminate an election that may be in progress at the old
+		 * egen.  Whether or not there was one, this call will result in
+		 * HOLDELECTION (assuming no unexpected failures crop up).
+		 */
+		__rep_elect_done(env, rep);
+		rep->egen = vi->egen;
+	}
+
+	/*
+	 * If this site (sender of the VOTE1) is the first to the party, simply
+	 * initialize values from the message.  Otherwise, see if the site knows
+	 * about more sites, and/or requires more votes, than we do.
+	 */
+	if (!IN_ELECTION_TALLY(rep)) {
+		FLD_SET(rep->elect_flags, REP_E_TALLY);
+		rep->nsites = vi->nsites;
+		rep->nvotes = vi->nvotes;
+	} else {
+		if (vi->nsites > rep->nsites)
+			rep->nsites = vi->nsites;
+		if (vi->nvotes > rep->nvotes)
+			rep->nvotes = vi->nvotes;
+	}
+
+	/*
+	 * Ignore vote1's if we're in phase 2.
+	 */
+	if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2)) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "In phase 2, ignoring vote1"));
+		goto err;
+	}
+
+	/*
+	 * Record this vote.  If we're ignoring it, there's nothing more we need
+	 * to do.
+	 */
+	if ((ret = __rep_tally(env, rep, eid, &rep->sites, vi->egen, 1)) != 0) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Tally returned %d, sites %d", ret, rep->sites));
+		if (ret == DB_REP_IGNORE)
+			ret = 0;
+		goto err;
+	}
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+"Incoming vote: (eid)%d (pri)%lu %s (gen)%lu (egen)%lu (datagen)%lu [%lu,%lu]",
+	    eid, (u_long)vi->priority,
+	    F_ISSET(rp, REPCTL_ELECTABLE) ? "ELECTABLE" : "",
+	    (u_long)rp->gen, (u_long)vi->egen, (u_long)vi->data_gen,
+	    (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+	if (rep->sites > 1)
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+"Existing vote: (eid)%d (pri)%lu (gen)%lu (datagen)%lu (sites)%d [%lu,%lu]",
+		    rep->winner, (u_long)rep->w_priority,
+		    (u_long)rep->w_gen, (u_long)rep->w_datagen, rep->sites,
+		    (u_long)rep->w_lsn.file,
+		    (u_long)rep->w_lsn.offset));
+
+	__rep_cmp_vote(env, rep, eid, &rp->lsn, vi->priority,
+	    rp->gen, vi->data_gen, vi->tiebreaker, rp->flags);
+	/*
+	 * If you get a vote and you're not yet "in an election" at the proper
+	 * egen, we've already recorded this vote.  But that is all we need to
+	 * do.  But if you are in an election, check to see if we ought to send
+	 * an extra VOTE1.  We know that the VOTE1 we have received is not a
+	 * duplicated, because of the successful return from __rep_tally(),
+	 * above.
+	 */
+	if (IN_ELECTION(rep)) {
+		/*
+		 * If we're doing a full election, and we're into phase 1 (no
+		 * REP_E_TALLY), then resend, in case the sender of this VOTE1
+		 * missed our VOTE1.
+		 */
+		if (rep->full_elect &&
+		    FLD_ISSET((rep)->elect_flags, REP_E_PHASE1)) {
+			resend = 1;
+			vote1 = rep->vote1;
+			egen = rep->egen;
+		}
+	} else {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Not in election, but received vote1 0x%x 0x%x",
+		    rep->flags, rep->elect_flags));
+		ret = DB_REP_HOLDELECTION;
+		goto err;
+	}
+
+	master = rep->winner;
+	lsn = rep->w_lsn;
+	if (IS_PHASE1_DONE(rep)) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT, "Phase1 election done"));
+		RPRINT(env, (env, DB_VERB_REP_ELECT, "Voting for %d%s",
+		    master, master == rep->eid ? "(self)" : ""));
+		egen = rep->egen;
+		FLD_SET(rep->elect_flags, REP_E_PHASE2);
+		FLD_CLR(rep->elect_flags, REP_E_PHASE1);
+		if (master == rep->eid) {
+			if ((ret =__rep_tally(env, rep, rep->eid,
+			    &rep->votes, egen, 2)) != 0 &&
+			    ret != DB_REP_IGNORE)
+				goto err;
+			ret = 0;
+			RPRINT(env, (env, DB_VERB_REP_ELECT,
+			    "After phase 1 done: counted vote %d of %d",
+			    rep->votes, rep->nvotes));
+			if (I_HAVE_WON(rep, rep->winner)) {
+				__rep_elect_master(env, rep);
+				elected = 1;
+			}
+			goto err;
+		}
+		REP_SYSTEM_UNLOCK(env);
+
+		/* Vote for someone else. */
+		__rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 0,
+		    master, REP_VOTE2, 0);
+	} else
+err:		REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Note that if we're elected, there's no need for resending our VOTE1,
+	 * even if we thought it might have been necessary a moment ago.
+	 */
+	if (elected)
+		ret = __rep_fire_elected(env, rep, egen);
+	else if (resend)
+		__rep_send_vote(env,
+		    &vote1.lsn, vote1.nsites, vote1.nvotes, vote1.priority,
+		    vote1.tiebreaker, egen, vote1.data_gen,
+		    eid, REP_VOTE1, vote1.ctlflags);
+	return (ret);
+}
+
+/*
+ * __rep_vote2 --
+ *	Handle incoming vote2 message on a client.
+ *
+ * PUBLIC: int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote2(env, rp, rec, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+{
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	REP_OLD_VOTE_INFO *ovi;
+	__rep_vote_info_args tmpvi, *vi;
+	u_int32_t egen;
+	int ret;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT, "We received a vote%s",
+	    F_ISSET(rep, REP_F_MASTER) ? " (master)" : ""));
+	if (F_ISSET(rep, REP_F_MASTER)) {
+		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+		if (IS_USING_LEASES(env))
+			ret = __rep_lease_refresh(env);
+		return (ret);
+	}
+
+	REP_SYSTEM_LOCK(env);
+	egen = rep->egen;
+
+	/*
+	 * We might be the last to the party and we haven't had
+	 * time to tally all the vote1's, but others have and
+	 * decided we're the winner.  So, if we're in the process
+	 * of tallying sites, keep the vote so that when our
+	 * election thread catches up we'll have the votes we
+	 * already received.
+	 */
+	/*
+	 * In 4.7 we changed to having fixed sized u_int32_t's from
+	 * non-fixed 'int' fields in the vote structure.
+	 */
+	if (rp->rep_version < DB_REPVERSION_47) {
+		ovi = (REP_OLD_VOTE_INFO *)rec->data;
+		tmpvi.egen = ovi->egen;
+		tmpvi.nsites = (u_int32_t)ovi->nsites;
+		tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+		tmpvi.priority = (u_int32_t)ovi->priority;
+		tmpvi.tiebreaker = ovi->tiebreaker;
+	} else
+		if ((ret = __rep_vote_info_unmarshal(env,
+		    &tmpvi, rec->data, rec->size, NULL)) != 0)
+			return (ret);
+	vi = &tmpvi;
+	if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Not in election gen %lu, at %lu, got vote",
+		    (u_long)vi->egen, (u_long)rep->egen));
+		ret = DB_REP_HOLDELECTION;
+		goto err;
+	}
+
+	/*
+	 * Record this vote.  In a VOTE2, the only valid entry
+	 * in the vote information is the election generation.
+	 *
+	 * There are several things which can go wrong that we
+	 * need to account for:
+	 * 1. If we receive a latent VOTE2 from an earlier election,
+	 * we want to ignore it.
+	 * 2. If we receive a VOTE2 from a site from which we never
+	 * received a VOTE1, we want to record it, because we simply
+	 * may be processing messages out of order or its vote1 got lost,
+	 * but that site got all the votes it needed to send it.
+	 * 3. If we have received a duplicate VOTE2 from this election
+	 * from the same site we want to ignore it.
+	 * 4. If this is from the current election and someone is
+	 * really voting for us, then we finally get to record it.
+	 */
+	/*
+	 * Case 1.
+	 */
+	if (vi->egen != rep->egen) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Bad vote egen %lu.  Mine %lu",
+		    (u_long)vi->egen, (u_long)rep->egen));
+		ret = 0;
+		goto err;
+	}
+
+	/*
+	 * __rep_tally takes care of cases 2, 3 and 4.
+	 */
+	if ((ret = __rep_tally(env, rep, eid, &rep->votes, vi->egen, 2)) != 0) {
+		if (ret == DB_REP_IGNORE)
+			ret = 0;
+		goto err;
+	}
+	RPRINT(env, (env, DB_VERB_REP_ELECT, "Counted vote %d of %d",
+	    rep->votes, rep->nvotes));
+	if (I_HAVE_WON(rep, rep->winner)) {
+		__rep_elect_master(env, rep);
+		ret = DB_REP_NEWMASTER;
+	}
+
+err:	REP_SYSTEM_UNLOCK(env);
+	if (ret == DB_REP_NEWMASTER)
+		ret = __rep_fire_elected(env, rep, egen);
+	return (ret);
+}
+
+/*
+ * __rep_tally --
+ *	Handle incoming vote message on a client.  This will record either a
+ *	VOTE1 or a VOTE2, depending on the "phase" value the caller passed in.
+ *
+ *	This function will return:
+ *	    0                if we successfully tally the vote;
+ *	    DB_REP_IGNORE    if the vote is properly ignored;
+ *	    (anything else)  in case of an unexpected error.
+ *
+ *	!!! Caller must hold REP_SYSTEM_LOCK.
+ */
+static int
+__rep_tally(env, rep, eid, countp, egen, phase)
+	ENV *env;
+	REP *rep;
+	int eid;
+	u_int32_t *countp;
+	u_int32_t egen;
+	int phase;
+{
+	REP_VTALLY *tally, *vtp;
+	u_int32_t i;
+	int ret;
+
+	if (rep->nsites > rep->asites &&
+	    (ret = __rep_grow_sites(env, rep->nsites)) != 0) {
+		RPRINT(env, (env, DB_VERB_REP_ELECT,
+		    "Grow sites returned error %d", ret));
+		return (ret);
+	}
+	if (phase == 1)
+		tally = R_ADDR(env->reginfo, rep->tally_off);
+	else
+		tally = R_ADDR(env->reginfo, rep->v2tally_off);
+	vtp = &tally[0];
+	for (i = 0; i < *countp;) {
+		/*
+		 * Ignore votes from earlier elections (i.e. we've heard
+		 * from this site in this election, but its vote from an
+		 * earlier election got delayed and we received it now).
+		 * However, if we happened to hear from an earlier vote
+		 * and we recorded it and we're now hearing from a later
+		 * election we want to keep the updated one.  Note that
+		 * updating the entry will not increase the count.
+		 * Also ignore votes that are duplicates.
+		 */
+		if (vtp->eid == eid) {
+			RPRINT(env, (env, DB_VERB_REP_ELECT,
+			    "Tally found[%d] (%d, %lu), this vote (%d, %lu)",
+				    i, vtp->eid, (u_long)vtp->egen,
+				    eid, (u_long)egen));
+			if (vtp->egen >= egen)
+				return (DB_REP_IGNORE);
+			else {
+				vtp->egen = egen;
+				return (0);
+			}
+		}
+		i++;
+		vtp = &tally[i];
+	}
+
+	/*
+	 * If we get here, we have a new voter we haven't seen before.  Tally
+	 * this vote.
+	 */
+	RPRINT(env, (env, DB_VERB_REP_ELECT, "Tallying VOTE%d[%d] (%d, %lu)",
+	    phase, i, eid, (u_long)egen));
+
+	vtp->eid = eid;
+	vtp->egen = egen;
+	(*countp)++;
+	return (0);
+}
+
+/*
+ * __rep_cmp_vote --
+ *	Compare incoming vote1 message on a client.  Called with the db_rep
+ *	mutex held.
+ *
+ */
+static void
+__rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags)
+	ENV *env;
+	REP *rep;
+	int eid;
+	DB_LSN *lsnp;
+	u_int32_t priority;
+	u_int32_t data_gen, flags, gen, tiebreaker;
+{
+	int cmp, like_pri;
+
+	cmp = LOG_COMPARE(lsnp, &rep->w_lsn);
+	/*
+	 * If we've seen more than one, compare us to the best so far.
+	 * If we're the first, make ourselves the winner to start.
+	 */
+	if (rep->sites > 1 &&
+	    (priority != 0 || LF_ISSET(REPCTL_ELECTABLE))) {
+		/*
+		 * Special case, if we have a mixed version group of sites,
+		 * we set priority to 0, but set the ELECTABLE flag so that
+		 * all sites talking at lower versions can correctly elect.
+		 * If a non-zero priority comes in and current winner is
+		 * zero priority (but was electable), then the non-zero
+		 * site takes precedence no matter what its LSN is.
+		 *
+		 * Then the data_gen determines the winner.  The site with
+		 * the more recent generation of data wins.
+		 *
+		 * Then LSN is determinant only if we're comparing
+		 * like-styled version/priorities at the same data_gen.  I.e.
+		 * both with 0/ELECTABLE priority or both with non-zero
+		 * priority.  Then actual priority value if LSNs
+		 * are equal, then tiebreaker if both are equal.
+		 */
+		/*
+		 * Make note if we're comparing the same types of priorities
+		 * that indicate electability or not.  We know we are
+		 * electable if we are here.
+		 */
+		like_pri = (priority == 0 && rep->w_priority == 0) ||
+		    (priority != 0 && rep->w_priority != 0);
+
+		if ((priority != 0 && rep->w_priority == 0) ||
+		    (like_pri && data_gen > rep->w_datagen) ||
+		    (like_pri && data_gen == rep->w_datagen && cmp > 0) ||
+		    (cmp == 0 && (priority > rep->w_priority ||
+		    (priority == rep->w_priority &&
+		    (tiebreaker > rep->w_tiebreaker))))) {
+			RPRINT(env, (env, DB_VERB_REP_ELECT,
+			    "Accepting new vote"));
+			rep->winner = eid;
+			rep->w_priority = priority;
+			rep->w_lsn = *lsnp;
+			rep->w_gen = gen;
+			rep->w_datagen = data_gen;
+			rep->w_tiebreaker = tiebreaker;
+		}
+	} else if (rep->sites == 1) {
+		if (priority != 0 || LF_ISSET(REPCTL_ELECTABLE)) {
+			/* Make ourselves the winner to start. */
+			rep->winner = eid;
+			rep->w_priority = priority;
+			rep->w_gen = gen;
+			rep->w_datagen = data_gen;
+			rep->w_lsn = *lsnp;
+			rep->w_tiebreaker = tiebreaker;
+		} else {
+			rep->winner = DB_EID_INVALID;
+			rep->w_priority = 0;
+			rep->w_gen = 0;
+			rep->w_datagen = 0;
+			ZERO_LSN(rep->w_lsn);
+			rep->w_tiebreaker = 0;
+		}
+	}
+}
+
+/*
+ * __rep_elect_init
+ *	Initialize an election.  Sets beginp non-zero if the election is
+ * already in progress; makes it 0 otherwise.  Leaves it untouched if we return
+ * DB_REP_NEWMASTER.
+ *
+ * Caller holds the REP_SYSTEM mutex, and relies on us not dropping it.
+ */
+static int
+__rep_elect_init(env, nsites, nvotes, beginp, otally)
+	ENV *env;
+	u_int32_t nsites, nvotes;
+	int *beginp;
+	u_int32_t *otally;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	ret = 0;
+
+	if (otally != NULL)
+		*otally = FLD_ISSET(rep->elect_flags, REP_E_TALLY);
+
+	DB_ASSERT(env, rep->spent_egen <= rep->egen);
+	*beginp = rep->spent_egen == rep->egen;
+	if (!*beginp) {
+		/*
+		 * Make sure that we always initialize all the election fields
+		 * before putting ourselves in an election state.  That means
+		 * issuing calls that can fail (allocation) before setting all
+		 * the variables.
+		 */
+		if (nsites > rep->asites &&
+		    (ret = __rep_grow_sites(env, nsites)) != 0)
+			goto err;
+		DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTINIT, ret, NULL);
+		rep->spent_egen = rep->egen;
+
+		STAT_INC(env, rep, election, rep->stat.st_elections, rep->egen);
+
+		/*
+		 * If we're the first to the party, we simply set initial
+		 * values: pre-existing values would be left over from previous
+		 * election.
+		 */
+		if (!IN_ELECTION_TALLY(rep)) {
+			rep->nsites = nsites;
+			rep->nvotes = nvotes;
+		} else {
+			if (nsites > rep->nsites)
+				rep->nsites = nsites;
+			if (nvotes > rep->nvotes)
+				rep->nvotes = nvotes;
+		}
+	}
+DB_TEST_RECOVERY_LABEL
+err:
+	return (ret);
+}
+
+/*
+ * __rep_elect_master
+ *	Set up for new master from election.  Must be called with
+ *	the replication region mutex held.
+ */
+static void
+__rep_elect_master(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	if (F_ISSET(rep, REP_F_MASTERELECT | REP_F_MASTER)) {
+		/* We've been through here already; avoid double counting. */
+		return;
+	}
+
+	F_SET(rep, REP_F_MASTERELECT);
+	STAT_INC(env, rep, election_won, rep->stat.st_elections_won, rep->egen);
+
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "Got enough votes to win; election done; (prev) gen %lu",
+	    (u_long)rep->gen));
+}
+
+static int
+__rep_fire_elected(env, rep, egen)
+	ENV *env;
+	REP *rep;
+	u_int32_t egen;
+{
+	REP_EVENT_LOCK(env);
+	if (rep->notified_egen < egen) {
+		__rep_fire_event(env, DB_EVENT_REP_ELECTED, NULL);
+		rep->notified_egen = egen;
+	}
+	REP_EVENT_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * Compute a sleep interval.
+ *
+ * The user specifies an overall timeout function, but checking is cheap and the
+ * timeout may be a generous upper bound.  So sleep for the smaller of .5s and
+ * timeout/10.  Make sure we sleep at least 1usec if timeout < 10.
+ */
+#define	SLEEPTIME(timeout)					\
+	((timeout > 5000000) ? 500000 : ((timeout >= 10) ? timeout / 10 : 1))
+
+/*
+ * __rep_wait --
+ *
+ * Sleep until the indicated phase is over, or the timeout expires.  The phase
+ * is over when someone clears the phase flag (in the course of processing an
+ * incoming message).  This could either be a normal progression one one phase
+ * to the other, or it could be due to receiving a NEWMASTER or an egen change.
+ * In all cases we simply return 0, and the caller should check the state of the
+ * world (generally under mutex protection) to decide what to do next.
+ */
+static int
+__rep_wait(env, timeoutp, full_elect, egen, flags)
+	ENV *env;
+	db_timeout_t *timeoutp;
+	int full_elect;
+	u_int32_t egen, flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int done;
+	u_int32_t sleeptime, sleeptotal, timeout;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	done = 0;
+
+	timeout = *timeoutp;
+	sleeptime = SLEEPTIME(timeout);
+	sleeptotal = 0;
+	while (sleeptotal < timeout) {
+		__os_yield(env, 0, sleeptime);
+		sleeptotal += sleeptime;
+		REP_SYSTEM_LOCK(env);
+		/*
+		 * Check if group membership changed while we were
+		 * sleeping.  Specifically we're trying for a full
+		 * election and someone is telling us we're joining
+		 * a previously established replication group.  (This is not
+		 * applicable for the phase 0 wait, which uses a completely
+		 * unrelated timeout value.)
+		 */
+		if (!LF_ISSET(REP_E_PHASE0) &&
+		    full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) {
+			*timeoutp = rep->elect_timeout;
+			timeout = *timeoutp;
+			if (sleeptotal >= timeout)
+				done = 1;
+			else
+				sleeptime = SLEEPTIME(timeout);
+		}
+
+		if (egen != rep->egen || !FLD_ISSET(rep->elect_flags, flags))
+			done = 1;
+		REP_SYSTEM_UNLOCK(env);
+
+		if (done)
+			return (0);
+	}
+	return (0);
+}
+
+/*
+ * __rep_grow_sites --
+ *	Called to allocate more space in the election tally information.
+ * Called with the rep mutex held.  We need to call the region mutex, so
+ * we need to make sure that we *never* acquire those mutexes in the
+ * opposite order.
+ */
+static int
+__rep_grow_sites(env, nsites)
+	ENV *env;
+	u_int32_t nsites;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	int ret, *tally;
+	u_int32_t nalloc;
+
+	rep = env->rep_handle->region;
+
+	/*
+	 * Allocate either twice the current allocation or nsites,
+	 * whichever is more.
+	 */
+	nalloc = 2 * rep->asites;
+	if (nalloc < nsites)
+		nalloc = nsites;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	MUTEX_LOCK(env, renv->mtx_regenv);
+
+	/*
+	 * We allocate 2 tally regions, one for tallying VOTE1's and
+	 * one for VOTE2's.  Always grow them in tandem, because if we
+	 * get more VOTE1's we'll always expect more VOTE2's then too.
+	 */
+	if ((ret = __env_alloc(infop,
+	    (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+		if (rep->tally_off != INVALID_ROFF)
+			 __env_alloc_free(
+			     infop, R_ADDR(infop, rep->tally_off));
+		rep->tally_off = R_OFFSET(infop, tally);
+		if ((ret = __env_alloc(infop,
+		    (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+			/* Success */
+			if (rep->v2tally_off != INVALID_ROFF)
+				 __env_alloc_free(infop,
+				    R_ADDR(infop, rep->v2tally_off));
+			rep->v2tally_off = R_OFFSET(infop, tally);
+			rep->asites = nalloc;
+			rep->nsites = nsites;
+		} else {
+			/*
+			 * We were unable to allocate both.  So, we must
+			 * free the first one and reinitialize.  If
+			 * v2tally_off is valid, it is from an old
+			 * allocation and we are clearing it all out due
+			 * to the error.
+			 */
+			if (rep->v2tally_off != INVALID_ROFF)
+				 __env_alloc_free(infop,
+				    R_ADDR(infop, rep->v2tally_off));
+			__env_alloc_free(infop,
+			    R_ADDR(infop, rep->tally_off));
+			rep->v2tally_off = rep->tally_off = INVALID_ROFF;
+			rep->asites = 0;
+		}
+	}
+	MUTEX_UNLOCK(env, renv->mtx_regenv);
+	return (ret);
+}
+
+/*
+ * __rep_send_vote
+ *	Send this site's vote for the election.
+ */
+static void
+__rep_send_vote(env, lsnp,
+ nsites, nvotes, pri, tie, egen, data_gen, eid, vtype, flags)
+	ENV *env;
+	DB_LSN *lsnp;
+	int eid;
+	u_int32_t nsites, nvotes, pri;
+	u_int32_t flags, egen, data_gen, tie, vtype;
+{
+	DB_REP *db_rep;
+	DBT vote_dbt;
+	REP *rep;
+	REP_OLD_VOTE_INFO ovi;
+	__rep_vote_info_args vi;
+	__rep_vote_info_v5_args vi5;
+	u_int8_t buf[__REP_VOTE_INFO_SIZE];
+	size_t len;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	memset(&vi, 0, sizeof(vi));
+	memset(&vote_dbt, 0, sizeof(vote_dbt));
+
+	/*
+	 * In 4.7 we went to fixed sized fields.  They may not be
+	 * the same as the sizes in older versions.  In 5.2 we
+	 * added the data_gen.
+	 */
+	if (rep->version < DB_REPVERSION_47) {
+		ovi.egen = egen;
+		ovi.priority = (int) pri;
+		ovi.nsites = (int) nsites;
+		ovi.nvotes = (int) nvotes;
+		ovi.tiebreaker = tie;
+		DB_INIT_DBT(vote_dbt, &ovi, sizeof(ovi));
+	} else if (rep->version < DB_REPVERSION_52) {
+		vi5.egen = egen;
+		vi5.priority = pri;
+		vi5.nsites = nsites;
+		vi5.nvotes = nvotes;
+		vi5.tiebreaker = tie;
+		(void)__rep_vote_info_v5_marshal(env, &vi5, buf,
+		    __REP_VOTE_INFO_SIZE, &len);
+		DB_INIT_DBT(vote_dbt, buf, len);
+	} else {
+		vi.egen = egen;
+		vi.priority = pri;
+		vi.nsites = nsites;
+		vi.nvotes = nvotes;
+		vi.tiebreaker = tie;
+		vi.data_gen = data_gen;
+		(void)__rep_vote_info_marshal(env, &vi, buf,
+		    __REP_VOTE_INFO_SIZE, &len);
+		DB_INIT_DBT(vote_dbt, buf, len);
+	}
+
+	(void)__rep_send_message(env, eid, vtype, lsnp, &vote_dbt, flags, 0);
+}
diff --git a/src/rep/rep_lease.c b/src/rep/rep_lease.c
new file mode 100644
index 00000000..047c39a7
--- /dev/null
+++ b/src/rep/rep_lease.c
@@ -0,0 +1,545 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2007, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
+
+/*
+ * __rep_update_grant -
+ *      Update a client's lease grant for this perm record
+ *	and send the grant to the master.  Caller must
+ *	hold the mtx_clientdb mutex.  Timespec given is in
+ *	host local format.
+ *
+ * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
+ */
+int
+__rep_update_grant(env, ts)
+	ENV *env;
+	db_timespec *ts;
+{
+	DBT lease_dbt;
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	__rep_grant_info_args gi;
+	db_timespec mytime;
+	u_int8_t buf[__REP_GRANT_INFO_SIZE];
+	int master, ret;
+	size_t len;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	timespecclear(&mytime);
+
+	/*
+	 * Get current time, and add in the (skewed) lease duration
+	 * time to send the grant to the master.
+	 */
+	__os_gettime(env, &mytime, 1);
+	timespecadd(&mytime, &rep->lease_duration);
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * If we are in an election, we cannot grant the lease.
+	 * We need to check under the region mutex.
+	 */
+	if (IN_ELECTION(rep)) {
+		REP_SYSTEM_UNLOCK(env);
+		return (0);
+	}
+	if (timespeccmp(&mytime, &rep->grant_expire, >))
+		rep->grant_expire = mytime;
+	F_CLR(rep, REP_F_LEASE_EXPIRED);
+	REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Send the LEASE_GRANT message with the current lease grant
+	 * no matter if we've actually extended the lease or not.
+	 */
+	gi.msg_sec = (u_int32_t)ts->tv_sec;
+	gi.msg_nsec = (u_int32_t)ts->tv_nsec;
+
+	if ((ret = __rep_grant_info_marshal(env, &gi, buf,
+	    __REP_GRANT_INFO_SIZE, &len)) != 0)
+		return (ret);
+	DB_INIT_DBT(lease_dbt, buf, len);
+	/*
+	 * Don't send to the master if this site has zero priority because
+	 * our site cannot count toward the data being safe.
+	 */
+	if ((master = rep->master_id) != DB_EID_INVALID && rep->priority > 0)
+		(void)__rep_send_message(env, master, REP_LEASE_GRANT,
+		    &lp->max_perm_lsn, &lease_dbt, 0, 0);
+	return (0);
+}
+
+/*
+ * __rep_islease_granted -
+ *      Return 0 if this client has no outstanding lease granted.
+ *	Return 1 otherwise.
+ *	Caller must hold the REP_SYSTEM (region) mutex, and (rep_elect) relies
+ *      on us not dropping it.
+ *
+ * PUBLIC: int __rep_islease_granted __P((ENV *));
+ */
+int
+__rep_islease_granted(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	db_timespec mytime;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	/*
+	 * Get current time and compare against our granted lease.
+	 */
+	timespecclear(&mytime);
+	__os_gettime(env, &mytime, 1);
+
+	return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
+}
+
+/*
+ * __rep_lease_table_alloc -
+ *	Allocate the lease table on a master.  Called with rep mutex
+ * held.  We need to acquire the env region mutex, so we need to
+ * make sure we never acquire those mutexes in the opposite order.
+ *
+ * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+ */
+int
+__rep_lease_table_alloc(env, nsites)
+	ENV *env;
+	u_int32_t nsites;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	REP_LEASE_ENTRY *le, *table;
+	int *lease, ret;
+	u_int32_t i;
+
+	rep = env->rep_handle->region;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	MUTEX_LOCK(env, renv->mtx_regenv);
+	/*
+	 * If we have an old table from some other time, free it and
+	 * allocate ourselves a new one that is known to be for
+	 * the right number of sites.
+	 */
+	if (rep->lease_off != INVALID_ROFF) {
+		__env_alloc_free(infop,
+		    R_ADDR(infop, rep->lease_off));
+		rep->lease_off = INVALID_ROFF;
+	}
+	ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
+	    &lease);
+	MUTEX_UNLOCK(env, renv->mtx_regenv);
+	if (ret != 0)
+		return (ret);
+	else
+		rep->lease_off = R_OFFSET(infop, lease);
+	table = R_ADDR(infop, rep->lease_off);
+	for (i = 0; i < nsites; i++) {
+		le = &table[i];
+		le->eid = DB_EID_INVALID;
+		timespecclear(&le->start_time);
+		timespecclear(&le->end_time);
+		ZERO_LSN(le->lease_lsn);
+	}
+	return (0);
+}
+
+/*
+ * __rep_lease_grant -
+ *	Handle incoming REP_LEASE_GRANT message on a master.
+ *
+ * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_lease_grant(env, rp, rec, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	__rep_grant_info_args gi;
+	REP_LEASE_ENTRY *le;
+	db_timespec msg_time;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	if ((ret = __rep_grant_info_unmarshal(env,
+	    &gi, rec->data, rec->size, NULL)) != 0)
+		return (ret);
+	timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
+	le = NULL;
+
+	/*
+	 * Get current time, and add in the (skewed) lease duration
+	 * time to send the grant to the master.
+	 */
+	REP_SYSTEM_LOCK(env);
+	__rep_find_entry(env, rep, eid, &le);
+	/*
+	 * We either get back this site's entry, or an empty entry
+	 * that we need to initialize.
+	 */
+	DB_ASSERT(env, le != NULL);
+	/*
+	 * Update the entry if it is an empty entry or if the new
+	 * lease grant is a later start time than the current one.
+	 */
+	VPRINT(env, (env, DB_VERB_REP_LEASE,
+	    "lease_grant: grant msg time %lu %lu",
+	    (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
+	if (le->eid == DB_EID_INVALID ||
+	    timespeccmp(&msg_time, &le->start_time, >)) {
+		le->eid = eid;
+		le->start_time = msg_time;
+		le->end_time = le->start_time;
+		timespecadd(&le->end_time, &rep->lease_duration);
+		VPRINT(env, (env, DB_VERB_REP_LEASE,
+    "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
+    le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
+    (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
+    (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
+	}
+	/*
+	 * Only update the lease table with a larger LSN value
+	 * than the previous entry. This handles the case of a
+	 * lagging record with a later start time, which is
+	 * sometimes possible when a failed lease check resends
+	 * the last permanent record.
+	 */
+	if (LOG_COMPARE(&rp->lsn, &le->lease_lsn) > 0) {
+		le->lease_lsn = rp->lsn;
+		VPRINT(env, (env, DB_VERB_REP_LEASE,
+		    "lease_grant: eid %d, lease_lsn [%lu][%lu]",
+		    le->eid, (u_long)le->lease_lsn.file,
+		    (u_long)le->lease_lsn.offset));
+	}
+	REP_SYSTEM_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * Find the entry for the given EID.  Or the first empty one.
+ */
+static void
+__rep_find_entry(env, rep, eid, lep)
+	ENV *env;
+	REP *rep;
+	int eid;
+	REP_LEASE_ENTRY **lep;
+{
+	REGINFO *infop;
+	REP_LEASE_ENTRY *le, *table;
+	u_int32_t i;
+
+	infop = env->reginfo;
+	table = R_ADDR(infop, rep->lease_off);
+
+	for (i = 0; i < rep->config_nsites; i++) {
+		le = &table[i];
+		/*
+		 * Find either the one that matches the client's
+		 * EID or the first empty one.
+		 */
+		if (le->eid == eid || le->eid == DB_EID_INVALID) {
+			*lep = le;
+			return;
+		}
+	}
+	return;
+}
+
+/*
+ * __rep_lease_check -
+ *      Return 0 if this master holds valid leases and can confirm
+ *	its mastership.  If leases are expired, an attempt is made
+ *	to refresh the leases.  If that fails, then return the
+ *	DB_REP_LEASE_EXPIRED error to the user.  No mutexes held.
+ *
+ * PUBLIC: int __rep_lease_check __P((ENV *, int));
+ */
+int
+__rep_lease_check(env, refresh)
+	ENV *env;
+	int refresh;
+{
+	DB_LOG *dblp;
+	DB_LSN lease_lsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REGINFO *infop;
+	REP *rep;
+	REP_LEASE_ENTRY *le, *table;
+	db_timespec curtime;
+	int max_tries, ret, tries;
+	u_int32_t i, min_leases, valid_leases;
+
+	infop = env->reginfo;
+	tries = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	LOG_SYSTEM_LOCK(env);
+	lease_lsn = lp->max_perm_lsn;
+	LOG_SYSTEM_UNLOCK(env);
+#ifdef HAVE_STATISTICS
+	rep->stat.st_lease_chk++;
+#endif
+	/*
+	 * Set the maximum number of retries to be 2x the lease timeout
+	 * so that if a site is waiting to sync, it has a chance to do so.
+	 */
+	max_tries = (int)(rep->lease_timeout / (LEASE_REFRESH_USEC / 2));
+	if (max_tries < LEASE_REFRESH_MIN)
+		max_tries = LEASE_REFRESH_MIN;
+retry:
+	REP_SYSTEM_LOCK(env);
+	min_leases = rep->config_nsites / 2;
+	ret = 0;
+	__os_gettime(env, &curtime, 1);
+	VPRINT(env, (env, DB_VERB_REP_LEASE,
+"%s %d of %d refresh %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
+	    "lease_check: try ", tries, max_tries, refresh,
+	    (u_long)min_leases, (u_long)curtime.tv_sec,
+	    (u_long)curtime.tv_nsec,
+	    (u_long)lease_lsn.file,
+	    (u_long)lease_lsn.offset));
+	table = R_ADDR(infop, rep->lease_off);
+	for (i = 0, valid_leases = 0;
+	    i < rep->config_nsites && valid_leases < min_leases; i++) {
+		le = &table[i];
+		/*
+		 * Count this lease as valid if:
+		 * - It is a valid entry (has an EID).
+		 * - The lease has not expired.
+		 * - The LSN is up to date.
+		 */
+		if (le->eid != DB_EID_INVALID) {
+			VPRINT(env, (env, DB_VERB_REP_LEASE,
+		    "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
+			    (u_long)valid_leases, le->eid,
+			    (u_long)le->lease_lsn.file,
+			    (u_long)le->lease_lsn.offset));
+			VPRINT(env, (env, DB_VERB_REP_LEASE,
+			    "lease_check: endtime %lu %lu",
+			    (u_long)le->end_time.tv_sec,
+			    (u_long)le->end_time.tv_nsec));
+		}
+		if (le->eid != DB_EID_INVALID &&
+		    timespeccmp(&le->end_time, &curtime, >=) &&
+		    LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0)
+			valid_leases++;
+	}
+	REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Now see if we have enough.
+	 */
+	VPRINT(env, (env, DB_VERB_REP_LEASE, "valid %lu, min %lu",
+	    (u_long)valid_leases, (u_long)min_leases));
+	if (valid_leases < min_leases) {
+#ifdef HAVE_STATISTICS
+		rep->stat.st_lease_chk_misses++;
+#endif
+		if (!refresh || tries > max_tries)
+			ret = DB_REP_LEASE_EXPIRED;
+		else {
+			/*
+			 * If we are successful, we need to recheck the leases
+			 * because the lease grant messages may have raced with
+			 * the PERM acknowledgement.  Give the grant messages
+			 * a chance to arrive and be processed.
+			 */
+			if (((tries % 10) == 5 &&
+			    (ret = __rep_lease_refresh(env)) == 0) ||
+			    (tries % 10) != 5) {
+				/*
+				 * If we were successful sending, but
+				 * not in racing the message threads,
+				 * then yield the processor so that
+				 * the message threads get a chance
+				 * to run.
+				 */
+				if (tries > 0)
+					__os_yield(env, 0, LEASE_REFRESH_USEC);
+				tries++;
+#ifdef HAVE_STATISTICS
+				rep->stat.st_lease_chk_refresh++;
+#endif
+				goto retry;
+			}
+		}
+	}
+
+	if (ret == DB_REP_LEASE_EXPIRED)
+		RPRINT(env, (env, DB_VERB_REP_LEASE,
+		    "lease_check: Expired.  Only %lu valid",
+		    (u_long)valid_leases));
+	return (ret);
+}
+
+/*
+ * __rep_lease_refresh -
+ *	Find the last permanent record and send that out so that it
+ *	forces clients to grant their leases.
+ *
+ *	If there is no permanent record, this function cannot refresh
+ *	leases.  That should not happen because the master should write
+ *	a checkpoint when it starts, if there is no other perm record.
+ *
+ * PUBLIC: int __rep_lease_refresh __P((ENV *));
+ */
+int
+__rep_lease_refresh(env)
+	ENV *env;
+{
+	DBT rec;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	int ret, t_ret;
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	memset(&rec, 0, sizeof(rec));
+	memset(&lsn, 0, sizeof(lsn));
+	/*
+	 * Use __rep_log_backup to find the last PERM record.
+	 */
+	if ((ret = __rep_log_backup(env, logc, &lsn, REP_REC_PERM)) != 0) {
+		/*
+		 * If there is no PERM record, then we get DB_NOTFOUND.
+		 */
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		goto err;
+	}
+
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
+		goto err;
+
+	(void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn,
+	    &rec, REPCTL_LEASE, 0);
+
+err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_lease_expire -
+ *	Proactively expire all leases granted to us.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_lease_expire __P((ENV *));
+ */
+int
+__rep_lease_expire(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REGINFO *infop;
+	REP *rep;
+	REP_LEASE_ENTRY *le, *table;
+	int ret;
+	u_int32_t i;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+
+	if (rep->lease_off != INVALID_ROFF) {
+		table = R_ADDR(infop, rep->lease_off);
+		/*
+		 * Expire all leases forcibly.  We are guaranteed that the
+		 * start_time for all leases are not in the future.  Therefore,
+		 * set the end_time to the start_time.
+		 */
+		for (i = 0; i < rep->config_nsites; i++) {
+			le = &table[i];
+			le->end_time = le->start_time;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __rep_lease_waittime -
+ *	Return the amount of time remaining on a granted lease.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
+ */
+db_timeout_t
+__rep_lease_waittime(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	db_timespec exptime, mytime;
+	db_timeout_t to;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	exptime = rep->grant_expire;
+	to = 0;
+	/*
+	 * If the lease has never been granted, we must wait a full
+	 * lease timeout because we could be freshly rebooted after
+	 * a crash and a lease could be granted from a previous
+	 * incarnation of this client.  However, if the lease has never
+	 * been granted, and this client has already waited a full
+	 * lease timeout, we know our lease cannot be granted and there
+	 * is no need to wait again.
+	 */
+	RPRINT(env, (env, DB_VERB_REP_LEASE,
+    "wait_time: grant_expire %lu %lu lease_to %lu",
+	    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
+	    (u_long)rep->lease_timeout));
+	if (!timespecisset(&exptime)) {
+		if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
+			to = rep->lease_timeout;
+	} else {
+		__os_gettime(env, &mytime, 1);
+		RPRINT(env, (env, DB_VERB_REP_LEASE,
+    "wait_time: mytime %lu %lu, grant_expire %lu %lu",
+		    (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
+		    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
+		if (timespeccmp(&mytime, &exptime, <=)) {
+			/*
+			 * If the current time is before the grant expiration
+			 * compute the difference and return remaining grant
+			 * time.
+			 */
+			timespecsub(&exptime, &mytime);
+			DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
+		}
+	}
+	return (to);
+}
diff --git a/src/rep/rep_log.c b/src/rep/rep_log.c
new file mode 100644
index 00000000..42300685
--- /dev/null
+++ b/src/rep/rep_log.c
@@ -0,0 +1,1060 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static int __rep_chk_newfile __P((ENV *, DB_LOGC *, REP *,
+    __rep_control_args *, int));
+static int __rep_log_split __P((ENV *, DB_THREAD_INFO *,
+    __rep_control_args *, DBT *, DB_LSN *, DB_LSN *));
+
+/*
+ * __rep_allreq --
+ *      Handle a REP_ALL_REQ message.
+ *
+ * PUBLIC: int __rep_allreq __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_allreq(env, rp, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	int eid;
+{
+	DBT data_dbt, newfiledbt;
+	DB_LOGC *logc;
+	DB_LSN log_end, oldfilelsn;
+	DB_REP *db_rep;
+	REP *rep;
+	REP_BULK bulk;
+	REP_THROTTLE repth;
+	__rep_newfile_args nf_args;
+	uintptr_t bulkoff;
+	u_int32_t bulkflags, end_flag, flags, use_bulk;
+	int arch_flag, ret, t_ret;
+	u_int8_t buf[__REP_NEWFILE_SIZE];
+	size_t len;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	end_flag = 0;
+	arch_flag = 0;
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	/*
+	 * If we're doing bulk transfer, allocate a bulk buffer to put our
+	 * log records in.  We still need to initialize the throttle info
+	 * because if we encounter a log record larger than our entire bulk
+	 * buffer, we need to send it as a singleton and also we want to
+	 * support throttling with bulk.
+	 *
+	 * Use a local var so we don't need to worry if someone else turns
+	 * on/off bulk in the middle of our call.
+	 */
+	use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+	bulk.addr = NULL;
+	if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+	    &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+		goto err;
+	memset(&repth, 0, sizeof(repth));
+	REP_SYSTEM_LOCK(env);
+	if ((ret = __rep_lockout_archive(env, rep)) != 0) {
+		REP_SYSTEM_UNLOCK(env);
+		goto err;
+	}
+	arch_flag = 1;
+	repth.gbytes = rep->gbytes;
+	repth.bytes = rep->bytes;
+	oldfilelsn = repth.lsn = rp->lsn;
+	repth.type = REP_LOG;
+	repth.data_dbt = &data_dbt;
+	REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Get the LSN of the end of the log, so that in our reading loop
+	 * (below), we can recognize when we get there, and set the
+	 * REPCTL_LOG_END flag.
+	 */
+	if ((ret = __logc_get(logc, &log_end, &data_dbt, DB_LAST)) != 0) {
+		if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+			ret = 0;
+		goto err;
+	}
+
+	flags = IS_ZERO_LSN(rp->lsn) ||
+	    IS_INIT_LSN(rp->lsn) ?  DB_FIRST : DB_SET;
+	/*
+	 * We get the first item so that a client servicing requests
+	 * can distinguish between not having the records and reaching
+	 * the end of its log.  Return the DB_NOTFOUND if the client
+	 * cannot get the record.  Return 0 if we finish the loop and
+	 * sent all that we have.
+	 */
+	ret = __logc_get(logc, &repth.lsn, &data_dbt, flags);
+	/*
+	 * If the client is asking for all records
+	 * because it doesn't have any, and our first
+	 * record is not in the first log file, then
+	 * the client is outdated and needs to get a
+	 * VERIFY_FAIL.
+	 */
+	if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) {
+		if (F_ISSET(rep, REP_F_CLIENT))
+			ret = DB_NOTFOUND;
+		else
+			(void)__rep_send_message(env, eid,
+			    REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0);
+		goto err;
+	}
+	/*
+	 * If we got DB_NOTFOUND it could be because the LSN we were
+	 * given is at the end of the log file and we need to switch
+	 * log files.  Reinitialize and get the current record when we return.
+	 */
+	if (ret == DB_NOTFOUND) {
+		ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+		/*
+		 * If we still get DB_NOTFOUND the client gave us a
+		 * bad or unknown LSN.  Ignore it if we're the master.
+		 * Any other error is returned.
+		 */
+		if (ret == 0)
+			ret = __logc_get(logc, &repth.lsn,
+			    &data_dbt, DB_CURRENT);
+		if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) {
+			ret = 0;
+			goto err;
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * For singleton log records, we break when we get a REP_LOG_MORE.
+	 * Or if we're not using throttling, or we are using bulk, we stop
+	 * when we reach the end (i.e. ret != 0).
+	 */
+	for (end_flag = 0;
+	    ret == 0 && repth.type != REP_LOG_MORE && end_flag == 0;
+	    ret = __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) {
+		/*
+		 * If we just changed log files, we need to send the
+		 * version of this log file to the client.
+		 */
+		if (repth.lsn.file != oldfilelsn.file) {
+			if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+				break;
+			memset(&newfiledbt, 0, sizeof(newfiledbt));
+			if (rep->version < DB_REPVERSION_47)
+				DB_INIT_DBT(newfiledbt, &nf_args.version,
+				    sizeof(nf_args.version));
+			else {
+				if ((ret = __rep_newfile_marshal(env, &nf_args,
+				    buf, __REP_NEWFILE_SIZE, &len)) != 0)
+					goto err;
+				DB_INIT_DBT(newfiledbt, buf, len);
+			}
+			(void)__rep_send_message(env,
+			    eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+			    REPCTL_RESEND, 0);
+		}
+
+		/*
+		 * Mark the end of the ALL_REQ response to show that the
+		 * receiving client should now be "caught up" with the
+		 * replication group.  If we're the master, then our log end is
+		 * certainly authoritative.  If we're another client, only if we
+		 * ourselves have reached STARTUPDONE.
+		 */
+		end_flag = (LOG_COMPARE(&repth.lsn, &log_end) >= 0 &&
+		    (F_ISSET(rep, REP_F_MASTER) ||
+		    rep->stat.st_startup_complete)) ?
+		    REPCTL_LOG_END : 0;
+		/*
+		 * If we are configured for bulk, try to send this as a bulk
+		 * request.  If not configured, or it is too big for bulk
+		 * then just send normally.
+		 */
+		if (use_bulk)
+			ret = __rep_bulk_message(env, &bulk, &repth,
+			    &repth.lsn, &data_dbt, (REPCTL_RESEND | end_flag));
+		if (!use_bulk || ret == DB_REP_BULKOVF)
+			ret = __rep_send_throttle(env,
+			    eid, &repth, 0, end_flag);
+		if (ret != 0)
+			break;
+		/*
+		 * If we are about to change files, then we'll need the
+		 * last LSN in the previous file.  Save it here.
+		 */
+		oldfilelsn = repth.lsn;
+		oldfilelsn.offset += logc->len;
+	}
+
+	if (ret == DB_NOTFOUND || ret == DB_REP_UNAVAIL)
+		ret = 0;
+	/*
+	 * We're done, force out whatever remains in the bulk buffer and
+	 * free it.
+	 */
+err:
+	/*
+	 * We could have raced an unlink from an earlier log_archive
+	 * and the user is removing the files themselves, now.  If
+	 * we get an error indicating the log file might no longer
+	 * exist, ignore it.
+	 */
+	if (ret == ENOENT)
+		ret = 0;
+	if (bulk.addr != NULL && (t_ret = __rep_bulk_free(env, &bulk,
+	    (REPCTL_RESEND | end_flag))) != 0 && ret == 0 &&
+	    t_ret != DB_REP_UNAVAIL)
+		ret = t_ret;
+	if (arch_flag) {
+		REP_SYSTEM_LOCK(env);
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+		REP_SYSTEM_UNLOCK(env);
+	}
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_log --
+ *      Handle a REP_LOG/REP_LOG_MORE message.
+ *
+ * PUBLIC: int __rep_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC:     __rep_control_args *, DBT *, int, time_t, DB_LSN *));
+ */
+int
+__rep_log(env, ip, rp, rec, eid, savetime, ret_lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+	time_t savetime;
+	DB_LSN *ret_lsnp;
+{
+	DB_LOG *dblp;
+	DB_LSN last_lsn, lsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	int is_dup, master, ret;
+	u_int32_t gapflags;
+
+	is_dup = ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	ret = __rep_apply(env, ip, rp, rec, ret_lsnp, &is_dup, &last_lsn);
+	switch (ret) {
+	/*
+	 * We're in an internal backup and we've gotten
+	 * all the log we need to run recovery.  Do so now.
+	 */
+	case DB_REP_LOGREADY:
+		if ((ret =
+		    __rep_logready(env, rep, savetime, &last_lsn)) != 0)
+			goto out;
+		break;
+	/*
+	 * If we get any of the "normal" returns, we only process
+	 * LOG_MORE if this is not a duplicate record.  If the
+	 * record is a duplicate we don't want to handle LOG_MORE
+	 * and request a multiple data stream (or trigger internal
+	 * initialization) since this could be a very old record
+	 * that no longer exists on the master.
+	 */
+	case DB_REP_ISPERM:
+	case DB_REP_NOTPERM:
+	case 0:
+		if (is_dup)
+			goto out;
+		else
+			break;
+	/*
+	 * Any other return (errors), we're done.
+	 */
+	default:
+		goto out;
+	}
+	if (rp->rectype == REP_LOG_MORE) {
+		master = rep->master_id;
+
+		/*
+		 * Keep the cycle from stalling: In case we got the LOG_MORE out
+		 * of order, before some preceding log records, we want to make
+		 * sure our follow-up request resumes from where the LOG_MORE
+		 * said it should.  (If the preceding log records never arrive,
+		 * normal gap processing should take care of asking for them.)
+		 * But if we already have this record and/or more, we need to
+		 * ask to resume from what we need.  The upshot is we need the
+		 * max of lp->lsn and the lsn from the message.
+		 */
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		lsn = lp->ready_lsn;
+		if (LOG_COMPARE(&rp->lsn, &lsn) > 0)
+			lsn = rp->lsn;
+
+		/*
+		 * If the master_id is invalid, this means that since
+		 * the last record was sent, somebody declared an
+		 * election and we may not have a master to request
+		 * things of.
+		 *
+		 * This is not an error;  when we find a new master,
+		 * we'll re-negotiate where the end of the log is and
+		 * try to bring ourselves up to date again anyway.
+		 */
+		if (master == DB_EID_INVALID) {
+			ret = 0;
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			goto out;
+		}
+		/*
+		 * If we're waiting for records, set the wait_ts
+		 * high so that we avoid re-requesting too soon and
+		 * end up with multiple data streams.
+		 */
+		if (IS_ZERO_LSN(lp->waiting_lsn))
+			lp->wait_ts = rep->max_gap;
+		/*
+		 * If preceding log records were from the master, send the
+		 * request for further log records to the master instead of
+		 * allowing it to default to ANYWHERE.
+		 */
+		gapflags = REP_GAP_FORCE;
+		if (master == eid)
+			gapflags = gapflags | REP_GAP_REREQUEST;
+		ret = __rep_loggap_req(env, rep, &lsn, gapflags);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+out:
+	return (ret);
+}
+
+/*
+ * __rep_bulk_log --
+ *      Handle a REP_BULK_LOG message.
+ *
+ * PUBLIC: int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC:     __rep_control_args *, DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__rep_control_args *rp;
+	DBT *rec;
+	time_t savetime;
+	DB_LSN *ret_lsnp;
+{
+	DB_LSN last_lsn;
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	ret = __rep_log_split(env, ip, rp, rec, ret_lsnp, &last_lsn);
+	switch (ret) {
+	/*
+	 * We're in an internal backup and we've gotten
+	 * all the log we need to run recovery.  Do so now.
+	 */
+	case DB_REP_LOGREADY:
+		ret = __rep_logready(env, rep, savetime, &last_lsn);
+		break;
+	/*
+	 * Any other return (errors), we're done.
+	 */
+	default:
+		break;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_log_split --
+ *	- Split a log buffer into individual records.
+ *
+ * This is used by a client to process a bulk log message from the
+ * master and convert it into individual __rep_apply requests.
+ */
+static int
+__rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__rep_control_args *rp;
+	DBT *rec;
+	DB_LSN *ret_lsnp;
+	DB_LSN *last_lsnp;
+{
+	DBT logrec;
+	DB_LSN next_new_lsn, save_lsn, tmp_lsn;
+	__rep_control_args tmprp;
+	__rep_bulk_args b_args;
+	int is_dup, ret, save_ret;
+	u_int32_t save_flags;
+	u_int8_t *p, *ep;
+
+	memset(&logrec, 0, sizeof(logrec));
+	ZERO_LSN(next_new_lsn);
+	ZERO_LSN(save_lsn);
+	ZERO_LSN(tmp_lsn);
+	/*
+	 * We're going to be modifying the rp LSN contents so make
+	 * our own private copy to play with.
+	 */
+	memcpy(&tmprp, rp, sizeof(tmprp));
+	/*
+	 * We send the bulk buffer on a PERM record, so often we will have
+	 * DB_LOG_PERM set.  However, we only want to mark the last LSN
+	 * we have as a PERM record.  So clear it here, and when we're on
+	 * the last record below, set it.  The same applies if the sender
+	 * set REPCTL_LOG_END on this message.  We want the end of the
+	 * bulk buffer to be marked as the end.
+	 */
+	save_flags = F_ISSET(rp, REPCTL_LOG_END | REPCTL_PERM);
+	F_CLR(&tmprp, REPCTL_LOG_END | REPCTL_PERM);
+	is_dup = ret = save_ret = 0;
+	for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
+	    p < ep; ) {
+		/*
+		 * First thing in the buffer is the length.  Then the LSN
+		 * of this record, then the record itself.
+		 */
+		if (rp->rep_version < DB_REPVERSION_47) {
+			memcpy(&b_args.len, p, sizeof(b_args.len));
+			p += sizeof(b_args.len);
+			memcpy(&tmprp.lsn, p, sizeof(DB_LSN));
+			p += sizeof(DB_LSN);
+			logrec.data = p;
+			logrec.size = b_args.len;
+			p += b_args.len;
+		} else {
+			if ((ret = __rep_bulk_unmarshal(env,
+			    &b_args, p, rec->size, &p)) != 0)
+				return (ret);
+			tmprp.lsn = b_args.lsn;
+			logrec.data = b_args.bulkdata.data;
+			logrec.size = b_args.len;
+		}
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "log_rep_split: Processing LSN [%lu][%lu]",
+		    (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+    "log_rep_split: p %#lx ep %#lx logrec data %#lx, size %lu (%#lx)",
+		    P_TO_ULONG(p), P_TO_ULONG(ep), P_TO_ULONG(logrec.data),
+		    (u_long)logrec.size, (u_long)logrec.size));
+		if (p >= ep && save_flags)
+			F_SET(&tmprp, save_flags);
+		/*
+		 * A previous call to __rep_apply indicated an earlier
+		 * record is a dup and the next_new_lsn we are waiting for.
+		 * Skip log records until we catch up with next_new_lsn.
+		 */
+		if (is_dup && LOG_COMPARE(&tmprp.lsn, &next_new_lsn) < 0) {
+			VPRINT(env, (env, DB_VERB_REP_MISC,
+			    "log_split: Skip dup LSN [%lu][%lu]",
+			    (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+			continue;
+		}
+		is_dup = 0;
+		ret = __rep_apply(env, ip,
+		    &tmprp, &logrec, &tmp_lsn, &is_dup, last_lsnp);
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "log_split: rep_apply ret %d, dup %d, tmp_lsn [%lu][%lu]",
+		    ret, is_dup, (u_long)tmp_lsn.file, (u_long)tmp_lsn.offset));
+		if (is_dup)
+			next_new_lsn = tmp_lsn;
+		switch (ret) {
+		/*
+		 * If we received the pieces we need for running recovery,
+		 * short-circuit because recovery will truncate the log to
+		 * the LSN we want anyway.
+		 */
+		case DB_REP_LOGREADY:
+			goto out;
+		/*
+		 * If we just handled a special record, retain that information.
+		 */
+		case DB_REP_ISPERM:
+		case DB_REP_NOTPERM:
+			save_ret = ret;
+			save_lsn = tmp_lsn;
+			ret = 0;
+			break;
+		/*
+		 * Normal processing, do nothing, just continue.
+		 */
+		case 0:
+			break;
+		/*
+		 * If we get an error, then stop immediately.
+		 */
+		default:
+			goto out;
+		}
+	}
+out:
+	/*
+	 * If we finish processing successfully, set our return values
+	 * based on what we saw.
+	 */
+	if (ret == 0) {
+		ret = save_ret;
+		*ret_lsnp = save_lsn;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_log_req --
+ *      Handle a REP_LOG_REQ message.
+ *
+ * PUBLIC: int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_logreq(env, rp, rec, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+{
+	DBT data_dbt, newfiledbt;
+	DB_LOGC *logc;
+	DB_LSN firstlsn, lsn, oldfilelsn;
+	DB_REP *db_rep;
+	REP *rep;
+	REP_BULK bulk;
+	REP_THROTTLE repth;
+	__rep_logreq_args lr_args;
+	__rep_newfile_args nf_args;
+	uintptr_t bulkoff;
+	u_int32_t bulkflags, use_bulk;
+	int count, ret, t_ret;
+	u_int8_t buf[__REP_NEWFILE_SIZE];
+	size_t len;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/* COMPQUIET_LSN is what this is...  */
+	ZERO_LSN(lr_args.endlsn);
+
+	if (rec != NULL && rec->size != 0) {
+		if (rp->rep_version < DB_REPVERSION_47)
+			lr_args.endlsn = *(DB_LSN *)rec->data;
+		else if ((ret = __rep_logreq_unmarshal(env, &lr_args,
+		    rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "[%lu][%lu]: LOG_REQ max lsn: [%lu][%lu]",
+		    (u_long) rp->lsn.file, (u_long)rp->lsn.offset,
+		    (u_long)lr_args.endlsn.file,
+		    (u_long)lr_args.endlsn.offset));
+	}
+	/*
+	 * There are several different cases here.
+	 * 1. We asked logc_get for a particular LSN and got it.
+	 * 2. We asked logc_get for an LSN and it's not found because it is
+	 *	beyond the end of a log file and we need a NEWFILE msg.
+	 *	and then the record that was requested.
+	 * 3. We asked logc_get for an LSN and it is already archived.
+	 * 4. We asked logc_get for an LSN and it simply doesn't exist, but
+	 *    doesn't meet any of those other criteria, in which case
+	 *    it's an error (that should never happen on a master).
+	 *
+	 * If we have a valid LSN and the request has a data_dbt with
+	 * it, the sender is asking for a chunk of log records.
+	 * Then we need to send all records up to the LSN in the data dbt.
+	 */
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	oldfilelsn = lsn = rp->lsn;
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	REP_SYSTEM_LOCK(env);
+	if ((ret = __rep_lockout_archive(env, rep)) != 0) {
+		REP_SYSTEM_UNLOCK(env);
+		goto err;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	if ((ret = __logc_get(logc, &lsn, &data_dbt, DB_SET)) == 0) {
+		/* Case 1 */
+		(void)__rep_send_message(env,
+		   eid, REP_LOG, &lsn, &data_dbt, REPCTL_RESEND, 0);
+		oldfilelsn.offset += logc->len;
+	} else if (ret == DB_NOTFOUND) {
+		/*
+		 * If logc_get races with log_archive or the user removing
+		 * files from an earlier call to log_archive, it might return
+		 * DB_NOTFOUND.  We expect there to be some log record
+		 * that is the first one.  Loop until we either get
+		 * a log record or some error.  Since we only expect
+		 * to get this racing log file removal, bound it to a few
+		 * tries.
+		 */
+		count = 0;
+		do {
+			ret = __logc_get(logc, &firstlsn, &data_dbt, DB_FIRST);
+			/*
+			 * If we've raced this many tries and we're still
+			 * getting DB_NOTFOUND, then pause a bit to disrupt
+			 * the timing cycle that we appear to be in.
+			 */
+			if (count > 5)
+				__os_yield(env, 0, 50000);
+			count++;
+		} while (ret == DB_NOTFOUND && count < 10);
+		if (ret != 0) {
+			/*
+			 * If we're master we don't want to return DB_NOTFOUND.
+			 * We'll just ignore the error and this message.
+			 * It will get rerequested if needed.
+			 */
+			if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+				ret = 0;
+			goto err;
+		}
+		if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) {
+			/* Case 3 */
+			if (F_ISSET(rep, REP_F_CLIENT)) {
+				ret = DB_NOTFOUND;
+				goto err;
+			}
+			(void)__rep_send_message(env, eid,
+			    REP_VERIFY_FAIL, &rp->lsn, NULL, 0, 0);
+			ret = 0;
+			goto err;
+		}
+		ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+		if (ret == DB_NOTFOUND) {
+			/* Case 4 */
+			/*
+			 * If we still get DB_NOTFOUND the client gave us an
+			 * unknown LSN, perhaps at the end of the log.  Ignore
+			 * it if we're the master.  Return DB_NOTFOUND if
+			 * we are the client.
+			 */
+			if (F_ISSET(rep, REP_F_MASTER)) {
+				__db_errx(env, DB_STR_A("3501",
+				    "Request for LSN [%lu][%lu] not found",
+				    "%lu %lu"), (u_long)rp->lsn.file,
+				    (u_long)rp->lsn.offset);
+				ret = 0;
+				goto err;
+			} else
+				ret = DB_NOTFOUND;
+		}
+	}
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If the user requested a gap, send the whole thing, while observing
+	 * the limits from rep_set_limit.
+	 *
+	 * If we're doing bulk transfer, allocate a bulk buffer to put our
+	 * log records in.  We still need to initialize the throttle info
+	 * because if we encounter a log record larger than our entire bulk
+	 * buffer, we need to send it as a singleton.
+	 *
+	 * Use a local var so we don't need to worry if someone else turns
+	 * on/off bulk in the middle of our call.
+	 */
+	use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+	if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+	    &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+		goto err;
+	memset(&repth, 0, sizeof(repth));
+	REP_SYSTEM_LOCK(env);
+	repth.gbytes = rep->gbytes;
+	repth.bytes = rep->bytes;
+	repth.type = REP_LOG;
+	repth.data_dbt = &data_dbt;
+	REP_SYSTEM_UNLOCK(env);
+	while (ret == 0 && rec != NULL && rec->size != 0 &&
+	    repth.type == REP_LOG) {
+		if ((ret =
+		    __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) != 0) {
+			/*
+			 * If we're a client and we only have part of the gap,
+			 * return DB_NOTFOUND so that we send a REREQUEST
+			 * back to the requester and it can ask for more.
+			 */
+			if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+				ret = 0;
+			break;
+		}
+		if (LOG_COMPARE(&repth.lsn, &lr_args.endlsn) >= 0)
+			break;
+		if (repth.lsn.file != oldfilelsn.file) {
+			if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+				break;
+			memset(&newfiledbt, 0, sizeof(newfiledbt));
+			if (rep->version < DB_REPVERSION_47)
+				DB_INIT_DBT(newfiledbt, &nf_args.version,
+				    sizeof(nf_args.version));
+			else {
+				if ((ret = __rep_newfile_marshal(env, &nf_args,
+				    buf, __REP_NEWFILE_SIZE, &len)) != 0)
+					goto err;
+				DB_INIT_DBT(newfiledbt, buf, len);
+			}
+			(void)__rep_send_message(env,
+			    eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+			    REPCTL_RESEND, 0);
+		}
+		/*
+		 * If we are configured for bulk, try to send this as a bulk
+		 * request.  If not configured, or it is too big for bulk
+		 * then just send normally.
+		 */
+		if (use_bulk)
+			ret = __rep_bulk_message(env, &bulk, &repth,
+			    &repth.lsn, &data_dbt, REPCTL_RESEND);
+		if (!use_bulk || ret == DB_REP_BULKOVF)
+			ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+		if (ret != 0) {
+			/* Ignore send failure, except to break the loop. */
+			if (ret == DB_REP_UNAVAIL)
+				ret = 0;
+			break;
+		}
+		/*
+		 * If we are about to change files, then we'll need the
+		 * last LSN in the previous file.  Save it here.
+		 */
+		oldfilelsn = repth.lsn;
+		oldfilelsn.offset += logc->len;
+	}
+
+	/*
+	 * We're done, force out whatever remains in the bulk buffer and
+	 * free it.
+	 */
+	if (use_bulk && (t_ret = __rep_bulk_free(env, &bulk,
+	    REPCTL_RESEND)) != 0 && ret == 0 &&
+	    t_ret != DB_REP_UNAVAIL)
+		ret = t_ret;
+err:
+	/*
+	 * We could have raced an unlink from an earlier log_archive
+	 * and the user is removing the files themselves, now.  If
+	 * we get an error indicating the log file might no longer
+	 * exist, ignore it.
+	 */
+	if (ret == ENOENT)
+		ret = 0;
+	REP_SYSTEM_LOCK(env);
+	FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+	REP_SYSTEM_UNLOCK(env);
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_loggap_req -
+ *	Request a log gap.  Assumes the caller holds the REP->mtx_clientdb.
+ *
+ * lsnp is the current LSN we're handling.  It is used to help decide
+ *	if we ask for a gap or singleton.
+ * gapflags are flags that may override the algorithm or control the
+ *	processing in some way.
+ *
+ * PUBLIC: int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t));
+ */
+int
+__rep_loggap_req(env, rep, lsnp, gapflags)
+	ENV *env;
+	REP *rep;
+	DB_LSN *lsnp;
+	u_int32_t gapflags;
+{
+	DBT max_lsn_dbt, *max_lsn_dbtp;
+	DB_LOG *dblp;
+	DB_LSN next_lsn;
+	LOG *lp;
+	__rep_logreq_args lr_args;
+	size_t len;
+	u_int32_t ctlflags, flags, type;
+	int master, ret;
+	u_int8_t buf[__REP_LOGREQ_SIZE];
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+		next_lsn = *lsnp;
+	else
+		next_lsn = lp->ready_lsn;
+	ctlflags = flags = 0;
+	type = REP_LOG_REQ;
+	ret = 0;
+
+	/*
+	 * Check if we need to ask for the gap.
+	 * We ask for the gap if:
+	 *	We are forced to with gapflags.
+	 *	If max_wait_lsn is ZERO_LSN - we've never asked for
+	 *	  records before.
+	 *	If we asked for a single record and received it.
+	 *
+	 * If we want a gap, but don't have an ending LSN (waiting_lsn)
+	 * send an ALL_REQ.  This is primarily used by REP_REREQUEST when
+	 * an ALL_REQ was not able to be fulfilled by another client.
+	 */
+	if (FLD_ISSET(gapflags, (REP_GAP_FORCE | REP_GAP_REREQUEST)) ||
+	    IS_ZERO_LSN(lp->max_wait_lsn) ||
+	    (lsnp != NULL && LOG_COMPARE(lsnp, &lp->max_wait_lsn) == 0)) {
+		lp->max_wait_lsn = lp->waiting_lsn;
+		/*
+		 * In SYNC_LOG, make sure max_wait_lsn is set to avoid sending
+		 * an ALL_REQ that could create an unnecessary dual data stream.
+		 */
+		if (rep->sync_state == SYNC_LOG &&
+		    IS_ZERO_LSN(lp->max_wait_lsn))
+			lp->max_wait_lsn = rep->last_lsn;
+		/*
+		 * If we are forcing a gap, we need to send a max_wait_lsn
+		 * that may be beyond the current gap/waiting_lsn (but
+		 * it may not be).  If we cannot determine any future
+		 * waiting LSN, then it should be zero.  If we're in
+		 * internal init, it should be our ending LSN.
+		 */
+		if (FLD_ISSET(gapflags, REP_GAP_FORCE)) {
+			if (LOG_COMPARE(&lp->max_wait_lsn, lsnp) <= 0) {
+				if (rep->sync_state == SYNC_LOG) {
+					DB_ASSERT(env, LOG_COMPARE(lsnp,
+					    &rep->last_lsn) <= 0);
+					lp->max_wait_lsn = rep->last_lsn;
+				} else
+					ZERO_LSN(lp->max_wait_lsn);
+			}
+		}
+		if (IS_ZERO_LSN(lp->max_wait_lsn))
+			type = REP_ALL_REQ;
+		memset(&max_lsn_dbt, 0, sizeof(max_lsn_dbt));
+		lr_args.endlsn = lp->max_wait_lsn;
+		if (rep->version < DB_REPVERSION_47)
+			DB_INIT_DBT(max_lsn_dbt, &lp->max_wait_lsn,
+			    sizeof(DB_LSN));
+		else {
+			if ((ret = __rep_logreq_marshal(env, &lr_args, buf,
+			    __REP_LOGREQ_SIZE, &len)) != 0)
+				goto err;
+			DB_INIT_DBT(max_lsn_dbt, buf, len);
+		}
+		max_lsn_dbtp = &max_lsn_dbt;
+		/*
+		 * Gap requests are "new" and can go anywhere, unless
+		 * this is already a re-request.
+		 */
+		if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+			flags = DB_REP_REREQUEST;
+		else
+			flags = DB_REP_ANYWHERE;
+	} else {
+		max_lsn_dbtp = NULL;
+		lp->max_wait_lsn = next_lsn;
+		/*
+		 * If we're dropping to singletons, this is a re-request.
+		 */
+		flags = DB_REP_REREQUEST;
+	}
+	if ((master = rep->master_id) != DB_EID_INVALID) {
+		STAT_INC(env,
+		    rep, log_request, rep->stat.st_log_requested, master);
+		if (rep->sync_state == SYNC_LOG)
+			ctlflags = REPCTL_INIT;
+		(void)__rep_send_message(env, master,
+		    type, &next_lsn, max_lsn_dbtp, ctlflags, flags);
+	} else
+		(void)__rep_send_message(env, DB_EID_BROADCAST,
+		    REP_MASTER_REQ, NULL, NULL, 0, 0);
+err:
+	return (ret);
+}
+
+/*
+ * __rep_logready -
+ *	Handle getting back REP_LOGREADY.  Any call to __rep_apply
+ * can return it.
+ *
+ * PUBLIC: int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *));
+ */
+int
+__rep_logready(env, rep, savetime, last_lsnp)
+	ENV *env;
+	REP *rep;
+	time_t savetime;
+	DB_LSN *last_lsnp;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	int ret;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	if ((ret = __log_flush(env, NULL)) != 0)
+		goto err;
+	if ((ret = __rep_verify_match(env, last_lsnp, savetime)) != 0)
+		goto err;
+
+	REP_SYSTEM_LOCK(env);
+	ZERO_LSN(rep->first_lsn);
+
+	if (rep->originfo_off != INVALID_ROFF) {
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		__env_alloc_free(infop, R_ADDR(infop, rep->originfo_off));
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		rep->originfo_off = INVALID_ROFF;
+	}
+
+	rep->sync_state = SYNC_OFF;
+	F_SET(rep, REP_F_NIMDBS_LOADED);
+	ret = __rep_notify_threads(env, AWAIT_NIMDB);
+	REP_SYSTEM_UNLOCK(env);
+	if (ret != 0)
+		goto err;
+
+	return (0);
+
+err:
+	DB_ASSERT(env, ret != DB_REP_WOULDROLLBACK);
+	__db_errx(env, DB_STR("3502",
+	    "Client initialization failed.  Need to manually restore client"));
+	return (__env_panic(env, ret));
+}
+
+/*
+ * __rep_chk_newfile --
+ *     Determine if getting DB_NOTFOUND is because we're at the
+ * end of a log file and need to send a NEWFILE message.
+ *
+ * This function handles these cases:
+ * [Case 1 was that we found the record we were looking for - it
+ * is already handled by the caller.]
+ * 2. We asked logc_get for an LSN and it's not found because it is
+ *	beyond the end of a log file and we need a NEWFILE msg.
+ * 3. We asked logc_get for an LSN and it simply doesn't exist, but
+ *    doesn't meet any of those other criteria, in which case
+ *    we return DB_NOTFOUND and the caller decides if it's an error.
+ *
+ * This function returns 0 if we had to send a message and the bad
+ * LSN is dealt with and DB_NOTFOUND if this really is an unknown LSN
+ * (on a client) and errors if it isn't found on the master.
+ */
+static int
+__rep_chk_newfile(env, logc, rep, rp, eid)
+	ENV *env;
+	DB_LOGC *logc;
+	REP *rep;
+	__rep_control_args *rp;
+	int eid;
+{
+	DBT data_dbt, newfiledbt;
+	DB_LOG *dblp;
+	DB_LSN endlsn;
+	LOG *lp;
+	__rep_newfile_args nf_args;
+	int ret;
+	u_int8_t buf[__REP_NEWFILE_SIZE];
+	size_t len;
+
+	ret = 0;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	LOG_SYSTEM_LOCK(env);
+	endlsn = lp->lsn;
+	LOG_SYSTEM_UNLOCK(env);
+	if (endlsn.file > rp->lsn.file) {
+		/*
+		 * Case 2:
+		 * Need to find the LSN of the last record in
+		 * file lsn.file so that we can send it with
+		 * the NEWFILE call.  In order to do that, we
+		 * need to try to get {lsn.file + 1, 0} and
+		 * then backup.
+		 */
+		endlsn.file = rp->lsn.file + 1;
+		endlsn.offset = 0;
+		if ((ret = __logc_get(logc,
+		    &endlsn, &data_dbt, DB_SET)) != 0 ||
+		    (ret = __logc_get(logc,
+			&endlsn, &data_dbt, DB_PREV)) != 0) {
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "Unable to get prev of [%lu][%lu]",
+			    (u_long)rp->lsn.file,
+			    (u_long)rp->lsn.offset));
+			/*
+			 * We want to push the error back
+			 * to the client so that the client
+			 * does an internal backup.  The
+			 * client asked for a log record
+			 * we no longer have and it is
+			 * outdated.
+			 * XXX - This could be optimized by
+			 * having the master perform and
+			 * send a REP_UPDATE message.  We
+			 * currently want the client to set
+			 * up its 'update' state prior to
+			 * requesting REP_UPDATE_REQ.
+			 *
+			 * If we're a client servicing a request
+			 * just return DB_NOTFOUND.
+			 */
+			if (F_ISSET(rep, REP_F_MASTER)) {
+				ret = 0;
+				(void)__rep_send_message(env, eid,
+				    REP_VERIFY_FAIL, &rp->lsn,
+				    NULL, 0, 0);
+			} else
+				ret = DB_NOTFOUND;
+		} else {
+			endlsn.offset += logc->len;
+			if ((ret = __logc_version(logc,
+			    &nf_args.version)) == 0) {
+				memset(&newfiledbt, 0,
+				    sizeof(newfiledbt));
+				if (rep->version < DB_REPVERSION_47)
+					DB_INIT_DBT(newfiledbt,
+					    &nf_args.version,
+					    sizeof(nf_args.version));
+				else {
+					if ((ret = __rep_newfile_marshal(env,
+					    &nf_args, buf, __REP_NEWFILE_SIZE,
+					    &len)) != 0)
+						return (ret);
+					DB_INIT_DBT(newfiledbt, buf, len);
+				}
+				(void)__rep_send_message(env, eid,
+				    REP_NEWFILE, &endlsn,
+				    &newfiledbt, REPCTL_RESEND, 0);
+			}
+		}
+	} else
+		ret = DB_NOTFOUND;
+
+	return (ret);
+}
diff --git a/src/rep/rep_method.c b/src/rep/rep_method.c
new file mode 100644
index 00000000..f9f1924c
--- /dev/null
+++ b/src/rep/rep_method.c
@@ -0,0 +1,3032 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int  __rep_abort_prepared __P((ENV *));
+static int  __rep_await_condition __P((ENV *,
+    struct rep_waitgoal *, db_timeout_t));
+static int  __rep_bt_cmp __P((DB *, const DBT *, const DBT *));
+static int  __rep_check_applied __P((ENV *,
+    DB_THREAD_INFO *, DB_COMMIT_INFO *, struct rep_waitgoal *));
+static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *));
+static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t));
+static int __rep_read_lsn_history __P((ENV *,
+    DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t,
+    __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t));
+static int  __rep_restore_prepared __P((ENV *));
+static int  __rep_save_lsn_hist __P((ENV *, DB_THREAD_INFO *, DB_LSN *));
+/*
+ * __rep_env_create --
+ *	Replication-specific initialization of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_create __P((DB_ENV *));
+ */
+int
+__rep_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	if ((ret = __os_calloc(env, 1, sizeof(DB_REP), &db_rep)) != 0)
+		return (ret);
+
+	db_rep->eid = DB_EID_INVALID;
+	db_rep->bytes = REP_DEFAULT_THROTTLE;
+	DB_TIMEOUT_TO_TIMESPEC(DB_REP_REQUEST_GAP, &db_rep->request_gap);
+	DB_TIMEOUT_TO_TIMESPEC(DB_REP_MAX_GAP, &db_rep->max_gap);
+	db_rep->elect_timeout = 2 * US_PER_SEC;			/*  2 seconds */
+	db_rep->chkpt_delay = 30 * US_PER_SEC;			/* 30 seconds */
+	db_rep->my_priority = DB_REP_DEFAULT_PRIORITY;
+	/*
+	 * Make no clock skew the default.  Setting both fields
+	 * to the same non-zero value means no skew.
+	 */
+	db_rep->clock_skew = 1;
+	db_rep->clock_base = 1;
+	FLD_SET(db_rep->config, REP_C_AUTOINIT);
+	FLD_SET(db_rep->config, REP_C_AUTOROLLBACK);
+
+	/*
+	 * Turn on system messages by default.
+	 */
+	FLD_SET(dbenv->verbose, DB_VERB_REP_SYSTEM);
+
+#ifdef HAVE_REPLICATION_THREADS
+	if ((ret = __repmgr_env_create(env, db_rep)) != 0) {
+		__os_free(env, db_rep);
+		return (ret);
+	}
+#endif
+
+	env->rep_handle = db_rep;
+	return (0);
+}
+
+/*
+ * __rep_env_destroy --
+ *	Replication-specific destruction of the ENV structure.
+ *
+ * PUBLIC: void __rep_env_destroy __P((DB_ENV *));
+ */
+void
+__rep_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	if (env->rep_handle != NULL) {
+#ifdef HAVE_REPLICATION_THREADS
+		__repmgr_env_destroy(env, env->rep_handle);
+#endif
+		__os_free(env, env->rep_handle);
+		env->rep_handle = NULL;
+	}
+}
+
+/*
+ * __rep_get_config --
+ *	Return the replication subsystem configuration.
+ *
+ * PUBLIC: int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__rep_get_config(dbenv, which, onp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onp;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REP *rep;
+	u_int32_t mapped;
+
+	env = dbenv->env;
+
+#undef	OK_FLAGS
+#define	OK_FLAGS							\
+    (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK |			\
+    DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM |	\
+    DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT |				\
+    DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+
+	if (FLD_ISSET(which, ~OK_FLAGS))
+		return (__db_ferr(env, "DB_ENV->rep_get_config", 0));
+
+	db_rep = env->rep_handle;
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_config", DB_INIT_REP);
+
+	mapped = 0;
+	__rep_config_map(env, &which, &mapped);
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		if (FLD_ISSET(rep->config, mapped))
+			*onp = 1;
+		else
+			*onp = 0;
+	} else {
+		if (FLD_ISSET(db_rep->config, mapped))
+			*onp = 1;
+		else
+			*onp = 0;
+	}
+	return (0);
+}
+
+/*
+ * __rep_set_config --
+ *	Configure the replication subsystem.
+ *
+ * PUBLIC: int __rep_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__rep_set_config(dbenv, which, on)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int on;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	LOG *lp;
+	REP *rep;
+	REP_BULK bulk;
+	u_int32_t mapped, orig;
+	int ret, t_ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+#undef	OK_FLAGS
+#define	OK_FLAGS							\
+    (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK |			\
+    DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM |	\
+    DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT |				\
+    DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+#define	REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS)
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP);
+
+	if (FLD_ISSET(which, ~OK_FLAGS))
+		return (__db_ferr(env, "DB_ENV->rep_set_config", 0));
+
+	mapped = 0;
+	__rep_config_map(env, &which, &mapped);
+
+	if (APP_IS_BASEAPI(env) && FLD_ISSET(mapped, REPMGR_FLAGS)) {
+		__db_errx(env, DB_STR_A("3548",
+    "%s cannot configure repmgr settings from base replication application",
+		    "%s"), "DB_ENV->rep_set_config:");
+		return (EINVAL);
+	}
+
+	if (REP_ON(env)) {
+#ifdef HAVE_REPLICATION_THREADS
+		if ((ret = __repmgr_valid_config(env, mapped)) != 0)
+			return (ret);
+#endif
+
+		ENV_ENTER(env, ip);
+
+		rep = db_rep->region;
+		/*
+		 * In-memory replication must be called before calling
+		 * env->open.  If it is turned on and off before env->open,
+		 * it doesn't matter.  Any attempt to turn it on or off after
+		 * env->open is intercepted by this error.
+		 */
+		if (FLD_ISSET(mapped, REP_C_INMEM)) {
+			__db_errx(env, DB_STR_A("3549",
+"%s in-memory replication must be configured before DB_ENV->open",
+			    "%s"), "DB_ENV->rep_set_config:");
+			ENV_LEAVE(env, ip);
+			return (EINVAL);
+		}
+		/*
+		 * Leases must be turned on before calling rep_start.
+		 * Leases can never be turned off once they're turned on.
+		 */
+		if (FLD_ISSET(mapped, REP_C_LEASE)) {
+			if (F_ISSET(rep, REP_F_START_CALLED)) {
+				__db_errx(env, DB_STR("3550",
+				    "DB_ENV->rep_set_config: leases must be "
+				    "configured before DB_ENV->rep_start"));
+				ret = EINVAL;
+			}
+			if (on == 0) {
+				__db_errx(env, DB_STR("3551",
+	    "DB_ENV->rep_set_config: leases cannot be turned off"));
+				ret = EINVAL;
+			}
+			if (ret != 0) {
+				ENV_LEAVE(env, ip);
+				return (ret);
+			}
+		}
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		REP_SYSTEM_LOCK(env);
+		orig = rep->config;
+		if (on)
+			FLD_SET(rep->config, mapped);
+		else
+			FLD_CLR(rep->config, mapped);
+
+		/*
+		 * Bulk transfer requires special processing if it is getting
+		 * toggled.
+		 */
+		dblp = env->lg_handle;
+		lp = dblp->reginfo.primary;
+		if (FLD_ISSET(rep->config, REP_C_BULK) &&
+		    !FLD_ISSET(orig, REP_C_BULK))
+			db_rep->bulk = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+		REP_SYSTEM_UNLOCK(env);
+
+		/*
+		 * If turning bulk off and it was on, send out whatever is in
+		 * the buffer already.
+		 */
+		if (FLD_ISSET(orig, REP_C_BULK) &&
+		    !FLD_ISSET(rep->config, REP_C_BULK) && lp->bulk_off != 0) {
+			memset(&bulk, 0, sizeof(bulk));
+			if (db_rep->bulk == NULL)
+				bulk.addr =
+				    R_ADDR(&dblp->reginfo, lp->bulk_buf);
+			else
+				bulk.addr = db_rep->bulk;
+			bulk.offp = &lp->bulk_off;
+			bulk.len = lp->bulk_len;
+			bulk.type = REP_BULK_LOG;
+			bulk.eid = DB_EID_BROADCAST;
+			bulk.flagsp = &lp->bulk_flags;
+			ret = __rep_send_bulk(env, &bulk, 0);
+		}
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+		ENV_LEAVE(env, ip);
+
+#ifdef HAVE_REPLICATION_THREADS
+		/*
+		 * If turning ELECTIONS on, and it was off, check whether we
+		 * need to start an election immediately.
+		 */
+		if (!FLD_ISSET(orig, REP_C_ELECTIONS) &&
+		    FLD_ISSET(rep->config, REP_C_ELECTIONS) &&
+		    (t_ret = __repmgr_turn_on_elections(env)) != 0 && ret == 0)
+			ret = t_ret;
+#endif
+	} else {
+		if (on)
+			FLD_SET(db_rep->config, mapped);
+		else
+			FLD_CLR(db_rep->config, mapped);
+	}
+	/* Configuring 2SITE_STRICT, etc. makes this a repmgr application */
+	if (ret == 0 && FLD_ISSET(mapped, REPMGR_FLAGS))
+		APP_SET_REPMGR(env);
+	return (ret);
+}
+
+static void
+__rep_config_map(env, inflagsp, outflagsp)
+	ENV *env;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(env, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_AUTOINIT)) {
+		FLD_SET(*outflagsp, REP_C_AUTOINIT);
+		FLD_CLR(*inflagsp, DB_REP_CONF_AUTOINIT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_AUTOROLLBACK)) {
+		FLD_SET(*outflagsp, REP_C_AUTOROLLBACK);
+		FLD_CLR(*inflagsp, DB_REP_CONF_AUTOROLLBACK);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_BULK)) {
+		FLD_SET(*outflagsp, REP_C_BULK);
+		FLD_CLR(*inflagsp, DB_REP_CONF_BULK);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_DELAYCLIENT)) {
+		FLD_SET(*outflagsp, REP_C_DELAYCLIENT);
+		FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) {
+		FLD_SET(*outflagsp, REP_C_INMEM);
+		FLD_CLR(*inflagsp, DB_REP_CONF_INMEM);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_LEASE)) {
+		FLD_SET(*outflagsp, REP_C_LEASE);
+		FLD_CLR(*inflagsp, DB_REP_CONF_LEASE);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOWAIT)) {
+		FLD_SET(*outflagsp, REP_C_NOWAIT);
+		FLD_CLR(*inflagsp, DB_REP_CONF_NOWAIT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT)) {
+		FLD_SET(*outflagsp, REP_C_2SITE_STRICT);
+		FLD_CLR(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_ELECTIONS)) {
+		FLD_SET(*outflagsp, REP_C_ELECTIONS);
+		FLD_CLR(*inflagsp, DB_REPMGR_CONF_ELECTIONS);
+	}
+	DB_ASSERT(env, *inflagsp == 0);
+}
+
+/*
+ * __rep_start_pp --
+ *	Become a master or client, and start sending messages to participate
+ * in the replication environment.  Must be called after the environment
+ * is open.
+ *
+ * PUBLIC: int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_pp(dbenv, dbt, flags)
+	DB_ENV *dbenv;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	ENV *env;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_start", DB_INIT_REP);
+
+	if (APP_IS_REPMGR(env)) {
+		__db_errx(env, DB_STR("3552",
+"DB_ENV->rep_start: cannot call from Replication Manager application"));
+		return (EINVAL);
+	}
+
+	switch (LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER)) {
+	case DB_REP_CLIENT:
+	case DB_REP_MASTER:
+		break;
+	default:
+		__db_errx(env, DB_STR("3553",
+	    "DB_ENV->rep_start: must specify DB_REP_CLIENT or DB_REP_MASTER"));
+		return (EINVAL);
+	}
+
+	/* We need a transport function because we send messages. */
+	if (db_rep->send == NULL) {
+		__db_errx(env, DB_STR("3554",
+    "DB_ENV->rep_start: must be called after DB_ENV->rep_set_transport"));
+		return (EINVAL);
+	}
+
+	return (__rep_start_int(env, dbt, flags));
+}
+
+/*
+ * __rep_start_int --
+ *	Internal processing to become a master or client and start sending
+ * messages to participate in the replication environment.  If this is
+ * a newly created environment, then this site has likely been in an
+ * initial, undefined state - neither master nor client.  What that means
+ * is that as a non-client, it can write log records locally (such as
+ * those generated by recovery) and as a non-master, it does not attempt
+ * to send those log records elsewhere.
+ *
+ * We must protect rep_start_int, which may change the world, with the rest
+ * of the DB library.  Each API interface will count itself as it enters
+ * the library.  Rep_start_int checks the following:
+ *
+ * rep->msg_th - this is the count of threads currently in rep_process_message
+ * rep->handle_cnt - number of threads actively using a dbp in library.
+ * rep->txn_cnt - number of active txns.
+ * REP_LOCKOUT_* - Replication flag that indicates that we wish to run
+ * recovery, and want to prohibit new transactions from entering and cause
+ * existing ones to return immediately (with a DB_LOCK_DEADLOCK error).
+ *
+ * There is also the renv->rep_timestamp which is updated whenever significant
+ * events (i.e., new masters, log rollback, etc).  Upon creation, a handle
+ * is associated with the current timestamp.  Each time a handle enters the
+ * library it must check if the handle timestamp is the same as the one
+ * stored in the replication region.  This prevents the use of handles on
+ * clients that reference non-existent files whose creation was backed out
+ * during a synchronizing recovery.
+ *
+ * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_int(env, dbt, flags)
+	ENV *env;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn, perm_lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	DB_TXNREGION *region;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	db_timeout_t tmp;
+	u_int32_t new_gen, oldvers, pending_event, role;
+	int interrupting, locked, ret, role_chg, start_th, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+	renv = infop->primary;
+	interrupting = locked = 0;
+	pending_event = DB_EVENT_NO_SUCH_EVENT;
+	role = LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER);
+	start_th = 0;
+
+	/*
+	 * If we're using master leases, check that all needed
+	 * setup has been done, including setting the lease timeout.
+	 */
+	if (IS_USING_LEASES(env) && rep->lease_timeout == 0) {
+		__db_errx(env, DB_STR("3555",
+"DB_ENV->rep_start: must call DB_ENV->rep_set_timeout for leases first"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Serialize rep_start() calls. */
+	MUTEX_LOCK(env, rep->mtx_repstart);
+	start_th = 1;
+
+	/*
+	 * In order to correctly check log files for old versions, we
+	 * need to flush the logs.  Serialize log flush to make sure it is
+	 * always done just before the log old version check.  Otherwise it
+	 * is possible that another thread in rep_start could write LSN history
+	 * and create a new log file that is not yet fully there for the log
+	 * old version check.
+	 */
+	if ((ret = __log_flush(env, NULL)) != 0)
+		goto out;
+
+	REP_SYSTEM_LOCK(env);
+	role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) ||
+	    (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT);
+
+	/*
+	 * There is no need for lockout if all we're doing is sending a message.
+	 * In fact, lockout could be harmful: the typical use of this "duplicate
+	 * client" style of call is when the application has to poll, seeking
+	 * for a master.  If the resulting NEWMASTER message were to arrive when
+	 * we had messages locked out, we would discard it, resulting in further
+	 * delay.
+	 */
+	if (role == DB_REP_CLIENT && !role_chg) {
+		REP_SYSTEM_UNLOCK(env);
+		if ((ret = __dbt_usercopy(env, dbt)) == 0)
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
+		goto out;
+	}
+
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+		/*
+		 * There is already someone in msg lockout.  Return.
+		 */
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "Thread already in msg lockout"));
+		REP_SYSTEM_UNLOCK(env);
+		goto out;
+	} else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0)
+		goto errunlock;
+
+	/*
+	 * If we are internal init and we try to become master, reject it.
+	 * Our environment databases/logs are in an inconsistent state and
+	 * we cannot become master.
+	 */
+	if (IN_INTERNAL_INIT(rep) && role == DB_REP_MASTER) {
+		__db_errx(env, DB_STR("3556",
+    "DB_ENV->rep_start: Cannot become master during internal init"));
+		ret = DB_REP_UNAVAIL;
+		goto errunlock;
+	}
+
+	/*
+	 * Wait for any active txns or mpool ops to complete, and
+	 * prevent any new ones from occurring, only if we're
+	 * changing roles.
+	 */
+	if (role_chg) {
+		if ((ret = __rep_lockout_api(env, rep)) != 0)
+			goto errunlock;
+		locked = 1;
+	}
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	if (role == DB_REP_MASTER) {
+		if (role_chg) {
+			/*
+			 * If we were previously a client, it's possible we
+			 * could have an interruptible STARTSYNC in progress.
+			 * Interrupt it now, so that it doesn't slow down our
+			 * transition to master, and because its effects aren't
+			 * doing us any good anyway.
+			 */
+			(void)__memp_set_config(
+			    env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+			interrupting = 1;
+
+			/*
+			 * If we're upgrading from having been a client,
+			 * preclose, so that we close our temporary database
+			 * and any files we opened while doing a rep_apply.
+			 * If we don't we can infinitely leak file ids if
+			 * the master crashed with files open (the likely
+			 * case).  If we don't close them we can run into
+			 * problems if we try to remove that file or long
+			 * running applications end up with an unbounded
+			 * number of used fileids, each getting written
+			 * on checkpoint.  Just close them.
+			 * Then invalidate all files open in the logging
+			 * region.  These are files open by other processes
+			 * attached to the environment.  They must be
+			 * closed by the other processes when they notice
+			 * the change in role.
+			 */
+			if ((ret = __rep_preclose(env)) != 0)
+				goto errunlock;
+
+			new_gen = rep->gen + 1;
+			/*
+			 * There could have been any number of failed
+			 * elections, so jump the gen if we need to now.
+			 */
+			if (rep->egen > rep->gen)
+				new_gen = rep->egen;
+			SET_GEN(new_gen);
+			/*
+			 * If the "group" has only one site, it's OK to start as
+			 * master without an election.  This is how repmgr
+			 * builds up a primordial group, by induction.
+			 */
+			if (IS_USING_LEASES(env) &&
+			    rep->config_nsites > 1 &&
+			    !F_ISSET(rep, REP_F_MASTERELECT)) {
+				__db_errx(env, DB_STR("3557",
+"rep_start: Cannot become master without being elected when using leases."));
+				ret = EINVAL;
+				goto errunlock;
+			}
+			if (F_ISSET(rep, REP_F_MASTERELECT)) {
+				__rep_elect_done(env, rep);
+				F_CLR(rep, REP_F_MASTERELECT);
+			} else if (FLD_ISSET(rep->config, REP_C_INMEM))
+				/*
+				 * Help detect if application has ignored our
+				 * recommendation against reappointing same
+				 * master after a crash/reboot when running
+				 * in-memory replication.  Doing this allows a
+				 * slight chance of two masters at the same
+				 * generation, resulting in client crashes.
+				 */
+				RPRINT(env, (env, DB_VERB_REP_MISC,
+	"Appointed new master while running in-memory replication."));
+			if (rep->egen <= rep->gen)
+				rep->egen = rep->gen + 1;
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "New master gen %lu, egen %lu",
+			    (u_long)rep->gen, (u_long)rep->egen));
+			/*
+			 * If not running in-memory replication, write
+			 * gen file.
+			 */
+			if (!FLD_ISSET(rep->config, REP_C_INMEM) &&
+			    (ret = __rep_write_gen(env, rep, rep->gen)) != 0)
+					goto errunlock;
+		}
+		/*
+		 * Set lease duration assuming clients have faster clock.
+		 * Master needs to compensate so that clients do not
+		 * expire their grant while the master thinks it is valid.
+		 */
+		if (IS_USING_LEASES(env) &&
+		    (role_chg || !IS_REP_STARTED(env))) {
+			/*
+			 * If we have already granted our lease, we
+			 * cannot become master.
+			 */
+			if ((ret = __rep_islease_granted(env))) {
+				__db_errx(env, DB_STR("3558",
+    "rep_start: Cannot become master with outstanding lease granted."));
+				ret = EINVAL;
+				goto errunlock;
+			}
+			/*
+			 * Set max_perm_lsn to last PERM record on master.
+			 */
+			if ((ret = __log_cursor(env, &logc)) != 0)
+				goto errunlock;
+			ret = __rep_log_backup(env, logc, &perm_lsn,
+			    REP_REC_PERM);
+			(void)__logc_close(logc);
+			/*
+			 * If we found a perm LSN use it.  Otherwise, if
+			 * no perm LSN exists, initialize.
+			 */
+			if (ret == 0)
+				lp->max_perm_lsn = perm_lsn;
+			else if (ret == DB_NOTFOUND)
+				INIT_LSN(lp->max_perm_lsn);
+			else
+				goto errunlock;
+
+			/*
+			 * Simply compute the larger ratio for the lease.
+			 */
+			tmp = (db_timeout_t)((double)rep->lease_timeout /
+			    ((double)rep->clock_skew /
+			    (double)rep->clock_base));
+			DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+			if ((ret = __rep_lease_table_alloc(env,
+			    rep->config_nsites)) != 0)
+				goto errunlock;
+		}
+		rep->master_id = rep->eid;
+		STAT_INC(env, rep,
+		    master_change, rep->stat.st_master_changes, rep->eid);
+
+#ifdef	DIAGNOSTIC
+		if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "Establishing group as master."));
+#endif
+		/*
+		 * When becoming a master, clear the following flags:
+		 *   CLIENT: Site is no longer a client.
+		 *   ABBREVIATED: Indicates abbreviated internal init, which
+		 *       cannot occur on a master.
+		 *   MASTERELECT: Indicates that this master is elected
+		 *       rather than appointed. If we're changing roles we
+		 *       used this flag above for error checks and election
+		 *       cleanup.
+		 *   SKIPPED_APPLY: Indicates that client apply skipped
+		 *       some log records during an election, no longer
+		 *       applicable on master.
+		 *   DELAY: Indicates user config to delay initial client
+		 *       sync with new master, doesn't apply to master.
+		 *   LEASE_EXPIRED: Applies to client leases which are
+		 *       now defunct on master.
+		 *   NEWFILE: Used to delay client apply during newfile
+		 *       operation, not applicable to master.
+		 */
+		F_CLR(rep, REP_F_CLIENT | REP_F_ABBREVIATED |
+		    REP_F_MASTERELECT | REP_F_SKIPPED_APPLY | REP_F_DELAY |
+		    REP_F_LEASE_EXPIRED | REP_F_NEWFILE);
+		/*
+		 * When becoming a master, set the following flags:
+		 *   MASTER: Indicate that this site is master.
+		 *   GROUP_ESTD: Having a master means a that replication
+		 *       group exists.
+		 *   NIMDBS_LOADED: Inmem dbs are always present on a master.
+		 */
+		F_SET(rep, REP_F_MASTER | REP_F_GROUP_ESTD |
+		    REP_F_NIMDBS_LOADED);
+		/* Master cannot be in internal init. */
+		rep->sync_state = SYNC_OFF;
+
+		/*
+		 * We're master.  Set the versions to the current ones.
+		 */
+		oldvers = lp->persist.version;
+		/*
+		 * If we're moving forward to the current version, we need
+		 * to force the log file to advance and reset the
+		 * recovery table since it contains pointers to old
+		 * recovery functions.
+		 */
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "rep_start: Old log version was %lu", (u_long)oldvers));
+		if (lp->persist.version != DB_LOGVERSION) {
+			if ((ret = __env_init_rec(env, DB_LOGVERSION)) != 0)
+				goto errunlock;
+		}
+		rep->version = DB_REPVERSION;
+		/*
+		 * When becoming a master, clear the following lockouts:
+		 *   ARCHIVE: Used to keep logs while client may be
+		 *       inconsistent, not needed on master.
+		 *   MSG: We set this above to block message processing while
+		 *       becoming a master, can turn messages back on here.
+		 */
+		FLD_CLR(rep->lockout_flags,
+		    REP_LOCKOUT_ARCHIVE | REP_LOCKOUT_MSG);
+		REP_SYSTEM_UNLOCK(env);
+		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+
+		/*
+		 * Send the NEWMASTER message first so that clients know
+		 * subsequent messages are coming from the right master.
+		 * We need to perform all actions below no matter what
+		 * regarding errors.
+		 */
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+		ret = 0;
+		if (role_chg) {
+			pending_event = DB_EVENT_REP_MASTER;
+			/*
+			 * If prepared transactions have not been restored
+			 * look to see if there are any.  If there are,
+			 * then mark the open files, otherwise close them.
+			 */
+			region = env->tx_handle->reginfo.primary;
+			if (region->stat.st_nrestores == 0 &&
+			    (t_ret = __rep_restore_prepared(env)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+			if (region->stat.st_nrestores != 0) {
+			    if ((t_ret = __dbreg_mark_restored(env)) != 0 &&
+				    ret == 0)
+					ret = t_ret;
+			} else {
+				ret = __dbreg_invalidate_files(env, 0);
+				if ((t_ret = __rep_closefiles(env)) != 0 &&
+				    ret == 0)
+					ret = t_ret;
+			}
+
+			REP_SYSTEM_LOCK(env);
+			F_SET(rep, REP_F_SYS_DB_OP);
+			REP_SYSTEM_UNLOCK(env);
+			if ((t_ret = __txn_recycle_id(env, 0)) != 0 && ret == 0)
+				ret = t_ret;
+
+			/*
+			 * Write LSN history database, ahead of unlocking the
+			 * API so that clients can always know the heritage of
+			 * any transaction they receive via replication.
+			 */
+			if ((t_ret = __rep_save_lsn_hist(env, ip, &lsn)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+
+			REP_SYSTEM_LOCK(env);
+			rep->gen_base_lsn = lsn;
+			rep->master_envid = renv->envid;
+			F_CLR(rep, REP_F_SYS_DB_OP);
+			CLR_LOCKOUT_BDB(rep);
+			locked = 0;
+			REP_SYSTEM_UNLOCK(env);
+			(void)__memp_set_config(
+			    env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+			interrupting = 0;
+		}
+	} else {
+		/*
+		 * Start a non-client as a client.
+		 */
+		rep->master_id = DB_EID_INVALID;
+		/*
+		 * A non-client should not have been participating in an
+		 * election, so most election flags should be off.  The TALLY
+		 * flag is an exception because it is set any time we receive
+		 * a VOTE1 and there is no reason to clear and lose it for an
+		 * election that may begin shortly.
+		 */
+		DB_ASSERT(env, !FLD_ISSET(rep->elect_flags, ~REP_E_TALLY));
+		/*
+		 * A non-client should not have the following client flags
+		 * set and should not be in internal init.
+		 */
+		DB_ASSERT(env, !F_ISSET(rep,
+		    REP_F_ABBREVIATED | REP_F_DELAY | REP_F_NEWFILE));
+		DB_ASSERT(env, rep->sync_state == SYNC_OFF);
+
+		if ((ret = __log_get_oldversion(env, &oldvers)) != 0)
+			goto errunlock;
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+			"rep_start: Found old version log %d", oldvers));
+		if (oldvers >= DB_LOGVERSION_MIN) {
+			__log_set_version(env, oldvers);
+			if ((ret = __env_init_rec(env, oldvers)) != 0)
+				goto errunlock;
+			oldvers = __rep_conv_vers(env, oldvers);
+			DB_ASSERT(env, oldvers != DB_REPVERSION_INVALID);
+			rep->version = oldvers;
+		}
+		/*
+		 * When becoming a client, clear the following flags:
+		 *   MASTER: Site is no longer a master.
+		 *   MASTERELECT: Indicates that a master is elected
+		 *       rather than appointed, not applicable on client.
+		 */
+		F_CLR(rep, REP_F_MASTER | REP_F_MASTERELECT);
+		F_SET(rep, REP_F_CLIENT);
+
+		/*
+		 * On a client, compute the lease duration on the
+		 * assumption that the client has a fast clock.
+		 * Expire any existing leases we might have held as
+		 * a master.
+		 */
+		if (IS_USING_LEASES(env) && !IS_REP_STARTED(env)) {
+			if ((ret = __rep_lease_expire(env)) != 0)
+				goto errunlock;
+			/*
+			 * Since the master is also compensating on its
+			 * side as well, we're being doubly conservative
+			 * to compensate on the client side.  Theoretically,
+			 * this compensation is not necessary, as it is
+			 * effectively doubling the skew compensation.
+			 * But we are making guarantees based on time and
+			 * skews across machines.  So we are being extra
+			 * cautious.
+			 */
+			tmp = (db_timeout_t)((double)rep->lease_timeout *
+			    ((double)rep->clock_skew /
+			    (double)rep->clock_base));
+			DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+			if (rep->lease_off != INVALID_ROFF) {
+				MUTEX_LOCK(env, renv->mtx_regenv);
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->lease_off));
+				MUTEX_UNLOCK(env, renv->mtx_regenv);
+				rep->lease_off = INVALID_ROFF;
+			}
+		}
+		REP_SYSTEM_UNLOCK(env);
+
+		/*
+		 * Abort any prepared transactions that were restored
+		 * by recovery.  We won't be able to create any txns of
+		 * our own until they're resolved, but we can't resolve
+		 * them ourselves;  the master has to.  If any get
+		 * resolved as commits, we'll redo them when commit
+		 * records come in.  Aborts will simply be ignored.
+		 */
+		if ((ret = __rep_abort_prepared(env)) != 0)
+			goto errlock;
+
+		/*
+		 * Since we're changing roles we need to init the db.
+		 */
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			goto errlock;
+		/*
+		 * Ignore errors, because if the file doesn't exist,
+		 * this is perfectly OK.
+		 */
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		(void)__db_remove(dbp, ip, NULL, REPDBNAME,
+		    NULL, DB_FORCE);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		/*
+		 * Set pending_event after calls that can fail.
+		 */
+		pending_event = DB_EVENT_REP_CLIENT;
+
+		REP_SYSTEM_LOCK(env);
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+		if (locked) {
+			CLR_LOCKOUT_BDB(rep);
+			locked = 0;
+		}
+
+		if (F_ISSET(env, ENV_PRIVATE))
+			/*
+			 * If we think we're a new client, and we have a
+			 * private env, set our gen number down to 0.
+			 * Otherwise, we can restart and think
+			 * we're ready to accept a new record (because our
+			 * gen is okay), but really this client needs to
+			 * sync with the master.
+			 */
+			SET_GEN(0);
+		REP_SYSTEM_UNLOCK(env);
+
+		/*
+		 * Announce ourselves and send out our data.
+		 */
+		if ((ret = __dbt_usercopy(env, dbt)) != 0)
+			goto out;
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
+	}
+
+	if (0) {
+		/*
+		 * We have separate labels for errors.  If we're returning an
+		 * error before we've set REP_LOCKOUT_MSG, we use 'err'.  If
+		 * we are erroring while holding the region mutex, then we use
+		 * 'errunlock' label.  If we error without holding the rep
+		 * mutex we must use 'errlock'.
+		 */
+errlock:	REP_SYSTEM_LOCK(env);
+errunlock:	FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+		if (locked)
+			CLR_LOCKOUT_BDB(rep);
+		if (interrupting)
+			(void)__memp_set_config(
+			    env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+		REP_SYSTEM_UNLOCK(env);
+	}
+out:
+	if (ret == 0) {
+		REP_SYSTEM_LOCK(env);
+		F_SET(rep, REP_F_START_CALLED);
+		REP_SYSTEM_UNLOCK(env);
+	}
+	if (pending_event != DB_EVENT_NO_SUCH_EVENT)
+		__rep_fire_event(env, pending_event, NULL);
+	if (start_th)
+		MUTEX_UNLOCK(env, rep->mtx_repstart);
+	__dbt_userfree(env, dbt, NULL, NULL);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Write the current generation's base LSN into the history database.
+ */
+static int
+__rep_save_lsn_hist(env, ip, lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_LSN *lsnp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGENV *renv;
+	DB_TXN *txn;
+	DB *dbp;
+	DBT key_dbt, data_dbt;
+	__rep_lsn_hist_key_args key;
+	__rep_lsn_hist_data_args data;
+	u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+	u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+	db_timespec now;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	renv = env->reginfo->primary;
+	txn = NULL;
+	ret = 0;
+
+	if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		return (ret);
+
+	/*
+	 * Use the cached handle to the history database if it is already open.
+	 * Since we're becoming master, we don't expect to need it after this,
+	 * so clear the cached handle and close the database once we've written
+	 * our update.
+	 */
+	if ((dbp = db_rep->lsn_db) == NULL &&
+	    (ret = __rep_open_sysdb(env,
+	    ip, txn, REPLSNHIST, DB_CREATE, &dbp)) != 0)
+		goto err;
+
+	key.version = REP_LSN_HISTORY_FMT_VERSION;
+	key.gen = rep->gen;
+	__rep_lsn_hist_key_marshal(env, &key, key_buf);
+
+	data.envid = renv->envid;
+	data.lsn = *lsnp;
+	__os_gettime(env, &now, 0);
+	data.hist_sec = (u_int32_t)now.tv_sec;
+	data.hist_nsec = (u_int32_t)now.tv_nsec;
+	__rep_lsn_hist_data_marshal(env, &data, data_buf);
+
+	DB_INIT_DBT(key_dbt, key_buf, sizeof(key_buf));
+	DB_INIT_DBT(data_dbt, data_buf, sizeof(data_buf));
+
+	ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0);
+err:
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+	db_rep->lsn_db = NULL;
+
+	DB_ASSERT(env, txn != NULL);
+	if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * Open existing LSN history database, wherever it may be (on disk or in
+ * memory).  If it doesn't exist, create it only if DB_CREATE is specified by
+ * our caller.
+ *
+ * If we could be sure that all sites in the replication group had matching
+ * REP_C_INMEM settings (that never changed over time), we could simply look for
+ * the database in the place where we knew it should be.  The code here tries to
+ * be more flexible/resilient to mis-matching INMEM settings, even though we
+ * recommend against that.
+ * PUBLIC: int __rep_open_sysdb __P((ENV *,
+ * PUBLIC:    DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **));
+ */
+int
+__rep_open_sysdb(env, ip, txn, dbname, flags, dbpp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *dbname;
+	u_int32_t flags;
+	DB **dbpp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	DB *dbp;
+	char *fname;
+	u_int32_t myflags;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+
+	myflags = DB_INTERNAL_PERSISTENT_DB |
+	    (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+
+	/*
+	 * First, try opening it as a sub-database within a disk-resident
+	 * database file.  (If success, skip to the end.)
+	 */
+	if ((ret = __db_open(dbp, ip, txn,
+	    REPSYSDBNAME, dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+		goto found;
+	if (ret != ENOENT)
+		goto err;
+
+	/*
+	 * Here, the file was not found.  Next, try opening it as an in-memory
+	 * database (after the necessary clean-up).
+	 */
+	ret = __db_close(dbp, txn, DB_NOSYNC);
+	dbp = NULL;
+	if (ret != 0 || (ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+	if ((ret = __db_open(dbp, ip, txn,
+	    NULL, dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+		goto found;
+	if (ret != ENOENT)
+		goto err;
+
+	/*
+	 * Here, the database was not found either on disk or in memory.  Create
+	 * it, according to our local INMEM setting.
+	 */
+	ret = __db_close(dbp, txn, DB_NOSYNC);
+	dbp = NULL;
+	if (ret != 0)
+		goto err;
+	if (LF_ISSET(DB_CREATE)) {
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			goto err;
+		if ((ret = __db_set_pagesize(dbp, REPSYSDBPGSZ)) != 0)
+			goto err;
+		FLD_SET(myflags, DB_CREATE);
+		fname = FLD_ISSET(rep->config, REP_C_INMEM) ?
+		    NULL : REPSYSDBNAME;
+		if ((ret = __db_open(dbp, ip, txn, fname,
+		    dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+			goto found;
+	} else
+		ret = ENOENT;
+
+err:
+	if (dbp != NULL && (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 &&
+	    (ret == 0 || ret == ENOENT))
+		ret = t_ret;
+	return (ret);
+
+found:
+	*dbpp = dbp;
+	return (0);
+}
+
+/*
+ * __rep_client_dbinit --
+ *
+ * Initialize the LSN database on the client side.  This is called from the
+ * client initialization code.  The startup flag value indicates if
+ * this is the first thread/process starting up and therefore should create
+ * the LSN database.  This routine must be called once by each process acting
+ * as a client.
+ *
+ * Assumes caller holds appropriate mutex.
+ *
+ * PUBLIC: int __rep_client_dbinit __P((ENV *, int, repdb_t));
+ */
+int
+__rep_client_dbinit(env, startup, which)
+	ENV *env;
+	int startup;
+	repdb_t which;
+{
+	DB *dbp, **rdbpp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	REP *rep;
+	int ret, t_ret;
+	u_int32_t flags;
+	const char *fname, *name, *subdb;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dbp = NULL;
+
+	if (which == REP_DB) {
+		name = REPDBNAME;
+		rdbpp = &db_rep->rep_db;
+	} else {
+		name = REPPAGENAME;
+		rdbpp = &db_rep->file_dbp;
+	}
+	/* Check if this has already been called on this environment. */
+	if (*rdbpp != NULL)
+		return (0);
+
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/* Set up arguments for __db_remove and __db_open calls. */
+	fname = name;
+	subdb = NULL;
+	if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+		fname = NULL;
+		subdb = name;
+	}
+
+	if (startup) {
+		if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+			goto err;
+		/*
+		 * Prevent in-memory database remove from writing to
+		 * non-existent logs.
+		 */
+		if (FLD_ISSET(rep->config, REP_C_INMEM))
+			(void)__db_set_flags(dbp, DB_TXN_NOT_DURABLE);
+		/*
+		 * Ignore errors, because if the file doesn't exist, this
+		 * is perfectly OK.
+		 */
+		(void)__db_remove(dbp, ip, NULL, fname, subdb, DB_FORCE);
+	}
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+	if (which == REP_DB &&
+	    (ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0)
+		goto err;
+
+	/* Don't write log records on the client. */
+	if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+
+	flags = DB_NO_AUTO_COMMIT | DB_CREATE | DB_INTERNAL_TEMPORARY_DB |
+	    (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+
+	if ((ret = __db_open(dbp, ip, NULL, fname, subdb,
+	    (which == REP_DB ? DB_BTREE : DB_RECNO),
+	    flags, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	*rdbpp = dbp;
+
+	if (0) {
+err:		if (dbp != NULL &&
+		    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+		*rdbpp = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __rep_bt_cmp --
+ *
+ * Comparison function for the LSN table.  We use the entire control
+ * structure as a key (for simplicity, so we don't have to merge the
+ * other fields in the control with the data field), but really only
+ * care about the LSNs.
+ */
+static int
+__rep_bt_cmp(dbp, dbt1, dbt2)
+	DB *dbp;
+	const DBT *dbt1, *dbt2;
+{
+	DB_LSN lsn1, lsn2;
+	__rep_control_args *rp1, *rp2;
+
+	COMPQUIET(dbp, NULL);
+
+	rp1 = dbt1->data;
+	rp2 = dbt2->data;
+
+	(void)__ua_memcpy(&lsn1, &rp1->lsn, sizeof(DB_LSN));
+	(void)__ua_memcpy(&lsn2, &rp2->lsn, sizeof(DB_LSN));
+
+	if (lsn1.file > lsn2.file)
+		return (1);
+
+	if (lsn1.file < lsn2.file)
+		return (-1);
+
+	if (lsn1.offset > lsn2.offset)
+		return (1);
+
+	if (lsn1.offset < lsn2.offset)
+		return (-1);
+
+	return (0);
+}
+
+/*
+ * __rep_abort_prepared --
+ *	Abort any prepared transactions that recovery restored.
+ *
+ *	This is used by clients that have just run recovery, since
+ * they cannot/should not call txn_recover and handle prepared transactions
+ * themselves.
+ */
+static int
+__rep_abort_prepared(env)
+	ENV *env;
+{
+#define	PREPLISTSIZE	50
+	DB_LOG *dblp;
+	DB_PREPLIST prep[PREPLISTSIZE], *p;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	LOG *lp;
+	int ret;
+	long count, i;
+	u_int32_t op;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if (region->stat.st_nrestores == 0)
+		return (0);
+
+	op = DB_FIRST;
+	do {
+		if ((ret = __txn_recover(env,
+		    prep, PREPLISTSIZE, &count, op)) != 0)
+			return (ret);
+		for (i = 0; i < count; i++) {
+			p = &prep[i];
+			if ((ret = __txn_abort(p->txn)) != 0)
+				return (ret);
+			env->rep_handle->region->op_cnt--;
+			env->rep_handle->region->max_prep_lsn = lp->lsn;
+			region->stat.st_nrestores--;
+		}
+		op = DB_NEXT;
+	} while (count == PREPLISTSIZE);
+
+	return (0);
+}
+
+/*
+ * __rep_restore_prepared --
+ *	Restore to a prepared state any prepared but not yet committed
+ * transactions.
+ *
+ *	This performs, in effect, a "mini-recovery";  it is called from
+ * __rep_start by newly upgraded masters.  There may be transactions that an
+ * old master prepared but did not resolve, which we need to restore to an
+ * active state.
+ */
+static int
+__rep_restore_prepared(env)
+	ENV *env;
+{
+	DBT rec;
+	DB_LOGC *logc;
+	DB_LSN ckp_lsn, lsn;
+	DB_REP *db_rep;
+	DB_TXNHEAD *txninfo;
+	REP *rep;
+	__txn_ckp_args *ckp_args;
+	__txn_regop_args *regop_args;
+	__txn_prepare_args *prep_args;
+	int ret, t_ret;
+	u_int32_t hi_txn, low_txn, rectype, status, txnid, txnop;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	if (IS_ZERO_LSN(rep->max_prep_lsn)) {
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "restore_prep: No prepares. Skip."));
+		return (0);
+	}
+	txninfo = NULL;
+	ckp_args = NULL;
+	prep_args = NULL;
+	regop_args = NULL;
+	ZERO_LSN(ckp_lsn);
+	ZERO_LSN(lsn);
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	/*
+	 * Get our first LSN to see if the prepared LSN is still
+	 * available.  If so, it might be unresolved.  If not,
+	 * then it is guaranteed to be resolved.
+	 */
+	memset(&rec, 0, sizeof(DBT));
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0)  {
+		__db_errx(env, DB_STR("3559", "First record not found"));
+		goto err;
+	}
+	/*
+	 * If the max_prep_lsn is no longer available, we're sure
+	 * that txn has been resolved.  We're done.
+	 */
+	if (rep->max_prep_lsn.file < lsn.file) {
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "restore_prep: Prepare resolved. Skip"));
+		ZERO_LSN(rep->max_prep_lsn);
+		goto done;
+	}
+	/*
+	 * We need to consider the set of records between the most recent
+	 * checkpoint LSN and the end of the log;  any txn in that
+	 * range, and only txns in that range, could still have been
+	 * active, and thus prepared but not yet committed (PBNYC),
+	 * when the old master died.
+	 *
+	 * Find the most recent checkpoint LSN, and get the record there.
+	 * If there is no checkpoint in the log, start off by getting
+	 * the very first record in the log instead.
+	 */
+	if ((ret = __txn_getckp(env, &lsn)) == 0) {
+		if ((ret = __logc_get(logc, &lsn, &rec, DB_SET)) != 0)  {
+			__db_errx(env, DB_STR_A("3560",
+			    "Checkpoint record at LSN [%lu][%lu] not found",
+			    "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+			goto err;
+		}
+
+		if ((ret = __txn_ckp_read(
+		    env, rec.data, &ckp_args)) == 0) {
+			ckp_lsn = ckp_args->ckp_lsn;
+			__os_free(env, ckp_args);
+		}
+		if (ret != 0) {
+			__db_errx(env, DB_STR_A("3561",
+			    "Invalid checkpoint record at [%lu][%lu]",
+			    "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+			goto err;
+		}
+
+		if ((ret = __logc_get(logc, &ckp_lsn, &rec, DB_SET)) != 0) {
+			__db_errx(env, DB_STR_A("3562",
+			    "Checkpoint LSN record [%lu][%lu] not found",
+			    "%lu %lu"),
+			    (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+			goto err;
+		}
+	} else if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			/* An empty log means no PBNYC txns. */
+			ret = 0;
+			goto done;
+		}
+		__db_errx(env, DB_STR("3563",
+		    "Attempt to get first log record failed"));
+		goto err;
+	}
+
+	/*
+	 * We use the same txnlist infrastructure that recovery does;
+	 * it demands an estimate of the high and low txnids for
+	 * initialization.
+	 *
+	 * First, the low txnid.
+	 */
+	do {
+		/* txnid is after rectype, which is a u_int32. */
+		LOGCOPY_32(env, &low_txn,
+		    (u_int8_t *)rec.data + sizeof(u_int32_t));
+		if (low_txn != 0)
+			break;
+	} while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0);
+
+	/* If there are no txns, there are no PBNYC txns. */
+	if (ret == DB_NOTFOUND) {
+		ret = 0;
+		goto done;
+	} else if (ret != 0)
+		goto err;
+
+	/* Now, the high txnid. */
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0) {
+		/*
+		 * Note that DB_NOTFOUND is unacceptable here because we
+		 * had to have looked at some log record to get this far.
+		 */
+		__db_errx(env, DB_STR("3564",
+		    "Final log record not found"));
+		goto err;
+	}
+	do {
+		/* txnid is after rectype, which is a u_int32. */
+		LOGCOPY_32(env, &hi_txn,
+		    (u_int8_t *)rec.data + sizeof(u_int32_t));
+		if (hi_txn != 0)
+			break;
+	} while ((ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0);
+	if (ret == DB_NOTFOUND) {
+		ret = 0;
+		goto done;
+	} else if (ret != 0)
+		goto err;
+
+	/* We have a high and low txnid.  Initialise the txn list. */
+	if ((ret = __db_txnlist_init(env,
+	    NULL, low_txn, hi_txn, NULL, &txninfo)) != 0)
+		goto err;
+
+	/*
+	 * Now, walk backward from the end of the log to ckp_lsn.  Any
+	 * prepares that we hit without first hitting a commit or
+	 * abort belong to PBNYC txns, and we need to apply them and
+	 * restore them to a prepared state.
+	 *
+	 * Note that we wind up applying transactions out of order.
+	 * Since all PBNYC txns still held locks on the old master and
+	 * were isolated, this should be safe.
+	 */
+	F_SET(env->lg_handle, DBLOG_RECOVER);
+	for (ret = __logc_get(logc, &lsn, &rec, DB_LAST);
+	    ret == 0 && LOG_COMPARE(&lsn, &ckp_lsn) > 0;
+	    ret = __logc_get(logc, &lsn, &rec, DB_PREV)) {
+		LOGCOPY_32(env, &rectype, rec.data);
+		switch (rectype) {
+		case DB___txn_regop:
+			/*
+			 * It's a commit or abort--but we don't care
+			 * which!  Just add it to the list of txns
+			 * that are resolved.
+			 */
+			if ((ret = __txn_regop_read(
+			    env, rec.data, &regop_args)) != 0)
+				goto err;
+			txnid = regop_args->txnp->txnid;
+			txnop = regop_args->opcode;
+			__os_free(env, regop_args);
+
+			ret = __db_txnlist_find(env,
+			    txninfo, txnid, &status);
+			if (ret == DB_NOTFOUND)
+				ret = __db_txnlist_add(env, txninfo,
+				    txnid, txnop, &lsn);
+			else if (ret != 0)
+				goto err;
+			break;
+		case DB___txn_prepare:
+			/*
+			 * It's a prepare.  If its not aborted and
+			 * we haven't put the txn on our list yet, it
+			 * hasn't been resolved, so apply and restore it.
+			 */
+			if ((ret = __txn_prepare_read(
+			    env, rec.data, &prep_args)) != 0)
+				goto err;
+			ret = __db_txnlist_find(env, txninfo,
+			    prep_args->txnp->txnid, &status);
+			if (ret == DB_NOTFOUND) {
+				if (prep_args->opcode == TXN_ABORT)
+					ret = __db_txnlist_add(env, txninfo,
+					    prep_args->txnp->txnid,
+					    prep_args->opcode, &lsn);
+				else if ((ret =
+				    __rep_process_txn(env, &rec)) == 0) {
+					/*
+					 * We are guaranteed to be single
+					 * threaded here.  We need to
+					 * account for this newly
+					 * instantiated txn in the op_cnt
+					 * so that it is counted when it is
+					 * resolved.
+					 */
+					rep->op_cnt++;
+					ret = __txn_restore_txn(env,
+					    &lsn, prep_args);
+				}
+			} else if (ret != 0)
+				goto err;
+			__os_free(env, prep_args);
+			break;
+		default:
+			continue;
+		}
+	}
+
+	/* It's not an error to have hit the beginning of the log. */
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+done:
+err:	t_ret = __logc_close(logc);
+	F_CLR(env->lg_handle, DBLOG_RECOVER);
+
+	if (txninfo != NULL)
+		__db_txnlist_end(env, txninfo);
+
+	return (ret == 0 ? t_ret : ret);
+}
+
+/*
+ * __rep_get_limit --
+ *	Get the limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+	DB_ENV *dbenv;
+	u_int32_t *gbytesp, *bytesp;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_limit", DB_INIT_REP);
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		ENV_ENTER(env, ip);
+		REP_SYSTEM_LOCK(env);
+		if (gbytesp != NULL)
+			*gbytesp = rep->gbytes;
+		if (bytesp != NULL)
+			*bytesp = rep->bytes;
+		REP_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		if (gbytesp != NULL)
+			*gbytesp = db_rep->gbytes;
+		if (bytesp != NULL)
+			*bytesp = db_rep->bytes;
+	}
+
+	return (0);
+}
+
+/*
+ * __rep_set_limit --
+ *	Set a limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+	DB_ENV *dbenv;
+	u_int32_t gbytes, bytes;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_limit", DB_INIT_REP);
+
+	if (bytes > GIGABYTE) {
+		gbytes += bytes / GIGABYTE;
+		bytes = bytes % GIGABYTE;
+	}
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		ENV_ENTER(env, ip);
+		REP_SYSTEM_LOCK(env);
+		rep->gbytes = gbytes;
+		rep->bytes = bytes;
+		REP_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		db_rep->gbytes = gbytes;
+		db_rep->bytes = bytes;
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_nsites_pp(dbenv, n)
+	DB_ENV *dbenv;
+	u_int32_t n;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_nsites", DB_INIT_REP);
+	if (APP_IS_REPMGR(env)) {
+		__db_errx(env, DB_STR("3565",
+"DB_ENV->rep_set_nsites: cannot call from Replication Manager application"));
+		return (EINVAL);
+	}
+	if ((ret = __rep_set_nsites_int(env, n)) == 0)
+		APP_SET_BASEAPI(env);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_set_nsites_int __P((ENV *, u_int32_t));
+ */
+int
+__rep_set_nsites_int(env, n)
+	ENV *env;
+	u_int32_t n;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	ret = 0;
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		rep->config_nsites = n;
+		if (IS_USING_LEASES(env) &&
+		    IS_REP_MASTER(env) && IS_REP_STARTED(env)) {
+			REP_SYSTEM_LOCK(env);
+			ret = __rep_lease_table_alloc(env, n);
+			REP_SYSTEM_UNLOCK(env);
+		}
+	} else
+		db_rep->config_nsites = n;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_nsites(dbenv, n)
+	DB_ENV *dbenv;
+	u_int32_t *n;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_nsites", DB_INIT_REP);
+
+	if (APP_IS_REPMGR(env))
+		return (__repmgr_get_nsites(env, n));
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		*n = rep->config_nsites;
+	} else
+		*n = db_rep->config_nsites;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_priority(dbenv, priority)
+	DB_ENV *dbenv;
+	u_int32_t priority;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REP *rep;
+	u_int32_t prev;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP);
+
+	ret = 0;
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		prev = rep->priority;
+		rep->priority = priority;
+#ifdef HAVE_REPLICATION_THREADS
+		ret = __repmgr_chg_prio(env, prev, priority);
+#endif
+	} else
+		db_rep->my_priority = priority;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_priority __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_priority(dbenv, priority)
+	DB_ENV *dbenv;
+	u_int32_t *priority;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_priority", DB_INIT_REP);
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		*priority = rep->priority;
+	} else
+		*priority = db_rep->my_priority;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+ */
+int
+__rep_set_timeout(dbenv, which, timeout)
+	DB_ENV *dbenv;
+	int which;
+	db_timeout_t timeout;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+	int repmgr_timeout, ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+	repmgr_timeout = 0;
+
+	if (timeout == 0 && (which == DB_REP_CONNECTION_RETRY ||
+	    which == DB_REP_ELECTION_TIMEOUT || which == DB_REP_LEASE_TIMEOUT ||
+	    which == DB_REP_ELECTION_RETRY)) {
+		__db_errx(env, DB_STR("3566", "timeout value must be > 0"));
+		return (EINVAL);
+	}
+
+	if (which == DB_REP_ACK_TIMEOUT || which == DB_REP_CONNECTION_RETRY ||
+	    which == DB_REP_ELECTION_RETRY ||
+	    which == DB_REP_HEARTBEAT_MONITOR ||
+	    which == DB_REP_HEARTBEAT_SEND)
+		repmgr_timeout = 1;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_timeout", DB_INIT_REP);
+
+	if (APP_IS_BASEAPI(env) && repmgr_timeout) {
+		__db_errx(env, DB_STR_A("3567",
+"%scannot set Replication Manager timeout from base replication application",
+		    "%s"), "DB_ENV->rep_set_timeout:");
+		return (EINVAL);
+	}
+	if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) {
+		ret = EINVAL;
+		__db_errx(env, DB_STR_A("3568",
+"%s: lease timeout must be set before DB_ENV->rep_start.",
+		    "%s"), "DB_ENV->rep_set_timeout");
+		return (EINVAL);
+	}
+
+	switch (which) {
+	case DB_REP_CHECKPOINT_DELAY:
+		if (REP_ON(env))
+			rep->chkpt_delay = timeout;
+		else
+			db_rep->chkpt_delay = timeout;
+		break;
+	case DB_REP_ELECTION_TIMEOUT:
+		if (REP_ON(env))
+			rep->elect_timeout = timeout;
+		else
+			db_rep->elect_timeout = timeout;
+		break;
+	case DB_REP_FULL_ELECTION_TIMEOUT:
+		if (REP_ON(env))
+			rep->full_elect_timeout = timeout;
+		else
+			db_rep->full_elect_timeout = timeout;
+		break;
+	case DB_REP_LEASE_TIMEOUT:
+		if (REP_ON(env))
+			rep->lease_timeout = timeout;
+		else
+			db_rep->lease_timeout = timeout;
+		break;
+#ifdef HAVE_REPLICATION_THREADS
+	case DB_REP_ACK_TIMEOUT:
+		if (REP_ON(env))
+			rep->ack_timeout = timeout;
+		else
+			db_rep->ack_timeout = timeout;
+		break;
+	case DB_REP_CONNECTION_RETRY:
+		if (REP_ON(env))
+			rep->connection_retry_wait = timeout;
+		else
+			db_rep->connection_retry_wait = timeout;
+		break;
+	case DB_REP_ELECTION_RETRY:
+		if (REP_ON(env))
+			rep->election_retry_wait = timeout;
+		else
+			db_rep->election_retry_wait = timeout;
+		break;
+	case DB_REP_HEARTBEAT_MONITOR:
+		if (REP_ON(env))
+			rep->heartbeat_monitor_timeout = timeout;
+		else
+			db_rep->heartbeat_monitor_timeout = timeout;
+		break;
+	case DB_REP_HEARTBEAT_SEND:
+		if (REP_ON(env))
+			rep->heartbeat_frequency = timeout;
+		else
+			db_rep->heartbeat_frequency = timeout;
+		break;
+#endif
+	default:
+		__db_errx(env, DB_STR("3569",
+	    "Unknown timeout type argument to DB_ENV->rep_set_timeout"));
+		ret = EINVAL;
+	}
+
+	/* Setting a repmgr timeout makes this a repmgr application */
+	if (ret == 0 && repmgr_timeout)
+		APP_SET_REPMGR(env);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
+ */
+int
+__rep_get_timeout(dbenv, which, timeout)
+	DB_ENV *dbenv;
+	int which;
+	db_timeout_t *timeout;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_timeout", DB_INIT_REP);
+
+	switch (which) {
+	case DB_REP_CHECKPOINT_DELAY:
+		*timeout = REP_ON(env) ?
+		    rep->chkpt_delay : db_rep->chkpt_delay;
+		break;
+	case DB_REP_ELECTION_TIMEOUT:
+		*timeout = REP_ON(env) ?
+		    rep->elect_timeout : db_rep->elect_timeout;
+		break;
+	case DB_REP_FULL_ELECTION_TIMEOUT:
+		*timeout = REP_ON(env) ?
+		    rep->full_elect_timeout : db_rep->full_elect_timeout;
+		break;
+	case DB_REP_LEASE_TIMEOUT:
+		*timeout = REP_ON(env) ?
+		    rep->lease_timeout : db_rep->lease_timeout;
+		break;
+#ifdef HAVE_REPLICATION_THREADS
+	case DB_REP_ACK_TIMEOUT:
+		*timeout = REP_ON(env) ?
+		    rep->ack_timeout : db_rep->ack_timeout;
+		break;
+	case DB_REP_CONNECTION_RETRY:
+		*timeout = REP_ON(env) ?
+		    rep->connection_retry_wait : db_rep->connection_retry_wait;
+		break;
+	case DB_REP_ELECTION_RETRY:
+		*timeout = REP_ON(env) ?
+		    rep->election_retry_wait : db_rep->election_retry_wait;
+		break;
+	case DB_REP_HEARTBEAT_MONITOR:
+		*timeout = REP_ON(env) ? rep->heartbeat_monitor_timeout :
+		    db_rep->heartbeat_monitor_timeout;
+		break;
+	case DB_REP_HEARTBEAT_SEND:
+		*timeout = REP_ON(env) ?
+		    rep->heartbeat_frequency : db_rep->heartbeat_frequency;
+		break;
+#endif
+	default:
+		__db_errx(env, DB_STR("3570",
+	    "unknown timeout type argument to DB_ENV->rep_get_timeout"));
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __rep_get_request --
+ *	Get the minimum and maximum number of log records that we wait
+ *	before retransmitting.
+ *
+ * PUBLIC: int __rep_get_request
+ * PUBLIC:     __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
+ */
+int
+__rep_get_request(dbenv, minp, maxp)
+	DB_ENV *dbenv;
+	db_timeout_t *minp, *maxp;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_request", DB_INIT_REP);
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		ENV_ENTER(env, ip);
+		/*
+		 * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+		 */
+		REP_SYSTEM_LOCK(env);
+		if (minp != NULL)
+			DB_TIMESPEC_TO_TIMEOUT((*minp), &rep->request_gap, 0);
+		if (maxp != NULL)
+			DB_TIMESPEC_TO_TIMEOUT((*maxp), &rep->max_gap, 0);
+		REP_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		if (minp != NULL)
+			DB_TIMESPEC_TO_TIMEOUT((*minp),
+			    &db_rep->request_gap, 0);
+		if (maxp != NULL)
+			DB_TIMESPEC_TO_TIMEOUT((*maxp), &db_rep->max_gap, 0);
+	}
+
+	return (0);
+}
+
+/*
+ * __rep_set_request --
+ *	Set the minimum and maximum number of log records that we wait
+ *	before retransmitting.
+ *
+ * PUBLIC: int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+ */
+int
+__rep_set_request(dbenv, min, max)
+	DB_ENV *dbenv;
+	db_timeout_t min, max;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	LOG *lp;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_request", DB_INIT_REP);
+
+	if (min == 0 || max < min) {
+		__db_errx(env, DB_STR("3571",
+		    "DB_ENV->rep_set_request: Invalid min or max values"));
+		return (EINVAL);
+	}
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		ENV_ENTER(env, ip);
+		/*
+		 * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+		 */
+		REP_SYSTEM_LOCK(env);
+		DB_TIMEOUT_TO_TIMESPEC(min, &rep->request_gap);
+		DB_TIMEOUT_TO_TIMESPEC(max, &rep->max_gap);
+		REP_SYSTEM_UNLOCK(env);
+
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		dblp = env->lg_handle;
+		if (dblp != NULL && (lp = dblp->reginfo.primary) != NULL) {
+			DB_TIMEOUT_TO_TIMESPEC(min, &lp->wait_ts);
+		}
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		ENV_LEAVE(env, ip);
+	} else {
+		DB_TIMEOUT_TO_TIMESPEC(min, &db_rep->request_gap);
+		DB_TIMEOUT_TO_TIMESPEC(max, &db_rep->max_gap);
+	}
+
+	return (0);
+}
+
+/*
+ * __rep_set_transport_pp --
+ *	Set the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_pp __P((DB_ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC:     int, u_int32_t)));
+ */
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+	DB_ENV *dbenv;
+	int eid;
+	int (*f_send) __P((DB_ENV *,
+	    const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_transport", DB_INIT_REP);
+
+	if (APP_IS_REPMGR(env)) {
+		__db_errx(env, DB_STR("3572",
+		    "DB_ENV->rep_set_transport: cannot call from "
+		    "Replication Manager application"));
+		return (EINVAL);
+	}
+
+	if (f_send == NULL) {
+		__db_errx(env, DB_STR("3573",
+		    "DB_ENV->rep_set_transport: no send function specified"));
+		return (EINVAL);
+	}
+
+	if (eid < 0) {
+		__db_errx(env, DB_STR("3574",
+    "DB_ENV->rep_set_transport: eid must be greater than or equal to 0"));
+		return (EINVAL);
+	}
+
+	if ((ret = __rep_set_transport_int(env, eid, f_send)) == 0)
+		/*
+		 * Setting a non-repmgr send function makes this a base API
+		 * application.
+		 */
+		APP_SET_BASEAPI(env);
+
+	return (ret);
+}
+
+/*
+ * __rep_set_transport_int --
+ *	Set the internal values for the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_int __P((ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC:     int, u_int32_t)));
+ */
+int
+__rep_set_transport_int(env, eid, f_send)
+	ENV *env;
+	int eid;
+	int (*f_send) __P((DB_ENV *,
+	    const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	db_rep = env->rep_handle;
+	db_rep->send = f_send;
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		rep->eid = eid;
+	} else
+		db_rep->eid = eid;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+	DB_ENV *dbenv;
+	u_int32_t *fast_clockp, *slow_clockp;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_get_clockskew", DB_INIT_REP);
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		ENV_ENTER(env, ip);
+		REP_SYSTEM_LOCK(env);
+		*fast_clockp = rep->clock_skew;
+		*slow_clockp = rep->clock_base;
+		REP_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		*fast_clockp = db_rep->clock_skew;
+		*slow_clockp = db_rep->clock_base;
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+	DB_ENV *dbenv;
+	u_int32_t fast_clock, slow_clock;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->rep_set_clockskew", DB_INIT_REP);
+
+	/*
+	 * Check for valid values.  The fast clock should be a larger
+	 * number than the slow clock.  We use the slow clock value as
+	 * our base for adjustment - therefore, a 2% difference should
+	 * be fast == 102, slow == 100.  Check for values being 0.  If
+	 * they are, then set them both to 1 internally.
+	 *
+	 * We will use these numbers to compute the larger ratio to be
+	 * most conservative about the user's intention.
+	 */
+	if (fast_clock == 0 || slow_clock == 0) {
+		/*
+		 * If one value is zero, reject if both aren't zero.
+		 */
+		if (slow_clock != 0 || fast_clock != 0) {
+			__db_errx(env, DB_STR("3575",
+			    "DB_ENV->rep_set_clockskew: Zero only valid for "
+			    "when used for both arguments"));
+			return (EINVAL);
+		}
+		fast_clock = 1;
+		slow_clock = 1;
+	}
+	if (fast_clock < slow_clock) {
+		__db_errx(env, DB_STR("3576",
+		    "DB_ENV->rep_set_clockskew: slow_clock value is "
+		    "larger than fast_clock_value"));
+		return (EINVAL);
+	}
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		if (IS_REP_STARTED(env)) {
+			__db_errx(env, DB_STR("3577",
+	"DB_ENV->rep_set_clockskew: must be called before DB_ENV->rep_start"));
+			return (EINVAL);
+		}
+		ENV_ENTER(env, ip);
+		REP_SYSTEM_LOCK(env);
+		rep->clock_skew = fast_clock;
+		rep->clock_base = slow_clock;
+		REP_SYSTEM_UNLOCK(env);
+		ENV_LEAVE(env, ip);
+	} else {
+		db_rep->clock_skew = fast_clock;
+		db_rep->clock_base = slow_clock;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_flush --
+ *	Re-push the last log record to all clients, in case they've lost
+ *	messages and don't know it.
+ *
+ * PUBLIC: int __rep_flush __P((DB_ENV *));
+ */
+int
+__rep_flush(dbenv)
+	DB_ENV *dbenv;
+{
+	DBT rec;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_flush", DB_INIT_REP);
+
+	if (IS_REP_CLIENT(env))
+		return (0);
+
+	/* We need a transport function because we send messages. */
+	if (db_rep->send == NULL) {
+		__db_errx(env, DB_STR("3578",
+    "DB_ENV->rep_flush: must be called after DB_ENV->rep_set_transport"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	memset(&rec, 0, sizeof(rec));
+	memset(&lsn, 0, sizeof(lsn));
+
+	if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
+		goto err;
+
+	(void)__rep_send_message(env,
+	    DB_EID_BROADCAST, REP_LOG, &lsn, &rec, 0, 0);
+
+err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __rep_sync --
+ *	Force a synchronization to occur between this client and the master.
+ *	This is the other half of configuring DELAYCLIENT.
+ *
+ * PUBLIC: int __rep_sync __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_sync(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	LOG *lp;
+	REP *rep;
+	int master, ret;
+	u_int32_t repflags, type;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	COMPQUIET(flags, 0);
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_sync", DB_INIT_REP);
+
+	/* We need a transport function because we send messages. */
+	if (db_rep->send == NULL) {
+		__db_errx(env, DB_STR("3579",
+    "DB_ENV->rep_sync: must be called after DB_ENV->rep_set_transport"));
+		return (EINVAL);
+	}
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	rep = db_rep->region;
+	ret = 0;
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * Simple cases.  If we're not in the DELAY state we have nothing
+	 * to do.  If we don't know who the master is, send a MASTER_REQ.
+	 */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	lsn = lp->verify_lsn;
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(env);
+	master = rep->master_id;
+	if (master == DB_EID_INVALID) {
+		REP_SYSTEM_UNLOCK(env);
+		(void)__rep_send_message(env, DB_EID_BROADCAST,
+		    REP_MASTER_REQ, NULL, NULL, 0, 0);
+		goto out;
+	}
+	/*
+	 * We want to hold the rep mutex to test and then clear the
+	 * DELAY flag.  Racing threads in here could otherwise result
+	 * in dual data streams.
+	 */
+	if (!F_ISSET(rep, REP_F_DELAY)) {
+		REP_SYSTEM_UNLOCK(env);
+		goto out;
+	}
+
+	DB_ASSERT(env,
+	    !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+	/*
+	 * If we get here, we clear the delay flag and kick off a
+	 * synchronization.  From this point forward, we will
+	 * synchronize until the next time the master changes.
+	 */
+	F_CLR(rep, REP_F_DELAY);
+	if (IS_ZERO_LSN(lsn) && !FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+		CLR_RECOVERY_SETTINGS(rep);
+		ret = DB_REP_JOIN_FAILURE;
+		REP_SYSTEM_UNLOCK(env);
+		goto out;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	/*
+	 * When we set REP_F_DELAY, we set verify_lsn to the real verify lsn if
+	 * we need to verify, or we zeroed it out if this is a client that needs
+	 * internal init.  So, send the type of message now that
+	 * __rep_new_master delayed sending.
+	 */
+	if (IS_ZERO_LSN(lsn)) {
+		DB_ASSERT(env, rep->sync_state == SYNC_UPDATE);
+		type = REP_UPDATE_REQ;
+		repflags = 0;
+	} else {
+		DB_ASSERT(env, rep->sync_state == SYNC_VERIFY);
+		type = REP_VERIFY_REQ;
+		repflags = DB_REP_ANYWHERE;
+	}
+	(void)__rep_send_message(env, master, type, &lsn, NULL, 0, repflags);
+
+out:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_txn_applied __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+ */
+int
+__rep_txn_applied(env, ip, commit_info, timeout)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_COMMIT_INFO *commit_info;
+	db_timeout_t timeout;
+{
+	REP *rep;
+	db_timespec limit, now, t;
+	db_timeout_t duration;
+	struct rep_waitgoal reason;
+	int locked, ret, t_ret;
+
+	if (commit_info->gen == 0) {
+		__db_errx(env, DB_STR("3580",
+		    "non-replication commit token in replication env"));
+		return (EINVAL);
+	}
+
+	rep = env->rep_handle->region;
+
+	VPRINT(env, (env, DB_VERB_REP_MISC,
+	    "checking txn_applied: gen %lu, envid %lu, LSN [%lu][%lu]",
+	    (u_long)commit_info->gen, (u_long)commit_info->envid,
+	    (u_long)commit_info->lsn.file, (u_long)commit_info->lsn.offset));
+	locked = 0;
+	__os_gettime(env, &limit, 1);
+	TIMESPEC_ADD_DB_TIMEOUT(&limit, timeout);
+
+retry:
+	/*
+	 * The checking is done within the scope of the handle count, but if we
+	 * end up having to wait that part is not.  If a lockout sequence begins
+	 * while we're waiting, it will wake us up, and we'll come back here to
+	 * try entering the scope again, at which point we'll get an error so
+	 * that we return immediately.
+	 */
+	if ((ret = __op_handle_enter(env)) != 0)
+		goto out;
+
+	ret = __rep_check_applied(env, ip, commit_info, &reason);
+	t_ret = __env_db_rep_exit(env);
+
+	/*
+	 * Between here and __rep_check_applied() we use DB_TIMEOUT privately to
+	 * mean that the transaction hasn't been applied yet, but it still
+	 * plausibly could be soon; think of it as meaning "not yet".  So
+	 * DB_TIMEOUT doesn't necessarily mean that DB_TIMEOUT is the ultimate
+	 * return that the application will see.
+	 *
+	 * When we get this "not yet", we check the actual time remaining.  If
+	 * the time has expired, then indeed we can simply pass DB_TIMEOUT back
+	 * up to the calling application.  But if not, it tells us that we have
+	 * a chance to wait and try again.  This is a nice division of labor,
+	 * because it means the lower level functions (__rep_check_applied() and
+	 * below) do not have to mess with any actual time computations, or
+	 * waiting, at all.
+	 */
+	if (ret == DB_TIMEOUT && t_ret == 0 && F_ISSET(rep, REP_F_CLIENT)) {
+		__os_gettime(env, &now, 1);
+		if (timespeccmp(&now, &limit, <)) {
+
+			/* Compute how much time remains before the limit. */
+			t = limit;
+			timespecsub(&t, &now);
+			DB_TIMESPEC_TO_TIMEOUT(duration, &t, 1);
+
+			/*
+			 * Wait for whatever __rep_check_applied told us we
+			 * needed to wait for.  But first, check the condition
+			 * again under mutex protection, in case there was a
+			 * close race.
+			 */
+			if (reason.why == AWAIT_LSN ||
+			    reason.why == AWAIT_HISTORY) {
+				MUTEX_LOCK(env, rep->mtx_clientdb);
+				locked = 1;
+			}
+			REP_SYSTEM_LOCK(env);
+			ret = __rep_check_goal(env, &reason);
+			if (locked) {
+				MUTEX_UNLOCK(env, rep->mtx_clientdb);
+				locked = 0;
+			}
+			if (ret == DB_TIMEOUT) {
+				/*
+				 * The usual case: we haven't reached our goal
+				 * yet, even after checking again while holding
+				 * mutex.
+				 */
+				ret = __rep_await_condition(env,
+				    &reason, duration);
+
+				/*
+				 * If it were possible for
+				 * __rep_await_condition() to return DB_TIMEOUT
+				 * that would confuse the outer "if" statement
+				 * here.
+				 */
+				DB_ASSERT(env, ret != DB_TIMEOUT);
+			}
+			REP_SYSTEM_UNLOCK(env);
+			if (ret != 0)
+				goto out;
+
+			/*
+			 * Note that the "reason" that check_applied set, and
+			 * that await_condition waited for, does not necessarily
+			 * represent a final result ready to return to the
+			 * user.  In some cases there may be a few state changes
+			 * necessary before we are able to determine the final
+			 * result.  Thus whenever we complete a successful wait
+			 * we need to cycle back and check the full txn_applied
+			 * question again.
+			 */
+			goto retry;
+		}
+	}
+
+	if (t_ret != 0 &&
+	    (ret == 0 || ret == DB_TIMEOUT || ret == DB_NOTFOUND))
+		ret = t_ret;
+
+out:
+	return (ret);
+}
+
+/*
+ * The only non-zero return code from this function is for unexpected errors.
+ * We normally return 0, regardless of whether the wait terminated because the
+ * condition was satisfied or the timeout expired.
+ */
+static int
+__rep_await_condition(env, reasonp, duration)
+	ENV *env;
+	struct rep_waitgoal *reasonp;
+	db_timeout_t duration;
+{
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	struct __rep_waiter *waiter;
+	int ret;
+
+	rep = env->rep_handle->region;
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	/*
+	 * Acquire the first lock on the self-blocking mutex when we first
+	 * allocate it.  Thereafter when it's on the free list we know that
+	 * first lock has already been taken.
+	 */
+	if ((waiter = SH_TAILQ_FIRST(&rep->free_waiters,
+	    __rep_waiter)) == NULL) {
+		MUTEX_LOCK(env, renv->mtx_regenv);
+		if ((ret = __env_alloc(env->reginfo,
+		    sizeof(struct __rep_waiter), &waiter)) == 0) {
+			memset(waiter, 0, sizeof(*waiter));
+			if ((ret = __mutex_alloc(env, MTX_REP_WAITER,
+			    DB_MUTEX_SELF_BLOCK, &waiter->mtx_repwait)) != 0)
+				__env_alloc_free(infop, waiter);
+		}
+		MUTEX_UNLOCK(env, renv->mtx_regenv);
+		if (ret != 0)
+			return (ret);
+
+		MUTEX_LOCK(env, waiter->mtx_repwait);
+	} else
+		SH_TAILQ_REMOVE(&rep->free_waiters,
+		    waiter, links, __rep_waiter);
+	waiter->flags = 0;
+	waiter->goal = *reasonp;
+	SH_TAILQ_INSERT_HEAD(&rep->waiters,
+	    waiter, links, __rep_waiter);
+
+	VPRINT(env, (env, DB_VERB_REP_MISC,
+	    "waiting for condition %d", (int)reasonp->why));
+	REP_SYSTEM_UNLOCK(env);
+	/* Wait here for conditions to become more favorable. */
+	MUTEX_WAIT(env, waiter->mtx_repwait, duration);
+	REP_SYSTEM_LOCK(env);
+
+	if (!F_ISSET(waiter, REP_F_WOKEN))
+		SH_TAILQ_REMOVE(&rep->waiters, waiter, links, __rep_waiter);
+	SH_TAILQ_INSERT_HEAD(&rep->free_waiters, waiter, links, __rep_waiter);
+
+	return (0);
+}
+
+/*
+ * Check whether the transaction is currently applied.  If it is not, but it
+ * might likely become applied in the future, then return DB_TIMEOUT.  It's the
+ * caller's duty to figure out whether to wait or not in that case.  Here we
+ * only do an immediate check of the current state of affairs.
+ */
+static int
+__rep_check_applied(env, ip, commit_info, reasonp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_COMMIT_INFO *commit_info;
+	struct rep_waitgoal *reasonp;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	DB_TXN *txn;
+	DBC *dbc;
+	__rep_lsn_hist_data_args hist, hist2;
+	DB_LSN lsn;
+	u_int32_t gen;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	gen = rep->gen;
+	txn = NULL;
+	dbc = NULL;
+
+	if (F_ISSET(rep, REP_F_MASTER)) {
+		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+	} else {
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		lsn = lp->max_perm_lsn;
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+
+	/*
+	 * The first thing to consider is whether we're in the right gen.
+	 * The token gen either matches our current gen, or is left over from an
+	 * older gen, or in rare circumstances could be from a "future" gen that
+	 * we haven't learned about yet (or that got rolled back).
+	 */
+	if (commit_info->gen == gen) {
+		ret = __rep_read_lsn_history(env,
+		    ip, &txn, &dbc, gen, &hist, reasonp, DB_SET);
+		if (ret == DB_NOTFOUND) {
+			/*
+			 * We haven't yet received the LSN history of the
+			 * current generation from the master.  Return
+			 * DB_TIMEOUT to tell the caller it needs to wait and
+			 * tell it to wait for the LSN history.
+			 *
+			 * Note that this also helps by eliminating the weird
+			 * period between receiving a new gen (from a NEWMASTER)
+			 * and the subsequent syncing with that new gen.  We
+			 * really only want to return success at the current gen
+			 * once we've synced.
+			 */
+			ret = DB_TIMEOUT;
+			reasonp->why = AWAIT_HISTORY;
+			reasonp->u.lsn = lsn;
+		}
+		if (ret != 0)
+			goto out;
+
+		if (commit_info->envid != hist.envid) {
+			/*
+			 * Gens match, but envids don't: means there were two
+			 * masters at the same gen, and the txn of interest was
+			 * rolled back.
+			 */
+			ret = DB_NOTFOUND;
+			goto out;
+		}
+
+		if (LOG_COMPARE(&commit_info->lsn, &lsn) > 0) {
+			/*
+			 * We haven't yet gotten the LSN of interest, but we can
+			 * expect it soon; so wait for it.
+			 */
+			ret = DB_TIMEOUT;
+			reasonp->why = AWAIT_LSN;
+			reasonp->u.lsn = commit_info->lsn;
+			goto out;
+		}
+
+		if (LOG_COMPARE(&commit_info->lsn, &hist.lsn) >= 0) {
+			/*
+			 * The LSN of interest is in the past, but within the
+			 * range claimed for this gen.  Success!  (We have read
+			 * consistency.)
+			 */
+			ret = 0;
+			goto out;
+		}
+
+		/*
+		 * There must have been a DUPMASTER at some point: the
+		 * description of the txn of interest doesn't match what we see
+		 * in the history available to us now.
+		 */
+		ret = DB_NOTFOUND;
+
+	} else if (commit_info->gen < gen || gen == 0) {
+		/*
+		 * Transaction from an old gen.  Read this gen's base LSN, plus
+		 * that of the next higher gen, because we want to check that
+		 * the token LSN is within the close/open range defined by
+		 * [base,next).
+		 */
+		ret = __rep_read_lsn_history(env,
+		    ip, &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET);
+		t_ret = __rep_read_lsn_history(env,
+		    ip, &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT);
+		if (ret == DB_NOTFOUND) {
+			/*
+			 * If the desired gen is not in our database, it could
+			 * mean either of two things.  1. The whole gen could
+			 * have been rolled back.  2. We could just be really
+			 * far behind on replication.  Reading ahead to the next
+			 * following gen, which we likely need anyway, helps us
+			 * decide which case to conclude.
+			 */
+			if (t_ret == 0)
+				/*
+				 * Second read succeeded, so "being behind in
+				 * replication" is not a viable reason for
+				 * having failed to find the first read.
+				 * Therefore, the gen must have been rolled
+				 * back, and the proper result is NOTFOUND to
+				 * indicate that.
+				 */
+				goto out;
+			if (t_ret == DB_NOTFOUND) {
+				/*
+				 * Second read also got a NOTFOUND: we're
+				 * definitely "behind" (we don't even have
+				 * current gen's history).  So, waiting is the
+				 * correct result.
+				 */
+				ret = DB_TIMEOUT;
+				reasonp->why = AWAIT_HISTORY;
+				reasonp->u.lsn = lsn;
+				goto out;
+			}
+			/*
+			 * Here, t_ret is something unexpected, which trumps the
+			 * NOTFOUND returned from the first read.
+			 */
+			ret = t_ret;
+			goto out;
+		}
+		if (ret != 0)
+			goto out; /* Unexpected error, first read. */
+		if (commit_info->envid != hist.envid) {
+			/*
+			 * (We don't need the second read in order to make this
+			 * test.)
+			 *
+			 * We have info for the indicated gen, but the envids
+			 * don't match, meaning the txn was written at a dup
+			 * master and that gen instance was rolled back.
+			 */
+			ret = DB_NOTFOUND;
+			goto out;
+		}
+
+		/* Examine result of second read. */
+		if ((ret = t_ret) == DB_NOTFOUND) {
+			/*
+			 * We haven't even heard about our current gen yet, so
+			 * it's worth waiting for it.
+			 */
+			ret = DB_TIMEOUT;
+			reasonp->why = AWAIT_HISTORY;
+			reasonp->u.lsn = lsn;
+		} else if (ret != 0)
+			goto out; /* Second read returned unexpected error. */
+
+		/*
+		 * We now have the history info for the gen of the txn, and for
+		 * the subsequent gen.  All we have to do is see if the LSN is
+		 * in range.
+		 */
+		if (LOG_COMPARE(&commit_info->lsn, &hist.lsn) >= 0 &&
+		    LOG_COMPARE(&commit_info->lsn, &hist2.lsn) < 0)
+			ret = 0;
+		else
+			ret = DB_NOTFOUND;
+	} else {
+		/*
+		 * Token names a future gen.  If we're a client and the LSN also
+		 * is in the future, then it's possible we just haven't caught
+		 * up yet, so we can wait for it.  Otherwise, it must have been
+		 * part of a generation that got lost in a roll-back.
+		 */
+		if (F_ISSET(rep, REP_F_CLIENT) &&
+		    LOG_COMPARE(&commit_info->lsn, &lsn) > 0) {
+			reasonp->why = AWAIT_GEN;
+			reasonp->u.gen = commit_info->gen;
+			return (DB_TIMEOUT);
+		}
+		return (DB_NOTFOUND);
+	}
+
+out:
+	if (dbc != NULL &&
+	    (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (txn != NULL &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 1, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * The txn and dbc handles are owned by caller, though we create them if
+ * necessary.  Caller is responsible for closing them.
+ */
+static int
+__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN **txn;
+	DBC **dbc;
+	u_int32_t gen;
+	__rep_lsn_hist_data_args *gen_infop;
+	struct rep_waitgoal *reasonp;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	DB *dbp;
+	__rep_lsn_hist_key_args key;
+	u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+	u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+	DBT key_dbt, data_dbt;
+	u_int32_t desired_gen;
+	int ret, tries;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+
+	DB_ASSERT(env, flags == DB_SET || flags == DB_NEXT);
+
+	/* Simply return cached info, if we already have it. */
+	desired_gen = flags == DB_SET ? gen : gen + 1;
+	REP_SYSTEM_LOCK(env);
+	if (rep->gen == desired_gen && !IS_ZERO_LSN(rep->gen_base_lsn)) {
+		gen_infop->lsn = rep->gen_base_lsn;
+		gen_infop->envid = rep->master_envid;
+		goto unlock;
+	}
+	REP_SYSTEM_UNLOCK(env);
+
+	tries = 0;
+retry:
+	if (*txn == NULL &&
+	    (ret = __txn_begin(env, ip, NULL, txn, 0)) != 0)
+		return (ret);
+
+	if ((dbp = db_rep->lsn_db) == NULL) {
+		if ((ret = __rep_open_sysdb(env,
+		    ip, *txn, REPLSNHIST, 0, &dbp)) != 0) {
+			/*
+			 * If the database isn't there, it could be because it's
+			 * memory-resident, and we haven't yet sync'ed with the
+			 * master to materialize it.  (It could make sense to
+			 * include a test for INMEM in this conditional
+			 * expression, if we were sure all sites had matching
+			 * INMEM settings; but since we don't enforce that,
+			 * leaving it out makes for more optimistic behavior.)
+			 */
+			if (ret == ENOENT &&
+			    !F_ISSET(rep, REP_F_NIMDBS_LOADED | REP_F_MASTER)) {
+				ret = DB_TIMEOUT;
+				reasonp->why = AWAIT_NIMDB;
+			}
+			goto err;
+		}
+		db_rep->lsn_db = dbp;
+	}
+
+	if (*dbc == NULL &&
+	    (ret = __db_cursor(dbp, ip, *txn, dbc, 0)) != 0)
+		goto err;
+
+	if (flags == DB_SET) {
+		key.version = REP_LSN_HISTORY_FMT_VERSION;
+		key.gen = gen;
+		__rep_lsn_hist_key_marshal(env, &key, key_buf);
+	}
+	DB_INIT_DBT(key_dbt, key_buf, __REP_LSN_HIST_KEY_SIZE);
+	key_dbt.ulen = __REP_LSN_HIST_KEY_SIZE;
+	F_SET(&key_dbt, DB_DBT_USERMEM);
+
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	data_dbt.data = data_buf;
+	data_dbt.ulen = __REP_LSN_HIST_DATA_SIZE;
+	F_SET(&data_dbt, DB_DBT_USERMEM);
+	if ((ret = __dbc_get(*dbc, &key_dbt, &data_dbt, flags)) != 0) {
+		if ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+		    ++tries < 5) { /* Limit of 5 is an arbitrary choice. */
+			ret = __dbc_close(*dbc);
+			*dbc = NULL;
+			if (ret != 0)
+				goto err;
+			ret = __txn_abort(*txn);
+			*txn = NULL;
+			if (ret != 0)
+				goto err;
+			__os_yield(env, 0, 10000); /* Arbitrary duration. */
+			goto retry;
+		}
+		goto err;
+	}
+
+	/*
+	 * In the DB_NEXT case, we don't know what the next gen is.  Unmarshal
+	 * the key too, just so that we can check whether it matches the current
+	 * gen, for setting the cache.  Note that, interestingly, the caller
+	 * doesn't care what the key is in that case!
+	 */
+	if ((ret = __rep_lsn_hist_key_unmarshal(env,
+	    &key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) != 0)
+		goto err;
+	ret = __rep_lsn_hist_data_unmarshal(env,
+	    gen_infop, data_buf, __REP_LSN_HIST_DATA_SIZE, NULL);
+
+	REP_SYSTEM_LOCK(env);
+	if (rep->gen == key.gen) {
+		rep->gen_base_lsn = gen_infop->lsn;
+		rep->master_envid = gen_infop->envid;
+	}
+unlock:
+	REP_SYSTEM_UNLOCK(env);
+
+err:
+	return (ret);
+}
+
+/*
+ * __rep_conv_vers --
+ *	Convert from a log version to the replication message version
+ *	that release used.
+ */
+static u_int32_t
+__rep_conv_vers(env, log_ver)
+	ENV *env;
+	u_int32_t log_ver;
+{
+	COMPQUIET(env, NULL);
+
+	/*
+	 * We can't use a switch statement, some of the DB_LOGVERSION_XX
+	 * constants are the same
+	 */
+	if (log_ver == DB_LOGVERSION_53)
+		return (DB_REPVERSION_53);
+	if (log_ver == DB_LOGVERSION_52)
+		return (DB_REPVERSION_52);
+	/* 5.0 and 5.1 had identical log and rep versions. */
+	if (log_ver == DB_LOGVERSION_51)
+		return (DB_REPVERSION_51);
+	if (log_ver == DB_LOGVERSION_48p2)
+		return (DB_REPVERSION_48);
+	if (log_ver == DB_LOGVERSION_48)
+		return (DB_REPVERSION_48);
+	if (log_ver == DB_LOGVERSION_47)
+		return (DB_REPVERSION_47);
+	if (log_ver == DB_LOGVERSION_46)
+		return (DB_REPVERSION_46);
+	if (log_ver == DB_LOGVERSION_45)
+		return (DB_REPVERSION_45);
+	if (log_ver == DB_LOGVERSION_44)
+		return (DB_REPVERSION_44);
+	if (log_ver == DB_LOGVERSION)
+		return (DB_REPVERSION);
+	return (DB_REPVERSION_INVALID);
+}
diff --git a/src/rep/rep_record.c b/src/rep/rep_record.c
new file mode 100644
index 00000000..f4691974
--- /dev/null
+++ b/src/rep/rep_record.c
@@ -0,0 +1,2586 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *));
+static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *));
+static int __rep_fire_newmaster __P((ENV *, u_int32_t, int));
+static int __rep_fire_startupdone __P((ENV *, u_int32_t, int));
+static int __rep_getnext __P((ENV *, DB_THREAD_INFO *));
+static int __rep_lsn_cmp __P((const void *, const void *));
+static int __rep_newfile __P((ENV *, __rep_control_args *, DBT *));
+static int __rep_process_rec __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+    DBT *, db_timespec *, DB_LSN *));
+static int __rep_remfirst __P((ENV *, DB_THREAD_INFO *, DBT *, DBT *));
+static int __rep_skip_msg __P((ENV *, REP *, int, u_int32_t));
+
+/* Used to consistently designate which messages ought to be received where. */
+
+#define	MASTER_ONLY(rep, rp) do {					\
+	if (!F_ISSET(rep, REP_F_MASTER)) {				\
+		RPRINT(env, (env, DB_VERB_REP_MSGS,			\
+		    "Master record received on client"));		\
+		REP_PRINT_MESSAGE(env,					\
+		    eid, rp, "rep_process_message", 0);			\
+		/* Just skip/ignore it. */				\
+		ret = 0;						\
+		goto errlock;						\
+	}								\
+} while (0)
+
+#define	CLIENT_ONLY(rep, rp) do {					\
+	if (!F_ISSET(rep, REP_F_CLIENT)) {				\
+		RPRINT(env, (env, DB_VERB_REP_MSGS,			\
+		    "Client record received on master"));		\
+		/*							\
+		 * Only broadcast DUPMASTER if leases are not		\
+		 * in effect.  If I am an old master, using		\
+		 * leases and I get a newer message, my leases		\
+		 * had better all be expired.				\
+		 */							\
+		if (IS_USING_LEASES(env))				\
+			DB_ASSERT(env,					\
+			    __rep_lease_check(env, 0) ==		\
+			    DB_REP_LEASE_EXPIRED);			\
+		else {							\
+			REP_PRINT_MESSAGE(env,				\
+			    eid, rp, "rep_process_message", 0);		\
+			(void)__rep_send_message(env, DB_EID_BROADCAST, \
+			    REP_DUPMASTER, NULL, NULL, 0, 0);		\
+		}							\
+		ret = DB_REP_DUPMASTER;					\
+		goto errlock;						\
+	}								\
+} while (0)
+
+/*
+ * If a client is attempting to service a request and its gen is not in
+ * sync with its database state, it cannot service the request.  Currently
+ * the only way to know this is with the heavy hammer of knowing (or not)
+ * who the master is.  If the master is invalid, force a rerequest.
+ * If we receive an ALIVE, we update both gen and invalidate the
+ * master_id.
+ */
+#define	CLIENT_MASTERCHK do {						\
+	if (F_ISSET(rep, REP_F_CLIENT)) {				\
+		if (master_id == DB_EID_INVALID) {			\
+			STAT(rep->stat.st_client_svc_miss++);		\
+			ret = __rep_skip_msg(env, rep, eid, rp->rectype);\
+			goto errlock;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * If a client is attempting to service a request it does not have,
+ * call rep_skip_msg to skip this message and force a rerequest to the
+ * sender.  We don't hold the mutex for the stats and may miscount.
+ */
+#define	CLIENT_REREQ do {						\
+	if (F_ISSET(rep, REP_F_CLIENT)) {				\
+		STAT(rep->stat.st_client_svc_req++);			\
+		if (ret == DB_NOTFOUND) {				\
+			STAT(rep->stat.st_client_svc_miss++);		\
+			ret = __rep_skip_msg(env, rep, eid, rp->rectype);\
+		}							\
+	}								\
+} while (0)
+
+#define	RECOVERING_SKIP do {						\
+	if (IS_REP_CLIENT(env) && recovering) {			\
+		/* Not holding region mutex, may miscount */		\
+		STAT(rep->stat.st_msgs_recover++);			\
+		ret = __rep_skip_msg(env, rep, eid, rp->rectype);	\
+		goto errlock;						\
+	}								\
+} while (0)
+
+/*
+ * If we're recovering the log we only want log records that are in the
+ * range we need to recover.  Otherwise we can end up storing a huge
+ * number of "new" records, only to truncate the temp database later after
+ * we run recovery.  If we are actively delaying a sync-up, we also skip
+ * all incoming log records until the application requests sync-up.
+ */
+#define	RECOVERING_LOG_SKIP do {					\
+	if (F_ISSET(rep, REP_F_DELAY) ||				\
+	    rep->master_id == DB_EID_INVALID ||				\
+	    (recovering &&						\
+	    (rep->sync_state != SYNC_LOG ||				\
+	     LOG_COMPARE(&rp->lsn, &rep->last_lsn) >= 0))) {		\
+		/* Not holding region mutex, may miscount */		\
+		STAT(rep->stat.st_msgs_recover++);			\
+		ret = __rep_skip_msg(env, rep, eid, rp->rectype);	\
+		goto errlock;						\
+	}								\
+} while (0)
+
+#define	ANYSITE(rep)
+
+/*
+ * __rep_process_message_pp --
+ *
+ * This routine takes an incoming message and processes it.
+ *
+ * control: contains the control fields from the record
+ * rec: contains the actual record
+ * eid: the environment id of the sender of the message;
+ * ret_lsnp: On DB_REP_ISPERM and DB_REP_NOTPERM returns, contains the
+ *	lsn of the maximum permanent or current not permanent log record
+ *	(respectively).
+ *
+ * PUBLIC: int __rep_process_message_pp
+ * PUBLIC:      __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+	DB_ENV *dbenv;
+	DBT *control, *rec;
+	int eid;
+	DB_LSN *ret_lsnp;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+	ret = 0;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_process_message", DB_INIT_REP);
+
+	if (APP_IS_REPMGR(env)) {
+		__db_errx(env, DB_STR_A("3512",
+		    "%s cannot call from Replication Manager application",
+		    "%s"), "DB_ENV->rep_process_message:");
+		return (EINVAL);
+	}
+
+	/* Control argument must be non-Null. */
+	if (control == NULL || control->size == 0) {
+		__db_errx(env, DB_STR("3513",
+	"DB_ENV->rep_process_message: control argument must be specified"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Make sure site is a master or a client, which implies that
+	 * replication has been started.
+	 */
+	if (!IS_REP_MASTER(env) && !IS_REP_CLIENT(env)) {
+		__db_errx(env, DB_STR("3514",
+	"Environment not configured as replication master or client"));
+		return (EINVAL);
+	}
+
+	if ((ret = __dbt_usercopy(env, control)) != 0 ||
+	    (ret = __dbt_usercopy(env, rec)) != 0) {
+		__dbt_userfree(env, control, rec, NULL);
+		__db_errx(env, DB_STR("3515",
+	"DB_ENV->rep_process_message: error retrieving DBT contents"));
+		return (ret);
+	}
+
+	ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp);
+
+	__dbt_userfree(env, control, rec, NULL);
+	return (ret);
+}
+
+/*
+ * __rep_process_message_int --
+ *
+ * This routine performs the internal steps to process an incoming message.
+ *
+ * PUBLIC: int __rep_process_message_int
+ * PUBLIC:     __P((ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_int(env, control, rec, eid, ret_lsnp)
+	ENV *env;
+	DBT *control, *rec;
+	int eid;
+	DB_LSN *ret_lsnp;
+{
+	DBT data_dbt;
+	DB_LOG *dblp;
+	DB_LSN last_lsn, lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	REP_46_CONTROL *rp46;
+	REP_OLD_CONTROL *orp;
+	__rep_control_args *rp, tmprp;
+	__rep_egen_args egen_arg;
+	size_t len;
+	u_int32_t gen, rep_version;
+	int cmp, do_sync, lockout, master_id, recovering, ret, t_ret;
+	time_t savetime;
+	u_int8_t buf[__REP_MAXMSG_SIZE];
+
+	ret = 0;
+	do_sync = 0;
+	lockout = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	infop = env->reginfo;
+	renv = infop->primary;
+	/*
+	 * Casting this to REP_OLD_CONTROL is just kind of stylistic: the
+	 * rep_version field of course has to be in the same offset in all
+	 * versions in order for this to work.
+	 *
+	 * We can look at the rep_version unswapped here because if we're
+	 * talking to an old version, it will always be unswapped.  If
+	 * we're talking to a new version, the only issue is if it is
+	 * swapped and we take one of the old version conditionals
+	 * incorrectly.  The rep_version would need to be very, very
+	 * large for a swapped version to look like a small, older
+	 * version.  There is no problem here looking at it unswapped.
+	 */
+	rep_version = ((REP_OLD_CONTROL *)control->data)->rep_version;
+	if (rep_version <= DB_REPVERSION_45) {
+		orp = (REP_OLD_CONTROL *)control->data;
+		if (rep_version == DB_REPVERSION_45 &&
+		    F_ISSET(orp, REPCTL_INIT_45)) {
+			F_CLR(orp, REPCTL_INIT_45);
+			F_SET(orp, REPCTL_INIT);
+		}
+		tmprp.rep_version = orp->rep_version;
+		tmprp.log_version = orp->log_version;
+		tmprp.lsn = orp->lsn;
+		tmprp.rectype = orp->rectype;
+		tmprp.gen = orp->gen;
+		tmprp.flags = orp->flags;
+		tmprp.msg_sec = 0;
+		tmprp.msg_nsec = 0;
+	} else if (rep_version == DB_REPVERSION_46) {
+		rp46 = (REP_46_CONTROL *)control->data;
+		tmprp.rep_version = rp46->rep_version;
+		tmprp.log_version = rp46->log_version;
+		tmprp.lsn = rp46->lsn;
+		tmprp.rectype = rp46->rectype;
+		tmprp.gen = rp46->gen;
+		tmprp.flags = rp46->flags;
+		tmprp.msg_sec = (u_int32_t)rp46->msg_time.tv_sec;
+		tmprp.msg_nsec = (u_int32_t)rp46->msg_time.tv_nsec;
+	} else
+		if ((ret = __rep_control_unmarshal(env, &tmprp,
+		    control->data, control->size, NULL)) != 0)
+			return (ret);
+	rp = &tmprp;
+	if (ret_lsnp != NULL)
+		ZERO_LSN(*ret_lsnp);
+
+	ENV_ENTER(env, ip);
+
+	REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0);
+	/*
+	 * Check the version number for both rep and log.  If it is
+	 * an old version we support, convert it.  Otherwise complain.
+	 */
+	if (rp->rep_version < DB_REPVERSION) {
+		if (rp->rep_version < DB_REPVERSION_MIN) {
+			__db_errx(env, DB_STR_A("3516",
+ "unsupported old replication message version %lu, minimum version %d",
+			    "%lu %d"), (u_long)rp->rep_version,
+			    DB_REPVERSION_MIN);
+
+			ret = EINVAL;
+			goto errlock;
+		}
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Received record %lu with old rep version %lu",
+		    (u_long)rp->rectype, (u_long)rp->rep_version));
+		rp->rectype = __rep_msg_from_old(rp->rep_version, rp->rectype);
+		DB_ASSERT(env, rp->rectype != REP_INVALID);
+		/*
+		 * We should have a valid new record type for all the old
+		 * versions.
+		 */
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Converted to record %lu with old rep version %lu",
+		    (u_long)rp->rectype, (u_long)rp->rep_version));
+	} else if (rp->rep_version > DB_REPVERSION) {
+		__db_errx(env, DB_STR_A("3517",
+		    "unexpected replication message version %lu, expected %d",
+		    "%lu %d"), (u_long)rp->rep_version, DB_REPVERSION);
+		ret = EINVAL;
+		goto errlock;
+	}
+
+	if (rp->log_version < DB_LOGVERSION) {
+		if (rp->log_version < DB_LOGVERSION_MIN) {
+			__db_errx(env, DB_STR_A("3518",
+ "unsupported old replication log version %lu, minimum version %d",
+			    "%lu %d"), (u_long)rp->log_version,
+			    DB_LOGVERSION_MIN);
+			ret = EINVAL;
+			goto errlock;
+		}
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Received record %lu with old log version %lu",
+		    (u_long)rp->rectype, (u_long)rp->log_version));
+	} else if (rp->log_version > DB_LOGVERSION) {
+		__db_errx(env, DB_STR_A("3519",
+		    "unexpected log record version %lu, expected %d",
+		    "%lu %d"), (u_long)rp->log_version, DB_LOGVERSION);
+		ret = EINVAL;
+		goto errlock;
+	}
+
+	/*
+	 * Acquire the replication lock.
+	 */
+	REP_SYSTEM_LOCK(env);
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+		/*
+		 * If we're racing with a thread in rep_start, then
+		 * just ignore the message and return.
+		 */
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Racing replication msg lockout, ignore message."));
+		/*
+		 * Although we're ignoring the message, there are a few
+		 * we need to pay a bit of attention to anyway.  All of
+		 * these cases are mutually exclusive.
+		 * 1. If it is a PERM message, we don't want to return 0.
+		 * 2. If it is a NEWSITE message let the app know so it can
+		 * do whatever it needs for connection purposes.
+		 * 3. If it is a c2c request, tell the sender we're not
+		 * going to handle it.
+		 */
+		if (F_ISSET(rp, REPCTL_PERM))
+			ret = DB_REP_IGNORE;
+		REP_SYSTEM_UNLOCK(env);
+		/*
+		 * If this is new site information return DB_REP_NEWSITE so
+		 * that the user can use whatever information may have been
+		 * sent for connections.
+		 */
+		if (rp->rectype == REP_NEWSITE)
+			ret = DB_REP_NEWSITE;
+		/*
+		 * If another client has sent a c2c request to us, it may be a
+		 * long time before it resends the request (due to its dual data
+		 * streams avoidance heuristic); let it know we can't serve the
+		 * request just now.
+		 */
+		if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rp->rectype)) {
+			STAT(rep->stat.st_client_svc_req++);
+			STAT(rep->stat.st_client_svc_miss++);
+			(void)__rep_send_message(env,
+			    eid, REP_REREQUEST, NULL, NULL, 0, 0);
+		}
+		goto out;
+	}
+	rep->msg_th++;
+	gen = rep->gen;
+	master_id = rep->master_id;
+	recovering = IS_REP_RECOVERING(rep);
+	savetime = renv->rep_timestamp;
+
+	STAT(rep->stat.st_msgs_processed++);
+	REP_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Check for lease configuration matching.  Leases must be
+	 * configured all or none.  If I am a client and I receive a
+	 * message requesting a lease, and I'm not using leases, that
+	 * is an error.
+	 */
+	if (!IS_USING_LEASES(env) &&
+	    (F_ISSET(rp, REPCTL_LEASE) || rp->rectype == REP_LEASE_GRANT)) {
+		__db_errx(env, DB_STR("3520",
+		    "Inconsistent lease configuration"));
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Client received lease message and not using leases"));
+		ret = EINVAL;
+		ret = __env_panic(env, ret);
+		goto errlock;
+	}
+
+	/*
+	 * Check for generation number matching.  Ignore any old messages
+	 * except requests that are indicative of a new client that needs
+	 * to get in sync.
+	 */
+	if (rp->gen < gen && rp->rectype != REP_ALIVE_REQ &&
+	    rp->rectype != REP_NEWCLIENT && rp->rectype != REP_MASTER_REQ &&
+	    rp->rectype != REP_DUPMASTER && rp->rectype != REP_VOTE1) {
+		/*
+		 * We don't hold the rep mutex, and could miscount if we race.
+		 */
+		STAT(rep->stat.st_msgs_badgen++);
+		if (F_ISSET(rp, REPCTL_PERM))
+			ret = DB_REP_IGNORE;
+		goto errlock;
+	}
+
+	if (rp->gen > gen) {
+		/*
+		 * If I am a master and am out of date with a lower generation
+		 * number, I am in bad shape and should downgrade.
+		 */
+		if (F_ISSET(rep, REP_F_MASTER)) {
+			STAT(rep->stat.st_dupmasters++);
+			ret = DB_REP_DUPMASTER;
+			/*
+			 * Only broadcast DUPMASTER if leases are not
+			 * in effect.  If I am an old master, using
+			 * leases and I get a newer message, my leases
+			 * had better all be expired.
+			 */
+			if (IS_USING_LEASES(env))
+				DB_ASSERT(env,
+				    __rep_lease_check(env, 0) ==
+				    DB_REP_LEASE_EXPIRED);
+			else if (rp->rectype != REP_DUPMASTER)
+				(void)__rep_send_message(env,
+				    DB_EID_BROADCAST, REP_DUPMASTER,
+				    NULL, NULL, 0, 0);
+			goto errlock;
+		}
+
+		/*
+		 * I am a client and am out of date.  If this is an election,
+		 * or a response from the first site I contacted, then I can
+		 * accept the generation number and participate in future
+		 * elections and communication. Otherwise, I need to hear about
+		 * a new master and sync up.
+		 */
+		if (rp->rectype == REP_ALIVE ||
+		    rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) {
+			REP_SYSTEM_LOCK(env);
+			RPRINT(env, (env, DB_VERB_REP_MSGS,
+			    "Updating gen from %lu to %lu",
+			    (u_long)gen, (u_long)rp->gen));
+			rep->master_id = DB_EID_INVALID;
+			gen = rp->gen;
+			SET_GEN(gen);
+			/*
+			 * Updating of egen will happen when we process the
+			 * message below for each message type.
+			 */
+			REP_SYSTEM_UNLOCK(env);
+			if (rp->rectype == REP_ALIVE)
+				(void)__rep_send_message(env,
+				    DB_EID_BROADCAST, REP_MASTER_REQ, NULL,
+				    NULL, 0, 0);
+		} else if (rp->rectype != REP_NEWMASTER) {
+			/*
+			 * Ignore this message, retransmit if needed.
+			 */
+			if (__rep_check_doreq(env, rep))
+				(void)__rep_send_message(env,
+				    DB_EID_BROADCAST, REP_MASTER_REQ,
+				    NULL, NULL, 0, 0);
+			goto errlock;
+		}
+		/*
+		 * If you get here, then you're a client and either you're
+		 * in an election or you have a NEWMASTER or an ALIVE message
+		 * whose processing will do the right thing below.
+		 */
+	}
+
+	/*
+	 * If the sender is part of an established group, so are we now.
+	 */
+	if (F_ISSET(rp, REPCTL_GROUP_ESTD)) {
+		REP_SYSTEM_LOCK(env);
+#ifdef	DIAGNOSTIC
+		if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+			RPRINT(env, (env, DB_VERB_REP_MSGS,
+			    "I am now part of an established group"));
+#endif
+		F_SET(rep, REP_F_GROUP_ESTD);
+		REP_SYSTEM_UNLOCK(env);
+	}
+
+	/*
+	 * We need to check if we're in recovery and if we are
+	 * then we need to ignore any messages except VERIFY*, VOTE*,
+	 * NEW* and ALIVE_REQ, or backup related messages: UPDATE*,
+	 * PAGE* and FILE*.  We need to also accept LOG messages
+	 * if we're copying the log for recovery/backup.
+	 */
+	switch (rp->rectype) {
+	case REP_ALIVE:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		ANYSITE(rep);
+		if (rp->rep_version < DB_REPVERSION_47)
+			egen_arg.egen = *(u_int32_t *)rec->data;
+		else if ((ret = __rep_egen_unmarshal(env, &egen_arg,
+		    rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		REP_SYSTEM_LOCK(env);
+		if (egen_arg.egen > rep->egen) {
+			/*
+			 * If we're currently working futilely at processing an
+			 * obsolete egen, treat it like an egen update, so that
+			 * we abort the current rep_elect() call and signal the
+			 * application to start a new one.
+			 */
+			if (rep->spent_egen == rep->egen)
+				ret = DB_REP_HOLDELECTION;
+
+			RPRINT(env, (env, DB_VERB_REP_MSGS,
+			    "Received ALIVE egen of %lu, mine %lu",
+			    (u_long)egen_arg.egen, (u_long)rep->egen));
+			__rep_elect_done(env, rep);
+			rep->egen = egen_arg.egen;
+		}
+		REP_SYSTEM_UNLOCK(env);
+		break;
+	case REP_ALIVE_REQ:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		ANYSITE(rep);
+		LOG_SYSTEM_LOCK(env);
+		lsn = lp->lsn;
+		LOG_SYSTEM_UNLOCK(env);
+#ifdef	CONFIG_TEST
+		/*
+		 * Send this first, before the ALIVE message because of the
+		 * way the test suite and messaging is done sequentially.
+		 * In some sequences it is possible to get into a situation
+		 * where the test suite cannot get the later NEWMASTER because
+		 * we break out of the messaging loop too early.
+		 */
+		if (F_ISSET(rep, REP_F_MASTER))
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+#endif
+		REP_SYSTEM_LOCK(env);
+		egen_arg.egen = rep->egen;
+		REP_SYSTEM_UNLOCK(env);
+		if (rep->version < DB_REPVERSION_47)
+			DB_INIT_DBT(data_dbt, &egen_arg.egen,
+			    sizeof(egen_arg.egen));
+		else {
+			if ((ret = __rep_egen_marshal(env,
+			    &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+				goto errlock;
+			DB_INIT_DBT(data_dbt, buf, len);
+		}
+		(void)__rep_send_message(env,
+		    eid, REP_ALIVE, &lsn, &data_dbt, 0, 0);
+		break;
+	case REP_ALL_REQ:
+		RECOVERING_SKIP;
+		CLIENT_MASTERCHK;
+		ret = __rep_allreq(env, rp, eid);
+		CLIENT_REREQ;
+		break;
+	case REP_BULK_LOG:
+		RECOVERING_LOG_SKIP;
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp);
+		break;
+	case REP_BULK_PAGE:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_bulk_page(env, ip, eid, rp, rec);
+		break;
+	case REP_DUPMASTER:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		if (F_ISSET(rep, REP_F_MASTER))
+			ret = DB_REP_DUPMASTER;
+		break;
+#ifdef NOTYET
+	case REP_FILE: /* TODO */
+		CLIENT_ONLY(rep, rp);
+		break;
+	case REP_FILE_REQ:
+		ret = __rep_send_file(env, rec, eid);
+		break;
+#endif
+	case REP_FILE_FAIL:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		CLIENT_ONLY(rep, rp);
+		/*
+		 * Clean up any internal init that was in progress.
+		 */
+		if (eid == rep->master_id) {
+			REP_SYSTEM_LOCK(env);
+			/*
+			 * If we're already locking out messages, give up.
+			 */
+			if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+				goto errhlk;
+			/*
+			 * Lock out other messages to prevent race
+			 * conditions.
+			 */
+			if ((ret =
+			    __rep_lockout_msg(env, rep, 1)) != 0) {
+				goto errhlk;
+			}
+			lockout = 1;
+			/*
+			 * Need mtx_clientdb to safely clean up
+			 * page database in __rep_init_cleanup().
+			 */
+			REP_SYSTEM_UNLOCK(env);
+			MUTEX_LOCK(env, rep->mtx_clientdb);
+			REP_SYSTEM_LOCK(env);
+			/*
+			 * Clean up internal init if one was in progress.
+			 */
+			if (ISSET_LOCKOUT_BDB(rep)) {
+				RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "FILE_FAIL is cleaning up old internal init"));
+#ifdef	CONFIG_TEST
+				STAT(rep->stat.st_filefail_cleanups++);
+#endif
+				ret = __rep_init_cleanup(env, rep, DB_FORCE);
+				F_CLR(rep, REP_F_ABBREVIATED);
+				CLR_RECOVERY_SETTINGS(rep);
+			}
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			if (ret != 0) {
+				RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "FILE_FAIL error cleaning up internal init: %d", ret));
+				goto errhlk;
+			}
+			FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+			lockout = 0;
+			/*
+			 * Restart internal init, setting UPDATE flag and
+			 * zeroing applicable LSNs.
+			 */
+			rep->sync_state = SYNC_UPDATE;
+			ZERO_LSN(rep->first_lsn);
+			ZERO_LSN(rep->ckp_lsn);
+			REP_SYSTEM_UNLOCK(env);
+			(void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+			    NULL, NULL, 0, 0);
+		}
+		break;
+	case REP_LEASE_GRANT:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		MASTER_ONLY(rep, rp);
+		ret = __rep_lease_grant(env, rp, rec, eid);
+		break;
+	case REP_LOG:
+	case REP_LOG_MORE:
+		RECOVERING_LOG_SKIP;
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_log(env, ip, rp, rec, eid, savetime, ret_lsnp);
+		break;
+	case REP_LOG_REQ:
+		RECOVERING_SKIP;
+		CLIENT_MASTERCHK;
+		if (F_ISSET(rp, REPCTL_INIT))
+			MASTER_UPDATE(env, renv);
+		ret = __rep_logreq(env, rp, rec, eid);
+		CLIENT_REREQ;
+		break;
+	case REP_NEWSITE:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		/* We don't hold the rep mutex, and may miscount. */
+		STAT(rep->stat.st_newsites++);
+
+		/* This is a rebroadcast; simply tell the application. */
+		if (F_ISSET(rep, REP_F_MASTER)) {
+			dblp = env->lg_handle;
+			lp = dblp->reginfo.primary;
+			LOG_SYSTEM_LOCK(env);
+			lsn = lp->lsn;
+			LOG_SYSTEM_UNLOCK(env);
+			(void)__rep_send_message(env,
+			    eid, REP_NEWMASTER, &lsn, NULL, 0, 0);
+		}
+		ret = DB_REP_NEWSITE;
+		break;
+	case REP_NEWCLIENT:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		/*
+		 * This message was received and should have resulted in the
+		 * application entering the machine ID in its machine table.
+		 * We respond to this with an ALIVE to send relevant information
+		 * to the new client (if we are a master, we'll send a
+		 * NEWMASTER, so we only need to send the ALIVE if we're a
+		 * client).  But first, broadcast the new client's record to
+		 * all the clients.
+		 */
+		(void)__rep_send_message(env,
+		    DB_EID_BROADCAST, REP_NEWSITE, &rp->lsn, rec, 0, 0);
+
+		ret = DB_REP_NEWSITE;
+
+		if (F_ISSET(rep, REP_F_CLIENT)) {
+			REP_SYSTEM_LOCK(env);
+			egen_arg.egen = rep->egen;
+
+			/*
+			 * Clean up any previous master remnants by making
+			 * master_id invalid and cleaning up any internal
+			 * init that was in progress.
+			 */
+			if (eid == rep->master_id) {
+				rep->master_id = DB_EID_INVALID;
+
+				/*
+				 * Already locking out messages, must be
+				 * in sync-up recover or internal init,
+				 * give up.
+				 */
+				if (FLD_ISSET(rep->lockout_flags,
+				    REP_LOCKOUT_MSG))
+					goto errhlk;
+
+				/*
+				 * Lock out other messages to prevent race
+				 * conditions.
+				 */
+				if ((t_ret =
+				    __rep_lockout_msg(env, rep, 1)) != 0) {
+					ret = t_ret;
+					goto errhlk;
+				}
+				lockout = 1;
+
+				/*
+				 * Need mtx_clientdb to safely clean up
+				 * page database in __rep_init_cleanup().
+				 */
+				REP_SYSTEM_UNLOCK(env);
+				MUTEX_LOCK(env, rep->mtx_clientdb);
+				REP_SYSTEM_LOCK(env);
+
+				/*
+				 * Clean up internal init if one was in
+				 * progress.
+				 */
+				if (ISSET_LOCKOUT_BDB(rep)) {
+					RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "NEWCLIENT is cleaning up old internal init for invalid master"));
+					t_ret = __rep_init_cleanup(env,
+					    rep, DB_FORCE);
+					F_CLR(rep, REP_F_ABBREVIATED);
+					CLR_RECOVERY_SETTINGS(rep);
+				}
+				MUTEX_UNLOCK(env, rep->mtx_clientdb);
+				if (t_ret != 0) {
+					ret = t_ret;
+					RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "NEWCLIENT error cleaning up internal init for invalid master: %d", ret));
+					goto errhlk;
+				}
+				FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+				lockout = 0;
+			}
+			REP_SYSTEM_UNLOCK(env);
+			if (rep->version < DB_REPVERSION_47)
+				DB_INIT_DBT(data_dbt, &egen_arg.egen,
+				    sizeof(egen_arg.egen));
+			else {
+				if ((ret = __rep_egen_marshal(env, &egen_arg,
+				    buf, __REP_EGEN_SIZE, &len)) != 0)
+					goto errlock;
+				DB_INIT_DBT(data_dbt, buf, len);
+			}
+			(void)__rep_send_message(env, DB_EID_BROADCAST,
+			    REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+			break;
+		}
+		/* FALLTHROUGH */
+	case REP_MASTER_REQ:
+		RECOVERING_SKIP;
+		if (F_ISSET(rep, REP_F_MASTER)) {
+			LOG_SYSTEM_LOCK(env);
+			lsn = lp->lsn;
+			LOG_SYSTEM_UNLOCK(env);
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+			if (IS_USING_LEASES(env))
+				(void)__rep_lease_refresh(env);
+		}
+		/*
+		 * If there is no master, then we could get into a state
+		 * where an old client lost the initial ALIVE message and
+		 * is calling an election under an old gen and can
+		 * never get to the current gen.
+		 */
+		if (F_ISSET(rep, REP_F_CLIENT) && rp->gen < gen) {
+			REP_SYSTEM_LOCK(env);
+			egen_arg.egen = rep->egen;
+			if (eid == rep->master_id)
+				rep->master_id = DB_EID_INVALID;
+			REP_SYSTEM_UNLOCK(env);
+			if (rep->version < DB_REPVERSION_47)
+				DB_INIT_DBT(data_dbt, &egen_arg.egen,
+				    sizeof(egen_arg.egen));
+			else {
+				if ((ret = __rep_egen_marshal(env, &egen_arg,
+				    buf, __REP_EGEN_SIZE, &len)) != 0)
+					goto errlock;
+				DB_INIT_DBT(data_dbt, buf, len);
+			}
+			(void)__rep_send_message(env, eid,
+			    REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+		}
+		break;
+	case REP_NEWFILE:
+		RECOVERING_LOG_SKIP;
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_apply(env,
+		     ip, rp, rec, ret_lsnp, NULL, &last_lsn);
+		if (ret == DB_REP_LOGREADY)
+			ret = __rep_logready(env, rep, savetime, &last_lsn);
+		break;
+	case REP_NEWMASTER:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		ANYSITE(rep);
+		if (F_ISSET(rep, REP_F_MASTER) &&
+		    eid != rep->eid) {
+			/* We don't hold the rep mutex, and may miscount. */
+			STAT(rep->stat.st_dupmasters++);
+			ret = DB_REP_DUPMASTER;
+			if (IS_USING_LEASES(env))
+				DB_ASSERT(env,
+				    __rep_lease_check(env, 0) ==
+				    DB_REP_LEASE_EXPIRED);
+			else
+				(void)__rep_send_message(env,
+				    DB_EID_BROADCAST, REP_DUPMASTER,
+				    NULL, NULL, 0, 0);
+			break;
+		}
+		if ((ret =
+		    __rep_new_master(env, rp, eid)) == DB_REP_NEWMASTER)
+			ret = __rep_fire_newmaster(env, rp->gen, eid);
+		break;
+	case REP_PAGE:
+	case REP_PAGE_FAIL:
+	case REP_PAGE_MORE:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_page(env, ip, eid, rp, rec);
+		if (ret == DB_REP_PAGEDONE)
+			ret = 0;
+		break;
+	case REP_PAGE_REQ:
+		RECOVERING_SKIP;
+		CLIENT_MASTERCHK;
+		MASTER_UPDATE(env, renv);
+		ret = __rep_page_req(env, ip, eid, rp, rec);
+		CLIENT_REREQ;
+		break;
+	case REP_REREQUEST:
+		/*
+		 * Handle even if we're recovering.  Don't do a master
+		 * check.
+		 */
+		CLIENT_ONLY(rep, rp);
+		/*
+		 * Don't hold any mutex, may miscount.
+		 */
+		STAT(rep->stat.st_client_rerequests++);
+		ret = __rep_resend_req(env, 1);
+		break;
+	case REP_START_SYNC:
+		RECOVERING_SKIP;
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+		/*
+		 * The comparison needs to be <= because the LSN in
+		 * the message can be the LSN of the first outstanding
+		 * txn, which may be the LSN immediately after the
+		 * previous commit.  The ready_lsn is the LSN of the
+		 * next record expected.  In that case, the LSNs
+		 * could be equal and the client has the commit and
+		 * wants to sync. [SR #15338]
+		 */
+		if (cmp <= 0) {
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			do_sync = 1;
+		} else {
+			STAT(rep->stat.st_startsync_delayed++);
+			/*
+			 * There are cases where keeping the first ckp_lsn
+			 * LSN is advantageous and cases where keeping
+			 * a later LSN is better.  If random, earlier
+			 * log records are missing, keeping the later
+			 * LSN seems to be better.  That is what we'll
+			 * do for now.
+			 */
+			if (LOG_COMPARE(&rp->lsn, &rep->ckp_lsn) > 0)
+				rep->ckp_lsn = rp->lsn;
+			RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "Delayed START_SYNC memp_sync due to missing records."));
+			RPRINT(env, (env, DB_VERB_REP_MSGS,
+    "ready LSN [%lu][%lu], ckp_lsn [%lu][%lu]",
+		    (u_long)lp->ready_lsn.file, (u_long)lp->ready_lsn.offset,
+		    (u_long)rep->ckp_lsn.file, (u_long)rep->ckp_lsn.offset));
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		}
+		break;
+	case REP_UPDATE:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		CLIENT_ONLY(rep, rp);
+		if ((ret = __rep_update_setup(env,
+		    eid, rp, rec, savetime, &lsn)) == DB_REP_WOULDROLLBACK &&
+		    ret_lsnp != NULL) {
+			/*
+			 * Not for a normal internal init.  But this could
+			 * happen here if we had to ask for an UPDATE message in
+			 * order to check for materializing NIMDBs; in other
+			 * words, an "abbreviated internal init."
+			 */
+			*ret_lsnp = lsn;
+		}
+		break;
+	case REP_UPDATE_REQ:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		MASTER_ONLY(rep, rp);
+		infop = env->reginfo;
+		renv = infop->primary;
+		MASTER_UPDATE(env, renv);
+		ret = __rep_update_req(env, rp);
+		break;
+	case REP_VERIFY:
+		if (recovering) {
+			MUTEX_LOCK(env, rep->mtx_clientdb);
+			cmp = LOG_COMPARE(&lp->verify_lsn, &rp->lsn);
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			/*
+			 * If this is not the verify record I want, skip it.
+			 */
+			if (cmp != 0) {
+				ret = __rep_skip_msg(
+				    env, rep, eid, rp->rectype);
+				break;
+			}
+		}
+		CLIENT_ONLY(rep, rp);
+		if ((ret = __rep_verify(env, rp, rec, eid, savetime)) ==
+		    DB_REP_WOULDROLLBACK && ret_lsnp != NULL)
+			*ret_lsnp = rp->lsn;
+		break;
+	case REP_VERIFY_FAIL:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		CLIENT_ONLY(rep, rp);
+		ret = __rep_verify_fail(env, rp);
+		break;
+	case REP_VERIFY_REQ:
+		RECOVERING_SKIP;
+		CLIENT_MASTERCHK;
+		ret = __rep_verify_req(env, rp, eid);
+		CLIENT_REREQ;
+		break;
+	case REP_VOTE1:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		ret = __rep_vote1(env, rp, rec, eid);
+		break;
+	case REP_VOTE2:
+		/*
+		 * Handle even if we're recovering.
+		 */
+		ret = __rep_vote2(env, rp, rec, eid);
+		break;
+	default:
+		__db_errx(env, DB_STR_A("3521",
+	"DB_ENV->rep_process_message: unknown replication message: type %lu",
+		   "%lu"), (u_long)rp->rectype);
+		ret = EINVAL;
+		break;
+	}
+
+errlock:
+	REP_SYSTEM_LOCK(env);
+errhlk:	if (lockout)
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+	rep->msg_th--;
+	REP_SYSTEM_UNLOCK(env);
+	if (do_sync) {
+		MUTEX_LOCK(env, rep->mtx_ckp);
+		lsn = rp->lsn;
+		/*
+		 * This is the REP_START_SYNC sync, and so we permit it to be
+		 * interrupted.
+		 */
+		ret = __memp_sync(
+		    env, DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &lsn);
+		MUTEX_UNLOCK(env, rep->mtx_ckp);
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "START_SYNC: Completed sync [%lu][%lu]",
+		    (u_long)lsn.file, (u_long)lsn.offset));
+	}
+out:
+	if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+		if (ret_lsnp != NULL)
+			*ret_lsnp = rp->lsn;
+		ret = DB_REP_NOTPERM;
+	}
+	__dbt_userfree(env, control, rec, NULL);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __rep_apply --
+ *
+ * Handle incoming log records on a client, applying when possible and
+ * entering into the bookkeeping table otherwise.  This routine manages
+ * the state of the incoming message stream -- processing records, via
+ * __rep_process_rec, when possible and enqueuing in the __db.rep.db
+ * when necessary.  As gaps in the stream are filled in, this is where
+ * we try to process as much as possible from __db.rep.db to catch up.
+ *
+ * PUBLIC: int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+ * PUBLIC:     DBT *, DB_LSN *, int *, DB_LSN *));
+ */
+int
+__rep_apply(env, ip, rp, rec, ret_lsnp, is_dupp, last_lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__rep_control_args *rp;
+	DBT *rec;
+	DB_LSN *ret_lsnp;
+	int *is_dupp;
+	DB_LSN *last_lsnp;
+{
+	DB *dbp;
+	DBT control_dbt, key_dbt;
+	DBT rec_dbt;
+	DB_LOG *dblp;
+	DB_LSN max_lsn, save_lsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	db_timespec msg_time, max_ts;
+	u_int32_t gen, rectype;
+	int cmp, event, master, newfile_seen, ret, set_apply, t_ret;
+
+	COMPQUIET(gen, 0);
+	COMPQUIET(master, DB_EID_INVALID);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	event = ret = set_apply = 0;
+	memset(&control_dbt, 0, sizeof(control_dbt));
+	memset(&rec_dbt, 0, sizeof(rec_dbt));
+	ZERO_LSN(max_lsn);
+	timespecclear(&max_ts);
+	timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+	cmp = -2;		/* OOB value that LOG_COMPARE can't return. */
+
+	dblp = env->lg_handle;
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	/*
+	 * Lazily open the temp db.  Always set the startup flag to 0
+	 * because it was initialized from rep_start.
+	 */
+	if (db_rep->rep_db == NULL &&
+	    (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		goto out;
+	}
+	dbp = db_rep->rep_db;
+	lp = dblp->reginfo.primary;
+	newfile_seen = 0;
+	REP_SYSTEM_LOCK(env);
+	if (rep->sync_state == SYNC_LOG &&
+	    LOG_COMPARE(&lp->ready_lsn, &rep->first_lsn) < 0)
+		lp->ready_lsn = rep->first_lsn;
+	cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+	/*
+	 * If we are going to skip or process any message other
+	 * than a duplicate, make note of it if we're in an
+	 * election so that the election can rerequest proactively.
+	 */
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_APPLY) && cmp >= 0)
+		F_SET(rep, REP_F_SKIPPED_APPLY);
+
+	/*
+	 * If we're in the middle of processing a NEWFILE, we've dropped
+	 * the mutex and if this matches it is a duplicate record.  We
+	 * do not want this call taking the "matching" code below because
+	 * we may then process later records in the temp db and the
+	 * original NEWFILE may not have the log file ready.  It will
+	 * process those temp db items when it completes.
+	 */
+	if (F_ISSET(rep, REP_F_NEWFILE) && cmp == 0)
+		cmp = -1;
+
+	if (cmp == 0) {
+		/*
+		 * If we are in an election (i.e. we've sent a vote
+		 * with an LSN in it), then we drop the next record
+		 * we're expecting.  When we find a master, we'll
+		 * either go into sync, or if it was an existing
+		 * master, rerequest this one record (later records
+		 * are accumulating in the temp db).
+		 *
+		 * We can simply return here, and rep_process_message
+		 * will set NOTPERM if necessary for this record.
+		 */
+		if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_APPLY)) {
+			/*
+			 * We will simply return now.  All special return
+			 * processing should be ignored because the special
+			 * values are just initialized.  Variables like
+			 * max_lsn are still 0.
+			 */
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "rep_apply: In election. Ignoring [%lu][%lu]",
+			    (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+			REP_SYSTEM_UNLOCK(env);
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			goto out;
+		}
+		rep->apply_th++;
+		set_apply = 1;
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "rep_apply: Set apply_th %d", rep->apply_th));
+		REP_SYSTEM_UNLOCK(env);
+		if (rp->rectype == REP_NEWFILE)
+			newfile_seen = 1;
+		if ((ret = __rep_process_rec(env, ip,
+		    rp, rec, &max_ts, &max_lsn)) != 0)
+			goto err;
+		/*
+		 * If we get the record we are expecting, reset
+		 * the count of records we've received and are applying
+		 * towards the request interval.
+		 */
+		__os_gettime(env, &lp->rcvd_ts, 1);
+		ZERO_LSN(lp->max_wait_lsn);
+
+		/*
+		 * The __rep_remfirst() and __rep_getnext() functions each open,
+		 * use and then close a cursor on the temp db, each time through
+		 * the loop.  Although this may seem excessive, it is necessary
+		 * to avoid locking problems with checkpoints.
+		 */
+		while (ret == 0 &&
+		    LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0) {
+			/*
+			 * We just filled in a gap in the log record stream.
+			 * Write subsequent records to the log.
+			 */
+gap_check:
+			if ((ret = __rep_remfirst(env, ip,
+			     &control_dbt, &rec_dbt)) != 0)
+				goto err;
+
+			rp = (__rep_control_args *)control_dbt.data;
+			timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+			rec = &rec_dbt;
+			if (rp->rectype == REP_NEWFILE)
+				newfile_seen = 1;
+			if ((ret = __rep_process_rec(env, ip,
+			    rp, rec, &max_ts, &max_lsn)) != 0)
+				goto err;
+
+			STAT(--rep->stat.st_log_queued);
+
+			/*
+			 * Since we just filled a gap in the log stream, and
+			 * we're writing subsequent records to the log, we want
+			 * to use rcvd_ts and wait_ts so that we will
+			 * request the next gap if we end up with a gap and
+			 * not so recent records in the temp db, but not
+			 * request if recent records are in the temp db and
+			 * likely to arrive on its own shortly.  We want to
+			 * avoid requesting the record in that case.  Also
+			 * reset max_wait_lsn because the next gap is a
+			 * fresh gap.
+			 */
+			lp->rcvd_ts = lp->last_ts;
+			lp->wait_ts = rep->request_gap;
+			if ((ret = __rep_getnext(env, ip)) == DB_NOTFOUND) {
+				__os_gettime(env, &lp->rcvd_ts, 1);
+				ret = 0;
+				break;
+			} else if (ret != 0)
+				goto err;
+		}
+
+		/*
+		 * Check if we're at a gap in the table and if so, whether we
+		 * need to ask for any records.
+		 */
+		if (!IS_ZERO_LSN(lp->waiting_lsn) &&
+		    LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) != 0) {
+			/*
+			 * We got a record and processed it, but we may
+			 * still be waiting for more records.  If we
+			 * filled a gap we keep a count of how many other
+			 * records are in the temp database and if we should
+			 * request the next gap at this time.
+			 */
+			if (__rep_check_doreq(env, rep) && (ret =
+			    __rep_loggap_req(env, rep, &rp->lsn, 0)) != 0)
+				goto err;
+		} else {
+			lp->wait_ts = rep->request_gap;
+			ZERO_LSN(lp->max_wait_lsn);
+		}
+
+	} else if (cmp > 0) {
+		/*
+		 * The LSN is higher than the one we were waiting for.
+		 * This record isn't in sequence; add it to the temporary
+		 * database, update waiting_lsn if necessary, and perform
+		 * calculations to determine if we should issue requests
+		 * for new records.
+		 */
+		REP_SYSTEM_UNLOCK(env);
+		memset(&key_dbt, 0, sizeof(key_dbt));
+		key_dbt.data = rp;
+		key_dbt.size = sizeof(*rp);
+		ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+		if (ret == 0) {
+			STAT(rep->stat.st_log_queued++);
+			__os_gettime(env, &lp->last_ts, 1);
+#ifdef HAVE_STATISTICS
+			rep->stat.st_log_queued_total++;
+			if (rep->stat.st_log_queued_max <
+			    rep->stat.st_log_queued)
+				rep->stat.st_log_queued_max =
+				    rep->stat.st_log_queued;
+#endif
+		}
+
+		if (ret == DB_KEYEXIST)
+			ret = 0;
+		if (ret != 0 && ret != ENOMEM)
+			goto done;
+
+		/*
+		 * If we are using in-memory, and got ENOMEM, it is
+		 * not an error.  But in that case we want to skip
+		 * comparing the message LSN since we're not storing it.
+		 * However, we do want continue to check if we need to
+		 * send a request for the gap.
+		 */
+		if (ret == 0 && (IS_ZERO_LSN(lp->waiting_lsn) ||
+		    LOG_COMPARE(&rp->lsn, &lp->waiting_lsn) < 0)) {
+			/*
+			 * If this is a new gap, then reset the rcvd_ts so
+			 * that an out-of-order record after an idle period
+			 * does not (likely) immediately rerequest.
+			 */
+			if (IS_ZERO_LSN(lp->waiting_lsn))
+				__os_gettime(env, &lp->rcvd_ts, 1);
+			lp->waiting_lsn = rp->lsn;
+		}
+
+		if (__rep_check_doreq(env, rep) &&
+		    (ret = __rep_loggap_req(env, rep, &rp->lsn, 0) != 0))
+			goto err;
+
+		/*
+		 * If this is permanent; let the caller know that we have
+		 * not yet written it to disk, but we've accepted it.
+		 */
+		if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+			max_lsn = rp->lsn;
+			ret = DB_REP_NOTPERM;
+		}
+		goto done;
+	} else {
+		STAT(rep->stat.st_log_duplicated++);
+		REP_SYSTEM_UNLOCK(env);
+		if (is_dupp != NULL) {
+			*is_dupp = 1;
+			/*
+			 * Could get overwritten by max_lsn later.
+			 * But max_lsn is guaranteed <= ready_lsn, so
+			 * it would be a more conservative LSN to return.
+			 */
+			*ret_lsnp = lp->ready_lsn;
+		}
+		LOGCOPY_32(env, &rectype, rec->data);
+		if (rectype == DB___txn_regop || rectype == DB___txn_ckp)
+			max_lsn = lp->max_perm_lsn;
+		/*
+		 * We check REPCTL_LEASE here, because this client may
+		 * have leases configured but the master may not (especially
+		 * in a mixed version group.  If the master has leases
+		 * configured, all clients must also.
+		 */
+		if (IS_USING_LEASES(env) &&
+		    F_ISSET(rp, REPCTL_LEASE) &&
+		    timespecisset(&msg_time)) {
+			if (timespeccmp(&msg_time, &lp->max_lease_ts, >))
+				max_ts = msg_time;
+			else
+				max_ts = lp->max_lease_ts;
+		}
+		goto done;
+	}
+
+	/* Check if we need to go back into the table. */
+	if (ret == 0 && LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0)
+		goto gap_check;
+
+done:
+err:	/*
+	 * In case of a race, to make sure only one thread can get
+	 * DB_REP_LOGREADY, zero out rep->last_lsn to show that we've gotten to
+	 * this point.
+	 */
+	REP_SYSTEM_LOCK(env);
+	if (ret == 0 &&
+	    rep->sync_state == SYNC_LOG &&
+	    !IS_ZERO_LSN(rep->last_lsn) &&
+	    LOG_COMPARE(&lp->ready_lsn, &rep->last_lsn) >= 0) {
+		*last_lsnp = max_lsn;
+		ZERO_LSN(rep->last_lsn);
+		ZERO_LSN(max_lsn);
+		ret = DB_REP_LOGREADY;
+	}
+	/*
+	 * Only decrement if we were actually applying log records.
+	 * We do not care if we processed a dup record or put one
+	 * in the temp db.
+	 */
+	if (set_apply) {
+		rep->apply_th--;
+		VPRINT(env, (env, DB_VERB_REP_MISC,
+		    "rep_apply: Decrement apply_th %d [%lu][%lu]",
+		    rep->apply_th, (u_long)lp->ready_lsn.file,
+		    (u_long)lp->ready_lsn.offset));
+	}
+
+	if (ret == 0 && rep->sync_state != SYNC_LOG &&
+	    !IS_ZERO_LSN(max_lsn)) {
+		if (ret_lsnp != NULL)
+			*ret_lsnp = max_lsn;
+		ret = DB_REP_ISPERM;
+		DB_ASSERT(env, LOG_COMPARE(&max_lsn, &lp->max_perm_lsn) >= 0);
+		lp->max_perm_lsn = max_lsn;
+		if ((t_ret = __rep_notify_threads(env, AWAIT_LSN)) != 0)
+			ret = t_ret;
+	}
+
+	/*
+	 * Start-up is complete when we process (or have already processed) up
+	 * to the end of the replication group's log.  In case we miss that
+	 * message, as a back-up, we also recognize start-up completion when we
+	 * actually process a live log record.  Having cmp==0 here (with a good
+	 * "ret" value) implies we actually processed the record.
+	 */
+	if ((ret == 0 || ret == DB_REP_ISPERM) &&
+	    rep->stat.st_startup_complete == 0 &&
+	    rep->sync_state != SYNC_LOG &&
+	    ((cmp <= 0 && F_ISSET(rp, REPCTL_LOG_END)) ||
+	    (cmp == 0 && !F_ISSET(rp, REPCTL_RESEND)))) {
+		rep->stat.st_startup_complete = 1;
+		event = 1;
+		gen = rep->gen;
+		master = rep->master_id;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	/*
+	 * If we've processed beyond the needed LSN for a pending
+	 * start sync, start it now.  We must compare > here
+	 * because ready_lsn is the next record we expect and if
+	 * the last record is a commit, that will dirty pages on
+	 * a client as that txn is applied.
+	 */
+	if (!IS_ZERO_LSN(rep->ckp_lsn) &&
+	    LOG_COMPARE(&lp->ready_lsn, &rep->ckp_lsn) > 0) {
+		save_lsn = rep->ckp_lsn;
+		ZERO_LSN(rep->ckp_lsn);
+	} else
+		ZERO_LSN(save_lsn);
+
+	/*
+	 * If this is a perm record, we are using leases, update the lease
+	 * grant.  We must hold the clientdb mutex.  We must not hold
+	 * the region mutex because rep_update_grant will acquire it.
+	 */
+	if (ret == DB_REP_ISPERM && IS_USING_LEASES(env) &&
+	    timespecisset(&max_ts)) {
+		if ((t_ret = __rep_update_grant(env, &max_ts)) != 0)
+			ret = t_ret;
+		else if (timespeccmp(&max_ts, &lp->max_lease_ts, >))
+			lp->max_lease_ts = max_ts;
+	}
+
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	if (!IS_ZERO_LSN(save_lsn)) {
+		/*
+		 * Now call memp_sync holding only the ckp mutex.
+		 */
+		MUTEX_LOCK(env, rep->mtx_ckp);
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "Starting delayed __memp_sync call [%lu][%lu]",
+		    (u_long)save_lsn.file, (u_long)save_lsn.offset));
+		t_ret = __memp_sync(env,
+		    DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &save_lsn);
+		MUTEX_UNLOCK(env, rep->mtx_ckp);
+	}
+	if (event) {
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "Start-up is done [%lu][%lu]",
+		    (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+
+		if ((t_ret = __rep_fire_startupdone(env, gen, master)) != 0) {
+			DB_ASSERT(env, ret == 0 || ret == DB_REP_ISPERM);
+			/* Failure trumps either of those values. */
+			ret = t_ret;
+			goto out;
+		}
+	}
+	if ((ret == 0 || ret == DB_REP_ISPERM) &&
+	    newfile_seen && lp->db_log_autoremove)
+		__log_autoremove(env);
+	if (control_dbt.data != NULL)
+		__os_ufree(env, control_dbt.data);
+	if (rec_dbt.data != NULL)
+		__os_ufree(env, rec_dbt.data);
+
+out:
+	switch (ret) {
+	case 0:
+		break;
+	case DB_REP_ISPERM:
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Returning ISPERM [%lu][%lu], cmp = %d",
+		    (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+		break;
+	case DB_REP_LOGREADY:
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Returning LOGREADY up to [%lu][%lu], cmp = %d",
+		    (u_long)last_lsnp->file,
+		    (u_long)last_lsnp->offset, cmp));
+		break;
+	case DB_REP_NOTPERM:
+		if (rep->sync_state != SYNC_LOG &&
+		    !IS_ZERO_LSN(max_lsn) && ret_lsnp != NULL)
+			*ret_lsnp = max_lsn;
+
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Returning NOTPERM [%lu][%lu], cmp = %d",
+		    (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+		break;
+	default:
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "Returning %d [%lu][%lu], cmp = %d", ret,
+		    (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * __rep_process_txn --
+ *
+ * This is the routine that actually gets a transaction ready for
+ * processing.
+ *
+ * PUBLIC: int __rep_process_txn __P((ENV *, DBT *));
+ */
+int
+__rep_process_txn(env, rec)
+	ENV *env;
+	DBT *rec;
+{
+	DBT data_dbt, *lock_dbt;
+	DB_LOCKER *locker;
+	DB_LOCKREQ req, *lvp;
+	DB_LOGC *logc;
+	DB_LSN prev_lsn, *lsnp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	DB_TXNHEAD *txninfo;
+	LSN_COLLECTION lc;
+	REP *rep;
+	__txn_regop_args *txn_args;
+	__txn_regop_42_args *txn42_args;
+	__txn_prepare_args *prep_args;
+	u_int32_t rectype;
+	u_int i;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	logc = NULL;
+	txn_args = NULL;
+	txn42_args = NULL;
+	prep_args = NULL;
+	txninfo = NULL;
+
+	ENV_ENTER(env, ip);
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	if (F_ISSET(env, ENV_THREAD))
+		F_SET(&data_dbt, DB_DBT_REALLOC);
+
+	/*
+	 * There are two phases:  First, we have to traverse backwards through
+	 * the log records gathering the list of all LSNs in the transaction.
+	 * Once we have this information, we can loop through and then apply it.
+	 *
+	 * We may be passed a prepare (if we're restoring a prepare on upgrade)
+	 * instead of a commit (the common case).  Check which it is and behave
+	 * appropriately.
+	 */
+	LOGCOPY_32(env, &rectype, rec->data);
+	memset(&lc, 0, sizeof(lc));
+	if (rectype == DB___txn_regop) {
+		/*
+		 * We're the end of a transaction.  Make sure this is
+		 * really a commit and not an abort!
+		 */
+		if (rep->version >= DB_REPVERSION_44) {
+			if ((ret = __txn_regop_read(
+			    env, rec->data, &txn_args)) != 0)
+				return (ret);
+			if (txn_args->opcode != TXN_COMMIT) {
+				__os_free(env, txn_args);
+				return (0);
+			}
+			prev_lsn = txn_args->prev_lsn;
+			lock_dbt = &txn_args->locks;
+		} else {
+			if ((ret = __txn_regop_42_read(
+			    env, rec->data, &txn42_args)) != 0)
+				return (ret);
+			if (txn42_args->opcode != TXN_COMMIT) {
+				__os_free(env, txn42_args);
+				return (0);
+			}
+			prev_lsn = txn42_args->prev_lsn;
+			lock_dbt = &txn42_args->locks;
+		}
+	} else {
+		/* We're a prepare. */
+		DB_ASSERT(env, rectype == DB___txn_prepare);
+
+		if ((ret = __txn_prepare_read(
+		    env, rec->data, &prep_args)) != 0)
+			return (ret);
+		prev_lsn = prep_args->prev_lsn;
+		lock_dbt = &prep_args->locks;
+	}
+
+	/* Get locks. */
+	if ((ret = __lock_id(env, NULL, &locker)) != 0)
+		goto err1;
+
+	/* We are always more important than user transactions. */
+	locker->priority = DB_LOCK_MAXPRIORITY;
+
+	if ((ret =
+	    __lock_get_list(env, locker, 0, DB_LOCK_WRITE, lock_dbt)) != 0)
+		goto err;
+
+	/* Phase 1.  Get a list of the LSNs in this transaction, and sort it. */
+	if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0)
+		goto err;
+	qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp);
+
+	/*
+	 * The set of records for a transaction may include dbreg_register
+	 * records.  Create a txnlist so that they can keep track of file
+	 * state between records.
+	 */
+	if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
+		goto err;
+
+	/* Phase 2: Apply updates. */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+	for (lsnp = &lc.array[0], i = 0; i < lc.nlsns; i++, lsnp++) {
+		if ((ret = __logc_get(logc, lsnp, &data_dbt, DB_SET)) != 0) {
+			__db_errx(env, DB_STR_A("3522",
+			    "failed to read the log at [%lu][%lu]", "%lu %lu"),
+			    (u_long)lsnp->file, (u_long)lsnp->offset);
+			goto err;
+		}
+		if ((ret = __db_dispatch(env, &env->recover_dtab,
+		    &data_dbt, lsnp, DB_TXN_APPLY, txninfo)) != 0) {
+			__db_errx(env, DB_STR_A("3523",
+			    "transaction failed at [%lu][%lu]", "%lu %lu"),
+			    (u_long)lsnp->file, (u_long)lsnp->offset);
+			goto err;
+		}
+	}
+
+err:	memset(&req, 0, sizeof(req));
+	req.op = DB_LOCK_PUT_ALL;
+	if ((t_ret =
+	     __lock_vec(env, locker, 0, &req, 1, &lvp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+		ret = t_ret;
+
+err1:	if (txn_args != NULL)
+		__os_free(env, txn_args);
+	if (txn42_args != NULL)
+		__os_free(env, txn42_args);
+	if (prep_args != NULL)
+		__os_free(env, prep_args);
+	if (lc.array != NULL)
+		__os_free(env, lc.array);
+
+	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (txninfo != NULL)
+		__db_txnlist_end(env, txninfo);
+
+	if (F_ISSET(&data_dbt, DB_DBT_REALLOC) && data_dbt.data != NULL)
+		__os_ufree(env, data_dbt.data);
+
+#ifdef HAVE_STATISTICS
+	if (ret == 0)
+		/*
+		 * We don't hold the rep mutex, and could miscount if we race.
+		 */
+		rep->stat.st_txns_applied++;
+#endif
+
+	return (ret);
+}
+
+/*
+ * __rep_collect_txn
+ *	Recursive function that will let us visit every entry in a transaction
+ *	chain including all child transactions so that we can then apply
+ *	the entire transaction family at once.
+ */
+static int
+__rep_collect_txn(env, lsnp, lc)
+	ENV *env;
+	DB_LSN *lsnp;
+	LSN_COLLECTION *lc;
+{
+	__txn_child_args *argp;
+	DB_LOGC *logc;
+	DB_LSN c_lsn;
+	DBT data;
+	u_int32_t rectype;
+	u_int nalloc;
+	int ret, t_ret;
+
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_REALLOC);
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	while (!IS_ZERO_LSN(*lsnp) &&
+	    (ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) {
+		LOGCOPY_32(env, &rectype, data.data);
+		if (rectype == DB___txn_child) {
+			if ((ret = __txn_child_read(
+			    env, data.data, &argp)) != 0)
+				goto err;
+			c_lsn = argp->c_lsn;
+			*lsnp = argp->prev_lsn;
+			__os_free(env, argp);
+			ret = __rep_collect_txn(env, &c_lsn, lc);
+		} else {
+			if (lc->nalloc < lc->nlsns + 1) {
+				nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2;
+				if ((ret = __os_realloc(env,
+				    nalloc * sizeof(DB_LSN), &lc->array)) != 0)
+					goto err;
+				lc->nalloc = nalloc;
+			}
+			lc->array[lc->nlsns++] = *lsnp;
+
+			/*
+			 * Explicitly copy the previous lsn.  The record
+			 * starts with a u_int32_t record type, a u_int32_t
+			 * txn id, and then the DB_LSN (prev_lsn) that we
+			 * want.  We copy explicitly because we have no idea
+			 * what kind of record this is.
+			 */
+			LOGCOPY_TOLSN(env, lsnp, (u_int8_t *)data.data +
+			    sizeof(u_int32_t) + sizeof(u_int32_t));
+		}
+
+		if (ret != 0)
+			goto err;
+	}
+	if (ret != 0)
+		__db_errx(env, DB_STR_A("3524",
+		    "collect failed at: [%lu][%lu]", "%lu %lu"),
+		    (u_long)lsnp->file, (u_long)lsnp->offset);
+
+err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (data.data != NULL)
+		__os_ufree(env, data.data);
+	return (ret);
+}
+
+/*
+ * __rep_lsn_cmp --
+ *	qsort-type-compatible wrapper for LOG_COMPARE.
+ */
+static int
+__rep_lsn_cmp(lsn1, lsn2)
+	const void *lsn1, *lsn2;
+{
+
+	return (LOG_COMPARE((DB_LSN *)lsn1, (DB_LSN *)lsn2));
+}
+
+/*
+ * __rep_newfile --
+ *	NEWFILE messages have the LSN of the last record in the previous
+ * log file.  When applying a NEWFILE message, make sure we haven't already
+ * swapped files.  Assume caller hold mtx_clientdb.
+ */
+static int
+__rep_newfile(env, rp, rec)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+{
+	DB_LOG *dblp;
+	DB_LSN tmplsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	__rep_newfile_args nf_args;
+	int ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * If a newfile is already in progress, just ignore.
+	 */
+	if (F_ISSET(rep, REP_F_NEWFILE))
+		return (0);
+	if (rp->lsn.file + 1 > lp->ready_lsn.file) {
+		if (rec == NULL || rec->size == 0) {
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+"rep_newfile: Old-style NEWFILE msg.  Use control msg log version: %lu",
+    (u_long) rp->log_version));
+			nf_args.version = rp->log_version;
+		} else if (rp->rep_version < DB_REPVERSION_47)
+			nf_args.version = *(u_int32_t *)rec->data;
+		else if ((ret = __rep_newfile_unmarshal(env, &nf_args,
+		    rec->data, rec->size, NULL)) != 0)
+			return (ret);
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "rep_newfile: File %lu vers %lu",
+		    (u_long)rp->lsn.file + 1, (u_long)nf_args.version));
+
+		/*
+		 * We drop the mtx_clientdb mutex during
+		 * the file operation, and then reacquire it when
+		 * we're done.  We avoid colliding with new incoming
+		 * log records because lp->ready_lsn is not getting
+		 * updated and there is no real log record at this
+		 * ready_lsn.  We avoid colliding with a duplicate
+		 * NEWFILE message by setting an in-progress flag.
+		 */
+		REP_SYSTEM_LOCK(env);
+		F_SET(rep, REP_F_NEWFILE);
+		REP_SYSTEM_UNLOCK(env);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		LOG_SYSTEM_LOCK(env);
+		ret = __log_newfile(dblp, &tmplsn, 0, nf_args.version);
+		LOG_SYSTEM_UNLOCK(env);
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		REP_SYSTEM_LOCK(env);
+		F_CLR(rep, REP_F_NEWFILE);
+		REP_SYSTEM_UNLOCK(env);
+		if (ret == 0)
+			lp->ready_lsn = tmplsn;
+		return (ret);
+	} else
+		/* We've already applied this NEWFILE.  Just ignore it. */
+		return (0);
+}
+
+/*
+ * __rep_do_ckp --
+ * Perform the memp_sync necessary for this checkpoint without holding the
+ * REP->mtx_clientdb.  Callers of this function must hold REP->mtx_clientdb
+ * and must not be holding the region mutex.
+ */
+static int
+__rep_do_ckp(env, rec, rp)
+	ENV *env;
+	DBT *rec;
+	__rep_control_args *rp;
+{
+	DB_ENV *dbenv;
+	__txn_ckp_args *ckp_args;
+	DB_LSN ckp_lsn;
+	REP *rep;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	/* Crack the log record and extract the checkpoint LSN. */
+	if ((ret = __txn_ckp_read(env, rec->data, &ckp_args)) != 0)
+		return (ret);
+	ckp_lsn = ckp_args->ckp_lsn;
+	__os_free(env, ckp_args);
+
+	rep = env->rep_handle->region;
+
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	DB_TEST_WAIT(env, env->test_check);
+
+	/*
+	 * Sync the memory pool.
+	 *
+	 * This is the real PERM lock record/ckp.  We cannot return ISPERM
+	 * if we haven't truly completed the checkpoint, so we don't allow
+	 * this call to be interrupted.
+	 *
+	 * We may be overlapping our log record with an in-progress startsync
+	 * of this checkpoint; suppress the max_write settings on any running
+	 * cache-flush operation so it completes quickly.
+	 */
+	(void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 1);
+	MUTEX_LOCK(env, rep->mtx_ckp);
+	ret = __memp_sync(env, DB_SYNC_CHECKPOINT, &ckp_lsn);
+	MUTEX_UNLOCK(env, rep->mtx_ckp);
+	(void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0);
+
+	/* Update the last_ckp in the txn region. */
+	if (ret == 0)
+		ret = __txn_updateckp(env, &rp->lsn);
+	else {
+		__db_errx(env, DB_STR_A("3525",
+		    "Error syncing ckp [%lu][%lu]", "%lu %lu"),
+		    (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+		ret = __env_panic(env, ret);
+	}
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	return (ret);
+}
+
+/*
+ * __rep_remfirst --
+ * Remove the first entry from the __db.rep.db
+ */
+static int
+__rep_remfirst(env, ip, cntrl, rec)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DBT *cntrl;
+	DBT *rec;
+{
+	DB *dbp;
+	DBC *dbc;
+	DB_REP *db_rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	dbp = db_rep->rep_db;
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	/* The DBTs need to persist through another call. */
+	F_SET(cntrl, DB_DBT_REALLOC);
+	F_SET(rec, DB_DBT_REALLOC);
+	if ((ret = __dbc_get(dbc, cntrl, rec, DB_RMW | DB_FIRST)) == 0)
+		ret = __dbc_del(dbc, 0);
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __rep_getnext --
+ * Get the next record out of the __db.rep.db table.
+ */
+static int
+__rep_getnext(env, ip)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+{
+	DB *dbp;
+	DBC *dbc;
+	DBT lsn_dbt, nextrec_dbt;
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	__rep_control_args *rp;
+	int ret, t_ret;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	db_rep = env->rep_handle;
+	dbp = db_rep->rep_db;
+
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Update waiting_lsn.  We need to move it
+	 * forward to the LSN of the next record
+	 * in the queue.
+	 *
+	 * If the next item in the database is a log
+	 * record--the common case--we're not
+	 * interested in its contents, just in its LSN.
+	 * Optimize by doing a partial get of the data item.
+	 */
+	memset(&nextrec_dbt, 0, sizeof(nextrec_dbt));
+	F_SET(&nextrec_dbt, DB_DBT_PARTIAL);
+	nextrec_dbt.ulen = nextrec_dbt.dlen = 0;
+
+	memset(&lsn_dbt, 0, sizeof(lsn_dbt));
+	ret = __dbc_get(dbc, &lsn_dbt, &nextrec_dbt, DB_FIRST);
+	if (ret != DB_NOTFOUND && ret != 0)
+		goto err;
+
+	if (ret == DB_NOTFOUND) {
+		ZERO_LSN(lp->waiting_lsn);
+		/*
+		 * Whether or not the current record is
+		 * simple, there's no next one, and
+		 * therefore we haven't got anything
+		 * else to do right now.  Break out.
+		 */
+		goto err;
+	}
+	rp = (__rep_control_args *)lsn_dbt.data;
+	lp->waiting_lsn = rp->lsn;
+
+err:	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_process_rec --
+ *
+ * Given a record in 'rp', process it.  In the case of a NEWFILE, that means
+ * potentially switching files.  In the case of a checkpoint, it means doing
+ * the checkpoint, and in other cases, it means simply writing the record into
+ * the log.
+ */
+static int
+__rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__rep_control_args *rp;
+	DBT *rec;
+	db_timespec *ret_tsp;
+	DB_LSN *ret_lsnp;
+{
+	DB *dbp;
+	DBT control_dbt, key_dbt, rec_dbt;
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_LOGC *logc;
+	LOG *lp;
+	REP *rep;
+	DB_LSN lsn;
+	db_timespec msg_time;
+	u_int32_t rectype, txnid;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	dbp = db_rep->rep_db;
+	ret = 0;
+
+	memset(&rec_dbt, 0, sizeof(rec_dbt));
+	if (rp->rectype == REP_NEWFILE) {
+		if ((ret = __rep_newfile(env, rp, rec)) != 0)
+			return (ret);
+
+		/*
+		 * In SYNC_LOG, in case the end-of-log sync point happens to be
+		 * right at the file boundary, we need to make sure ret_lsnp
+		 * points to a real log record, rather than the "dead space" at
+		 * the end of the file that the NEWFILE msg normally points to.
+		 */
+		if (rep->sync_state == SYNC_LOG) {
+			if ((ret = __log_cursor(env, &logc)) != 0)
+				return (ret);
+			if ((ret = __logc_get(logc,
+			    &lsn, &rec_dbt, DB_LAST)) == 0)
+				*ret_lsnp = lsn;
+			if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+				ret = t_ret;
+		}
+		return (ret);
+	}
+
+	LOGCOPY_32(env, &rectype, rec->data);
+	memset(&control_dbt, 0, sizeof(control_dbt));
+	timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+
+	/*
+	 * We write all records except for checkpoint records here.
+	 * All non-checkpoint records need to appear in the log before
+	 * we take action upon them (i.e., we enforce write-ahead logging).
+	 * However, we can't write the checkpoint record here until the
+	 * data buffers are actually written to disk, else we are creating
+	 * an invalid log -- one that says all data before a certain point
+	 * has been written to disk.
+	 *
+	 * If two threads are both processing the same checkpoint record
+	 * (because, for example, it was resent and the original finally
+	 * arrived), we handle that below by checking for the existence of
+	 * the log record when we add it to the replication database.
+	 *
+	 * Any log records that arrive while we are processing the checkpoint
+	 * are added to the bookkeeping database because ready_lsn is not yet
+	 * updated to point after the checkpoint record.
+	 */
+	if (rectype != DB___txn_ckp || rep->sync_state == SYNC_LOG) {
+		if ((ret = __log_rep_put(env, &rp->lsn, rec, 0)) != 0)
+			return (ret);
+		STAT(rep->stat.st_log_records++);
+		if (rep->sync_state == SYNC_LOG) {
+			*ret_lsnp = rp->lsn;
+			goto out;
+		}
+	}
+
+	switch (rectype) {
+	case DB___dbreg_register:
+		/*
+		 * DB opens occur in the context of a transaction, so we can
+		 * simply handle them when we process the transaction.  Closes,
+		 * however, are not transaction-protected, so we have to handle
+		 * them here.
+		 *
+		 * It should be unsafe for the master to do a close of a file
+		 * that was opened in an active transaction, so we should be
+		 * guaranteed to get the ordering right.
+		 *
+		 * !!!
+		 * The txn ID is the second 4-byte field of the log record.
+		 * We should really be calling __dbreg_register_read() and
+		 * working from the __dbreg_register_args structure, but this
+		 * is considerably faster and the order of the fields won't
+		 * change.
+		 */
+		LOGCOPY_32(env, &txnid,
+		    (u_int8_t *)rec->data + sizeof(u_int32_t));
+		if (txnid == TXN_INVALID)
+			ret = __db_dispatch(env, &env->recover_dtab,
+			    rec, &rp->lsn, DB_TXN_APPLY, NULL);
+		break;
+	case DB___txn_regop:
+		/*
+		 * If an application is doing app-specific recovery
+		 * and acquires locks while applying a transaction,
+		 * it can deadlock.  Any other locks held by this
+		 * thread should have been discarded in the
+		 * __rep_process_txn error path, so if we simply
+		 * retry, we should eventually succeed.
+		 */
+		do {
+			ret = 0;
+			if (!F_ISSET(db_rep, DBREP_OPENFILES)) {
+				ret = __txn_openfiles(env, ip, NULL, 1);
+				F_SET(db_rep, DBREP_OPENFILES);
+			}
+			if (ret == 0)
+				ret = __rep_process_txn(env, rec);
+		} while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED);
+
+		/* Now flush the log unless we're running TXN_NOSYNC. */
+		if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+			ret = __log_flush(env, NULL);
+		if (ret != 0) {
+			__db_errx(env, DB_STR_A("3526",
+			    "Error processing txn [%lu][%lu]", "%lu %lu"),
+			    (u_long)rp->lsn.file, (u_long)rp->lsn.offset);
+			ret = __env_panic(env, ret);
+		}
+		*ret_lsnp = rp->lsn;
+		break;
+	case DB___txn_prepare:
+		ret = __log_flush(env, NULL);
+		/*
+		 * Save the biggest prepared LSN we've seen.
+		 */
+		rep->max_prep_lsn = rp->lsn;
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "process_rec: prepare at [%lu][%lu]",
+		    (u_long)rep->max_prep_lsn.file,
+		    (u_long)rep->max_prep_lsn.offset));
+		break;
+	case DB___txn_ckp:
+		/*
+		 * We do not want to hold the REP->mtx_clientdb mutex while
+		 * syncing the mpool, so if we get a checkpoint record we are
+		 * supposed to process, add it to the __db.rep.db, do the
+		 * memp_sync and then go back and process it later, when the
+		 * sync has finished.  If this record is already in the table,
+		 * then some other thread will process it, so simply return
+		 * REP_NOTPERM.
+		 */
+		memset(&key_dbt, 0, sizeof(key_dbt));
+		key_dbt.data = rp;
+		key_dbt.size = sizeof(*rp);
+
+		/*
+		 * We want to put this record into the tmp DB only if
+		 * it doesn't exist, so use DB_NOOVERWRITE.
+		 */
+		ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+		if (ret == DB_KEYEXIST) {
+			if (ret_lsnp != NULL)
+				*ret_lsnp = rp->lsn;
+			ret = DB_REP_NOTPERM;
+		}
+		if (ret != 0)
+			break;
+
+		/*
+		 * Now, do the checkpoint.  Regardless of
+		 * whether the checkpoint succeeds or not,
+		 * we need to remove the record we just put
+		 * in the temporary database.  If the
+		 * checkpoint failed, return an error.  We
+		 * will act like we never received the
+		 * checkpoint.
+		 */
+		if ((ret = __rep_do_ckp(env, rec, rp)) == 0)
+			ret = __log_rep_put(env, &rp->lsn, rec,
+			    DB_LOG_CHKPNT);
+		if ((t_ret = __rep_remfirst(env, ip,
+		    &control_dbt, &rec_dbt)) != 0 && ret == 0)
+			ret = t_ret;
+		/*
+		 * If we're successful putting the log record in the
+		 * log, flush it for a checkpoint.
+		 */
+		if (ret == 0) {
+			*ret_lsnp = rp->lsn;
+			ret = __log_flush(env, NULL);
+			if (ret == 0 && lp->db_log_autoremove)
+				__log_autoremove(env);
+		}
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (ret == 0 && F_ISSET(rp, REPCTL_PERM))
+		*ret_lsnp = rp->lsn;
+	if (IS_USING_LEASES(env) &&
+	    F_ISSET(rp, REPCTL_LEASE))
+		*ret_tsp = msg_time;
+	/*
+	 * Set ret_lsnp before flushing the log because if the
+	 * flush fails, we've still written the record to the
+	 * log and the LSN has been entered.
+	 */
+	if (ret == 0 && F_ISSET(rp, REPCTL_FLUSH))
+		ret = __log_flush(env, NULL);
+	if (control_dbt.data != NULL)
+		__os_ufree(env, control_dbt.data);
+	if (rec_dbt.data != NULL)
+		__os_ufree(env, rec_dbt.data);
+
+	return (ret);
+}
+
+/*
+ * __rep_resend_req --
+ *	We might have dropped a message, we need to resend our request.
+ *	The request we send is dependent on what recovery state we're in.
+ *	The caller holds no locks.
+ *
+ * PUBLIC: int __rep_resend_req __P((ENV *, int));
+ */
+int
+__rep_resend_req(env, rereq)
+	ENV *env;
+	int rereq;
+{
+	DB_LOG *dblp;
+	DB_LSN lsn, *lsnp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	int master, ret;
+	repsync_t sync_state;
+	u_int32_t gapflags, msgtype, repflags, sendflags;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+	lsnp = NULL;
+	msgtype = REP_INVALID;
+	sendflags = 0;
+
+	repflags = rep->flags;
+	sync_state = rep->sync_state;
+	/*
+	 * If we are delayed we do not rerequest anything.
+	 */
+	if (FLD_ISSET(repflags, REP_F_DELAY))
+		return (ret);
+	gapflags = rereq ? REP_GAP_REREQUEST : 0;
+
+	if (sync_state == SYNC_VERIFY) {
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		lsn = lp->verify_lsn;
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		if (!IS_ZERO_LSN(lsn)) {
+			msgtype = REP_VERIFY_REQ;
+			lsnp = &lsn;
+			sendflags = DB_REP_REREQUEST;
+		}
+	} else if (sync_state == SYNC_UPDATE) {
+		/*
+		 * UPDATE_REQ only goes to the master.
+		 */
+		msgtype = REP_UPDATE_REQ;
+	} else if (sync_state == SYNC_PAGE) {
+		REP_SYSTEM_LOCK(env);
+		ret = __rep_pggap_req(env, rep, NULL, gapflags);
+		REP_SYSTEM_UNLOCK(env);
+	} else {
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		ret = __rep_loggap_req(env, rep, NULL, gapflags);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+
+	if (msgtype != REP_INVALID) {
+		master = rep->master_id;
+		if (master == DB_EID_INVALID)
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+		else
+			(void)__rep_send_message(env,
+			    master, msgtype, lsnp, NULL, 0, sendflags);
+	}
+
+	return (ret);
+}
+
+/*
+ * __rep_check_doreq --
+ * PUBLIC: int __rep_check_doreq __P((ENV *, REP *));
+ *
+ * Check if we need to send another request.  If so, compare with
+ * the request limits the user might have set.  This assumes the
+ * caller holds the REP->mtx_clientdb mutex.  Returns 1 if a request
+ * needs to be made, and 0 if it does not.
+ */
+int
+__rep_check_doreq(env, rep)
+	ENV *env;
+	REP *rep;
+{
+
+	DB_LOG *dblp;
+	LOG *lp;
+	db_timespec now;
+	int req;
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	__os_gettime(env, &now, 1);
+	timespecsub(&now, &lp->rcvd_ts);
+	req = timespeccmp(&now, &lp->wait_ts, >=);
+	if (req) {
+		/*
+		 * Add wait_ts to itself to double it.
+		 */
+		timespecadd(&lp->wait_ts, &lp->wait_ts);
+		if (timespeccmp(&lp->wait_ts, &rep->max_gap, >))
+			lp->wait_ts = rep->max_gap;
+		__os_gettime(env, &lp->rcvd_ts, 1);
+	}
+	return (req);
+}
+
+/*
+ * __rep_skip_msg -
+ *
+ *	If we're in recovery we want to skip/ignore the message, but
+ *	we also need to see if we need to re-request any retransmissions.
+ */
+static int
+__rep_skip_msg(env, rep, eid, rectype)
+	ENV *env;
+	REP *rep;
+	int eid;
+	u_int32_t rectype;
+{
+	int do_req, ret;
+
+	ret = 0;
+	/*
+	 * If we have a request message from a client then immediately
+	 * send a REP_REREQUEST back to that client since we're skipping it.
+	 */
+	if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rectype))
+		do_req = 1;
+	else {
+		/* Check for need to retransmit. */
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		do_req = __rep_check_doreq(env, rep);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+	/*
+	 * Don't respond to a MASTER_REQ with
+	 * a MASTER_REQ or REREQUEST.
+	 */
+	if (do_req && rectype != REP_MASTER_REQ) {
+		/*
+		 * There are three cases:
+		 * 1.  If we don't know who the master is, then send MASTER_REQ.
+		 * 2.  If the message we're skipping came from the master,
+		 * then we need to rerequest.
+		 * 3.  If the message didn't come from a master (i.e. client
+		 * to client), then send a rerequest back to the sender so
+		 * the sender can rerequest it elsewhere, if we are a client.
+		 */
+		if (rep->master_id == DB_EID_INVALID)	/* Case 1. */
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+		else if (eid == rep->master_id)		/* Case 2. */
+			ret = __rep_resend_req(env, 0);
+		else if (F_ISSET(rep, REP_F_CLIENT))	/* Case 3. */
+			(void)__rep_send_message(env,
+			    eid, REP_REREQUEST, NULL, NULL, 0, 0);
+	}
+	return (ret);
+}
+
+/*
+ * __rep_check_missing --
+ * PUBLIC: int __rep_check_missing __P((ENV *, u_int32_t, DB_LSN *));
+ *
+ * Check for and request any missing client information.
+ */
+int
+__rep_check_missing(env, gen, master_perm_lsn)
+	ENV *env;
+	u_int32_t gen;
+	DB_LSN *master_perm_lsn;
+{
+	DB_LOG *dblp;
+	DB_LSN *end_lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGINFO *infop;
+	REP *rep;
+	__rep_fileinfo_args *curinfo;
+	int do_req, has_log_gap, has_page_gap, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	infop = env->reginfo;
+	has_log_gap = has_page_gap = ret = 0;
+
+	ENV_ENTER(env, ip);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * Check if we are okay to proceed with this operation.  If not,
+	 * do not rerequest anything.
+	 */
+	if (!F_ISSET(rep, REP_F_CLIENT) || rep->master_id == DB_EID_INVALID ||
+	    gen != rep->gen || FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+		REP_SYSTEM_UNLOCK(env);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		/*
+		 * If this client is out-of-date, ask the master to identify
+		 * itself so that this client will synchronize with the
+		 * master's later generation.
+		 */
+		if (gen > rep->gen && __rep_check_doreq(env, rep))
+			(void)__rep_send_message(env,
+			    DB_EID_BROADCAST, REP_MASTER_REQ,
+			    NULL, NULL, 0, 0);
+		goto out;
+	}
+
+	/*
+	 * Prevent message lockout by counting ourself here.
+	 * Setting rep->msg_th will prevent a major system
+	 * change, such as a role change or running recovery, from
+	 * occurring before sending out any rerequests.
+	 */
+	rep->msg_th++;
+	REP_SYSTEM_UNLOCK(env);
+
+	/* Check that it is time to request missing information. */
+	if ((do_req = __rep_check_doreq(env, rep))) {
+		/* Check for interior or tail page gap. */
+		REP_SYSTEM_LOCK(env);
+		if (rep->sync_state == SYNC_PAGE &&
+		    rep->curinfo_off != INVALID_ROFF) {
+			GET_CURINFO(rep, infop, curinfo);
+			has_page_gap =
+			    rep->waiting_pg != PGNO_INVALID ||
+			    rep->ready_pg <= curinfo->max_pgno;
+		}
+		REP_SYSTEM_UNLOCK(env);
+	}
+	/* Check for interior or tail log gap. */
+	if (do_req && !has_page_gap) {
+		lp = dblp->reginfo.primary;
+		/*
+		 * The LOG_COMPARE test is <= because ready_lsn is
+		 * the next LSN we are expecting but we do not have
+		 * it yet.  If the needed LSN is at this LSN, it
+		 * means we are missing the last record we need.
+		 */
+		if (rep->sync_state == SYNC_LOG)
+			end_lsn = &rep->last_lsn;
+		else
+			end_lsn = master_perm_lsn;
+		has_log_gap = !IS_ZERO_LSN(lp->waiting_lsn) ||
+		    LOG_COMPARE(&lp->ready_lsn, end_lsn) <= 0;
+	}
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	/*
+	 * If it is time to send a request, only do so if we
+	 * have a log gap or a page gap, or we need to resend an
+	 * UPDATE_REQ or VERIFY_REQ, or we are in SYNC_LOG to keep
+	 * requesting to the current known end of the log.
+	 */
+	do_req = do_req && (has_log_gap || has_page_gap ||
+	    rep->sync_state == SYNC_LOG ||
+	    rep->sync_state == SYNC_UPDATE ||
+	    rep->sync_state == SYNC_VERIFY);
+	/*
+	 * Determines request type from current replication
+	 * state and resends request.  The request may have
+	 * the DB_REP_ANYWHERE flag enabled if appropriate.
+	 */
+	if (do_req)
+		ret = __rep_resend_req(env, 0);
+
+	REP_SYSTEM_LOCK(env);
+	rep->msg_th--;
+	REP_SYSTEM_UNLOCK(env);
+
+out:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+__rep_fire_newmaster(env, gen, master)
+	ENV *env;
+	u_int32_t gen;
+	int master;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_EVENT_LOCK(env);
+	/*
+	 * The firing of this event should be idempotent with respect to a
+	 * particular generation number.
+	 */
+	if (rep->newmaster_event_gen < gen) {
+		__rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+		rep->newmaster_event_gen = gen;
+	}
+	REP_EVENT_UNLOCK(env);
+	return (0);
+}
+
+static int
+__rep_fire_startupdone(env, gen, master)
+	ENV *env;
+	u_int32_t gen;
+	int master;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_EVENT_LOCK(env);
+	/*
+	 * Usually NEWMASTER will already have been fired.  But if not, fire
+	 * it here now, to ensure the application receives events in the
+	 * expected order.
+	 */
+	if (rep->newmaster_event_gen < gen) {
+		__rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+		rep->newmaster_event_gen = gen;
+	}
+
+	/*
+	 * Caller already ensures that it only tries to fire STARTUPDONE once
+	 * per generation.  If we did not want to rely on that, we could add a
+	 * simple boolean flag (to the set of data protected by the mtx_event).
+	 * The precise meaning of that flag would be "STARTUPDONE has been fired
+	 * for the generation value stored in `newmaster_event_gen'".  Then the
+	 * more accurate test here would be simply to check that flag, and fire
+	 * the event (and set the flag) if it were not already set.
+	 */
+	if (rep->newmaster_event_gen == gen)
+		__rep_fire_event(env, DB_EVENT_REP_STARTUPDONE, NULL);
+	REP_EVENT_UNLOCK(env);
+	return (0);
+}
diff --git a/src/rep/rep_region.c b/src/rep/rep_region.c
new file mode 100644
index 00000000..f1d69dff
--- /dev/null
+++ b/src/rep/rep_region.c
@@ -0,0 +1,610 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int __rep_egen_init  __P((ENV *, REP *));
+static int __rep_gen_init  __P((ENV *, REP *));
+
+/*
+ * __rep_open --
+ *	Initialize the shared memory state for the replication system.
+ *
+ * PUBLIC: int __rep_open __P((ENV *));
+ */
+int
+__rep_open(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	int i, ret;
+	char *p;
+	char fname[sizeof(REP_DIAGNAME) + 3];
+
+	db_rep = env->rep_handle;
+	infop = env->reginfo;
+	renv = infop->primary;
+	ret = 0;
+	DB_ASSERT(env, DBREP_DIAG_FILES < 100);
+
+	if (renv->rep_off == INVALID_ROFF) {
+		/* Must create the region. */
+		if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0)
+			return (ret);
+		memset(rep, 0, sizeof(*rep));
+
+		/*
+		 * We have the region; fill in the values.  Some values may
+		 * have been configured before we open the region, and those
+		 * are taken from the DB_REP structure.
+		 */
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_REGION, 0, &rep->mtx_region)) != 0)
+			return (ret);
+		/*
+		 * Because we have no way to prevent deadlocks and cannot log
+		 * changes made to it, we single-thread access to the client
+		 * bookkeeping database.  This is suboptimal, but it only gets
+		 * accessed when messages arrive out-of-order, so it should
+		 * stay small and not be used in a high-performance app.
+		 */
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_DATABASE, 0, &rep->mtx_clientdb)) != 0)
+			return (ret);
+
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_CHKPT, 0, &rep->mtx_ckp)) != 0)
+			return (ret);
+
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_DIAG, 0, &rep->mtx_diag)) != 0)
+			return (ret);
+
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_EVENT, 0, &rep->mtx_event)) != 0)
+			return (ret);
+
+		if ((ret = __mutex_alloc(
+		    env, MTX_REP_START, 0, &rep->mtx_repstart)) != 0)
+			return (ret);
+
+		rep->diag_off = 0;
+		rep->diag_index = 0;
+		rep->newmaster_event_gen = 0;
+		rep->notified_egen = 0;
+		rep->curinfo_off = INVALID_ROFF;
+		rep->lease_off = INVALID_ROFF;
+		rep->originfo_off = INVALID_ROFF;
+		rep->tally_off = INVALID_ROFF;
+		rep->v2tally_off = INVALID_ROFF;
+		rep->eid = db_rep->eid;
+		rep->master_id = DB_EID_INVALID;
+		rep->version = DB_REPVERSION;
+
+		SH_TAILQ_INIT(&rep->waiters);
+		SH_TAILQ_INIT(&rep->free_waiters);
+
+		rep->config = db_rep->config;
+		/*
+		 * In-memory replication files must be set before we open
+		 * the env, so we know if it is in memory here.
+		 */
+		if (FLD_ISSET(rep->config, REP_C_INMEM))
+			FLD_CLR(env->dbenv->verbose, DB_VERB_REP_SYSTEM);
+
+		if ((ret = __rep_gen_init(env, rep)) != 0)
+			return (ret);
+		if ((ret = __rep_egen_init(env, rep)) != 0)
+			return (ret);
+		rep->gbytes = db_rep->gbytes;
+		rep->bytes = db_rep->bytes;
+		rep->request_gap = db_rep->request_gap;
+		rep->max_gap = db_rep->max_gap;
+		rep->config_nsites = db_rep->config_nsites;
+		rep->elect_timeout = db_rep->elect_timeout;
+		rep->full_elect_timeout = db_rep->full_elect_timeout;
+		rep->lease_timeout = db_rep->lease_timeout;
+		rep->clock_skew = db_rep->clock_skew;
+		rep->clock_base = db_rep->clock_base;
+		timespecclear(&rep->lease_duration);
+		timespecclear(&rep->grant_expire);
+		rep->chkpt_delay = db_rep->chkpt_delay;
+		rep->priority = db_rep->my_priority;
+
+		if ((ret = __rep_lockout_archive(env, rep)) != 0)
+			return (ret);
+
+		/* Copy application type flags if set before env open. */
+		if (F_ISSET(db_rep, DBREP_APP_REPMGR))
+			F_SET(rep, REP_F_APP_REPMGR);
+		if (F_ISSET(db_rep, DBREP_APP_BASEAPI))
+			F_SET(rep, REP_F_APP_BASEAPI);
+
+		/* Initialize encapsulating region. */
+		renv->rep_off = R_OFFSET(infop, rep);
+		(void)time(&renv->rep_timestamp);
+		renv->op_timestamp = 0;
+		F_CLR(renv, DB_REGENV_REPLOCKED);
+
+#ifdef HAVE_REPLICATION_THREADS
+		if ((ret = __repmgr_open(env, rep)) != 0)
+			return (ret);
+#endif
+	} else {
+		rep = R_ADDR(infop, renv->rep_off);
+		/*
+		 * Prevent an application type mismatch between a process
+		 * and the environment it is trying to join.
+		 */
+		if ((F_ISSET(db_rep, DBREP_APP_REPMGR) &&
+		    F_ISSET(rep, REP_F_APP_BASEAPI)) ||
+		    (F_ISSET(db_rep, DBREP_APP_BASEAPI) &&
+		    F_ISSET(rep, REP_F_APP_REPMGR))) {
+			__db_errx(env, DB_STR("3535",
+			    "Application type mismatch for a replication "
+			    "process joining the environment"));
+			return (EINVAL);
+		}
+#ifdef HAVE_REPLICATION_THREADS
+		if ((ret = __repmgr_join(env, rep)) != 0)
+			return (ret);
+#endif
+	}
+
+	db_rep->region = rep;
+	/*
+	 * Open the diagnostic message files for this env handle.  We do
+	 * this no matter if we created the environment or not.
+	 */
+	if (FLD_ISSET(rep->config, REP_C_INMEM))
+		goto out;
+	for (i = 0; i < DBREP_DIAG_FILES; i++) {
+		db_rep->diagfile[i] = NULL;
+		(void)snprintf(fname, sizeof(fname), REP_DIAGNAME, i);
+		if ((ret = __db_appname(env, DB_APP_NONE, fname,
+		    NULL, &p)) != 0)
+			goto err;
+		ret = __os_open(env, p, 0, DB_OSO_CREATE, DB_MODE_600,
+		    &db_rep->diagfile[i]);
+		__os_free(env, p);
+		if (ret != 0)
+			goto err;
+	}
+
+out:
+	return (0);
+
+err:
+	(void)__rep_close_diagfiles(env);
+	return (ret);
+}
+
+/*
+ * __rep_close_diagfiles --
+ *	Close any diag message files that are open.
+ *
+ * PUBLIC: int __rep_close_diagfiles __P((ENV *));
+ */
+int
+__rep_close_diagfiles(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int i, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	ret = t_ret = 0;
+
+	for (i = 0; i < DBREP_DIAG_FILES; i++) {
+		if (db_rep->diagfile[i] != NULL &&
+		    (t_ret = __os_closehandle(env, db_rep->diagfile[i])) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		db_rep->diagfile[i] = NULL;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_env_refresh --
+ *	Replication-specific refresh of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_refresh __P((ENV *));
+ */
+int
+__rep_env_refresh(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	struct __rep_waiter *waiter;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+	renv = infop->primary;
+	ret = 0;
+
+	/*
+	 * If we are the last reference closing the env, clear our knowledge of
+	 * belonging to a group and that there is a valid handle where
+	 * rep_start had already been called.
+	 */
+	if (renv->refcnt == 1) {
+		F_CLR(rep, REP_F_GROUP_ESTD);
+		F_CLR(rep, REP_F_START_CALLED);
+	}
+
+#ifdef HAVE_REPLICATION_THREADS
+	ret = __repmgr_env_refresh(env);
+#endif
+
+	/*
+	 * If a private region, return the memory to the heap.  Not needed for
+	 * filesystem-backed or system shared memory regions, that memory isn't
+	 * owned by any particular process.
+	 */
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		if (rep != NULL) {
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_region)) != 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_clientdb)) != 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_ckp)) != 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_diag)) != 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_event)) != 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __mutex_free(env,
+			    &rep->mtx_repstart)) != 0 && ret == 0)
+				ret = t_ret;
+
+			/* Discard commit queue elements. */
+			DB_ASSERT(env, SH_TAILQ_EMPTY(&rep->waiters));
+			while ((waiter = SH_TAILQ_FIRST(&rep->free_waiters,
+				    __rep_waiter)) != NULL) {
+				SH_TAILQ_REMOVE(&rep->free_waiters,
+				    waiter, links, __rep_waiter);
+				__env_alloc_free(env->reginfo, waiter);
+			}
+
+			if (rep->curinfo_off != INVALID_ROFF)
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->curinfo_off));
+			if (rep->lease_off != INVALID_ROFF)
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->lease_off));
+			if (rep->originfo_off != INVALID_ROFF)
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->originfo_off));
+			if (rep->tally_off != INVALID_ROFF)
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->tally_off));
+			if (rep->v2tally_off != INVALID_ROFF)
+				__env_alloc_free(infop,
+				    R_ADDR(infop, rep->v2tally_off));
+		}
+
+		if (renv->rep_off != INVALID_ROFF)
+			__env_alloc_free(infop, R_ADDR(infop, renv->rep_off));
+	}
+	if ((t_ret = __rep_close_diagfiles(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	env->rep_handle->region = NULL;
+	return (ret);
+}
+
+/*
+ * __rep_close --
+ *      Shut down all of replication.
+ *
+ * PUBLIC: int __rep_env_close __P((ENV *));
+ */
+int
+__rep_env_close(env)
+	ENV *env;
+{
+	int ret, t_ret;
+
+	ret = __rep_preclose(env);
+	if ((t_ret = __rep_closefiles(env)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_preclose --
+ *	If we are a client, shut down our client database and send
+ * any outstanding bulk buffers.
+ *
+ * PUBLIC: int __rep_preclose __P((ENV *));
+ */
+int
+__rep_preclose(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	DB *dbp;
+	REP_BULK bulk;
+	int ret, t_ret;
+
+	ret = 0;
+
+	db_rep = env->rep_handle;
+	dblp = env->lg_handle;
+
+	/*
+	 * If we have a rep region, we can preclose.  Otherwise, return.
+	 * If we're on an error path from env open, we may not have
+	 * a region, even though we have a handle.
+	 */
+	if (db_rep == NULL || db_rep->region == NULL)
+		return (ret);
+
+	if ((dbp = db_rep->lsn_db) != NULL) {
+		ret = __db_close(dbp, NULL, DB_NOSYNC);
+		db_rep->lsn_db = NULL;
+	}
+
+	MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+	if (db_rep->rep_db != NULL) {
+		if ((t_ret = __db_close(db_rep->rep_db,
+		    NULL, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+		db_rep->rep_db = NULL;
+	}
+	/*
+	 * We could be called early in an env_open error path, so
+	 * only do this if we have a log region set up.
+	 */
+	if (dblp == NULL)
+		goto out;
+	lp = dblp->reginfo.primary;
+	/*
+	 * If we have something in the bulk buffer, send anything in it
+	 * if we are able to.
+	 */
+	if (lp->bulk_off != 0 && db_rep->send != NULL) {
+		memset(&bulk, 0, sizeof(bulk));
+		bulk.addr = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+		bulk.offp = &lp->bulk_off;
+		bulk.len = lp->bulk_len;
+		bulk.type = REP_BULK_LOG;
+		bulk.eid = DB_EID_BROADCAST;
+		bulk.flagsp = &lp->bulk_flags;
+		/*
+		 * Ignore send errors here.  This can be called on the
+		 * env->close path - make a best attempt to send.
+		 */
+		(void)__rep_send_bulk(env, &bulk, 0);
+	}
+out:	MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+	return (ret);
+}
+
+/*
+ * __rep_closefiles --
+ *	If we were a client and are now a master, close all databases
+ *	we've opened while applying messages as a client.  This can
+ *	be called from __env_close and we need to check if the env,
+ *	handles and regions are set up, or not.
+ *
+ * PUBLIC: int __rep_closefiles __P((ENV *));
+ */
+int
+__rep_closefiles(env)
+	ENV *env;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	int ret;
+
+	ret = 0;
+
+	db_rep = env->rep_handle;
+	dblp = env->lg_handle;
+
+	if (db_rep == NULL || db_rep->region == NULL)
+		return (ret);
+	if (dblp == NULL)
+		return (ret);
+	if ((ret = __dbreg_close_files(env, 0)) == 0)
+		F_CLR(db_rep, DBREP_OPENFILES);
+
+	return (ret);
+}
+
+/*
+ * __rep_egen_init --
+ *	Initialize the value of egen in the region.  Called only from
+ *	__rep_region_init, which is guaranteed to be single-threaded
+ *	as we create the rep region.  We set the rep->egen field which
+ *	is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_egen_init(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	DB_FH *fhp;
+	int ret;
+	size_t cnt;
+	char *p;
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_EGENNAME, NULL, &p)) != 0)
+		return (ret);
+	/*
+	 * If the file doesn't exist, create it now and initialize with 1.
+	 */
+	if (__os_exists(env, p, NULL) != 0) {
+		rep->egen = rep->gen + 1;
+		if ((ret = __rep_write_egen(env, rep, rep->egen)) != 0)
+			goto err;
+	} else {
+		/*
+		 * File exists, open it and read in our egen.
+		 */
+		if ((ret = __os_open(env, p, 0,
+		    DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+			goto err;
+		if ((ret = __os_read(env, fhp, &rep->egen, sizeof(u_int32_t),
+		    &cnt)) != 0 || cnt != sizeof(u_int32_t))
+			goto err1;
+		RPRINT(env, (env, DB_VERB_REP_MISC, "Read in egen %lu",
+		    (u_long)rep->egen));
+err1:		 (void)__os_closehandle(env, fhp);
+	}
+err:	__os_free(env, p);
+	return (ret);
+}
+
+/*
+ * __rep_write_egen --
+ *	Write out the egen into the env file.
+ *
+ * PUBLIC: int __rep_write_egen __P((ENV *, REP *, u_int32_t));
+ *
+ * Caller relies on us not dropping the REP_SYSTEM_LOCK.
+ */
+int
+__rep_write_egen(env, rep, egen)
+	ENV *env;
+	REP *rep;
+	u_int32_t egen;
+{
+	DB_FH *fhp;
+	int ret;
+	size_t cnt;
+	char *p;
+
+	/*
+	 * If running in-memory replication, return without any file
+	 * operations.
+	 */
+	if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+		return (0);
+	}
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_EGENNAME, NULL, &p)) != 0)
+		return (ret);
+	if ((ret = __os_open(
+	    env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+		if ((ret = __os_write(env, fhp, &egen, sizeof(u_int32_t),
+		    &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+			__db_err(env, ret, "%s", p);
+		(void)__os_closehandle(env, fhp);
+	}
+	__os_free(env, p);
+	return (ret);
+}
+
+/*
+ * __rep_gen_init --
+ *	Initialize the value of gen in the region.  Called only from
+ *	__rep_region_init, which is guaranteed to be single-threaded
+ *	as we create the rep region.  We set the rep->gen field which
+ *	is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_gen_init(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	DB_FH *fhp;
+	int ret;
+	size_t cnt;
+	char *p;
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_GENNAME, NULL, &p)) != 0)
+		return (ret);
+
+	if (__os_exists(env, p, NULL) != 0) {
+		/*
+		 * File doesn't exist, create it now and initialize with 0.
+		 */
+		SET_GEN(0);
+		if ((ret = __rep_write_gen(env, rep, rep->gen)) != 0)
+			goto err;
+	} else {
+		/*
+		 * File exists, open it and read in our gen.
+		 */
+		if ((ret = __os_open(env, p, 0,
+		    DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+			goto err;
+		if ((ret = __os_read(env, fhp, &rep->gen, sizeof(u_int32_t),
+		    &cnt)) < 0 || cnt == 0)
+			goto err1;
+		RPRINT(env, (env, DB_VERB_REP_MISC, "Read in gen %lu",
+		    (u_long)rep->gen));
+err1:		 (void)__os_closehandle(env, fhp);
+	}
+err:	__os_free(env, p);
+	return (ret);
+}
+
+/*
+ * __rep_write_gen --
+ *	Write out the gen into the env file.
+ *
+ * PUBLIC: int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_write_gen(env, rep, gen)
+	ENV *env;
+	REP *rep;
+	u_int32_t gen;
+{
+	DB_FH *fhp;
+	int ret;
+	size_t cnt;
+	char *p;
+
+	/*
+	 * If running in-memory replication, return without any file
+	 * operations.
+	 */
+	if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+		return (0);
+	}
+
+	if ((ret = __db_appname(env,
+	    DB_APP_META, REP_GENNAME, NULL, &p)) != 0)
+		return (ret);
+	if ((ret = __os_open(
+	    env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+		if ((ret = __os_write(env, fhp, &gen, sizeof(u_int32_t),
+		    &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+			__db_err(env, ret, "%s", p);
+		(void)__os_closehandle(env, fhp);
+	}
+	__os_free(env, p);
+	return (ret);
+}
diff --git a/src/rep/rep_stat.c b/src/rep/rep_stat.c
new file mode 100644
index 00000000..addfee25
--- /dev/null
+++ b/src/rep/rep_stat.c
@@ -0,0 +1,692 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __rep_print_all __P((ENV *, u_int32_t));
+static int __rep_print_stats __P((ENV *, u_int32_t));
+static int __rep_stat __P((ENV *, DB_REP_STAT **, u_int32_t));
+static int __rep_stat_summary_print __P((ENV *));
+static const char *__rep_syncstate_to_string __P((repsync_t));
+
+/*
+ * Print the individual statistic for items that appear both in the full and
+ * the summary replication statistics output.
+ */
+#define	PRINT_LOGQUEUED(sp) do {					\
+	__db_dl(env, "Number of log records currently queued",		\
+	    (u_long)(sp)->st_log_queued);				\
+} while (0)
+
+#define	PRINT_MAXPERMLSN(sp) do {					\
+	__db_msg(env, "%lu/%lu\t%s",					\
+	    (u_long)(sp)->st_max_perm_lsn.file,				\
+	    (u_long)(sp)->st_max_perm_lsn.offset,			\
+	    (sp)->st_max_perm_lsn.file == 0 ?				\
+	    "No maximum permanent LSN" :				\
+	    "Maximum permanent LSN");					\
+} while (0)
+
+#define	PRINT_MSGSRECOVER(sp) do {					\
+	__db_dl(env, "Number of messages ignored due to pending recovery", \
+	    (u_long)(sp)->st_msgs_recover);				\
+} while (0)
+
+#define	PRINT_MSGSSENDFAILURES(sp) do {					\
+	__db_dl(env, "Number of failed message sends",			\
+	    (u_long)(sp)->st_msgs_send_failures);			\
+} while (0)
+
+#define	PRINT_STARTUPCOMPLETE(sp) do {					\
+	if ((sp)->st_startup_complete == 0)				\
+		__db_msg(env, "Startup incomplete");			\
+	else								\
+		__db_msg(env, "Startup complete");			\
+} while (0)
+
+#define	PRINT_STATUS(sp, is_client) do {				\
+	is_client = 0;							\
+	switch ((sp)->st_status) {					\
+	case DB_REP_MASTER:						\
+		__db_msg(env,						\
+		    "Environment configured as a replication master");	\
+		break;							\
+	case DB_REP_CLIENT:						\
+		__db_msg(env,						\
+		    "Environment configured as a replication client");	\
+		is_client = 1;						\
+		break;							\
+	default:							\
+		__db_msg(env,						\
+		    "Environment not configured for replication");	\
+		break;							\
+	}								\
+} while (0)
+
+/*
+ * __rep_stat_pp --
+ *	ENV->rep_stat pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+ */
+int
+__rep_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REP_STAT **statp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_stat", DB_INIT_REP);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __rep_stat(env, statp, flags);
+	ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __rep_stat --
+ *	ENV->rep_stat.
+ */
+static int
+__rep_stat(env, statp, flags)
+	ENV *env;
+	DB_REP_STAT **statp;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_REP_STAT *stats;
+	LOG *lp;
+	REP *rep;
+	u_int32_t startupdone;
+	uintmax_t queued;
+	int dolock, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	*statp = NULL;
+
+	/* Allocate a stat struct to return to the user. */
+	if ((ret = __os_umalloc(env, sizeof(DB_REP_STAT), &stats)) != 0)
+		return (ret);
+
+	/*
+	 * Read without holding the lock.  If we are in client recovery, we
+	 * copy just the stats struct so we won't block.  We only copy out
+	 * those stats that don't require acquiring any mutex.
+	 */
+	dolock = IS_REP_RECOVERING(rep) ? 0 : 1;
+	memcpy(stats, &rep->stat, sizeof(*stats));
+
+	/* Copy out election stats. */
+	if (FLD_ISSET(rep->elect_flags, REP_E_PHASE1))
+		stats->st_election_status = 1;
+	else if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2))
+		stats->st_election_status = 2;
+
+	stats->st_election_nsites = rep->sites;
+	stats->st_election_cur_winner = rep->winner;
+	stats->st_election_priority = rep->w_priority;
+	stats->st_election_gen = rep->w_gen;
+	stats->st_election_datagen = rep->w_datagen;
+	stats->st_election_lsn = rep->w_lsn;
+	stats->st_election_votes = rep->votes;
+	stats->st_election_nvotes = rep->nvotes;
+	stats->st_election_tiebreaker = rep->w_tiebreaker;
+
+	/* Copy out other info that's protected by the rep mutex. */
+	stats->st_env_id = rep->eid;
+	stats->st_env_priority = rep->priority;
+	stats->st_nsites = rep->nsites;
+	stats->st_master = rep->master_id;
+	stats->st_gen = rep->gen;
+	stats->st_egen = rep->egen;
+
+	if (F_ISSET(rep, REP_F_MASTER))
+		stats->st_status = DB_REP_MASTER;
+	else if (F_ISSET(rep, REP_F_CLIENT))
+		stats->st_status = DB_REP_CLIENT;
+	else
+		stats->st_status = 0;
+
+	if (LF_ISSET(DB_STAT_CLEAR)) {
+		queued = rep->stat.st_log_queued;
+		startupdone = rep->stat.st_startup_complete;
+		memset(&rep->stat, 0, sizeof(rep->stat));
+		rep->stat.st_log_queued = rep->stat.st_log_queued_total =
+		    rep->stat.st_log_queued_max = queued;
+		rep->stat.st_startup_complete = startupdone;
+	}
+
+	/*
+	 * Log-related replication info is stored in the log system and
+	 * protected by the log region lock.
+	 */
+	if (dolock)
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+	if (F_ISSET(rep, REP_F_CLIENT)) {
+		stats->st_next_lsn = lp->ready_lsn;
+		stats->st_waiting_lsn = lp->waiting_lsn;
+		stats->st_next_pg = rep->ready_pg;
+		stats->st_waiting_pg = rep->waiting_pg;
+		stats->st_max_lease_sec = (u_int32_t)lp->max_lease_ts.tv_sec;
+		stats->st_max_lease_usec = (u_int32_t)
+		    (lp->max_lease_ts.tv_nsec / NS_PER_US);
+	} else {
+		if (F_ISSET(rep, REP_F_MASTER)) {
+			LOG_SYSTEM_LOCK(env);
+			stats->st_next_lsn = lp->lsn;
+			LOG_SYSTEM_UNLOCK(env);
+		} else
+			ZERO_LSN(stats->st_next_lsn);
+		ZERO_LSN(stats->st_waiting_lsn);
+		stats->st_max_lease_sec = 0;
+		stats->st_max_lease_usec = 0;
+	}
+	stats->st_max_perm_lsn = lp->max_perm_lsn;
+	if (dolock)
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+	*statp = stats;
+	return (0);
+}
+
+/*
+ * __rep_stat_print_pp --
+ *	ENV->rep_stat_print pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->rep_stat_print", DB_INIT_REP);
+
+	if ((ret = __db_fchk(env, "DB_ENV->rep_stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_CLEAR | DB_STAT_SUMMARY)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __rep_stat_print(env, flags);
+	ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __rep_stat_print --
+ *	ENV->rep_stat_print method.
+ *
+ * PUBLIC: int __rep_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__rep_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (LF_ISSET(DB_STAT_SUMMARY))
+		return (__rep_stat_summary_print(env));
+
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __rep_print_stats(env, orig_flags);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __rep_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __rep_print_stats --
+ *	Print out default statistics.
+ */
+static int
+__rep_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_REP_STAT *sp;
+	int is_client, ret;
+	char *p;
+
+	if ((ret = __rep_stat(env, &sp, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default replication region information:");
+	PRINT_STATUS(sp, is_client);
+
+	__db_msg(env, "%lu/%lu\t%s",
+	    (u_long)sp->st_next_lsn.file, (u_long)sp->st_next_lsn.offset,
+	    is_client ? "Next LSN expected" : "Next LSN to be used");
+	__db_msg(env, "%lu/%lu\t%s",
+	    (u_long)sp->st_waiting_lsn.file, (u_long)sp->st_waiting_lsn.offset,
+	    sp->st_waiting_lsn.file == 0 ?
+	    "Not waiting for any missed log records" :
+	    "LSN of first log record we have after missed log records");
+	PRINT_MAXPERMLSN(sp);
+
+	__db_dl(env, "Next page number expected", (u_long)sp->st_next_pg);
+	p = sp->st_waiting_pg == PGNO_INVALID ?
+	    "Not waiting for any missed pages" :
+	    "Page number of first page we have after missed pages";
+	__db_msg(env, "%lu\t%s", (u_long)sp->st_waiting_pg, p);
+	__db_dl(env,
+     "Number of duplicate master conditions originally detected at this site",
+	    (u_long)sp->st_dupmasters);
+	if (sp->st_env_id != DB_EID_INVALID)
+		__db_dl(env, "Current environment ID", (u_long)sp->st_env_id);
+	else
+		__db_msg(env, "No current environment ID");
+	__db_dl(env,
+	    "Current environment priority", (u_long)sp->st_env_priority);
+	__db_dl(env, "Current generation number", (u_long)sp->st_gen);
+	__db_dl(env,
+	    "Election generation number for the current or next election",
+	    (u_long)sp->st_egen);
+	__db_dl(env, "Number of lease validity checks",
+	    (u_long)sp->st_lease_chk);
+	__db_dl(env, "Number of invalid lease validity checks",
+	    (u_long)sp->st_lease_chk_misses);
+	__db_dl(env,
+	    "Number of lease refresh attempts during lease validity checks",
+	    (u_long)sp->st_lease_chk_refresh);
+	__db_dl(env, "Number of live messages sent while using leases",
+	    (u_long)sp->st_lease_sends);
+	__db_dl(env, "Number of duplicate log records received",
+	    (u_long)sp->st_log_duplicated);
+	PRINT_LOGQUEUED(sp);
+	__db_dl(env, "Maximum number of log records ever queued at once",
+	    (u_long)sp->st_log_queued_max);
+	__db_dl(env, "Total number of log records queued",
+	    (u_long)sp->st_log_queued_total);
+	__db_dl(env,
+	    "Number of log records received and appended to the log",
+	    (u_long)sp->st_log_records);
+	__db_dl(env, "Number of log records missed and requested",
+	    (u_long)sp->st_log_requested);
+	if (sp->st_master != DB_EID_INVALID)
+		__db_dl(env, "Current master ID", (u_long)sp->st_master);
+	else
+		__db_msg(env, "No current master ID");
+	__db_dl(env, "Number of times the master has changed",
+	    (u_long)sp->st_master_changes);
+	__db_dl(env,
+	    "Number of messages received with a bad generation number",
+	    (u_long)sp->st_msgs_badgen);
+	__db_dl(env, "Number of messages received and processed",
+	    (u_long)sp->st_msgs_processed);
+	PRINT_MSGSRECOVER(sp);
+	PRINT_MSGSSENDFAILURES(sp);
+	__db_dl(env, "Number of messages sent", (u_long)sp->st_msgs_sent);
+	__db_dl(env,
+	    "Number of new site messages received", (u_long)sp->st_newsites);
+	__db_dl(env,
+	    "Number of environments used in the last election",
+	    (u_long)(sp)->st_nsites);
+	__db_dl(env, "Transmission limited", (u_long)sp->st_nthrottles);
+	__db_dl(env, "Number of outdated conditions detected",
+	    (u_long)sp->st_outdated);
+	__db_dl(env, "Number of duplicate page records received",
+	    (u_long)sp->st_pg_duplicated);
+	__db_dl(env, "Number of page records received and added to databases",
+	    (u_long)sp->st_pg_records);
+	__db_dl(env, "Number of page records missed and requested",
+	    (u_long)sp->st_pg_requested);
+	PRINT_STARTUPCOMPLETE(sp);
+	__db_dl(env,
+	    "Number of transactions applied", (u_long)sp->st_txns_applied);
+
+	__db_dl(env, "Number of startsync messages delayed",
+	    (u_long)sp->st_startsync_delayed);
+
+	__db_dl(env, "Number of elections held", (u_long)sp->st_elections);
+	__db_dl(env,
+	    "Number of elections won", (u_long)sp->st_elections_won);
+
+	if (sp->st_election_status == 0) {
+		__db_msg(env, "No election in progress");
+		if (sp->st_election_sec > 0 || sp->st_election_usec > 0)
+			__db_msg(env,
+			    "%lu.%.6lu\tDuration of last election (seconds)",
+			    (u_long)sp->st_election_sec,
+			    (u_long)sp->st_election_usec);
+	} else {
+		__db_dl(env, "Current election phase",
+		    (u_long)sp->st_election_status);
+		__db_dl(env,
+    "Environment ID of the winner of the current or last election",
+		    (u_long)sp->st_election_cur_winner);
+		__db_dl(env,
+    "Master generation number of the winner of the current or last election",
+		    (u_long)sp->st_election_gen);
+		__db_dl(env,
+  "Master data generation number of the winner of the current or last election",
+		    (u_long)sp->st_election_datagen);
+		__db_msg(env,
+    "%lu/%lu\tMaximum LSN of the winner of the current or last election",
+		    (u_long)sp->st_election_lsn.file,
+		    (u_long)sp->st_election_lsn.offset);
+		__db_dl(env,
+    "Number of sites responding to this site during the current election",
+		    (u_long)sp->st_election_nsites);
+		__db_dl(env,
+    "Number of votes required in the current or last election",
+		    (u_long)sp->st_election_nvotes);
+		__db_dl(env,
+		    "Priority of the winner of the current or last election",
+		    (u_long)sp->st_election_priority);
+		__db_dl(env,
+    "Tiebreaker value of the winner of the current or last election",
+		    (u_long)sp->st_election_tiebreaker);
+		__db_dl(env,
+		    "Number of votes received during the current election",
+		    (u_long)sp->st_election_votes);
+	}
+	__db_dl(env, "Number of bulk buffer sends triggered by full buffer",
+	    (u_long)sp->st_bulk_fills);
+	__db_dl(env, "Number of single records exceeding bulk buffer size",
+	    (u_long)sp->st_bulk_overflows);
+	__db_dl(env, "Number of records added to a bulk buffer",
+	    (u_long)sp->st_bulk_records);
+	__db_dl(env, "Number of bulk buffers sent",
+	    (u_long)sp->st_bulk_transfers);
+	__db_dl(env, "Number of re-request messages received",
+	    (u_long)sp->st_client_rerequests);
+	__db_dl(env,
+	    "Number of request messages this client failed to process",
+	    (u_long)sp->st_client_svc_miss);
+	__db_dl(env, "Number of request messages received by this client",
+	    (u_long)sp->st_client_svc_req);
+	if (sp->st_max_lease_sec > 0 || sp->st_max_lease_usec > 0)
+		__db_msg(env,
+		    "%lu.%.6lu\tDuration of maximum lease (seconds)",
+		    (u_long)sp->st_max_lease_sec,
+		    (u_long)sp->st_max_lease_usec);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __rep_print_all --
+ *	Display debugging replication region statistics.
+ */
+static int
+__rep_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN rep_cfn[] = {
+		{ REP_C_2SITE_STRICT,	"REP_C_2SITE_STRICT" },
+		{ REP_C_AUTOINIT,	"REP_C_AUTOINIT" },
+		{ REP_C_AUTOROLLBACK,	"REP_C_AUTOROLLBACK" },
+		{ REP_C_BULK,		"REP_C_BULK" },
+		{ REP_C_DELAYCLIENT,	"REP_C_DELAYCLIENT" },
+		{ REP_C_ELECTIONS,	"REP_C_ELECTIONS" },
+		{ REP_C_INMEM,		"REP_C_INMEM" },
+		{ REP_C_LEASE,		"REP_C_LEASE" },
+		{ REP_C_NOWAIT,		"REP_C_NOWAIT" },
+		{ 0,			NULL }
+	};
+	static const FN rep_efn[] = {
+		{ REP_E_PHASE0,		"REP_E_PHASE0" },
+		{ REP_E_PHASE1,		"REP_E_PHASE1" },
+		{ REP_E_PHASE2,		"REP_E_PHASE2" },
+		{ REP_E_TALLY,		"REP_E_TALLY" },
+		{ 0,			NULL }
+	};
+	static const FN rep_fn[] = {
+		{ REP_F_ABBREVIATED,	"REP_F_ABBREVIATED" },
+		{ REP_F_APP_BASEAPI,	"REP_F_APP_BASEAPI" },
+		{ REP_F_APP_REPMGR,	"REP_F_APP_REPMGR" },
+		{ REP_F_CLIENT,		"REP_F_CLIENT" },
+		{ REP_F_DELAY,		"REP_F_DELAY" },
+		{ REP_F_GROUP_ESTD,	"REP_F_GROUP_ESTD" },
+		{ REP_F_LEASE_EXPIRED,	"REP_F_LEASE_EXPIRED" },
+		{ REP_F_MASTER,		"REP_F_MASTER" },
+		{ REP_F_MASTERELECT,	"REP_F_MASTERELECT" },
+		{ REP_F_NEWFILE,	"REP_F_NEWFILE" },
+		{ REP_F_NIMDBS_LOADED,	"REP_F_NIMDBS_LOADED" },
+		{ REP_F_SKIPPED_APPLY,	"REP_F_SKIPPED_APPLY" },
+		{ REP_F_START_CALLED,	"REP_F_START_CALLED" },
+		{ 0,			NULL }
+	};
+	static const FN rep_lfn[] = {
+		{ REP_LOCKOUT_API,	"REP_LOCKOUT_API" },
+		{ REP_LOCKOUT_APPLY,	"REP_LOCKOUT_APPLY" },
+		{ REP_LOCKOUT_ARCHIVE,	"REP_LOCKOUT_ARCHIVE" },
+		{ REP_LOCKOUT_MSG,	"REP_LOCKOUT_MSG" },
+		{ REP_LOCKOUT_OP,	"REP_LOCKOUT_OP" },
+		{ 0,			NULL }
+	};
+	static const FN dbrep_fn[] = {
+		{ DBREP_APP_BASEAPI,	"DBREP_APP_BASEAPI" },
+		{ DBREP_APP_REPMGR,	"DBREP_APP_REPMGR" },
+		{ DBREP_OPENFILES,	"DBREP_OPENFILES" },
+		{ 0,			NULL }
+	};
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	char time_buf[CTIME_BUFLEN];
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+	renv = infop->primary;
+	ENV_ENTER(env, ip);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_REP handle information:");
+
+	if (db_rep->rep_db == NULL)
+		STAT_ISSET("Bookkeeping database", db_rep->rep_db);
+	else
+		(void)__db_stat_print(db_rep->rep_db, ip, flags);
+
+	__db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags");
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "REP handle information:");
+	__mutex_print_debug_single(env,
+	    "Replication region mutex", rep->mtx_region, flags);
+	__mutex_print_debug_single(env,
+	    "Bookkeeping database mutex", rep->mtx_clientdb, flags);
+
+	STAT_LONG("Environment ID", rep->eid);
+	STAT_LONG("Master environment ID", rep->master_id);
+	STAT_ULONG("Election generation", rep->egen);
+	STAT_ULONG("Last active egen", rep->spent_egen);
+	STAT_ULONG("Master generation", rep->gen);
+	STAT_LONG("Space allocated for sites", rep->asites);
+	STAT_LONG("Sites in group", rep->nsites);
+	STAT_LONG("Votes needed for election", rep->nvotes);
+	STAT_LONG("Priority in election", rep->priority);
+	__db_dlbytes(env, "Limit on data sent in a single call",
+	    rep->gbytes, (u_long)0, rep->bytes);
+	STAT_LONG("Request gap seconds", rep->request_gap.tv_sec);
+	STAT_LONG("Request gap microseconds",
+	    rep->request_gap.tv_nsec / NS_PER_US);
+	STAT_LONG("Maximum gap seconds", rep->max_gap.tv_sec);
+	STAT_LONG("Maximum gap microseconds",
+	    rep->max_gap.tv_nsec / NS_PER_US);
+
+	STAT_ULONG("Callers in rep_proc_msg", rep->msg_th);
+	STAT_ULONG("Callers in rep_elect", rep->elect_th);
+	STAT_ULONG("Library handle count", rep->handle_cnt);
+	STAT_ULONG("Multi-step operation count", rep->op_cnt);
+	__db_msg(env, "%.24s\tRecovery timestamp",
+	    renv->rep_timestamp == 0 ?
+	    "0" : __os_ctime(&renv->rep_timestamp, time_buf));
+
+	STAT_LONG("Sites heard from", rep->sites);
+	STAT_LONG("Current winner", rep->winner);
+	STAT_LONG("Winner priority", rep->w_priority);
+	STAT_ULONG("Winner generation", rep->w_gen);
+	STAT_ULONG("Winner data generation", rep->w_datagen);
+	STAT_LSN("Winner LSN", &rep->w_lsn);
+	STAT_LONG("Winner tiebreaker", rep->w_tiebreaker);
+	STAT_LONG("Votes for this site", rep->votes);
+
+	STAT_STRING("Synchronization State",
+	    __rep_syncstate_to_string(rep->sync_state));
+	__db_prflags(env, NULL, rep->config, rep_cfn, NULL,
+	    "\tConfig Flags");
+	__db_prflags(env, NULL, rep->elect_flags, rep_efn, NULL,
+	    "\tElect Flags");
+	__db_prflags(env, NULL, rep->lockout_flags, rep_lfn,
+	    NULL, "\tLockout Flags");
+	__db_prflags(env, NULL, rep->flags, rep_fn, NULL, "\tFlags");
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "LOG replication information:");
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	dblp = env->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+	STAT_LSN("First log record after a gap", &lp->waiting_lsn);
+	STAT_LSN("Maximum permanent LSN processed", &lp->max_perm_lsn);
+	STAT_LSN("LSN waiting to verify", &lp->verify_lsn);
+	STAT_LSN("Maximum LSN requested", &lp->max_wait_lsn);
+	STAT_LONG("Time to wait before requesting seconds", lp->wait_ts.tv_sec);
+	STAT_LONG("Time to wait before requesting microseconds",
+	    lp->wait_ts.tv_nsec / NS_PER_US);
+	STAT_LSN("Next LSN expected", &lp->ready_lsn);
+	STAT_LONG("Maximum lease timestamp seconds", lp->max_lease_ts.tv_sec);
+	STAT_LONG("Maximum lease timestamp microseconds",
+	    lp->max_lease_ts.tv_nsec / NS_PER_US);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	ENV_LEAVE(env, ip);
+
+	return (0);
+}
+
+static const char *
+__rep_syncstate_to_string(state)
+	repsync_t state;
+{
+	switch (state) {
+	case SYNC_OFF:
+		return ("Not Synchronizing");
+	case SYNC_LOG:
+		return ("SYNC_LOG");
+	case SYNC_PAGE:
+		return ("SYNC_PAGE");
+	case SYNC_UPDATE:
+		return ("SYNC_UPDATE");
+	case SYNC_VERIFY:
+		return ("SYNC_VERIFY");
+	default:
+		break;
+	}
+	return ("UNKNOWN STATE");
+}
+
+/*
+ * __rep_stat_summary_print --
+ *	Print out a brief summary of replication statistics.
+ */
+static int
+__rep_stat_summary_print(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	DB_REP_STAT *sp;
+	REP *rep;
+	int is_client, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+	if ((ret = __rep_stat(env, &sp, 0)) == 0) {
+		PRINT_STATUS(sp, is_client);
+		if (is_client)
+			PRINT_STARTUPCOMPLETE(sp);
+		PRINT_MAXPERMLSN(sp);
+		/*
+		 * Use the number of sites that is kept up-to-date most
+		 * frequently.  The rep_stat st_nsites is only current
+		 * as of the last election.
+		 */
+		__db_dl(env, "Number of environments in the replication group",
+		    (u_long)rep->config_nsites);
+		PRINT_MSGSSENDFAILURES(sp);
+		PRINT_MSGSRECOVER(sp);
+		PRINT_LOGQUEUED(sp);
+		__os_ufree(env, sp);
+	}
+	return (ret);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REP_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/rep/rep_stub.c b/src/rep/rep_stub.c
new file mode 100644
index 00000000..2d96ea59
--- /dev/null
+++ b/src/rep/rep_stub.c
@@ -0,0 +1,425 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_REPLICATION
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * If the library wasn't compiled with replication support, various routines
+ * aren't available.  Stub them here, returning an appropriate error.
+ */
+static int __db_norep __P((ENV *));
+
+/*
+ * __db_norep --
+ *	Error when a Berkeley DB build doesn't include replication support.
+ */
+static int
+__db_norep(env)
+	ENV *env;
+{
+	__db_errx(env, DB_STR("3581",
+	    "library build did not include support for replication"));
+	return (DB_OPNOTSUP);
+}
+
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+	DB *dbp;
+	int checkgen, checklock, return_now;
+{
+	COMPQUIET(checkgen, 0);
+	COMPQUIET(checklock, 0);
+	COMPQUIET(return_now, 0);
+	return (__db_norep(dbp->env));
+}
+
+int
+__env_rep_enter(env, checklock)
+	ENV *env;
+	int checklock;
+{
+	COMPQUIET(checklock, 0);
+	return (__db_norep(env));
+}
+
+int
+__env_db_rep_exit(env)
+	ENV *env;
+{
+	return (__db_norep(env));
+}
+
+int
+__op_rep_enter(env, local_nowait, obey_user)
+	ENV *env;
+	int local_nowait, obey_user;
+{
+	COMPQUIET(local_nowait, 0);
+	COMPQUIET(obey_user, 0);
+	return (__db_norep(env));
+}
+
+int
+__op_rep_exit(env)
+	ENV *env;
+{
+	return (__db_norep(env));
+}
+
+int
+__archive_rep_enter(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__archive_rep_exit(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__rep_bulk_message(env, bulkp, repth, lsnp, dbt, flags)
+	ENV *env;
+	REP_BULK *bulkp;
+	REP_THROTTLE *repth;
+	DB_LSN *lsnp;
+	const DBT *dbt;
+	u_int32_t flags;
+{
+	COMPQUIET(bulkp, NULL);
+	COMPQUIET(repth, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(dbt, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norep(env));
+}
+
+int
+__rep_env_refresh(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__rep_elect_pp(dbenv, nsites, nvotes, flags)
+	DB_ENV *dbenv;
+	u_int32_t nsites, nvotes;
+	u_int32_t flags;
+{
+	COMPQUIET(nsites, 0);
+	COMPQUIET(nvotes, 0);
+	COMPQUIET(flags, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_flush(dbenv)
+	DB_ENV *dbenv;
+{
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_lease_check(env, refresh)
+	ENV *env;
+	int refresh;
+{
+	COMPQUIET(refresh, 0);
+	return (__db_norep(env));
+}
+
+int
+__rep_lease_expire(env)
+	ENV *env;
+{
+	return (__db_norep(env));
+}
+
+void
+__rep_msg(env, msg)
+	const ENV *env;
+	const char *msg;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(msg, NULL);
+	return;
+}
+
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+	DB_ENV *dbenv;
+	u_int32_t *fast_clockp, *slow_clockp;
+{
+	COMPQUIET(fast_clockp, NULL);
+	COMPQUIET(slow_clockp, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+	DB_ENV *dbenv;
+	u_int32_t fast_clock, slow_clock;
+{
+	COMPQUIET(fast_clock, 0);
+	COMPQUIET(slow_clock, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_nsites_pp(dbenv, n)
+	DB_ENV *dbenv;
+	u_int32_t n;
+{
+	COMPQUIET(n, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_nsites(dbenv, n)
+	DB_ENV *dbenv;
+	u_int32_t *n;
+{
+	COMPQUIET(n, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_priority(dbenv, priority)
+	DB_ENV *dbenv;
+	u_int32_t priority;
+{
+	COMPQUIET(priority, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_priority(dbenv, priority)
+	DB_ENV *dbenv;
+	u_int32_t *priority;
+{
+	COMPQUIET(priority, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_timeout(dbenv, which, timeout)
+	DB_ENV *dbenv;
+	int which;
+	db_timeout_t timeout;
+{
+	COMPQUIET(which, 0);
+	COMPQUIET(timeout, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_timeout(dbenv, which, timeout)
+	DB_ENV *dbenv;
+	int which;
+	db_timeout_t *timeout;
+{
+	COMPQUIET(which, 0);
+	COMPQUIET(timeout, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_config(dbenv, which, onp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onp;
+{
+	COMPQUIET(which, 0);
+	COMPQUIET(onp, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_config(dbenv, which, on)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int on;
+{
+	COMPQUIET(which, 0);
+	COMPQUIET(on, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+	DB_ENV *dbenv;
+	u_int32_t *gbytesp, *bytesp;
+{
+	COMPQUIET(gbytesp, NULL);
+	COMPQUIET(bytesp, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_open(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+int
+__rep_preclose(env)
+	ENV *env;
+{
+	return (__db_norep(env));
+}
+
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+	DB_ENV *dbenv;
+	DBT *control, *rec;
+	int eid;
+	DB_LSN *ret_lsnp;
+{
+	COMPQUIET(control, NULL);
+	COMPQUIET(rec, NULL);
+	COMPQUIET(eid, 0);
+	COMPQUIET(ret_lsnp, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_send_message(env, eid, rtype, lsnp, dbtp, logflags, repflags)
+	ENV *env;
+	int eid;
+	u_int32_t rtype;
+	DB_LSN *lsnp;
+	const DBT *dbtp;
+	u_int32_t logflags, repflags;
+{
+	COMPQUIET(eid, 0);
+	COMPQUIET(rtype, 0);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(logflags, 0);
+	COMPQUIET(repflags, 0);
+	return (__db_norep(env));
+}
+
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+	DB_ENV *dbenv;
+	u_int32_t gbytes, bytes;
+{
+	COMPQUIET(gbytes, 0);
+	COMPQUIET(bytes, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+	DB_ENV *dbenv;
+	int eid;
+	int (*f_send) __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+	    int, u_int32_t));
+{
+	COMPQUIET(eid, 0);
+	COMPQUIET(f_send, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_request(dbenv, min, max)
+	DB_ENV *dbenv;
+	u_int32_t min, max;
+{
+	COMPQUIET(min, 0);
+	COMPQUIET(max, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_request(dbenv, minp, maxp)
+	DB_ENV *dbenv;
+	u_int32_t *minp, *maxp;
+{
+	COMPQUIET(minp, NULL);
+	COMPQUIET(maxp, NULL);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_start_pp(dbenv, dbt, flags)
+	DB_ENV *dbenv;
+	DBT *dbt;
+	u_int32_t flags;
+{
+	COMPQUIET(dbt, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REP_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_norep(env));
+}
+
+int
+__rep_sync(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_norep(dbenv->env));
+}
+
+int
+__rep_txn_applied(env, ip, commit_info, timeout)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_COMMIT_INFO *commit_info;
+	db_timeout_t timeout;
+{
+	COMPQUIET(ip, 0);
+	COMPQUIET(commit_info, NULL);
+	COMPQUIET(timeout, 0);
+	return (__db_norep(env));
+}
+#endif /* !HAVE_REPLICATION */
diff --git a/src/rep/rep_util.c b/src/rep/rep_util.c
new file mode 100644
index 00000000..0dfe6122
--- /dev/null
+++ b/src/rep/rep_util.c
@@ -0,0 +1,2791 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef REP_DIAGNOSTIC
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#endif
+
+/*
+ * rep_util.c:
+ *	Miscellaneous replication-related utility functions, including
+ *	those called by other subsystems.
+ */
+#define	TIMESTAMP_CHECK(env, ts, renv) do {				\
+	if (renv->op_timestamp != 0 &&					\
+	    renv->op_timestamp + DB_REGENV_TIMEOUT < ts) {		\
+		REP_SYSTEM_LOCK(env);					\
+		F_CLR(renv, DB_REGENV_REPLOCKED);			\
+		renv->op_timestamp = 0;					\
+		REP_SYSTEM_UNLOCK(env);					\
+	}								\
+} while (0)
+
+static int __rep_lockout_int __P((ENV *, REP *, u_int32_t *, u_int32_t,
+    const char *, u_int32_t));
+static int __rep_newmaster_empty __P((ENV *, int));
+static int __rep_print_int __P((ENV *, u_int32_t, const char *, va_list));
+#ifdef REP_DIAGNOSTIC
+static void __rep_print_logmsg __P((ENV *, const DBT *, DB_LSN *));
+#endif
+static int __rep_show_progress __P((ENV *, const char *, int mins));
+
+/*
+ * __rep_bulk_message --
+ *	This is a wrapper for putting a record into a bulk buffer.  Since
+ * we have different bulk buffers, the caller must hand us the information
+ * we need to put the record into the correct buffer.  All bulk buffers
+ * are protected by the REP->mtx_clientdb.
+ *
+ * PUBLIC: int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *,
+ * PUBLIC:     DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__rep_bulk_message(env, bulk, repth, lsn, dbt, flags)
+	ENV *env;
+	REP_BULK *bulk;
+	REP_THROTTLE *repth;
+	DB_LSN *lsn;
+	const DBT *dbt;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	__rep_bulk_args b_args;
+	size_t len;
+	int ret;
+	u_int32_t recsize, typemore;
+	u_int8_t *p;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+
+	/*
+	 * Figure out the total number of bytes needed for this record.
+	 * !!! The marshalling code includes the given len, but also
+	 * puts its own copy of the dbt->size with the DBT portion of
+	 * the record.  Account for that here.
+	 */
+	recsize = sizeof(len) + dbt->size + sizeof(DB_LSN) + sizeof(dbt->size);
+
+	/*
+	 * If *this* buffer is actively being transmitted, don't wait,
+	 * just return so that it can be sent as a singleton.
+	 */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	if (FLD_ISSET(*(bulk->flagsp), BULK_XMIT)) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		return (DB_REP_BULKOVF);
+	}
+
+	/*
+	 * If the record is bigger than the buffer entirely, send the
+	 * current buffer and then return DB_REP_BULKOVF so that this
+	 * record is sent as a singleton.  Do we have enough info to
+	 * do that here?  XXX
+	 */
+	if (recsize > bulk->len) {
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "bulk_msg: Record %d (0x%x) larger than entire buffer 0x%x",
+		    recsize, recsize, bulk->len));
+		STAT(rep->stat.st_bulk_overflows++);
+		(void)__rep_send_bulk(env, bulk, flags);
+		/*
+		 * XXX __rep_send_message...
+		 */
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		return (DB_REP_BULKOVF);
+	}
+	/*
+	 * If this record doesn't fit, send the current buffer.
+	 * Sending the buffer will reset the offset, but we will
+	 * drop the mutex while sending so we need to keep checking
+	 * if we're racing.
+	 */
+	while (recsize + *(bulk->offp) > bulk->len) {
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+	    "bulk_msg: Record %lu (%#lx) doesn't fit.  Send %lu (%#lx) now.",
+		    (u_long)recsize, (u_long)recsize,
+		    (u_long)bulk->len, (u_long)bulk->len));
+		STAT(rep->stat.st_bulk_fills++);
+		if ((ret = __rep_send_bulk(env, bulk, flags)) != 0) {
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			return (ret);
+		}
+	}
+
+	/*
+	 * If we're using throttling, see if we are at the throttling
+	 * limit before we do any more work here, by checking if the
+	 * call to rep_send_throttle changed the repth->type to the
+	 * *_MORE message type.  If the throttling code hits the limit
+	 * then we're done here.
+	 */
+	if (bulk->type == REP_BULK_LOG)
+		typemore = REP_LOG_MORE;
+	else
+		typemore = REP_PAGE_MORE;
+	if (repth != NULL) {
+		if ((ret = __rep_send_throttle(env,
+		    bulk->eid, repth, REP_THROTTLE_ONLY, flags)) != 0) {
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			return (ret);
+		}
+		if (repth->type == typemore) {
+			VPRINT(env, (env, DB_VERB_REP_MSGS,
+			    "bulk_msg: Record %lu (0x%lx) hit throttle limit.",
+			    (u_long)recsize, (u_long)recsize));
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			return (ret);
+		}
+	}
+
+	/*
+	 * Now we own the buffer, and we know our record fits into it.
+	 * The buffer is structured with the len, LSN and then the record.
+	 * Copy the record into the buffer.  Then if we need to,
+	 * send the buffer.
+	 */
+	p = bulk->addr + *(bulk->offp);
+	b_args.len = dbt->size;
+	b_args.lsn = *lsn;
+	b_args.bulkdata = *dbt;
+	/*
+	 * If we're the first record, we need to save the first
+	 * LSN in the bulk structure.
+	 */
+	if (*(bulk->offp) == 0)
+		bulk->lsn = *lsn;
+	if (rep->version < DB_REPVERSION_47) {
+		len = 0;
+		memcpy(p, &dbt->size, sizeof(dbt->size));
+		p += sizeof(dbt->size);
+		memcpy(p, lsn, sizeof(DB_LSN));
+		p += sizeof(DB_LSN);
+		memcpy(p, dbt->data, dbt->size);
+		p += dbt->size;
+	} else if ((ret = __rep_bulk_marshal(env, &b_args, p,
+	    bulk->len, &len)) != 0)
+		goto err;
+	*(bulk->offp) = (roff_t)(p + len - bulk->addr);
+	STAT(rep->stat.st_bulk_records++);
+	/*
+	 * Send the buffer if it is a perm record or a force.
+	 */
+	if (LF_ISSET(REPCTL_PERM)) {
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "bulk_msg: Send buffer after copy due to PERM"));
+		ret = __rep_send_bulk(env, bulk, flags);
+	}
+err:
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	return (ret);
+
+}
+
+/*
+ * __rep_send_bulk --
+ *	This function transmits the bulk buffer given.  It assumes the
+ * caller holds the REP->mtx_clientdb.  We may release it and reacquire
+ * it during this call.  We will return with it held.
+ *
+ * PUBLIC: int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_send_bulk(env, bulkp, ctlflags)
+	ENV *env;
+	REP_BULK *bulkp;
+	u_int32_t ctlflags;
+{
+	DBT dbt;
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	/*
+	 * If the offset is 0, we're done.  There is nothing to send.
+	 */
+	if (*(bulkp->offp) == 0)
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * Set that this buffer is being actively transmitted.
+	 */
+	FLD_SET(*(bulkp->flagsp), BULK_XMIT);
+	DB_INIT_DBT(dbt, bulkp->addr, *(bulkp->offp));
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	VPRINT(env, (env, DB_VERB_REP_MSGS,
+	    "send_bulk: Send %d (0x%x) bulk buffer bytes", dbt.size, dbt.size));
+
+	/*
+	 * Unlocked the mutex and now send the message.
+	 */
+	STAT(rep->stat.st_bulk_transfers++);
+	if ((ret = __rep_send_message(env,
+	    bulkp->eid, bulkp->type, &bulkp->lsn, &dbt, ctlflags, 0)) != 0)
+		ret = DB_REP_UNAVAIL;
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	/*
+	 * Ready the buffer for further records.
+	 */
+	*(bulkp->offp) = 0;
+	FLD_CLR(*(bulkp->flagsp), BULK_XMIT);
+	return (ret);
+}
+
+/*
+ * __rep_bulk_alloc --
+ *	This function allocates and initializes an internal bulk buffer.
+ * This is used by the master when fulfilling a request for a chunk of
+ * log records or a bunch of pages.
+ *
+ * PUBLIC: int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *,
+ * PUBLIC:    u_int32_t *, u_int32_t));
+ */
+int
+__rep_bulk_alloc(env, bulkp, eid, offp, flagsp, type)
+	ENV *env;
+	REP_BULK *bulkp;
+	int eid;
+	uintptr_t *offp;
+	u_int32_t *flagsp, type;
+{
+	int ret;
+
+	memset(bulkp, 0, sizeof(REP_BULK));
+	*offp = *flagsp = 0;
+	bulkp->len = MEGABYTE;
+	if ((ret = __os_malloc(env, bulkp->len, &bulkp->addr)) != 0)
+		return (ret);
+
+	/*
+	 * The cast is safe because offp is an "out" parameter. The value
+	 * of offp is meaningless when calling __rep_bulk_alloc.
+	 */
+	bulkp->offp = (roff_t *)offp;
+	bulkp->type = type;
+	bulkp->eid = eid;
+	bulkp->flagsp = flagsp;
+	return (ret);
+}
+
+/*
+ * __rep_bulk_free --
+ *	This function sends the remainder of the bulk buffer and frees it.
+ *
+ * PUBLIC: int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_bulk_free(env, bulkp, flags)
+	ENV *env;
+	REP_BULK *bulkp;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+	ret = __rep_send_bulk(env, bulkp, flags);
+	MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+	__os_free(env, bulkp->addr);
+	return (ret);
+}
+
+/*
+ * __rep_send_message --
+ *	This is a wrapper for sending a message.  It takes care of constructing
+ * the control structure and calling the user's specified send function.
+ *
+ * PUBLIC: int __rep_send_message __P((ENV *, int,
+ * PUBLIC:     u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags)
+	ENV *env;
+	int eid;
+	u_int32_t rtype;
+	DB_LSN *lsnp;
+	const DBT *dbt;
+	u_int32_t ctlflags, repflags;
+{
+	DBT cdbt, scrap_dbt;
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	REP_46_CONTROL cntrl46;
+	REP_OLD_CONTROL ocntrl;
+	__rep_control_args cntrl;
+	db_timespec msg_time;
+	int ret;
+	u_int32_t myflags;
+	u_int8_t buf[__REP_CONTROL_SIZE];
+	size_t len;
+
+	dbenv = env->dbenv;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+
+#if defined(DEBUG_ROP) || defined(DEBUG_WOP)
+	if (db_rep->send == NULL)
+		return (0);
+#endif
+
+	/* Set up control structure. */
+	memset(&cntrl, 0, sizeof(cntrl));
+	memset(&ocntrl, 0, sizeof(ocntrl));
+	memset(&cntrl46, 0, sizeof(cntrl46));
+	if (lsnp == NULL)
+		ZERO_LSN(cntrl.lsn);
+	else
+		cntrl.lsn = *lsnp;
+	/*
+	 * Set the rectype based on the version we need to speak.
+	 */
+	if (rep->version == DB_REPVERSION)
+		cntrl.rectype = rtype;
+	else if (rep->version < DB_REPVERSION) {
+		cntrl.rectype = __rep_msg_to_old(rep->version, rtype);
+		VPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "rep_send_msg: rtype %lu to version %lu record %lu.",
+		    (u_long)rtype, (u_long)rep->version,
+		    (u_long)cntrl.rectype));
+		if (cntrl.rectype == REP_INVALID)
+			return (ret);
+	} else {
+		__db_errx(env, DB_STR_A("3503",
+    "rep_send_message: Unknown rep version %lu, my version %lu",
+		    "%lu %lu"), (u_long)rep->version, (u_long)DB_REPVERSION);
+		return (__env_panic(env, EINVAL));
+	}
+	cntrl.flags = ctlflags;
+	cntrl.rep_version = rep->version;
+	cntrl.log_version = lp->persist.version;
+	cntrl.gen = rep->gen;
+
+	/* Don't assume the send function will be tolerant of NULL records. */
+	if (dbt == NULL) {
+		memset(&scrap_dbt, 0, sizeof(DBT));
+		dbt = &scrap_dbt;
+	}
+
+	/*
+	 * There are several types of records: commit and checkpoint records
+	 * that affect database durability, regular log records that might
+	 * be buffered on the master before being transmitted, and control
+	 * messages which don't require the guarantees of permanency, but
+	 * should not be buffered.
+	 *
+	 * There are request records that can be sent anywhere, and there
+	 * are rerequest records that the app might want to send to the master.
+	 */
+	myflags = repflags;
+	if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
+		/*
+		 * When writing to a system database, skip setting the PERMANENT
+		 * flag.  We don't care; we don't want to wait; and the
+		 * application shouldn't be distracted/confused in case there is
+		 * a failure.
+		 */
+		if (!F_ISSET(rep, REP_F_SYS_DB_OP))
+			myflags |= DB_REP_PERMANENT;
+	} else if (rtype != REP_LOG || FLD_ISSET(ctlflags, REPCTL_RESEND))
+		myflags |= DB_REP_NOBUFFER;
+
+	/*
+	 * Let everyone know if we've been in an established group.
+	 */
+	if (F_ISSET(rep, REP_F_GROUP_ESTD))
+		F_SET(&cntrl, REPCTL_GROUP_ESTD);
+
+	/*
+	 * If we are a master sending a perm record, then set the
+	 * REPCTL_LEASE flag to have the client reply.  Also set
+	 * the start time that the client will echo back to us.
+	 *
+	 * !!! If we are a master, using leases, we had better not be
+	 * sending to an older version.
+	 */
+	if (IS_REP_MASTER(env) && IS_USING_LEASES(env) &&
+	    FLD_ISSET(ctlflags, REPCTL_LEASE | REPCTL_PERM)) {
+		F_SET(&cntrl, REPCTL_LEASE);
+		DB_ASSERT(env, rep->version == DB_REPVERSION);
+		__os_gettime(env, &msg_time, 1);
+		cntrl.msg_sec = (u_int32_t)msg_time.tv_sec;
+		cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec;
+	}
+
+	REP_PRINT_MESSAGE(env, eid, &cntrl, "rep_send_message", myflags);
+#ifdef REP_DIAGNOSTIC
+	if (FLD_ISSET(
+	    env->dbenv->verbose, DB_VERB_REP_MSGS) && rtype == REP_LOG)
+		__rep_print_logmsg(env, dbt, lsnp);
+#endif
+
+	/*
+	 * If DB_REP_PERMANENT is set, the LSN better be non-zero.
+	 */
+	DB_ASSERT(env, !FLD_ISSET(myflags, DB_REP_PERMANENT) ||
+	    !IS_ZERO_LSN(cntrl.lsn));
+
+	/*
+	 * If we're talking to an old version, send an old control structure.
+	 */
+	memset(&cdbt, 0, sizeof(cdbt));
+	if (rep->version <= DB_REPVERSION_45) {
+		if (rep->version == DB_REPVERSION_45 &&
+		    F_ISSET(&cntrl, REPCTL_INIT)) {
+			F_CLR(&cntrl, REPCTL_INIT);
+			F_SET(&cntrl, REPCTL_INIT_45);
+		}
+		ocntrl.rep_version = cntrl.rep_version;
+		ocntrl.log_version = cntrl.log_version;
+		ocntrl.lsn = cntrl.lsn;
+		ocntrl.rectype = cntrl.rectype;
+		ocntrl.gen = cntrl.gen;
+		ocntrl.flags = cntrl.flags;
+		cdbt.data = &ocntrl;
+		cdbt.size = sizeof(ocntrl);
+	} else if (rep->version == DB_REPVERSION_46) {
+		cntrl46.rep_version = cntrl.rep_version;
+		cntrl46.log_version = cntrl.log_version;
+		cntrl46.lsn = cntrl.lsn;
+		cntrl46.rectype = cntrl.rectype;
+		cntrl46.gen = cntrl.gen;
+		cntrl46.msg_time.tv_sec = (time_t)cntrl.msg_sec;
+		cntrl46.msg_time.tv_nsec = (long)cntrl.msg_nsec;
+		cntrl46.flags = cntrl.flags;
+		cdbt.data = &cntrl46;
+		cdbt.size = sizeof(cntrl46);
+	} else {
+		(void)__rep_control_marshal(env, &cntrl, buf,
+		    __REP_CONTROL_SIZE, &len);
+		DB_INIT_DBT(cdbt, buf, len);
+	}
+
+	/*
+	 * We set the LSN above to something valid.  Give the master the
+	 * actual LSN so that they can coordinate with permanent records from
+	 * the client if they want to.
+	 *
+	 * !!! Even though we marshalled the control message for transmission,
+	 * give the transport function the real LSN.
+	 */
+	ret = db_rep->send(dbenv, &cdbt, dbt, &cntrl.lsn, eid, myflags);
+
+	/*
+	 * We don't hold the rep lock, so this could miscount if we race.
+	 * I don't think it's worth grabbing the mutex for that bit of
+	 * extra accuracy.
+	 */
+	if (ret != 0) {
+		RPRINT(env, (env, DB_VERB_REP_MSGS,
+		    "rep_send_function returned: %d", ret));
+#ifdef HAVE_STATISTICS
+		rep->stat.st_msgs_send_failures++;
+	} else
+		rep->stat.st_msgs_sent++;
+#else
+	}
+#endif
+	return (ret);
+}
+
+#ifdef REP_DIAGNOSTIC
+/*
+ * __rep_print_logmsg --
+ *	This is a debugging routine for printing out log records that
+ * we are about to transmit to a client.
+ */
+static void
+__rep_print_logmsg(env, logdbt, lsnp)
+	ENV *env;
+	const DBT *logdbt;
+	DB_LSN *lsnp;
+{
+	static int first = 1;
+	static DB_DISTAB dtab;
+
+	if (first) {
+		first = 0;
+
+		(void)__bam_init_print(env, &dtab);
+		(void)__crdel_init_print(env, &dtab);
+		(void)__db_init_print(env, &dtab);
+		(void)__dbreg_init_print(env, &dtab);
+		(void)__fop_init_print(env, &dtab);
+		(void)__ham_init_print(env, &dtab);
+		(void)__qam_init_print(env, &dtab);
+		(void)__repmgr_init_print(env, &dtab);
+		(void)__txn_init_print(env, &dtab);
+	}
+
+	(void)__db_dispatch(
+	    env, &dtab, (DBT *)logdbt, lsnp, DB_TXN_PRINT, NULL);
+}
+#endif
+
+/*
+ * __rep_new_master --
+ *	Called after a master election to sync back up with a new master.
+ * It's possible that we already know of this new master in which case
+ * we don't need to do anything.
+ *
+ * This is written assuming that this message came from the master; we
+ * need to enforce that in __rep_process_record, but right now, we have
+ * no way to identify the master.
+ *
+ * PUBLIC: int __rep_new_master __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_new_master(env, cntrl, eid)
+	ENV *env;
+	__rep_control_args *cntrl;
+	int eid;
+{
+	DBT dbt;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN first_lsn, lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	db_timeout_t lease_to;
+	u_int32_t unused, vers;
+	int change, do_req, lockout_msg, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+	logc = NULL;
+	lockout_msg = 0;
+	REP_SYSTEM_LOCK(env);
+	change = rep->gen != cntrl->gen || rep->master_id != eid;
+	/*
+	 * If we're hearing from a current or new master, then we
+	 * want to clear EPHASE0 in case this site is waiting to
+	 * hear from the master.
+	 */
+	FLD_CLR(rep->elect_flags, REP_E_PHASE0);
+	if (change) {
+		/*
+		 * If we are already locking out others, we're either
+		 * in the middle of sync-up recovery or internal init
+		 * when this newmaster comes in (we also lockout in
+		 * rep_start, but we cannot be racing that because we
+		 * don't allow rep_proc_msg when rep_start is going on).
+		 *
+		 * We're about to become the client of a new master.  Since we
+		 * want to be able to sync with the new master as quickly as
+		 * possible, interrupt any STARTSYNC from the old master.  The
+		 * new master may need to rely on acks from us and the old
+		 * STARTSYNC is now irrelevant.
+		 *
+		 * Note that, conveniently, the "lockout_msg" flag defines the
+		 * section of this code path during which both "message lockout"
+		 * and "memp sync interrupt" are in effect.
+		 */
+		if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+			goto lckout;
+
+		if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+			goto errlck;
+
+		(void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+		lockout_msg = 1;
+		/*
+		 * We must wait any remaining lease time before accepting
+		 * this new master.  This must be after the lockout above
+		 * so that no new message can be processed and re-grant
+		 * the lease out from under us.
+		 */
+		if (IS_USING_LEASES(env) &&
+		    ((lease_to = __rep_lease_waittime(env)) != 0)) {
+			REP_SYSTEM_UNLOCK(env);
+			__os_yield(env, 0, (u_long)lease_to);
+			REP_SYSTEM_LOCK(env);
+			F_SET(rep, REP_F_LEASE_EXPIRED);
+		}
+
+		vers = lp->persist.version;
+		if (cntrl->log_version != vers) {
+			/*
+			 * Set everything up to the lower version.  If we're
+			 * going to be upgrading to the latest version that
+			 * can happen automatically as we process later log
+			 * records.  We likely want to sync to earlier version.
+			 */
+			DB_ASSERT(env, vers != 0);
+			if (cntrl->log_version < vers)
+				vers = cntrl->log_version;
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "newmaster: Setting log version to %d",vers));
+			__log_set_version(env, vers);
+			if ((ret = __env_init_rec(env, vers)) != 0)
+				goto errlck;
+		}
+
+		REP_SYSTEM_UNLOCK(env);
+
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		__os_gettime(env, &lp->rcvd_ts, 1);
+		lp->wait_ts = rep->request_gap;
+		ZERO_LSN(lp->verify_lsn);
+		ZERO_LSN(lp->prev_ckp);
+		ZERO_LSN(lp->waiting_lsn);
+		ZERO_LSN(lp->max_wait_lsn);
+		/*
+		 * Open if we need to, in preparation for the truncate
+		 * we'll do in a moment.
+		 */
+		if (db_rep->rep_db == NULL &&
+		    (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			goto err;
+		}
+
+		/*
+		 * If we were in the middle of an internal initialization
+		 * and we've discovered a new master instead, clean up
+		 * our old internal init information.  We need to clean
+		 * up any flags and unlock our lockout.
+		 */
+		REP_SYSTEM_LOCK(env);
+		if (ISSET_LOCKOUT_BDB(rep)) {
+			ret = __rep_init_cleanup(env, rep, DB_FORCE);
+			/*
+			 * Note that if an in-progress internal init was indeed
+			 * "cleaned up", clearing these flags now will allow the
+			 * application to see a completely empty database
+			 * environment for a moment (until the master responds
+			 * to our ALL_REQ).
+			 */
+			F_CLR(rep, REP_F_ABBREVIATED);
+			CLR_RECOVERY_SETTINGS(rep);
+		}
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		if (ret != 0) {
+			/* TODO: consider add'l error recovery steps. */
+			goto errlck;
+		}
+		ENV_GET_THREAD_INFO(env, ip);
+		if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused))
+		    != 0)
+			goto errlck;
+		STAT(rep->stat.st_log_queued = 0);
+
+		/*
+		 * This needs to be performed under message lockout
+		 * if we're actually changing master.
+		 */
+		__rep_elect_done(env, rep);
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "Updating gen from %lu to %lu from master %d",
+		    (u_long)rep->gen, (u_long)cntrl->gen, eid));
+		SET_GEN(cntrl->gen);
+		rep->mgen = cntrl->gen;
+		if ((ret = __rep_notify_threads(env, AWAIT_GEN)) != 0)
+			goto errlck;
+		(void)__rep_write_gen(env, rep, rep->gen);
+		if (rep->egen <= rep->gen)
+			rep->egen = rep->gen + 1;
+		rep->master_id = eid;
+		STAT(rep->stat.st_master_changes++);
+		rep->stat.st_startup_complete = 0;
+		rep->version = cntrl->rep_version;
+		RPRINT(env, (env, DB_VERB_REP_MISC,
+		    "egen: %lu. rep version %lu",
+		    (u_long)rep->egen, (u_long)rep->version));
+
+		/*
+		 * If we're delaying client sync-up, we know we have a
+		 * new/changed master now, set flag indicating we are
+		 * actively delaying.
+		 */
+		if (FLD_ISSET(rep->config, REP_C_DELAYCLIENT))
+			F_SET(rep, REP_F_DELAY);
+		if ((ret = __rep_lockout_archive(env, rep)) != 0)
+			goto errlck;
+		rep->sync_state = SYNC_VERIFY;
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+		(void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+		lockout_msg = 0;
+	} else
+		__rep_elect_done(env, rep);
+	REP_SYSTEM_UNLOCK(env);
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	lsn = lp->ready_lsn;
+
+	if (!change) {
+		ret = 0;
+		do_req = __rep_check_doreq(env, rep);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		/*
+		 * If there wasn't a change, we might still have some
+		 * catching up or verification to do.
+		 */
+		if (do_req &&
+		    (rep->sync_state != SYNC_OFF ||
+		    LOG_COMPARE(&lsn, &cntrl->lsn) < 0)) {
+			ret = __rep_resend_req(env, 0);
+			if (ret != 0)
+				RPRINT(env, (env, DB_VERB_REP_MISC,
+				    "resend_req ret is %lu", (u_long)ret));
+		}
+		/*
+		 * If we're not in one of the recovery modes, we need to
+		 * clear the ARCHIVE flag.  Elections set ARCHIVE
+		 * and if we called an election and found the same
+		 * master, we need to clear ARCHIVE here.
+		 */
+		if (rep->sync_state == SYNC_OFF) {
+			REP_SYSTEM_LOCK(env);
+			FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+			REP_SYSTEM_UNLOCK(env);
+		}
+		return (ret);
+	}
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+	/*
+	 * If the master changed, we need to start the process of
+	 * figuring out what our last valid log record is.  However,
+	 * if both the master and we agree that the max LSN is 0,0,
+	 * then there is no recovery to be done.  If we are at 0 and
+	 * the master is not, then we just need to request all the log
+	 * records from the master.
+	 */
+	if (IS_INIT_LSN(lsn) || IS_ZERO_LSN(lsn)) {
+		if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+			goto err;
+		goto newmaster_complete;
+	}
+
+	memset(&dbt, 0, sizeof(dbt));
+	/*
+	 * If this client is farther ahead on the log file than the master, see
+	 * if there is any overlap in the logs.  If not, the client is too
+	 * far ahead of the master and the client will start over.
+	 */
+	if (cntrl->lsn.file < lsn.file) {
+		if ((ret = __log_cursor(env, &logc)) != 0)
+			goto err;
+		ret = __logc_get(logc, &first_lsn, &dbt, DB_FIRST);
+		if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret == DB_NOTFOUND)
+			goto notfound;
+		else if (ret != 0)
+			goto err;
+		if (cntrl->lsn.file < first_lsn.file)
+			goto notfound;
+	}
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+	ret = __rep_log_backup(env, logc, &lsn, REP_REC_PERM);
+	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret == DB_NOTFOUND)
+		goto notfound;
+	else if (ret != 0)
+		goto err;
+
+	/*
+	 * Finally, we have a record to ask for.
+	 */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	lp->verify_lsn = lsn;
+	__os_gettime(env, &lp->rcvd_ts, 1);
+	lp->wait_ts = rep->request_gap;
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	if (!F_ISSET(rep, REP_F_DELAY))
+		(void)__rep_send_message(env,
+		    eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE);
+	goto newmaster_complete;
+
+err:	/*
+	 * If we failed, we need to clear the flags we may have set above
+	 * because we're not going to be setting the verify_lsn.
+	 */
+	REP_SYSTEM_LOCK(env);
+errlck:	if (lockout_msg) {
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+		(void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+	}
+	F_CLR(rep, REP_F_DELAY);
+	CLR_RECOVERY_SETTINGS(rep);
+lckout:	REP_SYSTEM_UNLOCK(env);
+	return (ret);
+
+notfound:
+	/*
+	 * If we don't have an identification record, we still
+	 * might have some log records but we're discarding them
+	 * to sync up with the master from the start.
+	 * Therefore, truncate our log and treat it as if it
+	 * were empty.  In-memory logs can't be completely
+	 * zeroed using __log_vtruncate, so just zero them out.
+	 */
+	RPRINT(env, (env, DB_VERB_REP_MISC,
+	    "No commit or ckp found.  Truncate log."));
+	if (lp->db_log_inmemory) {
+		ZERO_LSN(lsn);
+		ret = __log_zero(env, &lsn);
+	} else {
+		INIT_LSN(lsn);
+		ret = __log_vtruncate(env, &lsn, &lsn, NULL);
+	}
+	if (ret != 0 && ret != DB_NOTFOUND)
+		return (ret);
+	infop = env->reginfo;
+	renv = infop->primary;
+	REP_SYSTEM_LOCK(env);
+	(void)time(&renv->rep_timestamp);
+	REP_SYSTEM_UNLOCK(env);
+	if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+		goto err;
+newmaster_complete:
+	return (DB_REP_NEWMASTER);
+}
+
+/*
+ * __rep_newmaster_empty
+ *      Handle the case of a NEWMASTER message received when we have an empty
+ * log.  This requires internal init.  If we can't do that because
+ * AUTOINIT off, return JOIN_FAILURE.  If F_DELAY is in effect, don't even
+ * consider AUTOINIT yet, because they could change it before rep_sync call.
+ */
+static int
+__rep_newmaster_empty(env, eid)
+	ENV *env;
+	int eid;
+{
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	int msg, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	lp = env->lg_handle->reginfo.primary;
+	msg = ret = 0;
+
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(env);
+	lp->wait_ts = rep->request_gap;
+
+	/* Usual case is to skip to UPDATE state; we may revise this below. */
+	rep->sync_state = SYNC_UPDATE;
+
+	if (F_ISSET(rep, REP_F_DELAY)) {
+		/*
+		 * Having properly set up wait_ts for later, nothing more to
+		 * do now.
+		 */
+	} else if (!FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+		CLR_RECOVERY_SETTINGS(rep);
+		ret = DB_REP_JOIN_FAILURE;
+	} else {
+		/* Normal case: not DELAY but AUTOINIT. */
+		msg = 1;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+	if (msg)
+		(void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+		    NULL, NULL, 0, 0);
+	return (ret);
+}
+
+/*
+ * __rep_elect_done
+ *	Clear all election information for this site.  Assumes the
+ *	caller hold the region mutex.
+ *
+ * PUBLIC: void __rep_elect_done __P((ENV *, REP *));
+ */
+void
+__rep_elect_done(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	int inelect;
+	db_timespec endtime;
+
+	inelect = IN_ELECTION(rep);
+	FLD_CLR(rep->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY);
+
+	rep->sites = 0;
+	rep->votes = 0;
+	if (inelect) {
+		if (timespecisset(&rep->etime)) {
+			__os_gettime(env, &endtime, 1);
+			timespecsub(&endtime, &rep->etime);
+#ifdef HAVE_STATISTICS
+			rep->stat.st_election_sec = (u_int32_t)endtime.tv_sec;
+			rep->stat.st_election_usec = (u_int32_t)
+			    (endtime.tv_nsec / NS_PER_US);
+#endif
+			RPRINT(env, (env, DB_VERB_REP_ELECT,
+			    "Election finished in %lu.%09lu sec",
+			    (u_long)endtime.tv_sec, (u_long)endtime.tv_nsec));
+			timespecclear(&rep->etime);
+		}
+		rep->egen++;
+	}
+	RPRINT(env, (env, DB_VERB_REP_ELECT,
+	    "Election done; egen %lu", (u_long)rep->egen));
+}
+
+/*
+ * __env_rep_enter --
+ *
+ * Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow operations.  If operations are allowed,
+ * increment handle-counts, so that we do not start recovery while we
+ * are operating in the library.
+ *
+ * PUBLIC: int __env_rep_enter __P((ENV *, int));
+ */
+int
+__env_rep_enter(env, checklock)
+	ENV *env;
+	int checklock;
+{
+	DB_REP *db_rep;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	int cnt, ret;
+	time_t	timestamp;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+		(void)time(&timestamp);
+		TIMESTAMP_CHECK(env, timestamp, renv);
+		/*
+		 * Check if we're still locked out after checking
+		 * the timestamp.
+		 */
+		if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+			return (EINVAL);
+	}
+
+	REP_SYSTEM_LOCK(env);
+	for (cnt = 0; FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_API);) {
+		REP_SYSTEM_UNLOCK(env);
+		/*
+		 * We're spinning - environment may be hung. Check if
+		 * recovery has been initiated.
+		 */
+		PANIC_CHECK(env);
+		if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
+			__db_errx(env, DB_STR("3504",
+    "Operation locked out.  Waiting for replication lockout to complete"));
+			return (DB_REP_LOCKOUT);
+		}
+		__os_yield(env, 1, 0);
+		if (++cnt % 60 == 0 &&
+		    (ret = __rep_show_progress(env,
+		    DB_STR_P("DB_ENV handle"), cnt / 60)) != 0)
+			return (ret);
+		REP_SYSTEM_LOCK(env);
+	}
+	rep->handle_cnt++;
+	REP_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+static int
+__rep_show_progress(env, which, mins)
+	ENV *env;
+	const char *which;
+	int mins;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+	REP *rep;
+	DB_LSN ready_lsn;
+
+	rep = env->rep_handle->region;
+	dblp = env->lg_handle;
+	lp = dblp == NULL ? NULL : dblp->reginfo.primary;
+
+#define	WAITING_MSG DB_STR_A("3505",					\
+    "%s waiting %d minutes for replication lockout to complete", "%s %d")
+#define	WAITING_ARGS WAITING_MSG, which, mins
+
+	__db_errx(env, WAITING_ARGS);
+	RPRINT(env, (env, DB_VERB_REP_SYNC, WAITING_ARGS));
+
+	if (lp == NULL)
+		ZERO_LSN(ready_lsn);
+	else {
+		MUTEX_LOCK(env, rep->mtx_clientdb);
+		ready_lsn = lp->ready_lsn;
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+	REP_SYSTEM_LOCK(env);
+	switch (rep->sync_state) {
+	case SYNC_PAGE:
+#define	PAGE_MSG DB_STR_A("3506",					\
+    "SYNC_PAGE: files %lu/%lu; pages %lu (%lu next)", "%lu %lu %lu %lu")
+#define	PAGE_ARGS (u_long)rep->curfile, (u_long)rep->nfiles, \
+		    (u_long)rep->npages, (u_long)rep->ready_pg
+		__db_errx(env, PAGE_MSG, PAGE_ARGS);
+		RPRINT(env, (env, DB_VERB_REP_SYNC, PAGE_MSG, PAGE_ARGS));
+		break;
+	case SYNC_LOG:
+#define	LSN_ARG(lsn) (u_long)(lsn).file, (u_long)(lsn).offset
+#define	LOG_LSN_ARGS LSN_ARG(ready_lsn),				\
+	    LSN_ARG(rep->first_lsn), LSN_ARG(rep->last_lsn)
+#ifdef	HAVE_STATISTICS
+#define	LOG_MSG DB_STR_A("3507",					\
+    "SYNC_LOG: thru [%lu][%lu] from [%lu][%lu]/[%lu][%lu] (%lu queued)",\
+    "%lu %lu %lu %lu %lu %lu %lu")
+#define	LOG_ARGS LOG_LSN_ARGS, (u_long)rep->stat.st_log_queued
+#else
+#define	LOG_MSG DB_STR_A("3508",					\
+    "SYNC_LOG: thru [%lu][%lu] from [%lu][%lu]/[%lu][%lu]",		\
+    "%lu %lu %lu %lu %lu %lu")
+#define	LOG_ARGS LOG_LSN_ARGS
+#endif
+		__db_errx(env, LOG_MSG, LOG_ARGS);
+		RPRINT(env, (env, DB_VERB_REP_SYNC, LOG_MSG, LOG_ARGS));
+		break;
+	default:
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+		    "sync state %d", (int)rep->sync_state));
+		break;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * __env_db_rep_exit --
+ *
+ *	Decrement handle count upon routine exit.
+ *
+ * PUBLIC: int __env_db_rep_exit __P((ENV *));
+ */
+int
+__env_db_rep_exit(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_SYSTEM_LOCK(env);
+	rep->handle_cnt--;
+	REP_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __db_rep_enter --
+ *	Called in replicated environments to keep track of in-use handles
+ * and prevent any concurrent operation during recovery.  If checkgen is
+ * non-zero, then we verify that the dbp has the same handle as the env.
+ *
+ * If return_now is non-zero, we'll return DB_DEADLOCK immediately, else we'll
+ * sleep before returning DB_DEADLOCK.  Without the sleep, it is likely
+ * the application will immediately try again and could reach a retry
+ * limit before replication has a chance to finish.  The sleep increases
+ * the probability that an application retry will succeed.
+ *
+ * Typically calls with txns set return_now so that we return immediately.
+ * We want to return immediately because we want the txn to abort ASAP
+ * so that the lockout can proceed.
+ *
+ * PUBLIC: int __db_rep_enter __P((DB *, int, int, int));
+ */
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+	DB *dbp;
+	int checkgen, checklock, return_now;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	time_t	timestamp;
+
+	env = dbp->env;
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+		(void)time(&timestamp);
+		TIMESTAMP_CHECK(env, timestamp, renv);
+		/*
+		 * Check if we're still locked out after checking
+		 * the timestamp.
+		 */
+		if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+			return (EINVAL);
+	}
+
+	/*
+	 * Return a dead handle if an internal handle is trying to
+	 * get an exclusive lock on this database.
+	 */
+	if (checkgen && dbp->mpf->mfp && IS_REP_CLIENT(env)) {
+		if (dbp->mpf->mfp->excl_lockout) 
+			return (DB_REP_HANDLE_DEAD);
+	}
+
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * !!!
+	 * Note, we are checking REP_LOCKOUT_OP, but we are
+	 * incrementing rep->handle_cnt.  That seems like a mismatch,
+	 * but the intention is to return DEADLOCK to the application
+	 * which will cause them to abort the txn quickly and allow
+	 * the lockout to proceed.
+	 *
+	 * The correctness of doing this depends on the fact that
+	 * lockout of the API always sets REP_LOCKOUT_OP first.
+	 */
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP)) {
+		REP_SYSTEM_UNLOCK(env);
+		if (!return_now)
+			__os_yield(env, 5, 0);
+		return (DB_LOCK_DEADLOCK);
+	}
+
+	if (checkgen && dbp->timestamp != renv->rep_timestamp) {
+		REP_SYSTEM_UNLOCK(env);
+		return (DB_REP_HANDLE_DEAD);
+	}
+	rep->handle_cnt++;
+	REP_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * Check for permission to increment handle_cnt, and do so if possible.  Used in
+ * cases where we want to count an operation in the context of a transaction,
+ * but the operation does not involve a DB handle.
+ *
+ * PUBLIC: int __op_handle_enter __P((ENV *));
+ */
+int
+__op_handle_enter(env)
+	ENV *env;
+{
+	REP *rep;
+	int ret;
+
+	rep = env->rep_handle->region;
+	REP_SYSTEM_LOCK(env);
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP))
+		ret = DB_LOCK_DEADLOCK;
+	else {
+		rep->handle_cnt++;
+		ret = 0;
+	}
+	REP_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __op_rep_enter --
+ *
+ *	Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow new multi-step operations, such as
+ * transaction and memp gets.  If operations are allowed,
+ * increment the op_cnt, so that we do not start recovery while we have
+ * active operations.
+ *
+ * PUBLIC: int __op_rep_enter __P((ENV *, int, int));
+ */
+int
+__op_rep_enter(env, local_nowait, obey_user)
+	ENV *env;
+	int local_nowait, obey_user;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int cnt, ret;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_SYSTEM_LOCK(env);
+	for (cnt = 0; FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP);) {
+		REP_SYSTEM_UNLOCK(env);
+		/*
+		 * We're spinning - environment may be hung.  Check if
+		 * recovery has been initiated.
+		 */
+		PANIC_CHECK(env);
+		if (local_nowait)
+			return (DB_REP_LOCKOUT);
+		if (FLD_ISSET(rep->config, REP_C_NOWAIT) && obey_user) {
+			__db_errx(env, DB_STR("3509",
+    "Operation locked out.  Waiting for replication lockout to complete"));
+			return (DB_REP_LOCKOUT);
+		}
+		__os_yield(env, 5, 0);
+		cnt += 5;
+		if (++cnt % 60 == 0 &&
+		    (ret = __rep_show_progress(env,
+		    "__op_rep_enter", cnt / 60)) != 0)
+			return (ret);
+		REP_SYSTEM_LOCK(env);
+	}
+	rep->op_cnt++;
+	REP_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __op_rep_exit --
+ *
+ *	Decrement op count upon transaction commit/abort/discard or
+ *	memp_fput.
+ *
+ * PUBLIC: int __op_rep_exit __P((ENV *));
+ */
+int
+__op_rep_exit(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	/* Check if locks have been globally turned off. */
+	if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	REP_SYSTEM_LOCK(env);
+	DB_ASSERT(env, rep->op_cnt > 0);
+	rep->op_cnt--;
+	REP_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __archive_rep_enter
+ *	Used by log_archive to determine if it is okay to remove
+ * log files.
+ *
+ * PUBLIC: int __archive_rep_enter __P((ENV *));
+ */
+int
+__archive_rep_enter(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	time_t timestamp;
+	int ret;
+
+	ret = 0;
+	infop = env->reginfo;
+	renv = infop->primary;
+
+	/*
+	 * This is tested before REP_ON below because we always need
+	 * to obey if any replication process has disabled archiving.
+	 * Everything is in the environment region that we need here.
+	 */
+	if (F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+		(void)time(&timestamp);
+		TIMESTAMP_CHECK(env, timestamp, renv);
+		/*
+		 * Check if we're still locked out after checking
+		 * the timestamp.
+		 */
+		if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+			return (DB_REP_LOCKOUT);
+	}
+
+	if (!REP_ON(env))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	REP_SYSTEM_LOCK(env);
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_ARCHIVE))
+		ret = DB_REP_LOCKOUT;
+	else
+		rep->arch_th++;
+	REP_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __archive_rep_exit
+ *	Clean up accounting for log archive threads.
+ *
+ * PUBLIC: int __archive_rep_exit __P((ENV *));
+ */
+int
+__archive_rep_exit(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	if (!REP_ON(env))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	REP_SYSTEM_LOCK(env);
+	rep->arch_th--;
+	REP_SYSTEM_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * __rep_lockout_archive --
+ *	Coordinate with other threads archiving log files so that
+ *	we can run and know that no log files will be removed out
+ *	from underneath us.
+ *	Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_archive __P((ENV *, REP *));
+ */
+int
+__rep_lockout_archive(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	return (__rep_lockout_int(env, rep, &rep->arch_th, 0,
+	    "arch_th", REP_LOCKOUT_ARCHIVE));
+}
+
+/*
+ * __rep_lockout_api --
+ *	Coordinate with other threads in the library and active txns so
+ *	that we can run single-threaded, for recovery or internal backup.
+ *	Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_api __P((ENV *, REP *));
+ */
+int
+__rep_lockout_api(env, rep)
+	ENV *env;
+	REP *rep;
+{
+	int ret;
+
+	/*
+	 * We must drain long-running operations first.  We check
+	 * REP_LOCKOUT_OP in __db_rep_enter in order to allow them
+	 * to abort existing txns quickly.  Therefore, we must
+	 * always lockout REP_LOCKOUT_OP first, then REP_LOCKOUT_API.
+	 */
+	if ((ret = __rep_lockout_int(env, rep, &rep->op_cnt, 0,
+	    "op_cnt", REP_LOCKOUT_OP)) != 0)
+		return (ret);
+	if ((ret = __rep_lockout_int(env, rep, &rep->handle_cnt, 0,
+	    "handle_cnt", REP_LOCKOUT_API)) != 0)
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_OP);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_take_apilockout __P((ENV *));
+ *
+ * For use by repmgr (keep the module boundaries reasonably clean).
+ */
+int
+__rep_take_apilockout(env)
+	ENV *env;
+{
+	REP *rep;
+	int ret;
+
+	rep = env->rep_handle->region;
+	REP_SYSTEM_LOCK(env);
+	ret = __rep_lockout_api(env, rep);
+	REP_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_clear_apilockout __P((ENV *));
+ */
+int
+__rep_clear_apilockout(env)
+	ENV *env;
+{
+	REP *rep;
+
+	rep = env->rep_handle->region;
+
+	REP_SYSTEM_LOCK(env);
+	CLR_LOCKOUT_BDB(rep);
+	REP_SYSTEM_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * __rep_lockout_apply --
+ *	Coordinate with other threads processing messages so that
+ *	we can run single-threaded and know that no incoming
+ *	message can apply new log records.
+ *	This call should be short-term covering a specific critical
+ *	operation where we need to make sure no new records change
+ *	the log.  Currently used to coordinate with elections.
+ *	Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_apply __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_apply(env, rep, apply_th)
+	ENV *env;
+	REP *rep;
+	u_int32_t apply_th;
+{
+	return (__rep_lockout_int(env, rep, &rep->apply_th, apply_th,
+	    "apply_th", REP_LOCKOUT_APPLY));
+}
+
+/*
+ * __rep_lockout_msg --
+ *	Coordinate with other threads processing messages so that
+ *	we can run single-threaded and know that no incoming
+ *	message can change the world (i.e., like a NEWMASTER message).
+ *	This call should be short-term covering a specific critical
+ *	operation where we need to make sure no new messages arrive
+ *	in the middle and all message threads are out before we start it.
+ *	Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_msg __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_msg(env, rep, msg_th)
+	ENV *env;
+	REP *rep;
+	u_int32_t msg_th;
+{
+	return (__rep_lockout_int(env, rep, &rep->msg_th, msg_th,
+	    "msg_th", REP_LOCKOUT_MSG));
+}
+
+/*
+ * __rep_lockout_int --
+ *	Internal common code for locking out and coordinating
+ *	with other areas of the code.
+ *	Assumes the caller holds the region mutex.
+ *
+ */
+static int
+__rep_lockout_int(env, rep, fieldp, field_val, msg, lockout_flag)
+	ENV *env;
+	REP *rep;
+	u_int32_t *fieldp;
+	const char *msg;
+	u_int32_t field_val, lockout_flag;
+{
+	int ret, wait_cnt;
+
+	FLD_SET(rep->lockout_flags, lockout_flag);
+	for (wait_cnt = 0; *fieldp > field_val;) {
+		if ((ret = __rep_notify_threads(env, LOCKOUT)) != 0)
+			return (ret);
+		REP_SYSTEM_UNLOCK(env);
+		/* We're spinning - environment may be hung.  Check if
+		 * recovery has been initiated.
+		 */
+		PANIC_CHECK(env);
+		__os_yield(env, 1, 0);
+#ifdef DIAGNOSTIC
+		if (wait_cnt == 5) {
+			RPRINT(env, (env, DB_VERB_REP_MISC,
+			    "Waiting for %s (%lu) to complete lockout to %lu",
+			    msg, (u_long)*fieldp, (u_long)field_val));
+			__db_errx(env, DB_STR_A("3510",
+"Waiting for %s (%lu) to complete replication lockout",
+			    "%s %lu"), msg, (u_long)*fieldp);
+		}
+		if (++wait_cnt % 60 == 0)
+			__db_errx(env, DB_STR_A("3511",
+"Waiting for %s (%lu) to complete replication lockout for %d minutes",
+			    "%s %lu %d"), msg, (u_long)*fieldp, wait_cnt / 60);
+#endif
+		REP_SYSTEM_LOCK(env);
+	}
+
+	COMPQUIET(msg, NULL);
+	return (0);
+}
+
+/*
+ * __rep_send_throttle -
+ *	Send a record, throttling if necessary.  Callers of this function
+ * will throttle - breaking out of their loop, if the repth->type field
+ * changes from the normal message type to the *_MORE message type.
+ * This function will send the normal type unless throttling gets invoked.
+ * Then it sets the type field and sends the _MORE message.
+ *
+ * Throttling is always only relevant in serving requests, so we always send
+ * with REPCTL_RESEND.  Additional desired flags can be passed in the ctlflags
+ * argument.
+ *
+ * PUBLIC: int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *,
+ * PUBLIC:    u_int32_t, u_int32_t));
+ */
+int
+__rep_send_throttle(env, eid, repth, flags, ctlflags)
+	ENV *env;
+	int eid;
+	REP_THROTTLE *repth;
+	u_int32_t ctlflags, flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	u_int32_t size, typemore;
+	int check_limit;
+
+	check_limit = repth->gbytes != 0 || repth->bytes != 0;
+	/*
+	 * If we only want to do throttle processing and we don't have it
+	 * turned on, return immediately.
+	 */
+	if (!check_limit && LF_ISSET(REP_THROTTLE_ONLY))
+		return (0);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	typemore = 0;
+	if (repth->type == REP_LOG)
+		typemore = REP_LOG_MORE;
+	if (repth->type == REP_PAGE)
+		typemore = REP_PAGE_MORE;
+	DB_ASSERT(env, typemore != 0);
+
+	/*
+	 * data_dbt.size is only the size of the log
+	 * record;  it doesn't count the size of the
+	 * control structure. Factor that in as well
+	 * so we're not off by a lot if our log records
+	 * are small.
+	 */
+	size = repth->data_dbt->size + sizeof(__rep_control_args);
+	if (check_limit) {
+		while (repth->bytes <= size) {
+			if (repth->gbytes > 0) {
+				repth->bytes += GIGABYTE;
+				--(repth->gbytes);
+				continue;
+			}
+			/*
+			 * We don't hold the rep mutex,
+			 * and may miscount.
+			 */
+			STAT(rep->stat.st_nthrottles++);
+			repth->type = typemore;
+			goto snd;
+		}
+		repth->bytes -= size;
+	}
+	/*
+	 * Always send if it is typemore, otherwise send only if
+	 * REP_THROTTLE_ONLY is not set.
+	 *
+	 * NOTE:  It is the responsibility of the caller to marshal, if
+	 * needed, the data_dbt.  This function just sends what it is given.
+	 */
+snd:	if ((repth->type == typemore || !LF_ISSET(REP_THROTTLE_ONLY)) &&
+	    (__rep_send_message(env, eid, repth->type,
+	    &repth->lsn, repth->data_dbt, (REPCTL_RESEND | ctlflags), 0) != 0))
+		return (DB_REP_UNAVAIL);
+	return (0);
+}
+
+/*
+ * __rep_msg_to_old --
+ *	Convert current message numbers to old message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_to_old(version, rectype)
+	u_int32_t version, rectype;
+{
+	/*
+	 * We need to convert from current message numbers to old numbers and
+	 * we need to convert from old numbers to current numbers.  Offset by
+	 * one for more readable code.
+	 */
+	/*
+	 * Everything for version 0 is invalid, there is no version 0.
+	 */
+	static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+	/* There is no DB_REPVERSION 0. */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * 4.2/DB_REPVERSION 1 no longer supported.
+	 */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * 4.3/DB_REPVERSION 2 no longer supported.
+	 */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * From 4.7 message number To 4.4/4.5 message number
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* REP_ALIVE */
+	    2,			/* REP_ALIVE_REQ */
+	    3,			/* REP_ALL_REQ */
+	    4,			/* REP_BULK_LOG */
+	    5,			/* REP_BULK_PAGE */
+	    6,			/* REP_DUPMASTER */
+	    7,			/* REP_FILE */
+	    8,			/* REP_FILE_FAIL */
+	    9,			/* REP_FILE_REQ */
+	    REP_INVALID,	/* REP_LEASE_GRANT */
+	    10,			/* REP_LOG */
+	    11,			/* REP_LOG_MORE */
+	    12,			/* REP_LOG_REQ */
+	    13,			/* REP_MASTER_REQ */
+	    14,			/* REP_NEWCLIENT */
+	    15,			/* REP_NEWFILE */
+	    16,			/* REP_NEWMASTER */
+	    17,			/* REP_NEWSITE */
+	    18,			/* REP_PAGE */
+	    19,			/* REP_PAGE_FAIL */
+	    20,			/* REP_PAGE_MORE */
+	    21,			/* REP_PAGE_REQ */
+	    22,			/* REP_REREQUEST */
+	    REP_INVALID,	/* REP_START_SYNC */
+	    23,			/* REP_UPDATE */
+	    24,			/* REP_UPDATE_REQ */
+	    25,			/* REP_VERIFY */
+	    26,			/* REP_VERIFY_FAIL */
+	    27,			/* REP_VERIFY_REQ */
+	    28,			/* REP_VOTE1 */
+	    29			/* REP_VOTE2 */
+	},
+	/*
+	 * From 4.7 message number To 4.6 message number.  There are
+	 * NO message differences between 4.6 and 4.7.  The
+	 * control structure changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* REP_ALIVE */
+	    2,			/* REP_ALIVE_REQ */
+	    3,			/* REP_ALL_REQ */
+	    4,			/* REP_BULK_LOG */
+	    5,			/* REP_BULK_PAGE */
+	    6,			/* REP_DUPMASTER */
+	    7,			/* REP_FILE */
+	    8,			/* REP_FILE_FAIL */
+	    9,			/* REP_FILE_REQ */
+	    10,			/* REP_LEASE_GRANT */
+	    11,			/* REP_LOG */
+	    12,			/* REP_LOG_MORE */
+	    13,			/* REP_LOG_REQ */
+	    14,			/* REP_MASTER_REQ */
+	    15,			/* REP_NEWCLIENT */
+	    16,			/* REP_NEWFILE */
+	    17,			/* REP_NEWMASTER */
+	    18,			/* REP_NEWSITE */
+	    19,			/* REP_PAGE */
+	    20,			/* REP_PAGE_FAIL */
+	    21,			/* REP_PAGE_MORE */
+	    22,			/* REP_PAGE_REQ */
+	    23,			/* REP_REREQUEST */
+	    24,			/* REP_START_SYNC */
+	    25,			/* REP_UPDATE */
+	    26,			/* REP_UPDATE_REQ */
+	    27,			/* REP_VERIFY */
+	    28,			/* REP_VERIFY_FAIL */
+	    29,			/* REP_VERIFY_REQ */
+	    30,			/* REP_VOTE1 */
+	    31			/* REP_VOTE2 */
+	},
+	/*
+	 * From 5.2 message number To 4.7 message number.  There are
+	 * NO message differences between 4.7 and 5.2.  The
+	 * content of vote1 changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* REP_ALIVE */
+	    2,			/* REP_ALIVE_REQ */
+	    3,			/* REP_ALL_REQ */
+	    4,			/* REP_BULK_LOG */
+	    5,			/* REP_BULK_PAGE */
+	    6,			/* REP_DUPMASTER */
+	    7,			/* REP_FILE */
+	    8,			/* REP_FILE_FAIL */
+	    9,			/* REP_FILE_REQ */
+	    10,			/* REP_LEASE_GRANT */
+	    11,			/* REP_LOG */
+	    12,			/* REP_LOG_MORE */
+	    13,			/* REP_LOG_REQ */
+	    14,			/* REP_MASTER_REQ */
+	    15,			/* REP_NEWCLIENT */
+	    16,			/* REP_NEWFILE */
+	    17,			/* REP_NEWMASTER */
+	    18,			/* REP_NEWSITE */
+	    19,			/* REP_PAGE */
+	    20,			/* REP_PAGE_FAIL */
+	    21,			/* REP_PAGE_MORE */
+	    22,			/* REP_PAGE_REQ */
+	    23,			/* REP_REREQUEST */
+	    24,			/* REP_START_SYNC */
+	    25,			/* REP_UPDATE */
+	    26,			/* REP_UPDATE_REQ */
+	    27,			/* REP_VERIFY */
+	    28,			/* REP_VERIFY_FAIL */
+	    29,			/* REP_VERIFY_REQ */
+	    30,			/* REP_VOTE1 */
+	    31			/* REP_VOTE2 */
+	},
+	/*
+	 * From 5.3 message number To 4.7 message number.  There are
+	 * NO message differences between 4.7 and 5.3.  The
+	 * content of fileinfo changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* REP_ALIVE */
+	    2,			/* REP_ALIVE_REQ */
+	    3,			/* REP_ALL_REQ */
+	    4,			/* REP_BULK_LOG */
+	    5,			/* REP_BULK_PAGE */
+	    6,			/* REP_DUPMASTER */
+	    7,			/* REP_FILE */
+	    8,			/* REP_FILE_FAIL */
+	    9,			/* REP_FILE_REQ */
+	    10,			/* REP_LEASE_GRANT */
+	    11,			/* REP_LOG */
+	    12,			/* REP_LOG_MORE */
+	    13,			/* REP_LOG_REQ */
+	    14,			/* REP_MASTER_REQ */
+	    15,			/* REP_NEWCLIENT */
+	    16,			/* REP_NEWFILE */
+	    17,			/* REP_NEWMASTER */
+	    18,			/* REP_NEWSITE */
+	    19,			/* REP_PAGE */
+	    20,			/* REP_PAGE_FAIL */
+	    21,			/* REP_PAGE_MORE */
+	    22,			/* REP_PAGE_REQ */
+	    23,			/* REP_REREQUEST */
+	    24,			/* REP_START_SYNC */
+	    25,			/* REP_UPDATE */
+	    26,			/* REP_UPDATE_REQ */
+	    27,			/* REP_VERIFY */
+	    28,			/* REP_VERIFY_FAIL */
+	    29,			/* REP_VERIFY_REQ */
+	    30,			/* REP_VOTE1 */
+	    31			/* REP_VOTE2 */
+	}
+	};
+	return (table[version][rectype]);
+}
+
+/*
+ * __rep_msg_from_old --
+ *	Convert old message numbers to current message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_from_old(version, rectype)
+	u_int32_t version, rectype;
+{
+	/*
+	 * We need to convert from current message numbers to old numbers and
+	 * we need to convert from old numbers to current numbers.  Offset by
+	 * one for more readable code.
+	 */
+	/*
+	 * Everything for version 0 is invalid, there is no version 0.
+	 */
+	static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+	/* There is no DB_REPVERSION 0. */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * 4.2/DB_REPVERSION 1 no longer supported.
+	 */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * 4.3/DB_REPVERSION 2 no longer supported.
+	 */
+	{   REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+	    REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+	/*
+	 * From 4.4/4.5 message number To 4.7 message number
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* 1, REP_ALIVE */
+	    2,			/* 2, REP_ALIVE_REQ */
+	    3,			/* 3, REP_ALL_REQ */
+	    4,			/* 4, REP_BULK_LOG */
+	    5,			/* 5, REP_BULK_PAGE */
+	    6,			/* 6, REP_DUPMASTER */
+	    7,			/* 7, REP_FILE */
+	    8,			/* 8, REP_FILE_FAIL */
+	    9,			/* 9, REP_FILE_REQ */
+	    /* 10, REP_LEASE_GRANT doesn't exist */
+	    11,			/* 10, REP_LOG */
+	    12,			/* 11, REP_LOG_MORE */
+	    13,			/* 12, REP_LOG_REQ */
+	    14,			/* 13, REP_MASTER_REQ */
+	    15,			/* 14, REP_NEWCLIENT */
+	    16,			/* 15, REP_NEWFILE */
+	    17,			/* 16, REP_NEWMASTER */
+	    18,			/* 17, REP_NEWSITE */
+	    19,			/* 18, REP_PAGE */
+	    20,			/* 19, REP_PAGE_FAIL */
+	    21,			/* 20, REP_PAGE_MORE */
+	    22,			/* 21, REP_PAGE_REQ */
+	    23,			/* 22, REP_REREQUEST */
+	    /* 24, REP_START_SYNC doesn't exist */
+	    25,			/* 23, REP_UPDATE */
+	    26,			/* 24, REP_UPDATE_REQ */
+	    27,			/* 25, REP_VERIFY */
+	    28,			/* 26, REP_VERIFY_FAIL */
+	    29,			/* 27, REP_VERIFY_REQ */
+	    30,			/* 28, REP_VOTE1 */
+	    31,			/* 29, REP_VOTE2 */
+	    REP_INVALID,	/* 30, 4.4/4.5 no message */
+	    REP_INVALID		/* 31, 4.4/4.5 no message */
+	},
+	/*
+	 * From 4.6 message number To 4.7 message number.  There are
+	 * NO message differences between 4.6 and 4.7.  The
+	 * control structure changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* 1, REP_ALIVE */
+	    2,			/* 2, REP_ALIVE_REQ */
+	    3,			/* 3, REP_ALL_REQ */
+	    4,			/* 4, REP_BULK_LOG */
+	    5,			/* 5, REP_BULK_PAGE */
+	    6,			/* 6, REP_DUPMASTER */
+	    7,			/* 7, REP_FILE */
+	    8,			/* 8, REP_FILE_FAIL */
+	    9,			/* 9, REP_FILE_REQ */
+	    10,			/* 10, REP_LEASE_GRANT */
+	    11,			/* 11, REP_LOG */
+	    12,			/* 12, REP_LOG_MORE */
+	    13,			/* 13, REP_LOG_REQ */
+	    14,			/* 14, REP_MASTER_REQ */
+	    15,			/* 15, REP_NEWCLIENT */
+	    16,			/* 16, REP_NEWFILE */
+	    17,			/* 17, REP_NEWMASTER */
+	    18,			/* 18, REP_NEWSITE */
+	    19,			/* 19, REP_PAGE */
+	    20,			/* 20, REP_PAGE_FAIL */
+	    21,			/* 21, REP_PAGE_MORE */
+	    22,			/* 22, REP_PAGE_REQ */
+	    23,			/* 22, REP_REREQUEST */
+	    24,			/* 24, REP_START_SYNC */
+	    25,			/* 25, REP_UPDATE */
+	    26,			/* 26, REP_UPDATE_REQ */
+	    27,			/* 27, REP_VERIFY */
+	    28,			/* 28, REP_VERIFY_FAIL */
+	    29,			/* 29, REP_VERIFY_REQ */
+	    30,			/* 30, REP_VOTE1 */
+	    31			/* 31, REP_VOTE2 */
+	},
+	/*
+	 * From 4.7 message number To 5.2 message number.  There are
+	 * NO message differences between them.  The vote1 contents
+	 * changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* 1, REP_ALIVE */
+	    2,			/* 2, REP_ALIVE_REQ */
+	    3,			/* 3, REP_ALL_REQ */
+	    4,			/* 4, REP_BULK_LOG */
+	    5,			/* 5, REP_BULK_PAGE */
+	    6,			/* 6, REP_DUPMASTER */
+	    7,			/* 7, REP_FILE */
+	    8,			/* 8, REP_FILE_FAIL */
+	    9,			/* 9, REP_FILE_REQ */
+	    10,			/* 10, REP_LEASE_GRANT */
+	    11,			/* 11, REP_LOG */
+	    12,			/* 12, REP_LOG_MORE */
+	    13,			/* 13, REP_LOG_REQ */
+	    14,			/* 14, REP_MASTER_REQ */
+	    15,			/* 15, REP_NEWCLIENT */
+	    16,			/* 16, REP_NEWFILE */
+	    17,			/* 17, REP_NEWMASTER */
+	    18,			/* 18, REP_NEWSITE */
+	    19,			/* 19, REP_PAGE */
+	    20,			/* 20, REP_PAGE_FAIL */
+	    21,			/* 21, REP_PAGE_MORE */
+	    22,			/* 22, REP_PAGE_REQ */
+	    23,			/* 22, REP_REREQUEST */
+	    24,			/* 24, REP_START_SYNC */
+	    25,			/* 25, REP_UPDATE */
+	    26,			/* 26, REP_UPDATE_REQ */
+	    27,			/* 27, REP_VERIFY */
+	    28,			/* 28, REP_VERIFY_FAIL */
+	    29,			/* 29, REP_VERIFY_REQ */
+	    30,			/* 30, REP_VOTE1 */
+	    31			/* 31, REP_VOTE2 */
+	},
+	/*
+	 * From 4.7 message number To 5.3 message number.  There are
+	 * NO message differences between them.  The fileinfo contents
+	 * changed.
+	 */
+	{   REP_INVALID,	/* NO message 0 */
+	    1,			/* 1, REP_ALIVE */
+	    2,			/* 2, REP_ALIVE_REQ */
+	    3,			/* 3, REP_ALL_REQ */
+	    4,			/* 4, REP_BULK_LOG */
+	    5,			/* 5, REP_BULK_PAGE */
+	    6,			/* 6, REP_DUPMASTER */
+	    7,			/* 7, REP_FILE */
+	    8,			/* 8, REP_FILE_FAIL */
+	    9,			/* 9, REP_FILE_REQ */
+	    10,			/* 10, REP_LEASE_GRANT */
+	    11,			/* 11, REP_LOG */
+	    12,			/* 12, REP_LOG_MORE */
+	    13,			/* 13, REP_LOG_REQ */
+	    14,			/* 14, REP_MASTER_REQ */
+	    15,			/* 15, REP_NEWCLIENT */
+	    16,			/* 16, REP_NEWFILE */
+	    17,			/* 17, REP_NEWMASTER */
+	    18,			/* 18, REP_NEWSITE */
+	    19,			/* 19, REP_PAGE */
+	    20,			/* 20, REP_PAGE_FAIL */
+	    21,			/* 21, REP_PAGE_MORE */
+	    22,			/* 22, REP_PAGE_REQ */
+	    23,			/* 22, REP_REREQUEST */
+	    24,			/* 24, REP_START_SYNC */
+	    25,			/* 25, REP_UPDATE */
+	    26,			/* 26, REP_UPDATE_REQ */
+	    27,			/* 27, REP_VERIFY */
+	    28,			/* 28, REP_VERIFY_FAIL */
+	    29,			/* 29, REP_VERIFY_REQ */
+	    30,			/* 30, REP_VOTE1 */
+	    31			/* 31, REP_VOTE2 */
+	}
+	};
+	return (table[version][rectype]);
+}
+
+/*
+ * __rep_print_system --
+ *	Optionally print a verbose message, including to the system file.
+ *
+ * PUBLIC: int __rep_print_system __P((ENV *, u_int32_t, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__rep_print_system(ENV *env, u_int32_t verbose, const char *fmt, ...)
+#else
+__rep_print_system(env, verbose, fmt, va_alist)
+	ENV *env;
+	u_int32_t verbose;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	int ret;
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	ret = __rep_print_int(env, verbose | DB_VERB_REP_SYSTEM, fmt, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __rep_print --
+ *	Optionally print a verbose message.
+ *
+ * PUBLIC: int __rep_print __P((ENV *, u_int32_t, const char *, ...))
+ * PUBLIC:    __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__rep_print(ENV *env, u_int32_t verbose, const char *fmt, ...)
+#else
+__rep_print(env, verbose, fmt, va_alist)
+	ENV *env;
+	u_int32_t verbose;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	va_list ap;
+	int ret;
+
+#ifdef STDC_HEADERS
+	va_start(ap, fmt);
+#else
+	va_start(ap);
+#endif
+	ret = __rep_print_int(env, verbose, fmt, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __rep_print_int --
+ *	Optionally print a verbose message.
+ *
+ * NOTE:
+ * One anomaly is that the messaging functions expect/use/require
+ * void functions.  The use of a mutex in __rep_print_int requires
+ * a return value.
+ */
+static int
+__rep_print_int(env, verbose, fmt, ap)
+	ENV *env;
+	u_int32_t verbose;
+	const char *fmt;
+	va_list ap;
+{
+	DB_MSGBUF mb;
+	REP *rep;
+	db_timespec ts;
+	pid_t pid;
+	db_threadid_t tid;
+	int diag_msg;
+	u_int32_t regular_msg, tmp_verbose;
+	const char *s;
+	char buf[DB_THREADID_STRLEN];
+
+	tmp_verbose = env->dbenv->verbose;
+	if (FLD_ISSET(tmp_verbose, verbose | DB_VERB_REPLICATION) == 0)
+		return (0);
+	DB_MSGBUF_INIT(&mb);
+
+	diag_msg = 0;
+	if (REP_ON(env)) {
+		rep = env->rep_handle->region;
+		/*
+		 * If system diag messages are configured and this message's
+		 * verbose level includes DB_VERB_REP_SYSTEM, this is a diag
+		 * message.  This means it will be written to the diagnostic
+		 * message files.
+		 */
+		diag_msg = FLD_ISSET(tmp_verbose, DB_VERB_REP_SYSTEM) &&
+		    FLD_ISSET(verbose, DB_VERB_REP_SYSTEM) &&
+		    !FLD_ISSET(rep->config, REP_C_INMEM);
+	} else
+		rep = NULL;
+	/*
+	 * We need to know if this message should be printed out
+	 * via the regular, user mechanism.
+	 */
+	FLD_CLR(tmp_verbose, DB_VERB_REP_SYSTEM);
+	regular_msg = FLD_ISSET(tmp_verbose,
+	    verbose | DB_VERB_REPLICATION);
+
+	/*
+	 * It is possible we could be called before the env is finished
+	 * getting set up and we want to skip that.
+	 */
+	if (diag_msg == 0 && regular_msg == 0)
+		return (0);
+	s = NULL;
+	if (env->dbenv->db_errpfx != NULL)
+		s = env->dbenv->db_errpfx;
+	else if (rep != NULL) {
+		if (F_ISSET(rep, REP_F_CLIENT))
+			s = "CLIENT";
+		else if (F_ISSET(rep, REP_F_MASTER))
+			s = "MASTER";
+	}
+	if (s == NULL)
+		s = "REP_UNDEF";
+	__os_id(env->dbenv, &pid, &tid);
+	if (diag_msg)
+		MUTEX_LOCK(env, rep->mtx_diag);
+	__os_gettime(env, &ts, 1);
+	__db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ",
+	    (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US,
+	    env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s);
+
+	__db_msgadd_ap(env, &mb, fmt, ap);
+
+	DB_MSGBUF_REP_FLUSH(env, &mb, diag_msg, regular_msg);
+	if (diag_msg)
+		MUTEX_UNLOCK(env, rep->mtx_diag);
+	return (0);
+}
+
+/*
+ * PUBLIC: void __rep_print_message
+ * PUBLIC:     __P((ENV *, int, __rep_control_args *, char *, u_int32_t));
+ */
+void
+__rep_print_message(env, eid, rp, str, flags)
+	ENV *env;
+	int eid;
+	__rep_control_args *rp;
+	char *str;
+	u_int32_t flags;
+{
+	u_int32_t ctlflags, rectype, verbflag;
+	char ftype[64], *home, *type;
+
+	rectype = rp->rectype;
+	ctlflags = rp->flags;
+	verbflag = DB_VERB_REP_MSGS | DB_VERB_REPLICATION;
+	if (rp->rep_version != DB_REPVERSION)
+		rectype = __rep_msg_from_old(rp->rep_version, rectype);
+	switch (rectype) {
+	case REP_ALIVE:
+		FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_MISC);
+		type = "alive";
+		break;
+	case REP_ALIVE_REQ:
+		type = "alive_req";
+		break;
+	case REP_ALL_REQ:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "all_req";
+		break;
+	case REP_BULK_LOG:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "bulk_log";
+		break;
+	case REP_BULK_PAGE:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC);
+		type = "bulk_page";
+		break;
+	case REP_DUPMASTER:
+		FLD_SET(verbflag, DB_VERB_REP_SYSTEM);
+		type = "dupmaster";
+		break;
+	case REP_FILE:
+		type = "file";
+		break;
+	case REP_FILE_FAIL:
+		type = "file_fail";
+		break;
+	case REP_FILE_REQ:
+		type = "file_req";
+		break;
+	case REP_LEASE_GRANT:
+		FLD_SET(verbflag, DB_VERB_REP_LEASE);
+		type = "lease_grant";
+		break;
+	case REP_LOG:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "log";
+		break;
+	case REP_LOG_MORE:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "log_more";
+		break;
+	case REP_LOG_REQ:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "log_req";
+		break;
+	case REP_MASTER_REQ:
+		type = "master_req";
+		break;
+	case REP_NEWCLIENT:
+		FLD_SET(verbflag, DB_VERB_REP_MISC | DB_VERB_REP_SYSTEM);
+		type = "newclient";
+		break;
+	case REP_NEWFILE:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "newfile";
+		break;
+	case REP_NEWMASTER:
+		FLD_SET(verbflag, DB_VERB_REP_MISC | DB_VERB_REP_SYSTEM);
+		type = "newmaster";
+		break;
+	case REP_NEWSITE:
+		type = "newsite";
+		break;
+	case REP_PAGE:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC);
+		type = "page";
+		break;
+	case REP_PAGE_FAIL:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC);
+		type = "page_fail";
+		break;
+	case REP_PAGE_MORE:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC);
+		type = "page_more";
+		break;
+	case REP_PAGE_REQ:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC);
+		type = "page_req";
+		break;
+	case REP_REREQUEST:
+		type = "rerequest";
+		break;
+	case REP_START_SYNC:
+		FLD_SET(verbflag, DB_VERB_REP_MISC);
+		type = "start_sync";
+		break;
+	case REP_UPDATE:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+		type = "update";
+		break;
+	case REP_UPDATE_REQ:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+		type = "update_req";
+		break;
+	case REP_VERIFY:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+		type = "verify";
+		break;
+	case REP_VERIFY_FAIL:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+		type = "verify_fail";
+		break;
+	case REP_VERIFY_REQ:
+		FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+		type = "verify_req";
+		break;
+	case REP_VOTE1:
+		FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_SYSTEM);
+		type = "vote1";
+		break;
+	case REP_VOTE2:
+		FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_SYSTEM);
+		type = "vote2";
+		break;
+	default:
+		type = "NOTYPE";
+		break;
+	}
+
+	/*
+	 * !!!
+	 * If adding new flags to print out make sure the aggregate
+	 * length cannot overflow the buffer.
+	 */
+	ftype[0] = '\0';
+	if (LF_ISSET(DB_REP_ANYWHERE))
+		(void)strcat(ftype, " any");		/* 4 */
+	if (FLD_ISSET(ctlflags, REPCTL_FLUSH))
+		(void)strcat(ftype, " flush");		/* 10 */
+	/*
+	 * We expect most of the time the messages will indicate
+	 * group membership.  Only print if we're not already
+	 * part of a group.
+	 */
+	if (!FLD_ISSET(ctlflags, REPCTL_GROUP_ESTD))
+		(void)strcat(ftype, " nogroup");	/* 18 */
+	if (FLD_ISSET(ctlflags, REPCTL_LEASE))
+		(void)strcat(ftype, " lease");		/* 24 */
+	if (LF_ISSET(DB_REP_NOBUFFER))
+		(void)strcat(ftype, " nobuf");		/* 30 */
+	if (FLD_ISSET(ctlflags, REPCTL_PERM))
+		(void)strcat(ftype, " perm");		/* 35 */
+	if (LF_ISSET(DB_REP_REREQUEST))
+		(void)strcat(ftype, " rereq");		/* 41 */
+	if (FLD_ISSET(ctlflags, REPCTL_RESEND))
+		(void)strcat(ftype, " resend");		/* 48 */
+	if (FLD_ISSET(ctlflags, REPCTL_LOG_END))
+		(void)strcat(ftype, " logend");		/* 55 */
+
+	/*
+	 * !!!
+	 * We selectively turned on bits using different verbose settings
+	 * that relate to each message type.  Therefore, since the
+	 * DB_VERB_REP_SYSTEM flag is explicitly set above when wanted,
+	 * we *must* use the VPRINT macro here.  It will correctly
+	 * handle the messages whether or not the SYSTEM flag is set.
+	 */
+	if ((home = env->db_home) == NULL)
+		home = "NULL";
+	VPRINT(env, (env, verbflag,
+    "%s %s: msgv = %lu logv %lu gen = %lu eid %d, type %s, LSN [%lu][%lu] %s",
+	    home, str,
+	    (u_long)rp->rep_version, (u_long)rp->log_version, (u_long)rp->gen,
+	    eid, type, (u_long)rp->lsn.file, (u_long)rp->lsn.offset, ftype));
+	/*
+	 * Make sure the version is close, and not swapped
+	 * here.  Check for current version,  +/- a little bit.
+	 */
+	DB_ASSERT(env, rp->rep_version <= DB_REPVERSION+10);
+	DB_ASSERT(env, rp->log_version <= DB_LOGVERSION+10);
+}
+
+/*
+ * PUBLIC: void __rep_fire_event __P((ENV *, u_int32_t, void *));
+ */
+void
+__rep_fire_event(env, event, info)
+	ENV *env;
+	u_int32_t event;
+	void *info;
+{
+	int ret;
+
+	/*
+	 * Give repmgr first crack at handling all replication-related events.
+	 * If it can't (or chooses not to) handle the event fully, then pass it
+	 * along to the application.
+	 */
+	ret = __repmgr_handle_event(env, event, info);
+	DB_ASSERT(env, ret == 0 || ret == DB_EVENT_NOT_HANDLED);
+
+	if (ret == DB_EVENT_NOT_HANDLED)
+		DB_EVENT(env, event, info);
+}
+
+/*
+ * __rep_msg --
+ *      Rep system diagnostic messaging routine.
+ * This function is called from the __db_msg subsystem to
+ * write out diagnostic messages to replication-owned files.
+ *
+ * PUBLIC: void __rep_msg __P((const ENV *, const char *));
+ */
+void
+__rep_msg(env, msg)
+	const ENV *env;
+	const char *msg;
+{
+	DB_FH *fhp;
+	DB_REP *db_rep;
+	REP *rep;
+	int i;
+	size_t cnt, nlcnt;
+	char nl = '\n';
+
+	if (PANIC_ISSET(env))
+		return;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	DB_ASSERT((ENV *)env, !FLD_ISSET(rep->config, REP_C_INMEM));
+	/*
+	 * We know the only way we get here is with the mutex locked.  So
+	 * we can read, modify and change all the diag related fields.
+	 */
+	i = rep->diag_index;
+	fhp = db_rep->diagfile[i];
+
+	if (db_rep->diag_off != rep->diag_off)
+		(void)__os_seek((ENV *)env, fhp, 0, 0, rep->diag_off);
+	if (__os_write((ENV *)env, fhp, (void *)msg, strlen(msg), &cnt) != 0)
+		return;
+	if (__os_write((ENV *)env, fhp, &nl, 1, &nlcnt) != 0)
+		return;
+	db_rep->diag_off = rep->diag_off += (cnt + nlcnt);
+	/*
+	 * If writing this message put us over the file size threshold,
+	 * then we reset to the next file.  We don't care if it is
+	 * exactly at the size, some amount over the file size is fine.
+	 */
+	if (rep->diag_off >= REP_DIAGSIZE) {
+		rep->diag_index = (++i % DBREP_DIAG_FILES);
+		rep->diag_off = 0;
+	}
+	return;
+}
+
+/*
+ * PUBLIC: int __rep_notify_threads __P((ENV *, rep_waitreason_t));
+ *
+ * Caller must hold rep region mutex.  In the AWAIT_LSN case, caller must also
+ * hold mtx_clientdb.
+ */
+int
+__rep_notify_threads(env, wake_reason)
+	ENV *env;
+	rep_waitreason_t wake_reason;
+{
+	REP *rep;
+	struct __rep_waiter *waiter;
+	struct rep_waitgoal *goal;
+	int ret, wake;
+
+	ret = 0;
+	rep = env->rep_handle->region;
+
+	SH_TAILQ_FOREACH(waiter, &rep->waiters, links, __rep_waiter) {
+		goal = &waiter->goal;
+		wake = 0;
+		if (wake_reason == LOCKOUT) {
+			F_SET(waiter, REP_F_PENDING_LOCKOUT);
+			wake = 1;
+		} else if (wake_reason == goal->why ||
+		    (goal->why == AWAIT_HISTORY && wake_reason == AWAIT_LSN)) {
+			/*
+			 * It's important that we only call __rep_check_goal
+			 * with "goals" that match the wake_reason passed to us
+			 * (modulo the LSN-to-HISTORY equivalence), because the
+			 * caller has ensured that it is holding the appropriate
+			 * mutexes depending on the wake_reason.
+			 */
+			if ((ret = __rep_check_goal(env, goal)) == 0)
+				wake = 1;
+			else if (ret == DB_TIMEOUT)
+				ret = 0;
+			else
+				goto out;
+		}
+
+		if (wake) {
+			MUTEX_UNLOCK(env, waiter->mtx_repwait);
+			SH_TAILQ_REMOVE(&rep->waiters,
+			    waiter, links, __rep_waiter);
+			F_SET(waiter, REP_F_WOKEN);
+		}
+	}
+
+out:
+	return (ret);
+}
+
+/*
+ * A "wait goal" describes a condition that a thread may be waiting for.
+ * Evaluate the condition, returning 0 if the condition has been satisfied, and
+ * DB_TIMEOUT if not.
+ *
+ * Caller must hold REP_SYSTEM lock and/or mtx_clientdb as appropriate.
+ *
+ * PUBLIC: int __rep_check_goal __P((ENV *, struct rep_waitgoal *));
+ */
+int
+__rep_check_goal(env, goal)
+	ENV *env;
+	struct rep_waitgoal *goal;
+{
+	REP *rep;
+	LOG *lp;
+	int ret;
+
+	rep = env->rep_handle->region;
+	lp = env->lg_handle->reginfo.primary;
+	ret = DB_TIMEOUT;	/* Pessimistic, to start. */
+
+	/*
+	 * Note that while AWAIT_LSN and AWAIT_HISTORY look similar, they are
+	 * actually quite different.  With AWAIT_LSN, the u.lsn is the LSN of
+	 * the commit of the transaction the caller is waiting for.  So we need
+	 * to make sure we have gotten at least that far, thus ">=".
+	 *
+	 * For AWAIT_HISTORY, the u.lsn is simply a copy of whatever the current
+	 * max_perm_lsn was at the time we last checked.  So anything if we have
+	 * anything *beyond* that then we should wake up again and check to see
+	 * if we now have the desired history (thus ">").  Thus when we're
+	 * waiting for HISTORY we're going to get woken *at every commit we
+	 * receive*!  Fortunately it should be coming as the first transaction
+	 * after the gen change, and waiting for HISTORY should be extremely
+	 * rare anyway.
+	 */
+	switch (goal->why) {
+	case AWAIT_LSN:
+		/* Have we reached our goal LSN? */
+		if (LOG_COMPARE(&lp->max_perm_lsn, &goal->u.lsn) >= 0)
+			ret = 0;
+		break;
+	case AWAIT_HISTORY:
+		/*
+		 * Have we made any progress whatsoever, beyond where we were at
+		 * the time the waiting thread noted the current LSN?
+		 *     When we have to wait for replication of the LSN history
+		 * database, we don't know what LSN it's going to occur at.  So
+		 * we have to wake up every time we get a new transaction.
+		 * Fortunately, this should be exceedingly rare, and the number
+		 * of transactions we have to plow through should almost never
+		 * be more than 1.
+		 */
+		if (LOG_COMPARE(&lp->max_perm_lsn, &goal->u.lsn) > 0)
+			ret = 0;
+		break;
+	case AWAIT_GEN:
+		if (rep->gen >= goal->u.gen)
+			ret = 0;
+		break;
+	case AWAIT_NIMDB:
+		if (F_ISSET(rep, REP_F_NIMDBS_LOADED))
+			ret = 0;
+		break;
+	default:
+		DB_ASSERT(env, 0);
+	}
+	return (ret);
+}
+
+/*
+ * __rep_log_backup --
+ *
+ * Walk backwards in the log looking for specific kinds of records.
+ *
+ * PUBLIC: int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t));
+ */
+int
+__rep_log_backup(env, logc, lsn, match)
+	ENV *env;
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+	u_int32_t match;
+{
+	DBT mylog;
+	u_int32_t rectype;
+	int ret;
+
+	ret = 0;
+	memset(&mylog, 0, sizeof(mylog));
+	while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) {
+		LOGCOPY_32(env, &rectype, mylog.data);
+		/*
+		 * Check the record type against the desired match type(s).
+		 */
+		if ((match == REP_REC_COMMIT &&
+		    rectype == DB___txn_regop) ||
+		    (match == REP_REC_PERM &&
+		    (rectype == DB___txn_ckp || rectype == DB___txn_regop)))
+			break;
+	}
+	return (ret);
+}
+
+/*
+ * __rep_get_maxpermlsn --
+ *
+ * Safely retrieve the current max_perm_lsn value.
+ *
+ * PUBLIC: int __rep_get_maxpermlsn __P((ENV *, DB_LSN *));
+ */
+int
+__rep_get_maxpermlsn(env, max_perm_lsnp)
+	ENV *env;
+	DB_LSN *max_perm_lsnp;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REP *rep;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	ENV_ENTER(env, ip);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	*max_perm_lsnp = lp->max_perm_lsn;
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	ENV_LEAVE(env, ip);
+	return (0);
+}
+
+/*
+ * __rep_is_internal_rep_file --
+ *
+ * Return 1 if filename is an internal replication file; 0 otherwise.
+ * Works for all internal replication files including internal database
+ * files.
+ *
+ * PUBLIC: int __rep_is_internal_rep_file __P((char *));
+ */
+int
+__rep_is_internal_rep_file(filename)
+	char *filename;
+{
+	return (strncmp(filename,
+	    REPFILEPREFIX, sizeof(REPFILEPREFIX) - 1) == 0 ? 1 : 0);
+}
+
+/*
+ * Get the last generation number from the LSN history database.
+ *
+ * PUBLIC: int __rep_get_datagen __P((ENV *, u_int32_t *));
+ */
+int
+__rep_get_datagen(env, data_genp)
+	ENV *env;
+	u_int32_t *data_genp;
+{
+	DB_REP *db_rep;
+	DB_TXN *txn;
+	DB *dbp;
+	DBC *dbc;
+	__rep_lsn_hist_key_args key;
+	u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+	u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+	DBT key_dbt, data_dbt;
+	u_int32_t flags;
+	int ret, t_ret, tries;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+	*data_genp = 0;
+	tries = 0;
+	flags = DB_LAST;
+retry:
+	if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		return (ret);
+
+	if ((dbp = db_rep->lsn_db) == NULL) {
+		if ((ret = __rep_open_sysdb(env,
+		    NULL, txn, REPLSNHIST, 0, &dbp)) != 0) {
+			/*
+			 * If the database isn't there, it could be because it's
+			 * memory-resident, and we haven't yet sync'ed with the
+			 * master to materialize it.  It could be that this is
+			 * a brand new environment.  We have a 0 datagen.
+			 * That is not an error.
+			 */
+			ret = 0;
+			goto out;
+		}
+		db_rep->lsn_db = dbp;
+	}
+
+	if ((ret = __db_cursor(dbp, NULL, txn, &dbc, 0)) != 0)
+		goto out;
+
+	DB_INIT_DBT(key_dbt, key_buf, __REP_LSN_HIST_KEY_SIZE);
+	key_dbt.ulen = __REP_LSN_HIST_KEY_SIZE;
+	F_SET(&key_dbt, DB_DBT_USERMEM);
+
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	data_dbt.data = data_buf;
+	data_dbt.ulen = __REP_LSN_HIST_DATA_SIZE;
+	F_SET(&data_dbt, DB_DBT_USERMEM);
+	if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, flags)) != 0) {
+		if ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+		    ++tries < 5) /* Limit of 5 is an arbitrary choice. */
+			ret = 0;
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
+			ret = t_ret;
+		/*
+		 * If we have any kind of error at this point, bail.
+		 * Otherwise pause and try again.
+		 */
+		if (ret != 0)
+			goto err;
+		__os_yield(env, 0, 10000); /* Arbitrary duration. */
+		goto retry;
+	}
+	if ((ret = __dbc_close(dbc)) == 0 &&
+	    (ret = __rep_lsn_hist_key_unmarshal(env,
+	    &key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) == 0)
+		*data_genp = key.gen;
+out:
+	if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+err:
+	return (ret);
+}
diff --git a/src/rep/rep_verify.c b/src/rep/rep_verify.c
new file mode 100644
index 00000000..5238f900
--- /dev/null
+++ b/src/rep/rep_verify.c
@@ -0,0 +1,751 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+static int __rep_internal_init __P((ENV *, u_int32_t));
+
+/*
+ * __rep_verify --
+ *	Handle a REP_VERIFY message.
+ *
+ * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *,
+ * PUBLIC:     int, time_t));
+ */
+int
+__rep_verify(env, rp, rec, eid, savetime)
+	ENV *env;
+	__rep_control_args *rp;
+	DBT *rec;
+	int eid;
+	time_t savetime;
+{
+	DBT mylog;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn, prev_ckp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	__txn_ckp_args *ckp_args;
+	u_int32_t logflag, rectype;
+	int master, match, ret, t_ret;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/* Do nothing if VERIFY is not set. */
+	if (rep->sync_state != SYNC_VERIFY)
+		return (ret);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * We should not ever be in internal init with a lease granted.
+	 */
+	if (IS_USING_LEASES(env)) {
+		REP_SYSTEM_LOCK(env);
+		DB_ASSERT(env, __rep_islease_granted(env) == 0);
+		REP_SYSTEM_UNLOCK(env);
+	}
+#endif
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	memset(&mylog, 0, sizeof(mylog));
+	/* If verify_lsn of ZERO is passed in, get last log. */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET;
+	prev_ckp = lp->prev_ckp;
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0)
+		goto out;
+	match = 0;
+	if (mylog.size == rec->size &&
+	    memcmp(mylog.data, rec->data, rec->size) == 0)
+		match = 1;
+	/*
+	 * If we don't have a match, backup to the previous
+	 * identification record and try again.
+	 */
+	if (match == 0) {
+		master = rep->master_id;
+		/*
+		 * We will eventually roll back over this log record (unless we
+		 * ultimately have to give up and do an internal init).  So, if
+		 * it was a checkpoint, make sure we don't end up without any
+		 * checkpoints left in the entire log.
+		 */
+		LOGCOPY_32(env, &rectype, mylog.data);
+		DB_ASSERT(env, ret == 0);
+		if (!lp->db_log_inmemory && rectype == DB___txn_ckp) {
+			if ((ret = __txn_ckp_read(env,
+			    mylog.data, &ckp_args)) != 0)
+				goto out;
+			lsn = ckp_args->last_ckp;
+			__os_free(env, ckp_args);
+			MUTEX_LOCK(env, rep->mtx_clientdb);
+			lp->prev_ckp =	lsn;
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			if (IS_ZERO_LSN(lsn)) {
+				/*
+				 * No previous checkpoints?  The only way this
+				 * is OK is if we have the entire log, all the
+				 * way back to file #1.
+				 */
+				if ((ret = __logc_get(logc,
+				    &lsn, &mylog, DB_FIRST)) != 0)
+					goto out;
+				if (lsn.file != 1) {
+					ret = __rep_internal_init(env, 0);
+					goto out;
+				}
+
+				/* Restore position of log cursor. */
+				if ((ret = __logc_get(logc,
+				    &rp->lsn, &mylog, DB_SET)) != 0)
+					goto out;
+			}
+		}
+		if ((ret = __rep_log_backup(env, logc, &lsn,
+		    REP_REC_PERM)) == 0) {
+			MUTEX_LOCK(env, rep->mtx_clientdb);
+			lp->verify_lsn = lsn;
+			__os_gettime(env, &lp->rcvd_ts, 1);
+			lp->wait_ts = rep->request_gap;
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+			if (master != DB_EID_INVALID)
+				eid = master;
+			(void)__rep_send_message(env, eid, REP_VERIFY_REQ,
+			    &lsn, NULL, 0, DB_REP_ANYWHERE);
+		} else if (ret == DB_NOTFOUND) {
+			/*
+			 * We've either run out of records because
+			 * logs have been removed or we've rolled back
+			 * all the way to the beginning.
+			 */
+			ret = __rep_internal_init(env, 0);
+		}
+	} else {
+		/*
+		 * We have a match, so we can probably do a simple sync, without
+		 * needing internal init.  But first, check for a couple of
+		 * special cases.
+		 */
+
+		if (!lp->db_log_inmemory && !IS_ZERO_LSN(prev_ckp)) {
+			/*
+			 * We previously saw a checkpoint, which means we may
+			 * now be about to roll back over it and lose it.  Make
+			 * sure we'll end up still having at least one other
+			 * checkpoint.  (Note that if the current record -- the
+			 * one we've just matched -- happens to be a checkpoint,
+			 * then it must be the same as the prev_ckp we're now
+			 * about to try reading.  Which means we wouldn't really
+			 * have to read it.  But checking for that special case
+			 * doesn't seem worth the trouble.)
+			 */
+			if ((ret = __logc_get(logc,
+			    &prev_ckp, &mylog, DB_SET)) != 0) {
+				if (ret == DB_NOTFOUND)
+					ret = __rep_internal_init(env, 0);
+				goto out;
+			}
+			/*
+			 * We succeeded reading for the prev_ckp, so it's safe
+			 * to fall through to the verify_match.
+			 */
+		}
+		/*
+		 * Mixed version internal init doesn't work with 4.4, so we
+		 * can't load NIMDBs from a very old-version master.  So, fib to
+		 * ourselves that they're already loaded, so that we don't try.
+		 */
+		if (rep->version == DB_REPVERSION_44) {
+			REP_SYSTEM_LOCK(env);
+			F_SET(rep, REP_F_NIMDBS_LOADED);
+			REP_SYSTEM_UNLOCK(env);
+		}
+		if (F_ISSET(rep, REP_F_NIMDBS_LOADED))
+			ret = __rep_verify_match(env, &rp->lsn, savetime);
+		else {
+			/*
+			 * Even though we found a match, we haven't yet loaded
+			 * any NIMDBs, so we have to do an abbreviated internal
+			 * init.  We leave lp->verify_lsn set to the matching
+			 * sync point, in case upon eventual examination of the
+			 * UPDATE message it turns out there are no NIMDBs
+			 * (since we can then skip back to a verify_match
+			 * outcome).
+			 */
+			ret = __rep_internal_init(env, REP_F_ABBREVIATED);
+		}
+	}
+
+out:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int
+__rep_internal_init(env, abbrev)
+	ENV *env;
+	u_int32_t abbrev;
+{
+	REP *rep;
+	int master, ret;
+
+	rep = env->rep_handle->region;
+	REP_SYSTEM_LOCK(env);
+#ifdef HAVE_STATISTICS
+	if (!abbrev)
+		rep->stat.st_outdated++;
+#endif
+
+	/*
+	 * What we call "abbreviated internal init" is really just NIMDB
+	 * materialization, and we always do that even if AUTOINIT has been
+	 * turned off.
+	 */
+	if (!FLD_ISSET(rep->config, REP_C_AUTOINIT) && !abbrev)
+		ret = DB_REP_JOIN_FAILURE;
+	else {
+		rep->sync_state = SYNC_UPDATE;
+		if (abbrev) {
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+			 "send UPDATE_REQ, merely to check for NIMDB refresh"));
+			F_SET(rep, REP_F_ABBREVIATED);
+		} else
+			F_CLR(rep, REP_F_ABBREVIATED);
+		ZERO_LSN(rep->first_lsn);
+		ZERO_LSN(rep->ckp_lsn);
+		ret = 0;
+	}
+	master = rep->master_id;
+	REP_SYSTEM_UNLOCK(env);
+	if (ret == 0 && master != DB_EID_INVALID)
+		(void)__rep_send_message(env,
+		    master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+	return (ret);
+}
+
+/*
+ * __rep_verify_fail --
+ *	Handle a REP_VERIFY_FAIL message.
+ *
+ * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *));
+ */
+int
+__rep_verify_fail(env, rp)
+	ENV *env;
+	__rep_control_args *rp;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	int clnt_lock_held, lockout, master, ret;
+
+	clnt_lock_held = lockout = 0;
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * If we are already in the middle of updating (PAGE or UPDATE state),
+	 * then we ignore this message.
+	 */
+	if (rep->sync_state == SYNC_PAGE || rep->sync_state == SYNC_UPDATE)
+		return (0);
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * We should not ever be in internal init with a lease granted.
+	 */
+	DB_ASSERT(env,
+	    !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+	/*
+	 * Clean up old internal init in progress if:
+	 * REP_C_AUTOINIT is configured and
+	 * we are recovering LOG and this LSN is in the range we need.
+	 */
+	if (rep->sync_state == SYNC_LOG &&
+	    LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+	    LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) {
+		/*
+		 * Already locking out messages, give up.
+		 */
+		if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+			goto unlock;
+
+		/*
+		 * Lock out other messages to prevent race conditions.
+		 */
+		if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+			goto unlock;
+		lockout = 1;
+
+		/*
+		 * Clean up internal init if one was in progress.
+		 */
+		if (ISSET_LOCKOUT_BDB(rep)) {
+			RPRINT(env, (env, DB_VERB_REP_SYNC,
+    "VERIFY_FAIL is cleaning up old internal init for missing log"));
+			if ((ret =
+			    __rep_init_cleanup(env, rep, DB_FORCE)) != 0) {
+				RPRINT(env, (env, DB_VERB_REP_SYNC,
+    "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret));
+				goto msglck;
+			}
+			CLR_RECOVERY_SETTINGS(rep);
+		}
+		FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+		lockout = 0;
+	}
+
+	REP_SYSTEM_UNLOCK(env);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	clnt_lock_held = 1;
+	REP_SYSTEM_LOCK(env);
+	/*
+	 * Commence an internal init if:
+	 * We are in VERIFY state and the failing LSN is the one we
+	 * were verifying or
+	 * we're recovering LOG and this LSN is in the range we need or
+	 * we are in normal state (no recovery flags set) and
+	 * the failing LSN is the one we're ready for.
+	 *
+	 * We don't want an old or delayed VERIFY_FAIL message to throw us
+	 * into internal initialization when we shouldn't be.
+	 */
+	if ((rep->sync_state == SYNC_VERIFY &&
+	    LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
+	    (rep->sync_state == SYNC_LOG &&
+	    LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+	    LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) ||
+	    (rep->sync_state == SYNC_OFF &&
+	    LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) {
+		/*
+		 * Update stats.
+		 */
+		STAT(rep->stat.st_outdated++);
+
+		/*
+		 * If REP_C_AUTOINIT is turned off, return
+		 * DB_REP_JOIN_FAILURE instead of doing internal init.
+		 */
+		if (!FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+			ret = DB_REP_JOIN_FAILURE;
+			goto unlock;
+		}
+
+		/*
+		 * Do the internal init.
+		 */
+		rep->sync_state = SYNC_UPDATE;
+		ZERO_LSN(rep->first_lsn);
+		ZERO_LSN(rep->ckp_lsn);
+		lp->wait_ts = rep->request_gap;
+		master = rep->master_id;
+		REP_SYSTEM_UNLOCK(env);
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		if (master != DB_EID_INVALID)
+			(void)__rep_send_message(env,
+			    master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+	} else {
+		/*
+		 * Otherwise ignore this message.
+		 */
+msglck:		if (lockout)
+			FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+unlock:		REP_SYSTEM_UNLOCK(env);
+		if (clnt_lock_held)
+			MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	}
+	return (ret);
+}
+
+/*
+ * __rep_verify_req --
+ *	Handle a REP_VERIFY_REQ message.
+ *
+ * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_verify_req(env, rp, eid)
+	ENV *env;
+	__rep_control_args *rp;
+	int eid;
+{
+	DBT *d, data_dbt;
+	DB_LOGC *logc;
+	DB_REP *db_rep;
+	REP *rep;
+	u_int32_t type;
+	int old, ret;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	type = REP_VERIFY;
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+	d = &data_dbt;
+	memset(d, 0, sizeof(data_dbt));
+	F_SET(logc, DB_LOG_SILENT_ERR);
+	ret = __logc_get(logc, &rp->lsn, d, DB_SET);
+	/*
+	 * If the LSN was invalid, then we might get a DB_NOTFOUND
+	 * we might get an EIO, we could get anything.
+	 * If we get a DB_NOTFOUND, then there is a chance that
+	 * the LSN comes before the first file present in which
+	 * case we need to return a fail so that the client can
+	 * perform an internal init or return a REP_JOIN_FAILURE.
+	 *
+	 * If we're a client servicing this request and we get a
+	 * NOTFOUND, return it so the caller can rerequest from
+	 * a better source.
+	 */
+	if (ret == DB_NOTFOUND) {
+		if (F_ISSET(rep, REP_F_CLIENT)) {
+			(void)__logc_close(logc);
+			return (DB_NOTFOUND);
+		}
+		if (__log_is_outdated(env, rp->lsn.file, &old) == 0 &&
+		    old != 0)
+			type = REP_VERIFY_FAIL;
+	}
+
+	if (ret != 0)
+		d = NULL;
+
+	(void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0);
+	return (__logc_close(logc));
+}
+
+/*
+ * PUBLIC: int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__rep_dorecovery(env, lsnp, trunclsnp)
+	ENV *env;
+	DB_LSN *lsnp, *trunclsnp;
+{
+	DBT mylog;
+	DB_LOGC *logc;
+	DB_LSN last_ckp, lsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	REP *rep;
+	int ret, rollback, skip_rec, t_ret, update;
+	u_int32_t rectype, opcode;
+	__txn_regop_args *txnrec;
+	__txn_regop_42_args *txn42rec;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/* Figure out if we are backing out any committed transactions. */
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	memset(&mylog, 0, sizeof(mylog));
+	if (rep->sync_state == SYNC_LOG) {
+		/*
+		 * Internal init can never skip recovery.
+		 * Internal init must always update the timestamp and
+		 * force dead handles.
+		 */
+		skip_rec = 0;
+		update = 1;
+	} else {
+		skip_rec = 1;
+		update = 0;
+	}
+	rollback = 0;
+	while (update == 0 &&
+	    (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 &&
+	    LOG_COMPARE(&lsn, lsnp) > 0) {
+		LOGCOPY_32(env, &rectype, mylog.data);
+		/*
+		 * Find out if we can skip recovery completely.  If we
+		 * are backing up over any record a client usually
+		 * cares about, we must run recovery.
+		 *
+		 * Skipping sync-up recovery can be pretty scary!
+		 * Here's why we can do it:
+		 * If a master downgraded to client and is now running
+		 * sync-up to a new master, that old master must have
+		 * waited for any outstanding txns to resolve before
+		 * becoming a client.  Also we are in lockout so there
+		 * can be no other operations right now.
+		 *
+		 * If the client wrote a commit record to the log, but
+		 * was descheduled before processing the txn, and then
+		 * a new master was found, we must've let the txn get
+		 * processed because right now we are the only message
+		 * thread allowed to be running.
+		 */
+		DB_ASSERT(env, rep->op_cnt == 0);
+		DB_ASSERT(env, rep->msg_th == 1);
+		if (rectype == DB___txn_regop || rectype == DB___txn_ckp ||
+		    rectype == DB___dbreg_register)
+			skip_rec = 0;
+		if (rectype == DB___txn_regop) {
+			if (rep->version >= DB_REPVERSION_44) {
+				if ((ret = __txn_regop_read(
+				    env, mylog.data, &txnrec)) != 0)
+					goto err;
+				opcode = txnrec->opcode;
+				__os_free(env, txnrec);
+			} else {
+				if ((ret = __txn_regop_42_read(
+				    env, mylog.data, &txn42rec)) != 0)
+					goto err;
+				opcode = txn42rec->opcode;
+				__os_free(env, txn42rec);
+			}
+			if (opcode != TXN_ABORT) {
+				rollback = 1;
+				update = 1;
+			}
+		}
+	}
+	/*
+	 * Handle if the logc_get fails.
+	 */
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If we successfully run recovery, we've opened all the necessary
+	 * files.  We are guaranteed to be single-threaded here, so no mutex
+	 * is necessary.
+	 */
+	if (skip_rec) {
+		if ((ret = __log_get_stable_lsn(env, &last_ckp, 0)) != 0) {
+			if (ret != DB_NOTFOUND)
+				goto err;
+			ZERO_LSN(last_ckp);
+		}
+		RPRINT(env, (env, DB_VERB_REP_SYNC,
+    "Skip sync-up rec.  Truncate log to [%lu][%lu], ckp [%lu][%lu]",
+    (u_long)lsnp->file, (u_long)lsnp->offset,
+    (u_long)last_ckp.file, (u_long)last_ckp.offset));
+		ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp);
+	} else {
+		if (rollback && !FLD_ISSET(rep->config, REP_C_AUTOROLLBACK)) {
+			ret = DB_REP_WOULDROLLBACK;
+			goto err;
+		}
+		ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0);
+	}
+
+	if (ret != 0)
+		goto err;
+	F_SET(db_rep, DBREP_OPENFILES);
+
+	/*
+	 * If we've just updated the env handle timestamp, then we would get
+	 * HANDLE_DEAD next time we tried to use our LSN history database.  So,
+	 * close it here now, to save ourselves the trouble of worrying about it
+	 * later.
+	 */
+	if (update && db_rep->lsn_db != NULL) {
+		ret = __db_close(db_rep->lsn_db, NULL, DB_NOSYNC);
+		db_rep->lsn_db = NULL;
+	}
+
+err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __rep_verify_match --
+ *	We have just received a matching log record during verification.
+ * Figure out if we're going to need to run recovery. If so, wait until
+ * everything else has exited the library.  If not, set up the world
+ * correctly and move forward.
+ *
+ * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
+ */
+int
+__rep_verify_match(env, reclsnp, savetime)
+	ENV *env;
+	DB_LSN *reclsnp;
+	time_t savetime;
+{
+	DB_LOG *dblp;
+	DB_LSN trunclsn;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	int done, event, master, ret;
+	u_int32_t unused;
+
+	dblp = env->lg_handle;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+	event = 0;
+	infop = env->reginfo;
+	renv = infop->primary;
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/*
+	 * Check if the savetime is different than our current time stamp.
+	 * If it is, then we're racing with another thread trying to recover
+	 * and we lost.  We must give up.
+	 */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	done = savetime != renv->rep_timestamp;
+	if (done) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		return (0);
+	}
+	ZERO_LSN(lp->verify_lsn);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+	/*
+	 * Make sure the world hasn't changed while we tried to get
+	 * the lock.  If it hasn't then it's time for us to kick all
+	 * operations out of DB and run recovery.
+	 */
+	REP_SYSTEM_LOCK(env);
+	if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG) ||
+	    (rep->sync_state != SYNC_LOG &&
+	    ISSET_LOCKOUT_BDB(rep))) {
+		/*
+		 * We lost.  The world changed and we should do nothing.
+		 */
+		STAT(rep->stat.st_msgs_recover++);
+		goto errunlock;
+	}
+
+	/*
+	 * Lockout all message threads but ourselves.
+	 */
+	if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+		goto errunlock;
+
+	/*
+	 * Lockout the API and wait for operations to complete.
+	 */
+	if ((ret = __rep_lockout_api(env, rep)) != 0)
+		goto errunlock;
+
+	/* OK, everyone is out, we can now run recovery. */
+	REP_SYSTEM_UNLOCK(env);
+
+	if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 ||
+	    (ret = __rep_remove_init_file(env)) != 0) {
+		REP_SYSTEM_LOCK(env);
+		FLD_CLR(rep->lockout_flags,
+		    REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP);
+		goto errunlock;
+	}
+
+	/*
+	 * The log has been truncated (either directly by us or by __db_apprec)
+	 * We want to make sure we're waiting for the LSN at the new end-of-log,
+	 * not some later point.
+	 */
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	lp->ready_lsn = trunclsn;
+	ZERO_LSN(lp->waiting_lsn);
+	ZERO_LSN(lp->max_wait_lsn);
+	lp->max_perm_lsn = *reclsnp;
+	lp->wait_ts = rep->request_gap;
+	__os_gettime(env, &lp->rcvd_ts, 1);
+	ZERO_LSN(lp->verify_lsn);
+	ZERO_LSN(lp->prev_ckp);
+
+	/*
+	 * Discard any log records we have queued;  we're about to re-request
+	 * them, and can't trust the ones in the queue.  We need to set the
+	 * DB_AM_RECOVER bit in this handle, so that the operation doesn't
+	 * deadlock.
+	 */
+	if (db_rep->rep_db == NULL &&
+	    (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		goto out;
+	}
+
+	F_SET(db_rep->rep_db, DB_AM_RECOVER);
+	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+	ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+	MUTEX_LOCK(env, rep->mtx_clientdb);
+	F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+
+	REP_SYSTEM_LOCK(env);
+	STAT(rep->stat.st_log_queued = 0);
+	if (IN_INTERNAL_INIT(rep))
+		event = 1;
+	CLR_RECOVERY_SETTINGS(rep);
+	FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE | REP_LOCKOUT_MSG);
+	if (ret != 0)
+		goto errunlock2;
+
+	/*
+	 * If the master_id is invalid, this means that since
+	 * the last record was sent, something happened to the
+	 * master and we may not have a master to request
+	 * things of.
+	 *
+	 * This is not an error;  when we find a new master,
+	 * we'll re-negotiate where the end of the log is and
+	 * try to bring ourselves up to date again anyway.
+	 */
+	master = rep->master_id;
+	REP_SYSTEM_UNLOCK(env);
+	if (master == DB_EID_INVALID) {
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		ret = 0;
+	} else {
+		/*
+		 * We're making an ALL_REQ.  But now that we've
+		 * cleared the flags, we're likely receiving new
+		 * log records from the master, resulting in a gap
+		 * immediately.  So to avoid multiple data streams,
+		 * set the wait_ts value high now to give the master
+		 * a chance to start sending us these records before
+		 * the gap code re-requests the same gap.  Wait_recs
+		 * will get reset once we start receiving these
+		 * records.
+		 */
+		lp->wait_ts = rep->max_gap;
+		MUTEX_UNLOCK(env, rep->mtx_clientdb);
+		(void)__rep_send_message(env,
+		    master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE);
+	}
+	if (event)
+		__rep_fire_event(env, DB_EVENT_REP_INIT_DONE, NULL);
+	if (0) {
+errunlock2:	MUTEX_UNLOCK(env, rep->mtx_clientdb);
+errunlock:	REP_SYSTEM_UNLOCK(env);
+	}
+out:	return (ret);
+}
diff --git a/src/repmgr/repmgr.msg b/src/repmgr/repmgr.msg
new file mode 100644
index 00000000..020f2e9c
--- /dev/null
+++ b/src/repmgr/repmgr.msg
@@ -0,0 +1,119 @@
+PREFIX	__repmgr
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_swap.h"
+INCLUDE
+
+BEGIN_MSG handshake
+ARG	port		u_int16_t
+ARG	alignment	u_int16_t
+ARG	ack_policy	u_int32_t
+ARG	flags		u_int32_t
+END
+
+BEGIN_MSG v3handshake
+ARG	port		u_int16_t
+ARG	priority	u_int32_t
+ARG	flags		u_int32_t
+END
+
+BEGIN_MSG v2handshake
+ARG	port		u_int16_t
+ARG	priority	u_int32_t
+END
+
+BEGIN_MSG parm_refresh
+ARG	ack_policy	u_int32_t
+ARG	flags		u_int32_t
+END
+
+BEGIN_MSG permlsn
+ARG	generation	u_int32_t
+ARG	lsn		DB_LSN
+END
+
+BEGIN_MSG version_proposal
+ARG	min		u_int32_t
+ARG	max		u_int32_t
+END
+
+BEGIN_MSG version_confirmation
+ARG	version		u_int32_t
+END
+
+BEGIN_MSG msg_hdr
+ARG	type		u_int8_t
+ARG	word1		u_int32_t
+ARG	word2		u_int32_t
+END
+
+/* Metadata that goes along with user message on a DB_CHANNEL. */
+BEGIN_MSG msg_metadata
+ARG	tag		u_int32_t
+ARG	limit		u_int32_t
+ARG	flags		u_int32_t
+END
+
+/*
+ * The membership database has a record for each site in the group, plus one
+ * extra meta-data record.  The key of the meta-data record has a zero-length
+ * host, and a port value of 0.
+ */
+BEGIN_MSG membership_key check_length
+ARG	host		DBT
+ARG	port		u_int16_t
+END
+
+BEGIN_MSG membership_data
+ARG	flags           u_int32_t
+END
+
+/*
+ * The "format" identifies the content and layout of the records within the
+ * membership database (i.e., some of the items defined here in this *.msg
+ * file).  It should be incremented when the layouts change in future Berkeley
+ * DB releases.  The "version" counts group changes that the application makes
+ * by adding or removing sites; thus it varies dynamically thoughout the life of
+ * a group, during a single release of Berkeley DB.
+ */ 
+BEGIN_MSG member_metadata
+ARG	format		u_int32_t
+ARG	version		u_int32_t
+END
+
+/*
+ * When a new site wants to join a group, it "guesses" that the configured
+ * "helper" site is the master, and sends the request there.  When that guess
+ * is wrong, the helper site responds with the location of the current master,
+ * in effect "forwarding" the request.
+ */
+BEGIN_MSG gm_fwd check_length
+ARG	host		DBT
+ARG	port		u_int16_t
+ARG	gen		u_int32_t
+END
+
+/* Membership list version header: */
+BEGIN_MSG membr_vers
+ARG	version		u_int32_t
+ARG	gen		u_int32_t
+END
+BEGIN_MSG site_info check_length
+ARG	host		DBT
+ARG	port		u_int16_t
+ARG	flags		u_int32_t
+END
+
+/*
+ * If site A breaks or rejects a connection from site B, it first
+ * tries to send B this message containing site A's currently known
+ * membership DB version.  Site B can use this to decide what to do.
+ * If site B knows of a later version, it should retry the connection
+ * to site A later, polling at it until site A catches up.  However, if
+ * site B's known version is less, it means that site B is no longer in 
+ * the group, and so instead it should shut down and notify the application.
+ */
+BEGIN_MSG connect_reject
+ARG	version		u_int32_t
+ARG	gen		u_int32_t
+END
diff --git a/src/repmgr/repmgr.src b/src/repmgr/repmgr.src
new file mode 100644
index 00000000..68d8c239
--- /dev/null
+++ b/src/repmgr/repmgr.src
@@ -0,0 +1,23 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+
+DBPRIVATE
+PREFIX	__repmgr
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc_auto/repmgr_auto.h"
+INCLUDE
+
+BEGIN member		52	200
+ARG	version         u_int32_t       lu
+ARG	prev_status	u_int32_t	lu
+ARG	status		u_int32_t	lu
+DBT	host		DBT		s
+ARG	port		u_int32_t	lu
+END
diff --git a/src/repmgr/repmgr_auto.c b/src/repmgr/repmgr_auto.c
new file mode 100644
index 00000000..19eb24d4
--- /dev/null
+++ b/src/repmgr/repmgr_auto.c
@@ -0,0 +1,32 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+DB_LOG_RECSPEC __repmgr_member_desc[] = {
+	{LOGREC_ARG, SSZ(__repmgr_member_args, version), "version", "%lu"},
+	{LOGREC_ARG, SSZ(__repmgr_member_args, prev_status), "prev_status", "%lu"},
+	{LOGREC_ARG, SSZ(__repmgr_member_args, status), "status", "%lu"},
+	{LOGREC_DBT, SSZ(__repmgr_member_args, host), "host", ""},
+	{LOGREC_ARG, SSZ(__repmgr_member_args, port), "port", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__repmgr_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __repmgr_member_recover, DB___repmgr_member)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/repmgr/repmgr_automsg.c b/src/repmgr/repmgr_automsg.c
new file mode 100644
index 00000000..90af08ff
--- /dev/null
+++ b/src/repmgr/repmgr_automsg.c
@@ -0,0 +1,757 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_swap.h"
+
+/*
+ * PUBLIC: void __repmgr_handshake_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_handshake_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_handshake_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+	DB_HTONS_COPYOUT(env, bp, argp->alignment);
+	DB_HTONL_COPYOUT(env, bp, argp->ack_policy);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_handshake_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_handshake_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_handshake_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_HANDSHAKE_SIZE)
+		goto too_few;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+	DB_NTOHS_COPYIN(env, argp->alignment, bp);
+	DB_NTOHL_COPYIN(env, argp->ack_policy, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_handshake message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_v3handshake_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_v3handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_v3handshake_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_v3handshake_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+	DB_HTONL_COPYOUT(env, bp, argp->priority);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_v3handshake_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_v3handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_v3handshake_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_v3handshake_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_V3HANDSHAKE_SIZE)
+		goto too_few;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+	DB_NTOHL_COPYIN(env, argp->priority, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_v3handshake message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_v2handshake_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_v2handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_v2handshake_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_v2handshake_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+	DB_HTONL_COPYOUT(env, bp, argp->priority);
+}
+
+/*
+ * PUBLIC: int __repmgr_v2handshake_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_v2handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_v2handshake_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_v2handshake_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_V2HANDSHAKE_SIZE)
+		goto too_few;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+	DB_NTOHL_COPYIN(env, argp->priority, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_v2handshake message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_parm_refresh_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_parm_refresh_args *, u_int8_t *));
+ */
+void
+__repmgr_parm_refresh_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_parm_refresh_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->ack_policy);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_parm_refresh_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_parm_refresh_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_parm_refresh_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_parm_refresh_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_PARM_REFRESH_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->ack_policy, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_parm_refresh message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_permlsn_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_permlsn_args *, u_int8_t *));
+ */
+void
+__repmgr_permlsn_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_permlsn_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->generation);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+	DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+}
+
+/*
+ * PUBLIC: int __repmgr_permlsn_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_permlsn_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_permlsn_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_permlsn_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_PERMLSN_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->generation, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+	DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_permlsn message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_version_proposal_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_version_proposal_args *, u_int8_t *));
+ */
+void
+__repmgr_version_proposal_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_version_proposal_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->min);
+	DB_HTONL_COPYOUT(env, bp, argp->max);
+}
+
+/*
+ * PUBLIC: int __repmgr_version_proposal_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_version_proposal_args *, u_int8_t *, size_t,
+ * PUBLIC:	 u_int8_t **));
+ */
+int
+__repmgr_version_proposal_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_version_proposal_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_VERSION_PROPOSAL_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->min, bp);
+	DB_NTOHL_COPYIN(env, argp->max, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_version_proposal message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_version_confirmation_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_version_confirmation_args *, u_int8_t *));
+ */
+void
+__repmgr_version_confirmation_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_version_confirmation_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+}
+
+/*
+ * PUBLIC: int __repmgr_version_confirmation_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_version_confirmation_args *, u_int8_t *, size_t,
+ * PUBLIC:	 u_int8_t **));
+ */
+int
+__repmgr_version_confirmation_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_version_confirmation_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_VERSION_CONFIRMATION_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_version_confirmation message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_msg_hdr_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_msg_hdr_args *, u_int8_t *));
+ */
+void
+__repmgr_msg_hdr_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_msg_hdr_args *argp;
+	u_int8_t *bp;
+{
+	*bp++ = argp->type;
+	DB_HTONL_COPYOUT(env, bp, argp->word1);
+	DB_HTONL_COPYOUT(env, bp, argp->word2);
+}
+
+/*
+ * PUBLIC: int __repmgr_msg_hdr_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_msg_hdr_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_msg_hdr_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_msg_hdr_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_MSG_HDR_SIZE)
+		goto too_few;
+	argp->type = *bp++;
+	DB_NTOHL_COPYIN(env, argp->word1, bp);
+	DB_NTOHL_COPYIN(env, argp->word2, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_msg_hdr message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_msg_metadata_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_msg_metadata_args *, u_int8_t *));
+ */
+void
+__repmgr_msg_metadata_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_msg_metadata_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->tag);
+	DB_HTONL_COPYOUT(env, bp, argp->limit);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_msg_metadata_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_msg_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_msg_metadata_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_msg_metadata_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_MSG_METADATA_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->tag, bp);
+	DB_NTOHL_COPYIN(env, argp->limit, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_msg_metadata message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_key_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membership_key_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_membership_key_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__repmgr_membership_key_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REPMGR_MEMBERSHIP_KEY_SIZE
+	    + (size_t)argp->host.size)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->host.size);
+	if (argp->host.size > 0) {
+		memcpy(bp, argp->host.data, argp->host.size);
+		bp += argp->host.size;
+	}
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_key_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_membership_key_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_membership_key_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+
+	needed = __REPMGR_MEMBERSHIP_KEY_SIZE;
+	if (max < needed)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->host.size, bp);
+	if (argp->host.size == 0)
+		argp->host.data = NULL;
+	else
+		argp->host.data = bp;
+	needed += (size_t)argp->host.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->host.size;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_membership_key message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_membership_data_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membership_data_args *, u_int8_t *));
+ */
+void
+__repmgr_membership_data_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_membership_data_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_data_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membership_data_args *, u_int8_t *, size_t,
+ * PUBLIC:	 u_int8_t **));
+ */
+int
+__repmgr_membership_data_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_membership_data_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_MEMBERSHIP_DATA_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_membership_data message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_member_metadata_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_member_metadata_args *, u_int8_t *));
+ */
+void
+__repmgr_member_metadata_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_member_metadata_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->format);
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+}
+
+/*
+ * PUBLIC: int __repmgr_member_metadata_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_member_metadata_args *, u_int8_t *, size_t,
+ * PUBLIC:	 u_int8_t **));
+ */
+int
+__repmgr_member_metadata_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_member_metadata_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_MEMBER_METADATA_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->format, bp);
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_member_metadata message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_gm_fwd_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_gm_fwd_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__repmgr_gm_fwd_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REPMGR_GM_FWD_SIZE
+	    + (size_t)argp->host.size)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->host.size);
+	if (argp->host.size > 0) {
+		memcpy(bp, argp->host.data, argp->host.size);
+		bp += argp->host.size;
+	}
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+	DB_HTONL_COPYOUT(env, bp, argp->gen);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_gm_fwd_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_gm_fwd_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_gm_fwd_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_gm_fwd_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+
+	needed = __REPMGR_GM_FWD_SIZE;
+	if (max < needed)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->host.size, bp);
+	if (argp->host.size == 0)
+		argp->host.data = NULL;
+	else
+		argp->host.data = bp;
+	needed += (size_t)argp->host.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->host.size;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+	DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_gm_fwd message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_membr_vers_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membr_vers_args *, u_int8_t *));
+ */
+void
+__repmgr_membr_vers_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_membr_vers_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+	DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __repmgr_membr_vers_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_membr_vers_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_membr_vers_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_MEMBR_VERS_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+	DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_membr_vers message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_info_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_site_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_site_info_marshal(env, argp, bp, max, lenp)
+	ENV *env;
+	__repmgr_site_info_args *argp;
+	u_int8_t *bp;
+	size_t *lenp, max;
+{
+	u_int8_t *start;
+
+	if (max < __REPMGR_SITE_INFO_SIZE
+	    + (size_t)argp->host.size)
+		return (ENOMEM);
+	start = bp;
+
+	DB_HTONL_COPYOUT(env, bp, argp->host.size);
+	if (argp->host.size > 0) {
+		memcpy(bp, argp->host.data, argp->host.size);
+		bp += argp->host.size;
+	}
+	DB_HTONS_COPYOUT(env, bp, argp->port);
+	DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+	*lenp = (size_t)(bp - start);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_info_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_site_info_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_site_info_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	size_t needed;
+
+	needed = __REPMGR_SITE_INFO_SIZE;
+	if (max < needed)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->host.size, bp);
+	if (argp->host.size == 0)
+		argp->host.data = NULL;
+	else
+		argp->host.data = bp;
+	needed += (size_t)argp->host.size;
+	if (max < needed)
+		goto too_few;
+	bp += argp->host.size;
+	DB_NTOHS_COPYIN(env, argp->port, bp);
+	DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_site_info message"));
+	return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_connect_reject_marshal __P((ENV *,
+ * PUBLIC:	 __repmgr_connect_reject_args *, u_int8_t *));
+ */
+void
+__repmgr_connect_reject_marshal(env, argp, bp)
+	ENV *env;
+	__repmgr_connect_reject_args *argp;
+	u_int8_t *bp;
+{
+	DB_HTONL_COPYOUT(env, bp, argp->version);
+	DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __repmgr_connect_reject_unmarshal __P((ENV *,
+ * PUBLIC:	 __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_connect_reject_unmarshal(env, argp, bp, max, nextp)
+	ENV *env;
+	__repmgr_connect_reject_args *argp;
+	u_int8_t *bp;
+	size_t max;
+	u_int8_t **nextp;
+{
+	if (max < __REPMGR_CONNECT_REJECT_SIZE)
+		goto too_few;
+	DB_NTOHL_COPYIN(env, argp->version, bp);
+	DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+	if (nextp != NULL)
+		*nextp = bp;
+	return (0);
+
+too_few:
+	__db_errx(env, DB_STR("3675",
+	    "Not enough input bytes to fill a __repmgr_connect_reject message"));
+	return (EINVAL);
+}
+
diff --git a/src/repmgr/repmgr_autop.c b/src/repmgr/repmgr_autop.c
new file mode 100644
index 00000000..8d7c1974
--- /dev/null
+++ b/src/repmgr/repmgr_autop.c
@@ -0,0 +1,44 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_REPLICATION_THREADS
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+/*
+ * PUBLIC: int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__repmgr_member_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__repmgr_member", __repmgr_member_desc, info));
+}
+
+/*
+ * PUBLIC: int __repmgr_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__repmgr_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __repmgr_member_print, DB___repmgr_member)) != 0)
+		return (ret);
+	return (0);
+}
+#endif /* HAVE_REPLICATION_THREADS */
diff --git a/src/repmgr/repmgr_elect.c b/src/repmgr/repmgr_elect.c
new file mode 100644
index 00000000..3a84694a
--- /dev/null
+++ b/src/repmgr/repmgr_elect.c
@@ -0,0 +1,585 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static db_timeout_t __repmgr_compute_response_time __P((ENV *));
+static int __repmgr_elect __P((ENV *, u_int32_t, db_timespec *));
+static int __repmgr_elect_main __P((ENV *, REPMGR_RUNNABLE *));
+static void *__repmgr_elect_thread __P((void *));
+static int send_membership __P((ENV *));
+
+/*
+ * Starts an election thread.
+ *
+ * PUBLIC: int __repmgr_init_election __P((ENV *, u_int32_t));
+ *
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_init_election(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REPMGR_RUNNABLE *th;
+	int ret;
+	u_int i, new_size;
+
+	COMPQUIET(th, NULL);
+
+	db_rep = env->rep_handle;
+	if (db_rep->repmgr_status == stopped) {
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "ignoring elect thread request %#lx; repmgr is stopped",
+		    (u_long)flags));
+		return (0);
+	}
+
+	/* Find an available slot, indexed by 'i'; allocate more if needed. */
+	for (i = 0; i < db_rep->aelect_threads; i++) {
+		th = db_rep->elect_threads[i];
+		if (th == NULL)
+			break;
+		if (th->finished) {
+			if ((ret = __repmgr_thread_join(th)) != 0)
+				return (ret);
+			/* Reuse the space in a moment. */
+			break;
+		}
+	}
+	if (i == db_rep->aelect_threads) {
+		new_size = db_rep->aelect_threads + 1;
+		if ((ret = __os_realloc(env,
+		    sizeof(REPMGR_RUNNABLE*) * new_size,
+		    &db_rep->elect_threads)) != 0)
+			return (ret);
+		db_rep->aelect_threads = new_size;
+		STAT(db_rep->region->mstat.st_max_elect_threads = new_size);
+		th = db_rep->elect_threads[i] = NULL;
+	}
+
+	if (th == NULL &&
+	    (ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0)
+		return (ret);
+	th->run = __repmgr_elect_thread;
+	th->args.flags = flags;
+
+	if ((ret = __repmgr_thread_start(env, th)) == 0)
+		STAT(db_rep->region->mstat.st_elect_threads++);
+	else {
+		__os_free(env, th);
+		th = NULL;
+	}
+	db_rep->elect_threads[i] = th;
+
+	return (ret);
+}
+
+static void *
+__repmgr_elect_thread(argsp)
+	void *argsp;
+{
+	REPMGR_RUNNABLE *th;
+	ENV *env;
+	int ret;
+
+	th = argsp;
+	env = th->env;
+
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread"));
+
+	if ((ret = __repmgr_elect_main(env, th)) != 0) {
+		__db_err(env, ret, "election thread failed");
+		(void)__repmgr_thread_failure(env, ret);
+	}
+
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting"));
+	th->finished = TRUE;
+	return (NULL);
+}
+
+static int
+__repmgr_elect_main(env, th)
+	ENV *env;
+	REPMGR_RUNNABLE *th;
+{
+	DB_REP *db_rep;
+	REP *rep;
+#ifdef DB_WIN32
+	DWORD duration;
+	db_timeout_t t;
+#else
+	struct timespec deadline;
+#endif
+	db_timespec failtime, now, repstart_time, target, wait_til;
+	db_timeout_t delay_time, response_time, tmp_time;
+	u_long sec, usec;
+	u_int32_t flags;
+	int done_repstart, ret, suppress_election;
+	enum { ELECTION, REPSTART } action;
+
+	COMPQUIET(action, ELECTION);
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	flags = th->args.flags;
+
+	if (LF_ISSET(ELECT_F_EVENT_NOTIFY))
+		DB_EVENT(env, DB_EVENT_REP_MASTER_FAILURE, NULL);
+
+	/*
+	 * If leases are enabled, delay the election to allow any straggler
+	 * messages to get processed that might grant our lease again and
+	 * fool the base code into thinking the master is still there.
+	 * Any delay here offsets the time election code will wait for a
+	 * lease grant to expire.  So with leases we're not adding more delay.
+	 */
+	if (FLD_ISSET(db_rep->region->config, REP_C_LEASE)) {
+		/*
+		 * Use the smallest of the lease timeout, ack timeout,
+		 * or connection retry timeout.  We want to give straggler
+		 * messages a chance to get processed, but get an election
+		 * underway as soon as possible to find a master.
+		 */
+		if ((ret = __rep_get_timeout(env->dbenv,
+		    DB_REP_LEASE_TIMEOUT, &delay_time)) != 0)
+			goto out;
+		if ((ret = __rep_get_timeout(env->dbenv,
+		    DB_REP_ACK_TIMEOUT, &tmp_time)) != 0)
+			goto out;
+		if (tmp_time < delay_time)
+			delay_time = tmp_time;
+		if ((ret = __rep_get_timeout(env->dbenv,
+		    DB_REP_CONNECTION_RETRY, &tmp_time)) != 0)
+			goto out;
+		if (tmp_time < delay_time)
+			delay_time = tmp_time;
+		sec = delay_time / US_PER_SEC;
+		usec = delay_time % US_PER_SEC;
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Election with leases pause sec %lu, usec %lu", sec, usec));
+		__os_yield(env, sec, usec);
+	}
+
+	/*
+	 * As a freshly started thread, lay claim to the title of being
+	 * "preferred".  If an older thread is sleeping for retry, when it wakes
+	 * up it will relinquish its role (since there's no need for multiple
+	 * threads to sleep and retry).
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	db_rep->preferred_elect_thr = th;
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	/*
+	 * The 'done_repstart' flag keeps track of which was our most recent
+	 * operation (repstart or election), so that we can alternate
+	 * appropriately.  There are a few different ways this thread can be
+	 * invoked, and all but one specify some form of immediate election be
+	 * called.  The one exception is at initial start-up, where we
+	 * first probe for a master by sending out rep_start(CLIENT) calls.
+	 */
+	if (LF_ISSET(ELECT_F_IMMED)) {
+		/*
+		 * When the election succeeds, we've successfully completed
+		 * everything we need to do.  If it fails in an unexpected way,
+		 * we abort all processing as usual.  The only time we need to
+		 * stay in here and do some more work is on DB_REP_UNAVAIL,
+		 * in which case we want to wait a while and retry later.
+		 */
+		if ((ret = __repmgr_elect(env, flags, &failtime)) ==
+		    DB_REP_UNAVAIL)
+			done_repstart = FALSE;
+		else
+			goto out;
+	} else {
+		/*
+		 * We didn't really have an election failure, because in this
+		 * case we haven't even done an election yet.  But the timing
+		 * we want turns out the same: we want to wait for the election
+		 * retry time and then call for an election if nothing else
+		 * interesting happens before then.
+		 */
+		__os_gettime(env, &failtime, 1);
+
+		/*
+		 * Although we didn't do a repstart in this thread, we know that
+		 * our caller did one just before creating the thread.
+		 */
+		done_repstart = TRUE;
+	}
+
+	LOCK_MUTEX(db_rep->mutex);
+	for (;;) {
+		ret = 0;
+
+		if (db_rep->repmgr_status == stopped)
+			goto unlock;
+
+		/*
+		 * If we've become the master (which could happen after an
+		 * election in another election thread), or we find we have a
+		 * working connection to a known master, then we're quite
+		 * content: that's really the essential purpose of this whole
+		 * thread.
+		 */
+		if (__repmgr_master_is_known(env))
+			goto unlock;
+
+		/*
+		 * When circumstances force us to do an immediate election, we
+		 * may be forced to create multiple threads in order to do so.
+		 * But we certainly don't need multiple threads sleeping,
+		 * alternating and retrying.  The "preferred election thread" is
+		 * the one that has the authority and responsibility to
+		 * persevere until our work is done.  Note that this role can
+		 * switch from one thread to another, depending on the timing of
+		 * events.  In particular, when an election fails the thread
+		 * that got the failure becomes the chosen one that will remain
+		 * to avenge the failure.
+		 */
+		if (db_rep->preferred_elect_thr != th)
+			goto unlock;
+
+		timespecclear(&wait_til);
+		__os_gettime(env, &now, 1);
+
+		/*
+		 * See if it's time to retry the operation.  Normally it's an
+		 * election we're interested in retrying.  But we refrain from
+		 * calling for elections if so configured.
+		 */
+		suppress_election = LF_ISSET(ELECT_F_STARTUP) ?
+		    db_rep->init_policy == DB_REP_CLIENT :
+		    !FLD_ISSET(rep->config, REP_C_ELECTIONS);
+		repstart_time = db_rep->repstart_time;
+		target = suppress_election ? repstart_time : failtime;
+		TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait);
+		if (timespeccmp(&now, &target, >=)) {
+			/*
+			 * We've surpassed our target retry time.
+			 * However, elections should generally alternate with
+			 * rep_start calls, so do that if we haven't done one
+			 * since the last election.
+			 */
+			action = suppress_election ? REPSTART :
+			    (done_repstart ? ELECTION : REPSTART);
+
+		} else if (db_rep->new_connection) {
+			/* Seen a recent new connection, let's do rep_start. */
+			action = REPSTART;
+		} else
+			wait_til = target;
+
+		if (!timespecisset(&wait_til)) {
+			response_time = __repmgr_compute_response_time(env);
+			target = repstart_time;
+			TIMESPEC_ADD_DB_TIMEOUT(&target, response_time);
+			if (timespeccmp(&now, &target, <)) {
+				/* We haven't waited long enough. */
+				wait_til = target;
+			}
+		}
+
+		if (timespecisset(&wait_til)) {
+#ifdef DB_WIN32
+			timespecsub(&wait_til, &now);
+			DB_TIMESPEC_TO_TIMEOUT(t, &wait_til, TRUE);
+			duration = t / US_PER_MS;
+			if ((ret = SignalObjectAndWait(*db_rep->mutex,
+			    db_rep->check_election, duration, FALSE)) !=
+			    WAIT_OBJECT_0 && ret != WAIT_TIMEOUT)
+				goto out;
+
+			LOCK_MUTEX(db_rep->mutex);
+
+			/*
+			 * Although there could be multiple threads, only the
+			 * "preferred" thread resets the event object.  If the
+			 * others tried to do so, the preferred thread might
+			 * miss the wake-up.  Another way of saying this is that
+			 * the precise meaning of the check_election event is
+			 * that "there may be some election-thread-related work
+			 * to do, and the correct thread to do it has not yet
+			 * been woken up".
+			 */
+			if (ret == WAIT_OBJECT_0 &&
+			    db_rep->preferred_elect_thr == th &&
+			    !ResetEvent(db_rep->check_election)) {
+				ret = GetLastError();
+				goto unlock;
+			}
+#else
+			deadline.tv_sec = wait_til.tv_sec;
+			deadline.tv_nsec = wait_til.tv_nsec;
+			if ((ret = pthread_cond_timedwait(
+			    &db_rep->check_election, db_rep->mutex, &deadline))
+			    != ETIMEDOUT && ret != 0)
+				goto unlock;
+#endif
+			continue;
+		}
+
+		UNLOCK_MUTEX(db_rep->mutex);
+		if (action == ELECTION) {
+			db_rep->new_connection = FALSE;
+			if ((ret = __repmgr_elect(env, 0, &failtime)) ==
+			    DB_REP_UNAVAIL)
+				done_repstart = FALSE;
+			else
+				goto out;
+			LOCK_MUTEX(db_rep->mutex);
+			db_rep->preferred_elect_thr = th;
+		} else {
+			DB_ASSERT(env, action == REPSTART);
+
+			db_rep->new_connection = FALSE;
+			if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+				goto out;
+			done_repstart = TRUE;
+
+			LOCK_MUTEX(db_rep->mutex);
+			__os_gettime(env, &db_rep->repstart_time, 1);
+		}
+	}
+
+#ifdef HAVE_STATISTICS
+	/*
+	 * We normally don't bother taking a mutex to increment statistics.  But
+	 * in this case, since we're incrementing and decrementing in pairs, it
+	 * could be very weird if we were "off somewhat".  For example, we could
+	 * get a negative value.  And this is not a high-traffic, performance-
+	 * critical path.
+	 *     On the other hand, it suffices to take repmgr's (handle-based)
+	 * mutex, rather than the rep mutex which normally protects shared
+	 * memory, since all election thread activity must be occurring in the
+	 * single listener process, under control of one single rep handle.
+	 */
+out:
+	LOCK_MUTEX(db_rep->mutex);
+unlock:
+	rep->mstat.st_elect_threads--;
+	UNLOCK_MUTEX(db_rep->mutex);
+#else
+unlock:
+	UNLOCK_MUTEX(db_rep->mutex);
+out:
+#endif
+	return (ret);
+}
+
+static db_timeout_t
+__repmgr_compute_response_time(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	db_timeout_t ato, eto;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * Avoid crowding operations too close together.  If we've just recently
+	 * done a rep_start, wait a moment in case there's a master out there,
+	 * to give it a chance to respond with a NEWMASTER message.  This is
+	 * particularly an issue at start-up time, when we're likely to have
+	 * several "new connection establishment" events bombarding us with lots
+	 * of rep_start requests in quick succession.
+	 *
+	 * We don't have a separate user configuration for rep_start response,
+	 * but it's reasonable to expect it to be similar to either the ack
+	 * timeout or the election timeout, whichever is smaller.  However, only
+	 * consider the ack timeout if all signs point to it being in use.
+	 */
+	ato = rep->ack_timeout;
+	eto = rep->elect_timeout;
+	if (ato > 0 &&
+	    rep->perm_policy != DB_REPMGR_ACKS_NONE &&
+	    rep->priority > 0 &&
+	    ato < eto)
+		return (ato);
+
+	return (eto);
+}
+
+static int
+__repmgr_elect(env, flags, failtimep)
+	ENV *env;
+	u_int32_t flags;
+	db_timespec *failtimep;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	u_int32_t invitation, nsites, nvotes;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	nsites = db_rep->region->config_nsites;
+	DB_ASSERT(env, nsites > 0);
+
+	/*
+	 * With only 2 sites in the group, even a single failure could make it
+	 * impossible to get a majority.  So, fudge a little, unless the user
+	 * really wants strict safety.
+	 */
+	if (nsites == 2 &&
+	    !FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT))
+		nvotes = 1;
+	else
+		nvotes = ELECTION_MAJORITY(nsites);
+
+	if (LF_ISSET(ELECT_F_INVITEE)) {
+		/*
+		 * We're going to the election party because we were invited by
+		 * another site.  Accept the other site's suggested value, if
+		 * it's reasonable.  (I.e., the other site may have wanted to do
+		 * a "fast" election after losing contact with the master.  If
+		 * so, let's not spoil it by imposing our own full nsites count
+		 * on it.)
+		 */
+		rep = db_rep->region;
+		invitation = rep->nsites;
+		if (invitation == nsites || invitation == nsites - 1) {
+			nsites = invitation;
+		}
+	}
+	if (LF_ISSET(ELECT_F_FAST) && nsites > nvotes) {
+		/*
+		 * If we're doing an election because we noticed that the master
+		 * failed, it's reasonable to expect that the master won't
+		 * participate.  By not waiting for its vote, we can probably
+		 * complete the election faster.  But note that we shouldn't
+		 * allow this to affect nvotes calculation.
+		 *
+		 * However, if we have 2 sites, and strict majority is turned
+		 * on, now nvotes would be 2, and it doesn't make sense to
+		 * rep_elect to see nsites of 1 in that case.  So only decrement
+		 * nsites if it currently exceeds nvotes.
+		 */
+		nsites--;
+	}
+	/* The rule for leases overrides all of the above. */
+	if (IS_USING_LEASES(env))
+		nsites = 0;
+
+	switch (ret = __rep_elect_int(env, nsites, nvotes, 0)) {
+	case DB_REP_UNAVAIL:
+		__os_gettime(env, failtimep, 1);
+		DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL);
+		if ((t_ret = send_membership(env)) != 0)
+			ret = t_ret;
+		break;
+
+	case 0:
+		if (db_rep->takeover_pending)
+			ret = __repmgr_claim_victory(env);
+		break;
+
+	case DB_REP_IGNORE:
+		ret = 0;
+		break;
+
+	default:
+		__db_err(env, ret, DB_STR("3629",
+		    "unexpected election failure"));
+		break;
+	}
+	return (ret);
+}
+
+/*
+ * If an election fails with DB_REP_UNAVAIL, it could be because a participating
+ * site has an obsolete, too-high notion of the group size.  (This could happen
+ * if the site was down/disconnected during removal of some (other) sites.)  To
+ * remedy this, broadcast a current copy of the membership list.  Since all
+ * sites are doing this, and we always ratchet to the most up-to-date version,
+ * this should bring all sites up to date.  We only do this after a failure,
+ * during what will normally be an idle period anyway, so that we don't slow
+ * down a first election following the loss of an active master.
+ */
+static int
+send_membership(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	u_int8_t *buf;
+	size_t len;
+	int ret;
+
+	db_rep = env->rep_handle;
+	buf = NULL;
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) != 0)
+		goto out;
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "Broadcast latest membership list"));
+	ret = __repmgr_bcast_own_msg(env, REPMGR_SHARING, buf, len);
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (buf != NULL)
+		__os_free(env, buf);
+	return (ret);
+}
+
+/*
+ * Becomes master after we've won an election, if we can.
+ *
+ * PUBLIC: int __repmgr_claim_victory __P((ENV *));
+ */
+int
+__repmgr_claim_victory(env)
+	ENV *env;
+{
+	int ret;
+
+	env->rep_handle->takeover_pending = FALSE;
+	if ((ret = __repmgr_become_master(env)) == DB_REP_UNAVAIL) {
+		ret = 0;
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Won election but lost race with DUPMASTER client intent"));
+	}
+	return (ret);
+}
+
+/*
+ * When turning on elections in an already-running system, check to see if we're
+ * in a state where we need an election (i.e., we would have started one
+ * previously if elections hadn't been turned off), and if so start one.
+ *
+ * PUBLIC: int __repmgr_turn_on_elections __P((ENV *));
+ */
+int
+__repmgr_turn_on_elections(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+
+	DB_ASSERT(env, REP_ON(env));
+	LOCK_MUTEX(db_rep->mutex);
+	if (db_rep->selector == NULL ||
+	    !FLD_ISSET(rep->config, REP_C_ELECTIONS) ||
+	    __repmgr_master_is_known(env))
+		goto out;
+
+	ret = __repmgr_init_election(env, ELECT_F_IMMED);
+
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
diff --git a/src/repmgr/repmgr_method.c b/src/repmgr/repmgr_method.c
new file mode 100644
index 00000000..229cf650
--- /dev/null
+++ b/src/repmgr/repmgr_method.c
@@ -0,0 +1,3092 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+/* Context for an API thread waiting for response to a synchronous request. */
+struct response_wait {
+	REPMGR_CONNECTION *conn;
+	u_int32_t index;
+};
+
+static int addr_chk __P((const ENV *, const char *, u_int));
+static void adjust_bulk_response __P((ENV *, DBT *));
+static int bad_callback_method __P((DB_CHANNEL *, const char *));
+static void copy_body __P((u_int8_t *, REPMGR_IOVECS *));
+static int get_shared_netaddr __P((ENV *, int, repmgr_netaddr_t *));
+static int establish_connection __P((ENV *, int, REPMGR_CONNECTION **));
+static int get_channel_connection __P((CHANNEL *, REPMGR_CONNECTION **));
+static int init_dbsite __P((ENV *, int, const char *, u_int, DB_SITE **));
+static int join_group_at_site __P((ENV *, repmgr_netaddr_t *));
+static int kick_blockers __P((ENV *, REPMGR_CONNECTION *, void *));
+static int make_request_conn __P((ENV *,
+    repmgr_netaddr_t *, REPMGR_CONNECTION **));
+static int set_local_site __P((DB_SITE *, u_int32_t));
+static int read_own_msg __P((ENV *,
+    REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *));
+static int refresh_site __P((DB_SITE *));
+static int __repmgr_await_threads __P((ENV *));
+static int __repmgr_build_data_out __P((ENV *,
+    DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
+static int __repmgr_build_msg_out __P((ENV *,
+    DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
+static int repmgr_only __P((ENV *, const char *));
+static int __repmgr_restart __P((ENV *, int, u_int32_t));
+static int __repmgr_remove_site __P((DB_SITE *));
+static int __repmgr_remove_site_pp __P((DB_SITE *));
+static int __repmgr_start_msg_threads __P((ENV *, u_int));
+static int request_self __P((ENV *, DBT *, u_int32_t, DBT *, u_int32_t));
+static int response_complete __P((ENV *, void *));
+static int send_msg_conn __P((ENV *, REPMGR_CONNECTION *, DBT *, u_int32_t));
+static int send_msg_self __P((ENV *, REPMGR_IOVECS *, u_int32_t));
+static int site_by_addr __P((ENV *, const char *, u_int, DB_SITE **));
+
+/*
+ * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ */
+int
+__repmgr_start(dbenv, nthreads, flags)
+	DB_ENV *dbenv;
+	int nthreads;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *me, *site;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int first, is_listener, locked, min, need_masterseek, ret, start_master;
+	u_int i, n;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	switch (flags) {
+	case 0:
+	case DB_REP_CLIENT:
+	case DB_REP_ELECTION:
+	case DB_REP_MASTER:
+		break;
+	default:
+		__db_errx(env, DB_STR("3635",
+		    "repmgr_start: unrecognized flags parameter value"));
+		return (EINVAL);
+	}
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->repmgr_start", DB_INIT_REP);
+	if (!F_ISSET(env, ENV_THREAD)) {
+		__db_errx(env, DB_STR("3636",
+	    "Replication Manager needs an environment with DB_THREAD"));
+		return (EINVAL);
+	}
+
+	if (APP_IS_BASEAPI(env))
+		return (repmgr_only(env, "repmgr_start"));
+
+	/* Check that the required initialization has been done. */
+	if (!IS_VALID_EID(db_rep->self_eid)) {
+		__db_errx(env, DB_STR("3637",
+		    "A local site must be named before calling repmgr_start"));
+		return (EINVAL);
+	}
+
+	/* Check if it is a shut-down site, if so, clean the resources. */
+	if (db_rep->repmgr_status == stopped) {
+		if ((ret = __repmgr_stop(env)) != 0) {
+			__db_errx(env, DB_STR("3638",
+			    "Could not clean up repmgr"));
+			return (ret);
+		}
+		db_rep->repmgr_status = ready;
+	}
+
+	db_rep->init_policy = flags;
+	if ((ret = __rep_set_transport_int(env,
+	    db_rep->self_eid, __repmgr_send)) != 0)
+		return (ret);
+	if (!REPMGR_INITED(db_rep) && (ret = __repmgr_init(env)) != 0)
+		return (ret);
+	/*
+	 * As a prerequisite to starting replication, get our list of remote
+	 * sites properly set up.  Mainly this involves reading the group
+	 * membership database; but alternatively, deciding what to do when it's
+	 * not present (which depends on various conditions).
+	 */
+	start_master = (flags == DB_REP_MASTER);
+
+	if (db_rep->restored_list != NULL) {
+		ret = __repmgr_refresh_membership(env,
+		    db_rep->restored_list, db_rep->restored_list_length);
+		__os_free(env, db_rep->restored_list);
+		db_rep->restored_list = NULL;
+	} else {
+		ret = __repmgr_reload_gmdb(env);
+		me = SITE_FROM_EID(db_rep->self_eid);
+		if (ret == 0) {
+			if (me->membership != SITE_PRESENT)
+				/*
+				 * We have a database but the local site is not
+				 * shown as "present" in the group.  We must
+				 * have been removed from the group, or perhaps
+				 * we're being created via hot backup.  In
+				 * either case the thing to do is to try to
+				 * join.
+				 */
+				ret = __repmgr_join_group(env);
+		} else if (ret == ENOENT) {
+			ENV_ENTER(env, ip);
+			if (FLD_ISSET(me->config, DB_GROUP_CREATOR))
+				start_master = TRUE;
+			/*
+			 * LEGACY is inconsistent with CREATOR, but start_master
+			 * could still be true due to "flags" being passed as
+			 * DB_REP_MASTER.  In that case, being started as master
+			 * is irrelevant to establishing initial membership
+			 * list: LEGACY always takes precedence if set.
+			 */
+			if (FLD_ISSET(me->config, DB_LEGACY)) {
+				LOCK_MUTEX(db_rep->mutex);
+				db_rep->membership_version = 1;
+				db_rep->member_version_gen = 1;
+				for (n = i = 0; i < db_rep->site_cnt; i++) {
+					site = SITE_FROM_EID(i);
+					if (!FLD_ISSET(site->config, DB_LEGACY))
+						continue;
+					if ((ret = __repmgr_set_membership(env,
+					    site->net_addr.host,
+					    site->net_addr.port,
+					    SITE_PRESENT)) != 0)
+						break;
+					n++;
+				}
+				ret = __rep_set_nsites_int(env, n);
+				DB_ASSERT(env, ret == 0);
+				UNLOCK_MUTEX(db_rep->mutex);
+			} else if (start_master) {
+				LOCK_MUTEX(db_rep->mutex);
+				db_rep->membership_version = 1;
+				db_rep->member_version_gen = 1;
+				if ((ret = __repmgr_set_membership(env,
+				    me->net_addr.host, me->net_addr.port,
+				    SITE_PRESENT)) == 0) {
+					ret = __rep_set_nsites_int(env, 1);
+					DB_ASSERT(env, ret == 0);
+				}
+				UNLOCK_MUTEX(db_rep->mutex);
+			} else
+				ret = __repmgr_join_group(env);
+			ENV_LEAVE(env, ip);
+		} else if (ret == DB_DELETED)
+			ret = DB_REP_UNAVAIL;
+	}
+	if (ret != 0)
+		return (ret);
+
+	DB_ASSERT(env, start_master ||
+	    SITE_FROM_EID(db_rep->self_eid)->membership == SITE_PRESENT);
+
+	/*
+	 * If we're the first repmgr_start() call, we will have to start threads.
+	 * Therefore, we require a flags value (to tell us how).
+	 */
+	if (db_rep->repmgr_status != running && flags == 0) {
+		__db_errx(env, DB_STR("3639",
+	"a non-zero flags value is required for initial repmgr_start() call"));
+		return (EINVAL);
+	}
+
+	/*
+	 * Figure out the current situation.  The current invocation of
+	 * repmgr_start() is either the first one (on the given env handle), or
+	 * a subsequent one.
+	 *
+	 * Then, in case there could be multiple processes, we're either the
+	 * main listener process or a subordinate process.  On a "subsequent"
+	 * repmgr_start() call we already have enough information to know which
+	 * it is.  Otherwise, negotiate with information in the shared region to
+	 * claim the listener role if possible.
+	 *
+	 * To avoid a race, once we decide we're in the first call, mark the
+	 * handle as started, so that no other thread thinks the same thing.
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	if (db_rep->repmgr_status == running) {
+		first = FALSE;
+		is_listener = !IS_SUBORDINATE(db_rep);
+	} else {
+		first = TRUE;
+		db_rep->repmgr_status = running;
+
+		ENV_ENTER(env, ip);
+		MUTEX_LOCK(env, rep->mtx_repmgr);
+		if (rep->listener == 0) {
+			is_listener = TRUE;
+			__os_id(dbenv, &rep->listener, NULL);
+		} else {
+			is_listener = FALSE;
+			nthreads = 0;
+		}
+		MUTEX_UNLOCK(env, rep->mtx_repmgr);
+		ENV_LEAVE(env, ip);
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	locked = FALSE;
+
+	if (!first) {
+		/*
+		 * Subsequent call is allowed when ELECTIONS are turned off, so
+		 * that the application can make its own dynamic role changes.
+		 * It's also allowed in any case, if not trying to change roles
+		 * (flags == 0), in order to change number of message processing
+		 * threads.  The __repmgr_restart() function will take care of
+		 * these cases entirely.
+		 */
+		if (!is_listener || (flags != 0 &&
+		    FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))) {
+			__db_errx(env, DB_STR("3640",
+			    "repmgr is already started"));
+			ret = EINVAL;
+		} else
+			ret = __repmgr_restart(env, nthreads, flags);
+		return (ret);
+	}
+
+	/*
+	 * The minimum legal number of threads is either 1 or 0, depending upon
+	 * whether we're the main process or a subordinate.
+	 */
+	min = is_listener ? 1 : 0;
+	if (nthreads < min) {
+		__db_errx(env, DB_STR_A("3641",
+		    "repmgr_start: nthreads parameter must be >= %d",
+		    "%d"), min);
+		ret = EINVAL;
+		goto err;
+	}
+
+	/*
+	 * Ensure at least one more thread (for channel messages and GMDB
+	 * requests) beyond those set aside to avoid starvation of rep
+	 * messages.
+	 *
+	 * Note that it's OK to silently fudge the number here, because the
+	 * documentation says that "[i]n addition to these message processing
+	 * threads, the Replication Manager creates and manages a few of its own
+	 * threads of control."
+	 */
+	min = RESERVED_MSG_TH(env) + 1;
+	if (nthreads < min && is_listener)
+		nthreads = min;
+
+	if (is_listener) {
+		if ((ret = __repmgr_listen(env)) != 0)
+			goto err;
+		/*
+		 * Make some sort of call to rep_start before starting message
+		 * processing threads, to ensure that incoming messages being
+		 * processed always have a rep context properly configured.
+		 * Note that even if we're starting without recovery, we need a
+		 * rep_start call in case we're using leases.  Leases keep track
+		 * of rep_start calls even within an env region lifetime.
+		 */
+		if (start_master) {
+			ret = __repmgr_become_master(env);
+			/* No other repmgr threads running yet. */
+			DB_ASSERT(env, ret != DB_REP_UNAVAIL);
+			if (ret != 0)
+				goto err;
+			need_masterseek = FALSE;
+		} else {
+			if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+				goto err;
+			/*
+			 * The repmgr election code starts elections only if
+			 * the DB_REP_ELECTION start flag was specified, but
+			 * it performs other actions to help find a master for
+			 * DB_REP_CLIENT, which is why we need_masterseek for
+			 * both cases.
+			 */
+			need_masterseek = TRUE;
+		}
+
+		LOCK_MUTEX(db_rep->mutex);
+		locked = TRUE;
+
+		/*
+		 * Since these allocated memory blocks are used by other
+		 * threads, we have to be a bit careful about freeing them in
+		 * case of any errors.  __repmgr_await_threads (which we call in
+		 * the err: coda below) takes care of that.
+		 *
+		 * Start by allocating enough space for 2 election threads.  We
+		 * occasionally need that many; more are possible, but would be
+		 * extremely rare.
+		 */
+#define	ELECT_THREADS_ALLOC	2
+
+		if ((ret = __os_calloc(env, ELECT_THREADS_ALLOC,
+		    sizeof(REPMGR_RUNNABLE *), &db_rep->elect_threads)) != 0)
+			goto err;
+		db_rep->aelect_threads = ELECT_THREADS_ALLOC;
+		STAT(rep->mstat.st_max_elect_threads = ELECT_THREADS_ALLOC);
+
+		if ((ret = __os_calloc(env, (u_int)nthreads,
+		    sizeof(REPMGR_RUNNABLE *), &db_rep->messengers)) != 0)
+			goto err;
+		db_rep->athreads = (u_int)nthreads;
+
+		db_rep->nthreads = 0;
+		if ((ret =
+		    __repmgr_start_msg_threads(env, (u_int)nthreads)) != 0)
+			goto err;
+
+		if (need_masterseek) {
+			/*
+			 * The repstart_time field records that time when we
+			 * last issued a rep_start(CLIENT) that sent out a
+			 * NEWCLIENT message.  We use it to avoid doing so
+			 * twice in quick succession (to give the master a
+			 * reasonable chance to respond).  The rep_start()
+			 * that we just issued above doesn't count, because we
+			 * haven't established any connections yet, and so no
+			 * message could have been sent out.  The instant we
+			 * get our first connection set up we want to send out
+			 * our first real NEWCLIENT.
+			 */
+			timespecclear(&db_rep->repstart_time);
+
+			if ((ret = __repmgr_init_election(env,
+			    ELECT_F_STARTUP)) != 0)
+				goto err;
+		}
+		UNLOCK_MUTEX(db_rep->mutex);
+		locked = FALSE;
+	}
+	/* All processes (even non-listeners) need a select() thread. */
+	if ((ret = __repmgr_start_selector(env)) == 0)
+		return (is_listener ? 0 : DB_REP_IGNORE);
+
+err:
+	/* If we couldn't succeed at everything, undo the parts we did do. */
+	if (db_rep->selector != NULL) {
+		if (!locked)
+			LOCK_MUTEX(db_rep->mutex);
+		(void)__repmgr_stop_threads(env);
+		UNLOCK_MUTEX(db_rep->mutex);
+		locked = FALSE;
+	}
+	(void)__repmgr_await_threads(env);
+	if (!locked)
+		LOCK_MUTEX(db_rep->mutex);
+	(void)__repmgr_net_close(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_valid_config __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_valid_config(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	DB_ASSERT(env, REP_ON(env));
+	LOCK_MUTEX(db_rep->mutex);
+
+	/* (Can't check IS_SUBORDINATE if select thread isn't running yet.) */
+	if (LF_ISSET(REP_C_ELECTIONS) &&
+	    db_rep->selector != NULL && IS_SUBORDINATE(db_rep)) {
+		__db_errx(env, DB_STR("3642",
+	    "can't configure repmgr elections from subordinate process"));
+		ret = EINVAL;
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * Starts message processing threads.  On entry, the actual number of threads
+ * already active is db_rep->nthreads; the desired number of threads is passed
+ * as "n".
+ *
+ * Caller must hold mutex.
+ */
+static int
+__repmgr_start_msg_threads(env, n)
+	ENV *env;
+	u_int n;
+{
+	DB_REP *db_rep;
+	REPMGR_RUNNABLE *messenger;
+	int ret;
+
+	db_rep = env->rep_handle;
+	DB_ASSERT(env, db_rep->athreads >= n);
+	while (db_rep->nthreads < n) {
+		if ((ret = __os_calloc(env,
+		    1, sizeof(REPMGR_RUNNABLE), &messenger)) != 0)
+			return (ret);
+
+		messenger->run = __repmgr_msg_thread;
+		if ((ret = __repmgr_thread_start(env, messenger)) != 0) {
+			__os_free(env, messenger);
+			return (ret);
+		}
+		db_rep->messengers[db_rep->nthreads++] = messenger;
+	}
+	return (0);
+}
+
+/*
+ * Handles a repmgr_start() call that occurs when repmgr is already running.
+ * This is allowed (when elections are not in use), to dynamically change
+ * master/client role.  It is also allowed (regardless of the ELECTIONS setting)
+ * to change the number of msg processing threads.
+ */
+static int
+__repmgr_restart(env, nthreads, flags)
+	ENV *env;
+	int nthreads;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_RUNNABLE **th;
+	u_int32_t cur_repflags;
+	int locked, ret, t_ret;
+	u_int delta, i, min, nth;
+
+	th = NULL;
+	locked = FALSE;
+
+	if (flags == DB_REP_ELECTION) {
+		__db_errx(env, DB_STR("3643",
+	    "subsequent repmgr_start() call may not specify DB_REP_ELECTION"));
+		return (EINVAL);
+	}
+	if (nthreads < 0) {
+		__db_errx(env, DB_STR("3644",
+		    "repmgr_start: nthreads parameter must be >= 0"));
+		return (EINVAL);
+	}
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	DB_ASSERT(env, REP_ON(env));
+	rep = db_rep->region;
+
+	cur_repflags = F_ISSET(rep, REP_F_MASTER | REP_F_CLIENT);
+	DB_ASSERT(env, cur_repflags);
+	if (FLD_ISSET(cur_repflags, REP_F_MASTER) &&
+	    flags == DB_REP_CLIENT)
+		ret = __repmgr_become_client(env);
+	else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) &&
+	    flags == DB_REP_MASTER)
+		ret = __repmgr_become_master(env);
+	if (ret != 0)
+		return (ret);
+
+	if (nthreads == 0)
+		return (0);
+	nth = (u_int)nthreads;
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	min = RESERVED_MSG_TH(env) + db_rep->non_rep_th;
+	if (nth < min)
+		nth = min;
+
+	if (nth > db_rep->nthreads) {
+		/*
+		 * To increase the number of threads, first allocate more space,
+		 * unless we already have enough unused space available.
+		 */
+		if (db_rep->athreads < nth) {
+			if ((ret = __os_realloc(env,
+			    sizeof(REPMGR_RUNNABLE *) * nth,
+			    &db_rep->messengers)) != 0)
+				goto out;
+			db_rep->athreads = nth;
+		}
+		ret = __repmgr_start_msg_threads(env, nth);
+	} else if (nth < db_rep->nthreads) {
+		/*
+		 * Remove losers from array, and then wait for each of them.  We
+		 * have to make an array copy, because we have to drop the mutex
+		 * to wait for the threads to complete, and if we left the real
+		 * array in the handle in the pending state while waiting,
+		 * another thread could come along wanting to make another
+		 * change, and would make a mess.
+		 *     The alternative is about as inelegant: we could do these
+		 * one at a time here if we added another field to the handle,
+		 * to keep track of both the actual number of threads and the
+		 * user's desired number of threads.
+		 */
+		/*
+		 * Make sure signalling the condition variable works, before
+		 * making a mess of the data structures.  Although it may seem a
+		 * little backwards, it doesn't really matter since we're
+		 * holding the mutex.  Once we allocate the temp array and grab
+		 * ownership of the loser thread structs, we must continue
+		 * trying (even if errors) so that we definitely free the
+		 * memory.
+		 */
+		if ((ret = __repmgr_wake_msngers(env, nth)) != 0)
+			goto out;
+		delta = db_rep->nthreads - nth;
+		if ((ret = __os_calloc(env, (size_t)delta,
+		    sizeof(REPMGR_RUNNABLE *), &th)) != 0)
+			goto out;
+		for (i = 0; i < delta; i++) {
+			th[i] = db_rep->messengers[nth + i];
+			th[i]->quit_requested = TRUE;
+			db_rep->messengers[nth + i] = NULL;
+		}
+		db_rep->nthreads = nth;
+		UNLOCK_MUTEX(db_rep->mutex);
+		locked = FALSE;
+
+		DB_ASSERT(env, ret == 0);
+		for (i = 0; i < delta; i++) {
+			if ((t_ret = __repmgr_thread_join(th[i])) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+			__os_free(env, th[i]);
+		}
+		__os_free(env, th);
+	}
+
+out:	if (locked)
+		UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_autostart __P((ENV *));
+ *
+ * Preconditions: rep_start() has been called; we're within an ENV_ENTER.
+ */
+int
+__repmgr_autostart(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	DB_ASSERT(env, REP_ON(env));
+	LOCK_MUTEX(db_rep->mutex);
+
+	if (REPMGR_INITED(db_rep))
+		ret = 0;
+	else
+		ret = __repmgr_init(env);
+	if (ret != 0)
+		goto out;
+
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "Automatically joining existing repmgr env"));
+
+	/*
+	 * We're only called if we're a master, which means we've had a
+	 * rep_start() call, which means we must have had a previous
+	 * rep_set_transport() call (in the region, in a separate env handle).
+	 * We could therefore get away with simply poking in a pointer to our
+	 * send function; but we need to dig up our EID value anyway, so we
+	 * might as well set it properly.
+	 */
+	db_rep->self_eid = rep->eid;
+	if ((ret = __rep_set_transport_int(env,
+	    db_rep->self_eid, __repmgr_send)) != 0)
+		goto out;
+
+	if (db_rep->selector == NULL && db_rep->repmgr_status != running)
+		ret = __repmgr_start_selector(env);
+
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_start_selector __P((ENV *));
+ */
+int
+__repmgr_start_selector(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_RUNNABLE *selector;
+	int ret;
+
+	db_rep = env->rep_handle;
+	if ((ret = __os_calloc(env, 1, sizeof(REPMGR_RUNNABLE), &selector))
+	    != 0)
+		return (ret);
+	selector->run = __repmgr_select_thread;
+
+	/*
+	 * In case the select thread ever examines db_rep->selector, set it
+	 * before starting the thread (since once we create it we could be
+	 * racing with it).
+	 */
+	db_rep->selector = selector;
+	if ((ret = __repmgr_thread_start(env, selector)) != 0) {
+		__db_err(env, ret, DB_STR("3645",
+		    "can't start selector thread"));
+		__os_free(env, selector);
+		db_rep->selector = NULL;
+		return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_close __P((ENV *));
+ *
+ * Close repmgr during env close.  It stops repmgr, frees sites array and
+ * its addresses.
+ */
+int
+__repmgr_close(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int ret;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	ret = __repmgr_stop(env);
+	if (db_rep->sites != NULL) {
+		for (i = 0; i < db_rep->site_cnt; i++) {
+			site = &db_rep->sites[i];
+			DB_ASSERT(env, TAILQ_EMPTY(&site->sub_conns));
+			__repmgr_cleanup_netaddr(env, &site->net_addr);
+		}
+		__os_free(env, db_rep->sites);
+		db_rep->sites = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_stop __P((ENV *));
+ *
+ * Stop repmgr either when closing the env or removing the current repmgr from
+ * replication group.  It stops threads if necessary, frees resources allocated
+ * after __repmgr_start, and cleans up site membership.
+ */
+int
+__repmgr_stop(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int ret, t_ret;
+	u_int i;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+
+	if (db_rep->selector != NULL) {
+		if (db_rep->repmgr_status != stopped) {
+			LOCK_MUTEX(db_rep->mutex);
+			ret = __repmgr_stop_threads(env);
+			UNLOCK_MUTEX(db_rep->mutex);
+		}
+		if ((t_ret = __repmgr_await_threads(env)) != 0 && ret == 0)
+			ret = t_ret;
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Repmgr threads are finished"));
+	}
+	__repmgr_net_destroy(env, db_rep);
+	if ((t_ret = __repmgr_deinit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __repmgr_queue_destroy(env)) != 0 && ret == 0)
+		ret = t_ret;
+	if (db_rep->restored_list != NULL) {
+		__os_free(env, db_rep->restored_list);
+		db_rep->restored_list = NULL;
+	}
+	/*
+	 * Clean up current site membership and state, so that the obsolete
+	 * membership won't mislead us for the next repmgr start.
+	 */
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		site = &db_rep->sites[i];
+		site->state = SITE_IDLE;
+		site->membership = 0;
+	}
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_ack_policy __P((DB_ENV *, int));
+ */
+int
+__repmgr_set_ack_policy(dbenv, policy)
+	DB_ENV *dbenv;
+	int policy;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+	int ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->repmgr_set_ack_policy", DB_INIT_REP);
+
+	if (APP_IS_BASEAPI(env))
+		return (repmgr_only(env, "repmgr_set_ack_policy"));
+
+	switch (policy) {
+	case DB_REPMGR_ACKS_ALL:
+	case DB_REPMGR_ACKS_ALL_AVAILABLE:
+	case DB_REPMGR_ACKS_ALL_PEERS:
+	case DB_REPMGR_ACKS_NONE:
+	case DB_REPMGR_ACKS_ONE:
+	case DB_REPMGR_ACKS_ONE_PEER:
+	case DB_REPMGR_ACKS_QUORUM:
+		if (REP_ON(env)) {
+			if (rep->perm_policy != policy) {
+				rep->perm_policy = policy;
+				if ((ret = __repmgr_bcast_parm_refresh(env))
+				    != 0)
+					return (ret);
+			}
+		} else
+			db_rep->perm_policy = policy;
+		/*
+		 * Setting an ack policy makes this a replication manager
+		 * application.
+		 */
+		APP_SET_REPMGR(env);
+		return (0);
+	default:
+		__db_errx(env, DB_STR("3646",
+		    "unknown ack_policy in DB_ENV->repmgr_set_ack_policy"));
+		return (EINVAL);
+	}
+}
+
+/*
+ * PUBLIC: int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+ */
+int
+__repmgr_get_ack_policy(dbenv, policy)
+	DB_ENV *dbenv;
+	int *policy;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	REP *rep;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	*policy = REP_ON(env) ? rep->perm_policy : db_rep->perm_policy;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_env_create __P((ENV *, DB_REP *));
+ */
+int
+__repmgr_env_create(env, db_rep)
+	ENV *env;
+	DB_REP *db_rep;
+{
+	int ret;
+
+	/* Set some default values. */
+	db_rep->ack_timeout = DB_REPMGR_DEFAULT_ACK_TIMEOUT;
+	db_rep->connection_retry_wait = DB_REPMGR_DEFAULT_CONNECTION_RETRY;
+	db_rep->election_retry_wait = DB_REPMGR_DEFAULT_ELECTION_RETRY;
+	db_rep->config_nsites = 0;
+	db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM;
+	FLD_SET(db_rep->config, REP_C_ELECTIONS);
+	FLD_SET(db_rep->config, REP_C_2SITE_STRICT);
+
+	db_rep->self_eid = DB_EID_INVALID;
+	db_rep->listen_fd = INVALID_SOCKET;
+	TAILQ_INIT(&db_rep->connections);
+	TAILQ_INIT(&db_rep->retries);
+
+	db_rep->input_queue.size = 0;
+	STAILQ_INIT(&db_rep->input_queue.header);
+
+	__repmgr_env_create_pf(db_rep);
+	ret = __repmgr_create_mutex(env, &db_rep->mutex);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_env_destroy __P((ENV *, DB_REP *));
+ */
+void
+__repmgr_env_destroy(env, db_rep)
+	ENV *env;
+	DB_REP *db_rep;
+{
+	if (db_rep->mutex != NULL) {
+		(void)__repmgr_destroy_mutex(env, db_rep->mutex);
+		db_rep->mutex = NULL;
+	}
+}
+
+/*
+ * PUBLIC: int __repmgr_stop_threads __P((ENV *));
+ *
+ * Caller must hold mutex;
+ */
+int
+__repmgr_stop_threads(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	db_rep->repmgr_status = stopped;
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Stopping repmgr threads"));
+	if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
+		return (ret);
+
+	/*
+	 * Because we've set "finished", it's enough to wake msg_avail, even on
+	 * Windows.  (We don't need to wake per-thread Event Objects here, as we
+	 * did in the case of only wanting to stop a subset of msg threads.)
+	 */
+	if ((ret = __repmgr_signal(&db_rep->msg_avail)) != 0)
+		return (ret);
+
+	if ((ret = __repmgr_each_connection(env,
+	    kick_blockers, NULL, TRUE)) != 0)
+		return (ret);
+
+	return (__repmgr_wake_main_thread(env));
+}
+
+static int
+kick_blockers(env, conn, unused)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *unused;
+{
+	int ret, t_ret;
+
+	COMPQUIET(unused, NULL);
+
+	ret = __repmgr_signal(&conn->drained);
+	if ((t_ret = __repmgr_wake_waiters(env,
+	    &conn->response_waiters)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * "Joins" all repmgr background threads.
+ */
+static int
+__repmgr_await_threads(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_RUNNABLE *th;
+	REPMGR_SITE *site;
+	int ret, t_ret;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	/*
+	 * First wait for the threads we started explicitly.  Then wait for
+	 * those "remote descendent" threads that these first threads may have
+	 * started.  This order is important, because, for example, the select
+	 * thread, in its last gasp, may have started yet another new instance
+	 * of a connector thread.
+	 */
+
+	/* Message processing threads. */
+	for (i = 0;
+	    i < db_rep->nthreads && db_rep->messengers[i] != NULL; i++) {
+		th = db_rep->messengers[i];
+		if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+			ret = t_ret;
+		__os_free(env, th);
+	}
+	__os_free(env, db_rep->messengers);
+	db_rep->messengers = NULL;
+
+	/* The select() loop thread. */
+	if (db_rep->selector != NULL) {
+		if ((t_ret = __repmgr_thread_join(db_rep->selector)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		__os_free(env, db_rep->selector);
+		db_rep->selector = NULL;
+	}
+
+	/* Election threads. */
+	for (i = 0; i < db_rep->aelect_threads; i++) {
+		th = db_rep->elect_threads[i];
+		if (th != NULL) {
+			if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+				ret = t_ret;
+			__os_free(env, th);
+		}
+	}
+	__os_free(env, db_rep->elect_threads);
+	db_rep->aelect_threads = 0;
+
+	/* Threads opening outgoing socket connections. */
+	FOR_EACH_REMOTE_SITE_INDEX(i) {
+		LOCK_MUTEX(db_rep->mutex);
+		site = SITE_FROM_EID(i);
+		th = site->connector;
+		site->connector = NULL;
+		UNLOCK_MUTEX(db_rep->mutex);
+		if (th != NULL) {
+			if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+				ret = t_ret;
+			__os_free(env, th);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+ */
+int
+__repmgr_local_site(dbenv, sitep)
+	DB_ENV *dbenv;
+	DB_SITE **sitep;
+{
+	DB_REP *db_rep;
+	ENV *env;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+
+	if (!IS_VALID_EID(db_rep->self_eid))
+		return (DB_NOTFOUND);
+	return (__repmgr_site_by_eid(dbenv, db_rep->self_eid, sitep));
+}
+
+static int
+addr_chk(env, host, port)
+	const ENV *env;
+	const char *host;
+	u_int port;
+{
+	if (host == NULL || host[0] == '\0') {
+		__db_errx(env, DB_STR("3648",
+		    "repmgr_site: a host name is required"));
+		return (EINVAL);
+	}
+	if (port == 0 || port > UINT16_MAX) {
+		__db_errx(env, DB_STR_A("3649",
+		    "repmgr_site: port out of range [1,%u]", "%u"), UINT16_MAX);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+ */
+int
+__repmgr_channel(dbenv, eid, dbchannelp, flags)
+	DB_ENV *dbenv;
+	int eid;
+	DB_CHANNEL **dbchannelp;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REP *rep;
+	DB_REP *db_rep;
+	DB_CHANNEL *dbchannel;
+	CHANNEL *channel;
+	REPMGR_CONNECTION *conn;
+	int cur_eid, master, ret;
+
+	channel = NULL;
+	dbchannel = NULL;
+	conn = NULL;
+
+	env = dbenv->env;
+	if ((ret = __db_fchk(env, "DB_ENV->repmgr_channel", flags, 0)) != 0)
+		return (ret);
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (db_rep->selector == NULL) {
+		__db_errx(env, DB_STR("3650",
+    "DB_ENV->repmgr_channel: must be called after DB_ENV->repmgr_start"));
+		return (EINVAL);
+	}
+	/*
+	 * Note that repmgr_start() checks DB_INIT_REP, ENV_THREAD and
+	 * APP_IS_BASEAPI.
+	 */
+	if (db_rep->repmgr_status == stopped) {
+		__db_errx(env, DB_STR("3651", "repmgr is stopped"));
+		return (EINVAL);
+	}
+
+	if (eid == DB_EID_MASTER) {
+		if ((master = rep->master_id) == DB_EID_INVALID)
+			return (DB_REP_UNAVAIL);
+		cur_eid = master;
+	} else if (IS_KNOWN_REMOTE_SITE(eid))
+		cur_eid = eid;
+	else {
+		__db_errx(env, DB_STR_A("3652",
+			"%d is not a valid remote EID", "%d"), eid);
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	if ((ret = __os_calloc(env, 1, sizeof(DB_CHANNEL), &dbchannel)) != 0 ||
+	    (ret = __os_calloc(env, 1, sizeof(CHANNEL), &channel)) != 0)
+		goto err;
+	dbchannel->channel = channel;
+	channel->db_channel = dbchannel;
+	channel->env = env;
+
+	/* Preserve EID as passed by the caller (not cur_eid). */
+	dbchannel->eid = eid;
+	dbchannel->timeout = DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT;
+
+	dbchannel->close = __repmgr_channel_close;
+	dbchannel->send_msg = __repmgr_send_msg;
+	dbchannel->send_request = __repmgr_send_request;
+	dbchannel->set_timeout = __repmgr_channel_timeout;
+
+	if (cur_eid != db_rep->self_eid &&
+	    (ret = establish_connection(env, cur_eid, &conn)) != 0)
+		goto err;
+
+	if (IS_VALID_EID(eid)) {
+		DB_ASSERT(env, conn != NULL);
+		channel->c.conn = conn;
+	} else {
+		/*
+		 * If the configured EID is one of the special ones (MASTER or
+		 * BROADCAST) we need a mutex for dynamic messing with
+		 * connections that could happen later.
+		 */
+		if ((ret = __repmgr_create_mutex(env,
+		    &channel->c.conns.mutex)) != 0)
+			goto err;
+
+		if (conn != NULL) {
+			/*
+			 * Allocate enough array elements to use cur_eid as an
+			 * index; save the number of slots allocated as "cnt."
+			 */
+			if ((ret = __os_calloc(env,
+			    (u_int)cur_eid + 1, sizeof(REPMGR_CONNECTION *),
+			    &channel->c.conns.array)) != 0)
+				goto err;
+			channel->c.conns.cnt = (u_int)cur_eid + 1;
+			channel->c.conns.array[cur_eid] = conn;
+		}
+	}
+
+	if (conn != NULL) {
+		LOCK_MUTEX(db_rep->mutex);
+		conn->ref_count++;
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+
+	*dbchannelp = dbchannel;
+
+err:
+	if (ret != 0) {
+		if (conn != NULL)
+			(void)__repmgr_disable_connection(env, conn);
+		if (channel != NULL) {
+			if (!IS_VALID_EID(eid) &&
+			    channel->c.conns.mutex != NULL)
+				(void)__repmgr_destroy_mutex(env,
+				    channel->c.conns.mutex);
+			__os_free(env, channel);
+		}
+		if (dbchannel != NULL)
+			__os_free(env, dbchannel);
+	}
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+get_shared_netaddr(env, eid, netaddr)
+	ENV *env;
+	int eid;
+	repmgr_netaddr_t *netaddr;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGINFO *infop;
+	SITEINFO *base, *p;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	MUTEX_LOCK(env, rep->mtx_repmgr);
+
+	if ((u_int)eid >= rep->site_cnt) {
+		ret = DB_NOTFOUND;
+		goto err;
+	}
+	DB_ASSERT(env, rep->siteinfo_off != INVALID_ROFF);
+
+	infop = env->reginfo;
+	base = R_ADDR(infop, rep->siteinfo_off);
+	p = &base[eid];
+	netaddr->host = R_ADDR(infop, p->addr.host);
+	netaddr->port = p->addr.port;
+	ret = 0;
+
+err:
+	MUTEX_UNLOCK(env, rep->mtx_repmgr);
+	return (ret);
+}
+
+static int
+establish_connection(env, eid, connp)
+	ENV *env;
+	int eid;
+	REPMGR_CONNECTION **connp;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	DBT vi;
+	repmgr_netaddr_t netaddr;
+	__repmgr_msg_hdr_args msg_hdr;
+	__repmgr_version_confirmation_args conf;
+	int alloc, locked, ret, unused;
+
+	db_rep = env->rep_handle;
+	alloc = locked = FALSE;
+
+	if ((ret = get_shared_netaddr(env, eid, &netaddr)) != 0)
+		return (ret);
+
+	if ((ret = __repmgr_connect(env, &netaddr, &conn, &unused)) != 0)
+		return (ret);
+	conn->type = APP_CONNECTION;
+
+	/* Read a handshake msg, to get version confirmation and parameters. */
+	if ((ret = __repmgr_read_conn(conn)) != 0)
+		goto out;
+	/*
+	 * We can only get here after having read the full 9 bytes that we
+	 * expect, so this can't fail.
+	 */
+	DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+	ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+	    conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+	DB_ASSERT(env, ret == 0);
+	__repmgr_iovec_init(&conn->iovecs);
+	conn->reading_phase = DATA_PHASE;
+
+	if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
+		goto out;
+	alloc = TRUE;
+
+	if ((ret = __repmgr_read_conn(conn)) != 0)
+		goto out;
+
+	/*
+	 * Analyze the handshake msg, and stash relevant info.
+	 */
+	if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+		goto out;
+	DB_ASSERT(env, vi.size > 0);
+	if ((ret = __repmgr_version_confirmation_unmarshal(env,
+	    &conf, vi.data, vi.size, NULL)) != 0)
+		goto out;
+
+	if (conf.version < CHANNEL_MIN_VERSION) {
+		ret = DB_REP_UNAVAIL;
+		goto out;
+	}
+
+	conn->version = conf.version;
+
+	if ((ret = __repmgr_send_handshake(env,
+	    conn, NULL, 0, APP_CHANNEL_CONNECTION)) != 0)
+		goto out;
+	conn->state = CONN_READY;
+	__repmgr_reset_for_reading(conn);
+	if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+		__db_err(env, ret, DB_STR("3653", "set_nonblock channel"));
+		goto out;
+	}
+
+	/*
+	 * Turn over the responsibility for reading on this connection to the
+	 * select() thread.
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	if ((ret = __repmgr_wake_main_thread(env)) != 0)
+		goto out;
+
+	/*
+	 * Share this new connection with the select thread, which will
+	 * hereafter own the exclusive right to read input from it.  Once we get
+	 * past this point, we can't unilaterally close and destroy the
+	 * connection if a retryable connection error happens.  Fortunately,
+	 * we're now at the point where everything has succeeded; so there will
+	 * be no more errors.
+	 */
+	TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+	conn->ref_count++;
+	*connp = conn;
+
+out:
+	if (locked)
+		UNLOCK_MUTEX(db_rep->mutex);
+
+	if (ret != 0) {
+		/*
+		 * Since we can't have given the connection to the select()
+		 * thread yet, clean-up is as simple as this:
+		 */
+		(void)__repmgr_close_connection(env, conn);
+		(void)__repmgr_destroy_conn(env, conn);
+	}
+
+	if (alloc) {
+		DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
+		__os_free(env, conn->input.repmgr_msg.cntrl.data);
+		DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
+		__os_free(env, conn->input.repmgr_msg.rec.data);
+	}
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_msg_dispatch __P((DB_ENV *,
+ * PUBLIC:     void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+ * PUBLIC:     u_int32_t));
+ */
+int
+__repmgr_set_msg_dispatch(dbenv, dispatch, flags)
+	DB_ENV *dbenv;
+	void (*dispatch) __P((DB_ENV *,
+		DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	env = dbenv->env;
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->repmgr_msg_dispatch", flags, 0)) != 0)
+		return (ret);
+	if (APP_IS_BASEAPI(env))
+		return (repmgr_only(env, "repmgr_msg_dispatch"));
+
+	db_rep = env->rep_handle;
+	db_rep->msg_dispatch = dispatch;
+	APP_SET_REPMGR(env);
+	return (0);
+}
+
+/*
+ * Implementation of DB_CHANNEL->send_msg() method for use in a normal channel
+ * explicitly created by the message-originator application.
+ *
+ * PUBLIC: int __repmgr_send_msg __P((DB_CHANNEL *,
+ * PUBLIC:     DBT *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_send_msg(db_channel, msg, nmsg, flags)
+	DB_CHANNEL *db_channel;
+	DBT *msg;
+	u_int32_t nmsg;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	CHANNEL *channel;
+	REPMGR_CONNECTION *conn;
+	int ret;
+
+	channel = db_channel->channel;
+	env = channel->env;
+	if ((ret = __db_fchk(env,
+	    "DB_CHANNEL->send_msg", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	if ((ret = get_channel_connection(channel, &conn)) == 0)
+		ret = send_msg_conn(env, conn, msg, nmsg);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Sends an async msg on the given connection (or just copies it locally if conn
+ * is NULL, since that means we're "sending to the master" when we ourselves are
+ * the master).
+ */
+static int
+send_msg_conn(env, conn, msg, nmsg)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	DBT *msg;
+	u_int32_t nmsg;
+{
+	DB_REP *db_rep;
+	REPMGR_IOVECS *iovecs;
+	__repmgr_msg_metadata_args meta;
+	int ret;
+
+	db_rep = env->rep_handle;
+	memset(&meta, 0, sizeof(meta));
+	if (conn == NULL) {
+		/* Sending to DB_EID_MASTER when we ourselves are master. */
+		if ((ret = __repmgr_build_data_out(env,
+		    msg, nmsg, &meta, &iovecs)) != 0)
+			return (ret);
+		ret = send_msg_self(env, iovecs, nmsg);
+	} else {
+		if ((ret = __repmgr_build_msg_out(env,
+		    msg, nmsg, &meta, &iovecs)) != 0)
+			return (ret);
+		LOCK_MUTEX(db_rep->mutex);
+		ret = __repmgr_send_many(env, conn, iovecs, 0);
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+
+	__os_free(env, iovecs);
+	return (ret);
+}
+
+/*
+ * Simulate sending by simply copying the message into a msg struct to be
+ * queued.  On input, iovecs is ready to "send", with first slot set aside for
+ * message header.
+ */
+static int
+send_msg_self(env, iovecs, nmsg)
+	ENV *env;
+	REPMGR_IOVECS *iovecs;
+	u_int32_t nmsg;
+{
+	REPMGR_MESSAGE *msg;
+	size_t align, bodysize, structsize;
+	u_int8_t *membase;
+	int ret;
+
+	align = sizeof(double);
+	bodysize = iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE;
+	structsize = (size_t)DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) +
+	    nmsg * sizeof(DBT)), align);
+	if ((ret = __os_malloc(env, structsize + bodysize, &membase)) != 0)
+		return (ret);
+
+	msg = (void*)membase;
+	membase += structsize;
+
+	/*
+	 * Build a msg struct that looks like what would be received in the
+	 * usual case.
+	 */
+	msg->msg_hdr.type = REPMGR_APP_MESSAGE;
+	APP_MSG_BUFFER_SIZE(msg->msg_hdr) = (u_int32_t)bodysize;
+	APP_MSG_SEGMENT_COUNT(msg->msg_hdr) = nmsg;
+
+	msg->v.appmsg.conn = NULL;
+
+	/*
+	 * The "buf" is the message body (as [if] transmitted); i.e., it
+	 * excludes the header (which we've just constructed separately).  So,
+	 * skip over slot 0 in the iovecs, which had been reserved for the hdr.
+	 */
+	DB_INIT_DBT(msg->v.appmsg.buf, membase, bodysize);
+	copy_body(membase, iovecs);
+
+	return (__repmgr_queue_put(env, msg));
+}
+
+/*
+ * Copies a message body into a single contiguous buffer.  The given iovecs is
+ * assumed to have the first slot reserved for a message header, and we skip
+ * that part.
+ */
+static void
+copy_body(membase, iovecs)
+	u_int8_t *membase;
+	REPMGR_IOVECS *iovecs;
+{
+	size_t sz;
+	int i;
+
+	for (i = 1; i < iovecs->count; i++) {
+		if ((sz = (size_t)iovecs->vectors[i].iov_len) > 0) {
+			memcpy(membase, iovecs->vectors[i].iov_base, sz);
+			membase += sz;
+		}
+	}
+}
+
+/*
+ * Gets a connection to be used for sending, either an async message or a
+ * request.  On a DB_EID_MASTER channel this entails checking the current
+ * master, and possibly opening a new connection if the master has changed.
+ * Allow an old connection to stay intact, because responses to previous
+ * requests could still be arriving (though often the connection will have died
+ * anyway, if the master changed due to failure of the old master).
+ *
+ * If the local site is currently master, then for a master channel we return
+ * (via connp) a NULL pointer.
+ */
+static int
+get_channel_connection(channel, connp)
+	CHANNEL *channel;
+	REPMGR_CONNECTION **connp;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_CONNECTION *conn;
+	DB_CHANNEL *db_channel;
+	int eid, ret;
+
+	env = channel->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	db_channel = channel->db_channel;
+
+	/*
+	 * On a specific-EID channel it's very simple, because there is only
+	 * ever one connection, which was established when the channel was
+	 * created.
+	 */
+	if (db_channel->eid >= 0) {
+		*connp = channel->c.conn;
+		return (0);
+	}
+
+	/*
+	 * For now we only support one connection at a time.  When we support
+	 * DB_EID_BROADCAST channels in the future, we will have to loop through
+	 * all connected sites.
+	 */
+	DB_ASSERT(env, db_channel->eid == DB_EID_MASTER);
+	eid = rep->master_id;
+	if (eid == db_rep->self_eid) {
+		*connp = NULL;
+		return (0);
+	}
+	if (eid == DB_EID_INVALID)
+		return (DB_REP_UNAVAIL);
+
+	LOCK_MUTEX(channel->c.conns.mutex);
+	if ((u_int)eid >= channel->c.conns.cnt) {
+		/*
+		 * Allocate an array big enough such that `eid' is a valid
+		 * index; initialize the newly allocated (tail) portion.
+		 */
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Grow master-channel array to accommodate EID %d", eid));
+		if ((ret = __os_realloc(env,
+		    sizeof(REPMGR_CONNECTION *) * ((u_int)eid + 1),
+		    &channel->c.conns.array)) != 0)
+			goto out;
+		memset(&channel->c.conns.array[channel->c.conns.cnt],
+		    0,
+		    sizeof(REPMGR_CONNECTION *) *
+		    (((u_int)eid + 1) - channel->c.conns.cnt));
+		channel->c.conns.cnt = (u_int)eid + 1;
+	}
+	DB_ASSERT(env, (u_int)eid < channel->c.conns.cnt);
+
+	if ((conn = channel->c.conns.array[eid]) == NULL) {
+		if ((ret = establish_connection(env, eid, &conn)) != 0)
+			goto out;
+
+		/*
+		 * Even though `conn' is a newly created object, by the time we
+		 * get here it has already been given out to the select()
+		 * thread, so we should hold the mutex while incrementing the
+		 * ref count.
+		 */
+		LOCK_MUTEX(db_rep->mutex);
+		channel->c.conns.array[eid] = conn;
+		conn->ref_count++;
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+
+	*connp = conn;
+	ret = 0;
+out:
+	UNLOCK_MUTEX(channel->c.conns.mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_request __P((DB_CHANNEL *,
+ * PUBLIC:     DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+ */
+int
+__repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
+	DB_CHANNEL *db_channel;
+	DBT *request;
+	u_int32_t nrequest;
+	DBT *response;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_REP *db_rep;
+	CHANNEL *channel;
+	REPMGR_CONNECTION *conn;
+	REPMGR_IOVECS *iovecs;
+	REPMGR_RESPONSE *resp;
+	struct response_wait ctx;
+	__repmgr_msg_metadata_args meta;
+	size_t sz;
+	void *dummy;
+	u_int32_t i, n;
+	int ret;
+
+	channel = db_channel->channel;
+	env = channel->env;
+	db_rep = env->rep_handle;
+
+	if ((ret = __db_fchk(env,
+	    "DB_CHANNEL->send_request", flags, DB_MULTIPLE)) != 0)
+		return (ret);
+
+	if (db_channel->eid == DB_EID_BROADCAST) {
+		__db_errx(env, DB_STR("3654",
+    "DB_CHANNEL->send_request() not supported on DB_EID_BROADCAST channel"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	ret = get_channel_connection(channel, &conn);
+	ENV_LEAVE(env, ip);
+	if (ret != 0)
+		return (ret);
+
+	if (conn == NULL)
+		return (request_self(env, request, nrequest, response, flags));
+
+	/* Find an available array slot, or grow the array if necessary. */
+	LOCK_MUTEX(db_rep->mutex);
+	for (i = 0; i < conn->aresp; i++)
+		if (!(F_ISSET(&conn->responses[i], RESP_IN_USE)))
+			break;
+	if (i == conn->aresp) {
+		n = conn->aresp == 0 ? 1 : conn->aresp * 2;
+		ret = __os_realloc(env,
+		    sizeof(REPMGR_RESPONSE) * n, &conn->responses);
+		memset(&conn->responses[i], 0,
+		    sizeof(REPMGR_RESPONSE) * (n - i));
+		conn->aresp = n;
+	}
+	resp = &conn->responses[i];
+	resp->flags = RESP_IN_USE | RESP_THREAD_WAITING;
+	resp->dbt = *response;
+	resp->ret = 0;
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	/*
+	 * The index "i" is stable, but the address in the "resp" pointer could
+	 * change while we drop the mutex, if another thread has to grow the
+	 * allocated array.  So we can't use "resp" again until after we set it
+	 * again, from "i", under mutex protection.
+	 */
+
+	meta.tag = i;
+	meta.flags = REPMGR_REQUEST_MSG_TYPE |
+	    (LF_ISSET(DB_MULTIPLE) ? REPMGR_MULTI_RESP : 0) |
+	    (F_ISSET(response, DB_DBT_USERMEM) ? REPMGR_RESPONSE_LIMIT : 0);
+	meta.limit = response->ulen;
+
+	/*
+	 * Build an iovecs structure describing the request message, and then
+	 * send it.
+	 */
+	if ((ret = __repmgr_build_msg_out(env,
+	    request, nrequest, &meta, &iovecs)) != 0) {
+		/*
+		 * Since we haven't sent the message yet, there's no chance the
+		 * select thread has started relying on the REPMGR_RESPONSE, so
+		 * it's easy to deallocate it.
+		 */
+		LOCK_MUTEX(db_rep->mutex);
+		F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING);
+		UNLOCK_MUTEX(db_rep->mutex);
+		return (ret);
+	}
+
+	timeout = timeout > 0 ? timeout : db_channel->timeout;
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_send_many(env, conn, iovecs, timeout);
+	if (ret == DB_TIMEOUT)
+		F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING);
+	UNLOCK_MUTEX(db_rep->mutex);
+	__os_free(env, iovecs);
+	if (ret != 0) {
+		/*
+		 * An error while writing will force the connection to be
+		 * closed, busted, abandoned.  Since there could be a few app
+		 * threads waiting, *any* abandoning of a connection will have
+		 * to wake up those threads, with a COMPLETE indication and an
+		 * error code.  That's more than we want to tackle here.
+		 */
+		return (ret);
+	}
+
+	/*
+	 * Here, we've successfully sent the request.  Once we've gotten this
+	 * far, the select thread owns the REPMGR_RESPONSE slot until it marks
+	 * it complete.
+	 */
+	ctx.conn = conn;
+	ctx.index = i;
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_await_cond(env,
+	    response_complete, &ctx, timeout, &conn->response_waiters);
+
+	resp = &conn->responses[i];
+	if (ret == 0) {
+		DB_ASSERT(env, F_ISSET(resp, RESP_COMPLETE));
+		*response = resp->dbt;
+		if ((ret = resp->ret) == 0 && LF_ISSET(DB_MULTIPLE))
+			adjust_bulk_response(env, response);
+		F_CLR(resp, RESP_IN_USE | RESP_THREAD_WAITING);
+
+	} else {
+		F_CLR(resp, RESP_THREAD_WAITING);
+		if (ret == DB_TIMEOUT && F_ISSET(resp, RESP_READING)) {
+			/*
+			 * The select thread is in the midst of reading the
+			 * response, but we're about to yank the buffer out from
+			 * under it.  So, replace it with a dummy buffer.
+			 * (There's no way to abort the reading of a message
+			 * part-way through.)
+			 *
+			 * Notice that whatever buffer the user is getting back,
+			 * including her own in the case of USERMEM, may already
+			 * have some partial data written into it.
+			 *
+			 * We always read responses in just one single chunk, so
+			 * figuring out the needed buffer size is fairly simple.
+			 */
+			DB_ASSERT(env, conn->iovecs.offset == 0 &&
+			    conn->iovecs.count == 1);
+			sz = conn->iovecs.vectors[0].iov_len;
+
+			if ((ret = __os_malloc(env, sz, &dummy)) != 0)
+				goto out;
+			__repmgr_iovec_init(&conn->iovecs);
+			DB_INIT_DBT(resp->dbt, dummy, sz);
+			__repmgr_add_dbt(&conn->iovecs, &resp->dbt);
+			F_SET(resp, RESP_DUMMY_BUF);
+		}
+	}
+
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+static int
+response_complete(env, ctx)
+	ENV *env;
+	void *ctx;
+{
+	REPMGR_CONNECTION *conn;
+	struct response_wait *rw;
+
+	COMPQUIET(env, NULL);
+
+	rw = ctx;
+	conn = rw->conn;
+	return (F_ISSET(&conn->responses[rw->index], RESP_COMPLETE) ||
+	    conn->state == CONN_DEFUNCT);
+}
+
+/*
+ * "Send" a request to ourselves, by invoking the application's call-back
+ * function directly, in the case where a channel directed to DB_EID_MASTER is
+ * used on a master.
+ */
+static int
+request_self(env, request, nrequest, response, flags)
+	ENV *env;
+	DBT *request;
+	u_int32_t nrequest;
+	DBT *response;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	DB_CHANNEL db_channel;
+	CHANNEL channel;
+	__repmgr_msg_metadata_args meta;
+
+	db_rep = env->rep_handle;
+	if (db_rep->msg_dispatch == NULL) {
+		__db_errx(env, DB_STR("3655",
+	    "No message dispatch call-back function has been configured"));
+		return (DB_NOSERVER);
+	}
+
+	db_channel.channel = &channel;
+	db_channel.send_msg = __repmgr_send_response;
+
+	/* Supply stub functions for methods inapplicable in msg disp func. */
+	db_channel.close = __repmgr_channel_close_inval;
+	db_channel.send_request = __repmgr_send_request_inval;
+	db_channel.set_timeout = __repmgr_channel_timeout_inval;
+
+	channel.env = env;
+	channel.c.conn = NULL;
+	channel.responded = FALSE;
+	channel.meta = &meta;
+	channel.response.dbt = *response;
+
+	meta.flags = REPMGR_REQUEST_MSG_TYPE |
+	    (LF_ISSET(DB_MULTIPLE) ? REPMGR_MULTI_RESP : 0) |
+	    (F_ISSET(response, DB_DBT_USERMEM) ? REPMGR_RESPONSE_LIMIT : 0);
+	meta.limit = response->ulen;
+
+	(*db_rep->msg_dispatch)(env->dbenv,
+	    &db_channel, request, nrequest, DB_REPMGR_NEED_RESPONSE);
+
+	if (!channel.responded) {
+		__db_errx(env, DB_STR("3656",
+		    "Application failed to provide a response"));
+		return (DB_KEYEMPTY);
+	} else {
+		response->data = channel.response.dbt.data;
+		response->size = channel.response.dbt.size;
+		if (LF_ISSET(DB_MULTIPLE))
+			adjust_bulk_response(env, response);
+	}
+	return (0);
+}
+
+static void
+adjust_bulk_response(env, response)
+	ENV *env;
+	DBT *response;
+{
+	u_int32_t n, *p;
+
+#ifndef	DIAGNOSTIC
+	COMPQUIET(env, NULL);
+#endif
+
+	/*
+	 * Convert bulk-buffer segment info to host byte-order, and count
+	 * segments.  See the definition of DB_MULTIPLE_INIT for a reminder of
+	 * the structure of a bulk buffer.  Each segment has both an offset and
+	 * a length, so "n" ends up as the number of u_int32_t words we (might)
+	 * need to shuffle, below.
+	 */
+	p = (u_int32_t *)((u_int8_t *)response->data +
+	    response->size - sizeof(u_int32_t));
+	for (n = 1; *p != (u_int32_t)-1; p -= 2) {
+		DB_ASSERT(env, p > (u_int32_t *)response->data);
+		p[0] = ntohl(p[0]);
+		p[-1] = ntohl(p[-1]);
+		n += 2;
+	}
+	/*
+	 * The bulk pointers appear at the end of the transmitted response, so
+	 * unless the buffer happened to be exactly the right size we need to
+	 * shuffle them to the end of the buffer.
+	 */
+	if (F_ISSET(response, DB_DBT_USERMEM))
+		memmove((u_int8_t *)response->data +
+		    response->ulen - n * sizeof(u_int32_t),
+		    p, n * sizeof(u_int32_t));
+	else
+		response->ulen = response->size;
+}
+
+/*
+ * Implementation of DB_CHANNEL->send_msg() method for use in recipient's msg
+ * dispatch callback function.
+ *
+ * PUBLIC: int __repmgr_send_response __P((DB_CHANNEL *,
+ * PUBLIC:     DBT *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_send_response(db_channel, msg, nmsg, flags)
+	DB_CHANNEL *db_channel;
+	DBT *msg;
+	u_int32_t nmsg;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	CHANNEL *channel;
+	REPMGR_CONNECTION *conn;
+	REPMGR_IOVECS iovecs, *iovecsp;
+	DBT *dbt;
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE], *msg_hdr_buf_p;
+	size_t sz;
+	int alloc, ret;
+
+	COMPQUIET(iovecsp, NULL);
+
+	channel = db_channel->channel;
+	env = channel->env;
+	db_rep = env->rep_handle;
+	conn = channel->c.conn;
+
+	if ((ret = __db_fchk(env,
+	    "DB_CHANNEL->send_msg", flags, 0)) != 0)
+		return (ret);
+
+	if (!F_ISSET(channel->meta, REPMGR_REQUEST_MSG_TYPE))
+		return (send_msg_conn(env, conn, msg, nmsg));
+
+	if (channel->responded) {
+		__db_errx(env, DB_STR("3657",
+		    "a response has already been sent"));
+		return (EINVAL);
+	}
+
+	alloc = FALSE;
+	if (F_ISSET(channel->meta, REPMGR_MULTI_RESP)) {
+		/*
+		 * Originator accepts bulk format: response can be any number of
+		 * segments.
+		 */
+		if ((ret = __repmgr_build_data_out(env,
+		    msg, nmsg, NULL, &iovecsp)) != 0)
+			goto out;
+		alloc = TRUE;
+
+		/*
+		 * Set buffer pointer to space we "know" build_data_out reserved
+		 * for us.
+		 */
+		msg_hdr_buf_p = (u_int8_t *)iovecsp->vectors[0].iov_base;
+		msg_hdr.type = REPMGR_APP_RESPONSE;
+		APP_RESP_TAG(msg_hdr) = channel->meta->tag;
+		APP_RESP_BUFFER_SIZE(msg_hdr) =
+		    (u_int32_t)(iovecsp->total_bytes - __REPMGR_MSG_HDR_SIZE);
+		__repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf_p);
+	} else if (nmsg > 1) {
+		__db_errx(env, DB_STR("3658",
+		    "originator does not accept multi-segment response"));
+		goto small;
+	} else {
+		iovecsp = &iovecs;
+		__repmgr_iovec_init(iovecsp);
+		msg_hdr.type = REPMGR_APP_RESPONSE;
+		APP_RESP_TAG(msg_hdr) = channel->meta->tag;
+		__repmgr_add_buffer(iovecsp,
+		    msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+		if (nmsg == 0)
+			APP_RESP_BUFFER_SIZE(msg_hdr) = 0;
+		else if ((APP_RESP_BUFFER_SIZE(msg_hdr) = msg->size) > 0)
+			__repmgr_add_dbt(iovecsp, msg);
+		__repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+	}
+
+	if (F_ISSET(channel->meta, REPMGR_RESPONSE_LIMIT) &&
+	    (APP_RESP_BUFFER_SIZE(msg_hdr) > channel->meta->limit)) {
+		__db_errx(env, DB_STR("3659",
+		    "originator's USERMEM buffer too small"));
+small:
+		if (conn == NULL)
+			channel->response.ret = DB_BUFFER_SMALL;
+		else
+			(void)__repmgr_send_err_resp(env,
+			    channel, DB_BUFFER_SMALL);
+		ret = EINVAL;
+	} else {
+		if (conn == NULL) {
+			sz = APP_RESP_BUFFER_SIZE(msg_hdr);
+			dbt = &channel->response.dbt;
+			if (F_ISSET(dbt, DB_DBT_MALLOC))
+				ret = __os_umalloc(env, sz, &dbt->data);
+			else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+				if (dbt->data == NULL || dbt->size < sz)
+					ret = __os_urealloc(env,
+					    sz, &dbt->data);
+				else
+					ret = 0;
+			}
+			dbt->size = (u_int32_t)sz;
+			copy_body(dbt->data, iovecsp);
+			channel->response.ret = 0;
+			ret = 0;
+		} else {
+			LOCK_MUTEX(db_rep->mutex);
+			ret = __repmgr_send_many(env, conn, iovecsp, 0);
+			UNLOCK_MUTEX(db_rep->mutex);
+		}
+	}
+
+out:
+	if (alloc)
+		__os_free(env, iovecsp);
+
+	/*
+	 * Once we've handed the tag back to the originator it becomes
+	 * meaningless, so we can't use it again.  Note the fact that we've
+	 * responded, so that we don't try.
+	 */
+	channel->responded = TRUE;
+
+	return (ret);
+}
+
+static int
+__repmgr_build_msg_out(env, msg, nmsg, meta, iovecsp)
+	ENV *env;
+	DBT *msg;
+	u_int32_t nmsg;
+	__repmgr_msg_metadata_args *meta;
+	REPMGR_IOVECS **iovecsp;
+{
+	REPMGR_IOVECS *iovecs;
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t *msg_hdr_buf;
+	int ret;
+
+	if ((ret = __repmgr_build_data_out(env, msg, nmsg, meta, &iovecs)) != 0)
+		return (ret);
+
+	/*
+	 * The IOVECS holds the entire message to be transmitted, including the
+	 * 9-byte header.  The header contains the length of the remaining part
+	 * of the message.  The header buffer area is of course pointed to by
+	 * the first of the io vectors.
+	 */
+	msg_hdr_buf = (u_int8_t *)iovecs->vectors[0].iov_base;
+	msg_hdr.type = REPMGR_APP_MESSAGE;
+	APP_MSG_BUFFER_SIZE(msg_hdr) =
+	    (u_int32_t)(iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE);
+	APP_MSG_SEGMENT_COUNT(msg_hdr) = nmsg;
+	__repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+
+	*iovecsp = iovecs;
+	return (0);
+}
+
+/*
+ * Allocate and build most of an outgoing message, leaving it up to the caller
+ * to fill in the header afterwards.
+ */
+static int
+__repmgr_build_data_out(env, msg, nmsg, meta, iovecsp)
+	ENV *env;
+	DBT *msg;
+	u_int32_t nmsg;
+	__repmgr_msg_metadata_args *meta;
+	REPMGR_IOVECS **iovecsp;
+{
+	REPMGR_IOVECS *iovecs;
+	u_int32_t *bulk_base, *bulk_ptr, i, n;
+	u_int8_t *membase, *meta_buf, *msg_hdr_buf, *p, *pad;
+	void *inc_p;
+	size_t align, bulk_area_sz, memsize, segments, sz, offset;
+	int ret;
+
+	COMPQUIET(pad, NULL);
+
+	/*
+	 * The actual message as it will be sent on the wire is composed of the
+	 * following parts:
+	 *
+	 * (a) the 9-byte header
+	 * (b) for each msg DBT ('nmsg' of them):
+	 *     (b.1) the data itself, and
+	 *     (b.2) an alignment pad, if necessary
+	 * (c) trailing section for bulk-style pointers (2 words per segment,
+	 *     plus a -1 end-marker)
+	 * (d) message meta-data (optionally)
+	 *
+	 * Note that nmsg could be 0.
+	 */
+
+	/* First, count how many segments need padding. */
+	n = 0;
+	align = sizeof(double);
+	for (i = 0; i < nmsg; i++) {
+		p = msg[i].data;
+		p = &p[msg[i].size];
+		inc_p = ALIGNP_INC(p, align);
+		if ((u_int8_t *)inc_p > p)
+			n++;
+	}
+
+	/*
+	 * Here we allocate memory to hold the actual pieces of the message we
+	 * will send, plus the iovecs structure that points to those pieces.  We
+	 * don't include the memory for the user's data (item (b.1) from the
+	 * above explanation), since the user is supplying them directly.  Also
+	 * note that we can reuse just one padding buffer even if we need to
+	 * send it (i.e., point to it from an iovec) more than once.
+	 *
+	 * According to the list of message segments explained above, the total
+	 * number of iovec elements we need is (1 + nmsg + n + 1 + f(meta)).
+	 */
+	segments = nmsg + n + (meta == NULL ? 2 : 3);
+	sz = segments > MIN_IOVEC ? REPMGR_IOVECS_ALLOC_SZ(segments) :
+	    sizeof(REPMGR_IOVECS);
+
+	bulk_area_sz = (nmsg * 2 + 1) * sizeof(u_int32_t);
+	memsize = sz + __REPMGR_MSG_HDR_SIZE +
+	    bulk_area_sz + (n > 0 ? align : 0) + __REPMGR_MSG_METADATA_SIZE;
+
+	if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+		return (ret);
+	p = membase;
+	iovecs = (REPMGR_IOVECS *)p;
+	p += sz;
+	bulk_base = (u_int32_t *)p;
+	p += bulk_area_sz;
+	if (n > 0) {
+		pad = p;
+		memset(pad, 0, align);
+		p += align;
+	}
+	msg_hdr_buf = p;
+	p += __REPMGR_MSG_HDR_SIZE;
+	meta_buf = p;
+
+	/*
+	 * The message header appears first (on the wire), so we have to add its
+	 * buffer address to the iovec list first.  But we don't actually
+	 * compose the content; that's the responsibility of the caller, after
+	 * we return.
+	 */
+	__repmgr_iovec_init(iovecs);
+	__repmgr_add_buffer(iovecs, msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+	offset = 0;
+	bulk_ptr = &bulk_base[2*nmsg + 1]; /* Work backward from the end. */
+	for (i = 0; i < nmsg; i++) {
+		p = msg[i].data;
+		sz = (size_t)msg[i].size;
+
+		/*
+		 * Format of bulk pointers is similar to the usage of
+		 * DB_MULTIPLE_NEXT, but note that the lengths we pass are of
+		 * course for the actual data itself, not including any
+		 * padding.
+		 */
+		*--bulk_ptr = htonl((u_long)offset);
+		*--bulk_ptr = htonl((u_long)sz);
+
+		__repmgr_add_dbt(iovecs, &msg[i]);
+		offset += sz;
+
+		p = &p[sz];
+		inc_p = ALIGNP_INC(p, align);
+		if ((u_int8_t *)inc_p > p) {
+			DB_ASSERT(env, n > 0);
+			sz = (size_t)((u_int8_t *)inc_p - p);
+			DB_ASSERT(env, sz <= align);
+			__repmgr_add_buffer(iovecs, pad, sz);
+			offset += sz;
+		}
+	}
+	*--bulk_ptr = (u_int32_t)-1;
+	__repmgr_add_buffer(iovecs, bulk_ptr, bulk_area_sz);
+
+	if (meta != NULL) {
+		__repmgr_msg_metadata_marshal(env, meta, meta_buf);
+		__repmgr_add_buffer(iovecs,
+		    meta_buf, __REPMGR_MSG_METADATA_SIZE);
+	}
+
+	*iovecsp = iovecs;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_close __P((DB_CHANNEL *, u_int32_t));
+ */
+int
+__repmgr_channel_close(dbchan, flags)
+	DB_CHANNEL *dbchan;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	CHANNEL *channel;
+	u_int32_t i;
+	int ret, t_ret;
+
+	channel = dbchan->channel;
+	env = channel->env;
+	ret = __db_fchk(env, "DB_CHANNEL->close", flags, 0);
+	db_rep = env->rep_handle;
+
+	/*
+	 * Disable connection(s) (if not already done due to an error having
+	 * occurred previously); release our reference to conn struct(s).
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	if (dbchan->eid >= 0) {
+		conn = channel->c.conn;
+		if (conn->state != CONN_DEFUNCT &&
+		    (t_ret = __repmgr_disable_connection(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	} else if (channel->c.conns.cnt > 0) {
+		for (i = 0; i < channel->c.conns.cnt; i++)
+			if ((conn = channel->c.conns.array[i]) != NULL) {
+				if (conn->state != CONN_DEFUNCT &&
+				    (t_ret = __repmgr_disable_connection(env,
+				    conn)) != 0 && ret == 0)
+					ret = t_ret;
+				if ((t_ret = __repmgr_decr_conn_ref(env,
+				    conn)) != 0 && ret == 0)
+					ret = t_ret;
+			}
+		__os_free(env, channel->c.conns.array);
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	if (!IS_VALID_EID(dbchan->eid) && channel->c.conns.mutex != NULL &&
+	    (t_ret = __repmgr_destroy_mutex(env,
+	    channel->c.conns.mutex)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __repmgr_wake_main_thread(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, channel);
+	__os_free(env, dbchan);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_timeout __P((DB_CHANNEL *, db_timeout_t));
+ */
+int
+__repmgr_channel_timeout(chan, timeout)
+	DB_CHANNEL *chan;
+	db_timeout_t timeout;
+{
+	chan->timeout = timeout;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_request_inval __P((DB_CHANNEL *,
+ * PUBLIC:     DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+ */
+int
+__repmgr_send_request_inval(dbchan, request, nrequest, response, timeout, flags)
+	DB_CHANNEL *dbchan;
+	DBT *request;
+	u_int32_t nrequest;
+	DBT *response;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	COMPQUIET(request, NULL);
+	COMPQUIET(nrequest, 0);
+	COMPQUIET(response, NULL);
+	COMPQUIET(timeout, 0);
+	COMPQUIET(flags, 0);
+	return (bad_callback_method(dbchan, "send_request"));
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_close_inval __P((DB_CHANNEL *, u_int32_t));
+ */
+int
+__repmgr_channel_close_inval(dbchan, flags)
+	DB_CHANNEL *dbchan;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (bad_callback_method(dbchan, "close"));
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_timeout_inval __P((DB_CHANNEL *, db_timeout_t));
+ */
+int
+__repmgr_channel_timeout_inval(dbchan, timeout)
+	DB_CHANNEL *dbchan;
+	db_timeout_t timeout;
+{
+	COMPQUIET(timeout, 0);
+	return (bad_callback_method(dbchan, "set_timeout"));
+}
+
+static int
+bad_callback_method(chan, method)
+	DB_CHANNEL *chan;
+	const char *method;
+{
+	__db_errx(chan->channel->env, DB_STR_A("3660",
+	    "%s() invalid on DB_CHANNEL supplied to msg dispatch function",
+	    "%s"), method);
+	return (EINVAL);
+}
+
+static int
+repmgr_only(env, method)
+	ENV *env;
+	const char *method;
+{
+	__db_errx(env, DB_STR_A("3661",
+	    "%s: cannot call from base replication application",
+	    "%s"), method);
+	return (EINVAL);
+}
+
+/*
+ * Attempts to join the replication group, by finding a remote "helper" site and
+ * sending a request message to it.
+ *
+ * PUBLIC: int __repmgr_join_group __P((ENV *));
+ */
+int
+__repmgr_join_group(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	repmgr_netaddr_t addr;
+	u_int i;
+	int pass, ret;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * Make two passes through the site list.  On the first pass, try
+	 * joining via an existing, fully "present" site whom we've found in the
+	 * membership database.  If that is fruitless, on the second pass try
+	 * any site marked as a bootstrap helper.
+	 *
+	 * On the first attempt to join, when we have found no database, the
+	 * first pass will produce nothing.  On a later attempt to rejoin after
+	 * having been removed, it's better to give priority to existing
+	 * remaining sites from the database, and only rely on bootstrap helpers
+	 * as a last resort.
+	 *
+	 * pass 0 => present members
+	 * pass 1 => helpers
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	for (pass = 0; pass <= 1; pass++) {
+		FOR_EACH_REMOTE_SITE_INDEX(i) {
+			site = SITE_FROM_EID(i);
+			if (pass == 0 && site->membership != SITE_PRESENT)
+				continue;
+			if (pass == 1 &&
+			    !FLD_ISSET(site->config, DB_BOOTSTRAP_HELPER))
+				continue;
+			addr = site->net_addr;
+			UNLOCK_MUTEX(db_rep->mutex);
+			if ((ret = join_group_at_site(env,
+			    &addr)) == DB_REP_UNAVAIL) {
+				LOCK_MUTEX(db_rep->mutex);
+				continue;
+			}
+			return (ret);
+		}
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (DB_REP_UNAVAIL);
+}
+
+/*
+ * Sends a request message to another site, asking for permission to join the
+ * replication group.  Ideally the other site is the master, because only the
+ * master can grant that request.  But since we're not currently part of the
+ * group, we generally don't know which site is master.  If the target site is
+ * not master, it will respond by telling us who is.
+ */
+static int
+join_group_at_site(env, addrp)
+	ENV *env;
+	repmgr_netaddr_t *addrp;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	SITE_STRING_BUFFER addr_buf;
+	repmgr_netaddr_t addr, myaddr;
+	__repmgr_gm_fwd_args fwd;
+	__repmgr_site_info_args site_info;
+	u_int8_t *p, *response_buf, siteinfo_buf[MAX_MSG_BUF];
+	char host_buf[MAXHOSTNAMELEN + 1], *host;
+	u_int32_t gen, type;
+	size_t len;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+	LOCK_MUTEX(db_rep->mutex);
+	myaddr = SITE_FROM_EID(db_rep->self_eid)->net_addr;
+	UNLOCK_MUTEX(db_rep->mutex);
+	len = strlen(myaddr.host) + 1;
+	DB_INIT_DBT(site_info.host, myaddr.host, len);
+	site_info.port = myaddr.port;
+	site_info.flags = 0;
+	ret = __repmgr_site_info_marshal(env,
+	    &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+	DB_ASSERT(env, ret == 0);
+
+	conn = NULL;
+	response_buf = NULL;
+	gen = 0;
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "try join request to site %s",
+	    __repmgr_format_addr_loc(addrp, addr_buf)));
+retry:
+	if ((ret = make_request_conn(env, addrp, &conn)) != 0)
+		return (ret);
+	if ((ret = __repmgr_send_sync_msg(env, conn,
+	    REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
+		goto err;
+
+	if ((ret = read_own_msg(env,
+	    conn, &type, &response_buf, &len)) != 0)
+		goto err;
+
+	if (type == REPMGR_GM_FAILURE) {
+		ret = DB_REP_UNAVAIL;
+		goto err;
+	}
+	if (type == REPMGR_GM_FORWARD) {
+		/*
+		 * The remote site we thought was master is telling us that some
+		 * other site has become master.  Retry with the new master.
+		 * However, in order to avoid an endless cycle, only continue
+		 * retrying as long as the master gen is advancing.
+		 */
+		ret = __repmgr_close_connection(env, conn);
+		if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		conn = NULL;
+		if (ret != 0)
+			goto err;
+
+		ret = __repmgr_gm_fwd_unmarshal(env, &fwd,
+		    response_buf, len, &p);
+		DB_ASSERT(env, ret == 0);
+		if (fwd.gen > gen) {
+			if (fwd.host.size > MAXHOSTNAMELEN + 1) {
+				ret = DB_REP_UNAVAIL;
+				goto err;
+			}
+			host = fwd.host.data;
+			host[fwd.host.size-1] = '\0'; /* Just to be sure. */
+			(void)strcpy(host_buf, host);
+			addr.host = host_buf;
+			addr.port = fwd.port;
+			addrp = &addr;
+			gen = fwd.gen;
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		      "will retry join request at forwarded master %s, gen %lu",
+				__repmgr_format_addr_loc(addrp, addr_buf),
+				(u_long)gen));
+			__os_free(env, response_buf);
+			response_buf = NULL;
+			goto retry;
+		} else {
+			ret = DB_REP_UNAVAIL;
+			goto err;
+		}
+	}
+	if (type == REPMGR_JOIN_SUCCESS)
+		ret = __repmgr_refresh_membership(env, response_buf, len);
+	else
+		ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */
+
+err:
+	if (conn != NULL) {
+		if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+		    ret != 0)
+			ret = t_ret;
+		if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+		    ret != 0)
+			ret = t_ret;
+	}
+	if (response_buf != NULL)
+		__os_free(env, response_buf);
+
+	return (ret);
+}
+
+/*
+ * Reads a whole message, when we expect to get a REPMGR_OWN_MSG.
+ */
+static int
+read_own_msg(env, conn, typep, bufp, lenp)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	u_int32_t *typep;
+	u_int8_t **bufp;
+	size_t *lenp;
+{
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t *buf;
+	u_int32_t type;
+	size_t size;
+	int ret;
+
+	__repmgr_reset_for_reading(conn);
+	if ((ret = __repmgr_read_conn(conn)) != 0)
+		goto err;
+	ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+	    conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+	DB_ASSERT(env, ret == 0);
+
+	if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) {
+		ret = DB_REP_UNAVAIL; /* Protocol violation. */
+		goto err;
+	}
+	type = REPMGR_OWN_MSG_TYPE(msg_hdr);
+	if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) {
+		conn->reading_phase = DATA_PHASE;
+		__repmgr_iovec_init(&conn->iovecs);
+
+		if ((ret = __os_malloc(env, size, &buf)) != 0)
+			goto err;
+		conn->input.rep_message = NULL;
+
+		__repmgr_add_buffer(&conn->iovecs, buf, size);
+		if ((ret = __repmgr_read_conn(conn)) != 0) {
+			__os_free(env, buf);
+			goto err;
+		}
+		*bufp = buf;
+	}
+
+	*typep = type;
+	*lenp = size;
+
+err:
+	return (ret);
+}
+
+static int
+make_request_conn(env, addr, connp)
+	ENV *env;
+	repmgr_netaddr_t *addr;
+	REPMGR_CONNECTION **connp;
+{
+	DBT vi;
+	__repmgr_msg_hdr_args msg_hdr;
+	__repmgr_version_confirmation_args conf;
+	REPMGR_CONNECTION *conn;
+	int alloc, ret, unused;
+
+	alloc = FALSE;
+	if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0)
+		return (ret);
+	conn->type = APP_CONNECTION;
+
+	/* Read a handshake msg, to get version confirmation and parameters. */
+	if ((ret = __repmgr_read_conn(conn)) != 0)
+		goto err;
+	/*
+	 * We can only get here after having read the full 9 bytes that we
+	 * expect, so this can't fail.
+	 */
+	DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+	ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+	    conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+	DB_ASSERT(env, ret == 0);
+	__repmgr_iovec_init(&conn->iovecs);
+	conn->reading_phase = DATA_PHASE;
+
+	if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
+		goto err;
+	alloc = TRUE;
+
+	if ((ret = __repmgr_read_conn(conn)) != 0)
+		goto err;
+
+	/*
+	 * Analyze the handshake msg, and stash relevant info.
+	 */
+	if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+		goto err;
+	DB_ASSERT(env, vi.size > 0);
+	if ((ret = __repmgr_version_confirmation_unmarshal(env,
+	    &conf, vi.data, vi.size, NULL)) != 0)
+		goto err;
+
+	if (conf.version < GM_MIN_VERSION) {
+		ret = DB_REP_UNAVAIL;
+		goto err;
+	}
+	conn->version = conf.version;
+
+err:
+	if (alloc) {
+		DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
+		__os_free(env, conn->input.repmgr_msg.cntrl.data);
+		DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
+		__os_free(env, conn->input.repmgr_msg.rec.data);
+	}
+	__repmgr_reset_for_reading(conn);
+	if (ret == 0)
+		*connp = conn;
+	else {
+		(void)__repmgr_close_connection(env, conn);
+		(void)__repmgr_destroy_conn(env, conn);
+	}
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site __P((DB_ENV *,
+ * PUBLIC:     const char *, u_int, DB_SITE **, u_int32_t));
+ */
+int
+__repmgr_site(dbenv, host, port, sitep, flags)
+	DB_ENV *dbenv;
+	const char *host;
+	u_int port;
+	DB_SITE **sitep;
+	u_int32_t flags;
+{
+	int ret;
+
+	if ((ret = __db_fchk(dbenv->env, "repmgr_site", flags, 0)) == 0)
+		ret = site_by_addr(dbenv->env, host, port, sitep);
+
+	return ret;
+}
+
+static int
+site_by_addr(env, host, port, sitep)
+	ENV *env;
+	const char *host;
+	u_int port;
+	DB_SITE **sitep;
+{
+	DB_THREAD_INFO *ip;
+	DB_REP *db_rep;
+	DB_SITE *dbsite;
+	REPMGR_SITE *site;
+	int eid, locked, ret;
+
+	COMPQUIET(ip, NULL);
+	PANIC_CHECK(env);
+	db_rep = env->rep_handle;
+	ENV_NOT_CONFIGURED(env, db_rep->region, "repmgr_site", DB_INIT_REP);
+	if (APP_IS_BASEAPI(env))
+		return (repmgr_only(env, "repmgr_site"));
+	if ((ret = addr_chk(env, host, port)) != 0)
+		return (ret);
+
+	if (REP_ON(env)) {
+		LOCK_MUTEX(db_rep->mutex);
+		ENV_ENTER(env, ip);
+		locked = TRUE;
+	} else
+		locked = FALSE;
+	ret = __repmgr_find_site(env, host, port, &eid);
+	DB_ASSERT(env, IS_VALID_EID(eid));
+	site = SITE_FROM_EID(eid);
+	/*
+	 * Point to the stable, permanent copy of the host name.  That's the one
+	 * we want the DB_SITE handle to point to; just like site_by_eid() does.
+	 */
+	host = site->net_addr.host;
+	if (locked) {
+		ENV_LEAVE(env, ip);
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+	if (ret != 0)
+		return (ret);
+
+	if ((ret = init_dbsite(env, eid, host, port, &dbsite)) != 0)
+		return (ret);
+
+	/* Manipulating a site makes this a replication manager application. */
+	APP_SET_REPMGR(env);
+	*sitep = dbsite;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+ */
+int
+__repmgr_site_by_eid(dbenv, eid, sitep)
+	DB_ENV *dbenv;
+	int eid;
+	DB_SITE **sitep;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	DB_SITE *dbsite;
+	int ret;
+
+	env = dbenv->env;
+	PANIC_CHECK(env);
+	db_rep = env->rep_handle;
+
+	if (eid < 0 || eid >= (int)db_rep->site_cnt)
+		return (DB_NOTFOUND);
+	site = SITE_FROM_EID(eid);
+
+	if ((ret = init_dbsite(env, eid,
+	    site->net_addr.host, site->net_addr.port, &dbsite)) != 0)
+		return (ret);
+	*sitep = dbsite;
+	return (0);
+}
+
+static int
+init_dbsite(env, eid, host, port, sitep)
+	ENV *env;
+	int eid;
+	const char *host;
+	u_int port;
+	DB_SITE **sitep;
+{
+	DB_SITE *dbsite;
+	int ret;
+
+	if ((ret = __os_calloc(env, 1, sizeof(DB_SITE), &dbsite)) != 0)
+		return (ret);
+
+	dbsite->env = env;
+	dbsite->eid = eid;
+	dbsite->host = host;
+	dbsite->port = port;
+	dbsite->flags = (REP_ON(env) ? 0 : DB_SITE_PREOPEN);
+
+	dbsite->get_address = __repmgr_get_site_address;
+	dbsite->get_config = __repmgr_get_config;
+	dbsite->get_eid = __repmgr_get_eid;
+	dbsite->set_config = __repmgr_site_config;
+	dbsite->remove = __repmgr_remove_site_pp;
+	dbsite->close = __repmgr_site_close;
+
+	*sitep = dbsite;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_site_address __P((DB_SITE *,
+ * PUBLIC:     const char **, u_int *));
+ */
+int
+__repmgr_get_site_address(dbsite, hostp, port)
+	DB_SITE *dbsite;
+	const char **hostp;
+	u_int *port;
+{
+	if (hostp != NULL)
+		*hostp = dbsite->host;
+	if (port != NULL)
+		*port = dbsite->port;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_eid __P((DB_SITE *, int *));
+ */
+int
+__repmgr_get_eid(dbsite, eidp)
+	DB_SITE *dbsite;
+	int *eidp;
+{
+	int ret;
+
+	if ((ret = refresh_site(dbsite)) != 0)
+		return (ret);
+
+	if (F_ISSET(dbsite, DB_SITE_PREOPEN)) {
+		__db_errx(dbsite->env, DB_STR("3662",
+		    "Can't determine EID before env open"));
+		return (EINVAL);
+	}
+	*eidp = dbsite->eid;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *));
+ */
+int
+__repmgr_get_config(dbsite, which, valuep)
+	DB_SITE *dbsite;
+	u_int32_t which;
+	u_int32_t *valuep;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REGINFO *infop;
+	REP *rep;
+	REPMGR_SITE *site;
+	SITEINFO *sites;
+	int ret;
+
+	env = dbsite->env;
+	db_rep = env->rep_handle;
+
+	if ((ret = refresh_site(dbsite)) != 0)
+		return (ret);
+	LOCK_MUTEX(db_rep->mutex);
+	DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+	site = SITE_FROM_EID(dbsite->eid);
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		infop = env->reginfo;
+
+		ENV_ENTER(env, ip);
+		MUTEX_LOCK(env, rep->mtx_repmgr);
+		sites = R_ADDR(infop, rep->siteinfo_off);
+
+		site->config = sites[dbsite->eid].config;
+
+		MUTEX_UNLOCK(env, rep->mtx_repmgr);
+		ENV_LEAVE(env, ip);
+	}
+	*valuep = FLD_ISSET(site->config, which) ? 1 : 0;
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_site_config(dbsite, which, value)
+	DB_SITE *dbsite;
+	u_int32_t which;
+	u_int32_t value;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REGINFO *infop;
+	REP *rep;
+	REPMGR_SITE *site;
+	SITEINFO *sites;
+	int ret;
+
+	env = dbsite->env;
+	db_rep = env->rep_handle;
+
+	if ((ret = refresh_site(dbsite)) != 0)
+		return (ret);
+	switch (which) {
+	case DB_BOOTSTRAP_HELPER:
+	case DB_REPMGR_PEER:
+		if (dbsite->eid == db_rep->self_eid) {
+			__db_errx(env, DB_STR("3663",
+			    "Site config value not applicable to local site"));
+			return (EINVAL);
+		}
+		break;
+	case DB_GROUP_CREATOR:
+		/*
+		 * Ignore if this is set on remote site.  Users will often
+		 * copy and edit a DB_CONFIG for all sites.
+		 */
+		break;
+	case DB_LEGACY:
+		/* Applicable to either local or remote site. */
+		break;
+	case DB_LOCAL_SITE:
+		/*
+		 * This special case needs extra processing, to set the
+		 * "self_eid" index in addition to the flag bit.
+		 */
+		if ((ret = set_local_site(dbsite, value)) != 0)
+			return (ret);
+		break;
+	default:
+		__db_errx(env,
+		    DB_STR("3665", "Unrecognized site config value"));
+		return (EINVAL);
+	}
+
+	DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		infop = env->reginfo;
+
+		LOCK_MUTEX(db_rep->mutex);
+		ENV_ENTER(env, ip);
+		MUTEX_LOCK(env, rep->mtx_repmgr);
+		sites = R_ADDR(infop, rep->siteinfo_off);
+		site = SITE_FROM_EID(dbsite->eid);
+
+		/*
+		 * Make sure we're up to date with shared memory version.  After
+		 * env open, we never set private without also updating shared.
+		 * But another process could have set the shared one, so shared
+		 * is always "best."
+		 */
+		site->config = sites[dbsite->eid].config;
+		if (value)
+			FLD_SET(site->config, which);
+		else
+			FLD_CLR(site->config, which);
+		if (site->config != sites[dbsite->eid].config) {
+			sites[dbsite->eid].config = site->config;
+			rep->siteinfo_seq++;
+		}
+		MUTEX_UNLOCK(env, rep->mtx_repmgr);
+		ENV_LEAVE(env, ip);
+		UNLOCK_MUTEX(db_rep->mutex);
+	} else {
+		site = SITE_FROM_EID(dbsite->eid);
+		if (value)
+			FLD_SET(site->config, which);
+		else
+			FLD_CLR(site->config, which);
+	}
+	return (0);
+}
+
+static int
+set_local_site(dbsite, value)
+	DB_SITE *dbsite;
+	u_int32_t value;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REP *rep;
+	REPMGR_SITE *site;
+	int locked, ret;
+
+	COMPQUIET(rep, NULL);
+	COMPQUIET(ip, NULL);
+	env = dbsite->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	locked = FALSE;
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		LOCK_MUTEX(db_rep->mutex);
+		ENV_ENTER(env, ip);
+		MUTEX_LOCK(env, rep->mtx_repmgr);
+		locked = TRUE;
+		/* Make sure we're in sync first. */
+		if (IS_VALID_EID(rep->self_eid))
+			db_rep->self_eid = rep->self_eid;
+	}
+	if (!value && db_rep->self_eid == dbsite->eid) {
+		__db_errx(env, DB_STR("3666",
+		    "A previously given local site may not be unset"));
+		ret = EINVAL;
+	} else if (IS_VALID_EID(db_rep->self_eid) &&
+	    db_rep->self_eid != dbsite->eid) {
+		__db_errx(env, DB_STR("3667",
+		    "A (different) local site has already been set"));
+		ret = EINVAL;
+	} else {
+		DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+		site = SITE_FROM_EID(dbsite->eid);
+		if (FLD_ISSET(site->config,
+		    DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) {
+			__db_errx(env, DB_STR("3668",
+		    "Local site cannot have HELPER or PEER attributes"));
+			ret = EINVAL;
+		}
+	}
+	if (ret == 0) {
+		db_rep->self_eid = dbsite->eid;
+		if (locked) {
+			rep->self_eid = dbsite->eid;
+			rep->siteinfo_seq++;
+		}
+	}
+	if (locked) {
+		MUTEX_UNLOCK(env, rep->mtx_repmgr);
+		ENV_LEAVE(env, ip);
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+	return (ret);
+}
+
+/*
+ * Brings the dbsite's EID up to date, in case it got shuffled around across an
+ * env open.
+ */
+static int
+refresh_site(dbsite)
+	DB_SITE *dbsite;
+{
+	DB_REP *db_rep;
+	ENV *env;
+	REPMGR_SITE *site;
+
+	env = dbsite->env;
+	PANIC_CHECK(env);
+	if (F_ISSET(dbsite, DB_SITE_PREOPEN) && REP_ON(env)) {
+		db_rep = env->rep_handle;
+		LOCK_MUTEX(db_rep->mutex);
+		site = __repmgr_lookup_site(env, dbsite->host, dbsite->port);
+		DB_ASSERT(env, site != NULL);
+		dbsite->eid = EID_FROM_SITE(site);
+		F_CLR(dbsite, DB_SITE_PREOPEN);
+		UNLOCK_MUTEX(db_rep->mutex);
+	}
+	return (0);
+}
+
+static int
+__repmgr_remove_site_pp(dbsite)
+	DB_SITE *dbsite;
+{
+	int ret, t_ret;
+
+	ret = __repmgr_remove_site(dbsite);
+	/*
+	 * The remove() method is documented as a destructor, which means that
+	 * absolutely all calls must deallocate the handle, including error
+	 * cases, even mutex failures.
+	 */
+	if ((t_ret = __repmgr_site_close(dbsite)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int
+__repmgr_remove_site(dbsite)
+	DB_SITE *dbsite;
+{
+	ENV *env;
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_CONNECTION *conn;
+	repmgr_netaddr_t addr;
+	__repmgr_site_info_args site_info;
+	u_int8_t *response_buf, siteinfo_buf[MAX_MSG_BUF];
+	size_t len;
+	u_int32_t type;
+	int master, ret, t_ret;
+
+	if ((ret = refresh_site(dbsite)) != 0)
+		return (ret);
+	env = dbsite->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (db_rep->repmgr_status != running || !SELECTOR_RUNNING(db_rep)) {
+		__db_errx(env, DB_STR("3669", "repmgr is not running"));
+		return (EINVAL);
+	}
+
+	if (!IS_VALID_EID((master = rep->master_id)))
+		return (DB_REP_UNAVAIL);
+	LOCK_MUTEX(db_rep->mutex);
+	DB_ASSERT(env, IS_VALID_EID(master));
+	addr = SITE_FROM_EID(master)->net_addr;
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	len = strlen(dbsite->host) + 1;
+	DB_INIT_DBT(site_info.host, dbsite->host, len);
+	site_info.port = dbsite->port;
+	site_info.flags = 0;
+	ret = __repmgr_site_info_marshal(env,
+	    &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+	DB_ASSERT(env, ret == 0);
+
+	conn = NULL;
+	response_buf = NULL;
+	if ((ret = make_request_conn(env, &addr, &conn)) != 0)
+		return (ret);
+	if ((ret = __repmgr_send_sync_msg(env, conn,
+	    REPMGR_REMOVE_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
+		goto err;
+	if ((ret = read_own_msg(env,
+	    conn, &type, &response_buf, &len)) != 0)
+		goto err;
+	ret = type == REPMGR_REMOVE_SUCCESS ? 0 : DB_REP_UNAVAIL;
+err:
+	if (conn != NULL) {
+		if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+		    ret != 0)
+			ret = t_ret;
+		if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+		    ret != 0)
+			ret = t_ret;
+	}
+	if (response_buf != NULL)
+		__os_free(env, response_buf);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_close __P((DB_SITE *));
+ */
+int
+__repmgr_site_close(dbsite)
+	DB_SITE *dbsite;
+{
+	__os_free(dbsite->env, dbsite);
+	return (0);
+}
diff --git a/src/repmgr/repmgr_msg.c b/src/repmgr/repmgr_msg.c
new file mode 100644
index 00000000..13537823
--- /dev/null
+++ b/src/repmgr/repmgr_msg.c
@@ -0,0 +1,1655 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+static int dispatch_app_message __P((ENV *, REPMGR_MESSAGE *));
+static int finish_gmdb_update __P((ENV *,
+	DB_THREAD_INFO *, DBT *, u_int32_t, u_int32_t, __repmgr_member_args *));
+static int incr_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *));
+static void marshal_site_data __P((ENV *, u_int32_t, u_int8_t *, DBT *));
+static void marshal_site_key __P((ENV *,
+	repmgr_netaddr_t *, u_int8_t *, DBT *, __repmgr_member_args *));
+static int message_loop __P((ENV *, REPMGR_RUNNABLE *));
+static int process_message __P((ENV*, DBT*, DBT*, int));
+static int reject_fwd __P((ENV *, REPMGR_CONNECTION *));
+static int rescind_pending __P((ENV *,
+	DB_THREAD_INFO *, int, u_int32_t, u_int32_t));
+static int resolve_limbo_int __P((ENV *, DB_THREAD_INFO *));
+static int resolve_limbo_wrapper __P((ENV *, DB_THREAD_INFO *));
+static int send_permlsn __P((ENV *, u_int32_t, DB_LSN *));
+static int send_permlsn_conn __P((ENV *,
+	REPMGR_CONNECTION *, u_int32_t, DB_LSN *));
+static int serve_join_request __P((ENV *,
+	DB_THREAD_INFO *, REPMGR_MESSAGE *));
+static int serve_remove_request __P((ENV *,
+	DB_THREAD_INFO *, REPMGR_MESSAGE *));
+static int serve_repmgr_request __P((ENV *, REPMGR_MESSAGE *));
+
+/*
+ * Map one of the phase-1/provisional membership status values to its
+ * corresponding ultimate goal status: if "adding", the goal is to be fully
+ * "present".  Otherwise ("deleting") the goal is to not even appear in the
+ * database at all (0).
+ */
+#define	NEXT_STATUS(s) (u_int32_t)((s) == SITE_ADDING ? SITE_PRESENT : 0)
+
+/*
+ * PUBLIC: void *__repmgr_msg_thread __P((void *));
+ */
+void *
+__repmgr_msg_thread(argsp)
+	void *argsp;
+{
+	REPMGR_RUNNABLE *th;
+	ENV *env;
+	int ret;
+
+	th = argsp;
+	env = th->env;
+
+	if ((ret = message_loop(env, th)) != 0) {
+		__db_err(env, ret, "message thread failed");
+		(void)__repmgr_thread_failure(env, ret);
+	}
+	return (NULL);
+}
+
+static int
+message_loop(env, th)
+	ENV *env;
+	REPMGR_RUNNABLE *th;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_MESSAGE *msg;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	__repmgr_permlsn_args permlsn;
+	int incremented, ret, t_ret;
+	u_int32_t membership;
+
+	COMPQUIET(membership, 0);
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	LOCK_MUTEX(db_rep->mutex);
+	while ((ret = __repmgr_queue_get(env, &msg, th)) == 0) {
+		incremented = FALSE;
+		if (IS_DEFERRABLE(msg->msg_hdr.type)) {
+			/*
+			 * Count threads currently processing channel requests
+			 * or GMDB operations, so that we can limit the number
+			 * of them, in order to avoid starving more important
+			 * rep messages.
+			 */
+			db_rep->non_rep_th++;
+			incremented = TRUE;
+		}
+		if (msg->msg_hdr.type == REPMGR_REP_MESSAGE) {
+			DB_ASSERT(env,
+			    IS_VALID_EID(msg->v.repmsg.originating_eid));
+			site = SITE_FROM_EID(msg->v.repmsg.originating_eid);
+			membership = site->membership;
+		}
+		UNLOCK_MUTEX(db_rep->mutex);
+
+		switch (msg->msg_hdr.type) {
+		case REPMGR_REP_MESSAGE:
+			if (membership != SITE_PRESENT)
+				break;
+			while ((ret = process_message(env,
+			    &msg->v.repmsg.control, &msg->v.repmsg.rec,
+			    msg->v.repmsg.originating_eid)) == DB_LOCK_DEADLOCK)
+				RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+				    "repmgr deadlock retry"));
+			break;
+		case REPMGR_APP_MESSAGE:
+			ret = dispatch_app_message(env, msg);
+			conn = msg->v.appmsg.conn;
+			if (conn != NULL) {
+				LOCK_MUTEX(db_rep->mutex);
+				t_ret = __repmgr_decr_conn_ref(env, conn);
+				UNLOCK_MUTEX(db_rep->mutex);
+				if (t_ret != 0 && ret == 0)
+					ret = t_ret;
+			}
+			break;
+		case REPMGR_OWN_MSG:
+			ret = serve_repmgr_request(env, msg);
+			break;
+		case REPMGR_HEARTBEAT:
+			if ((ret = __repmgr_permlsn_unmarshal(env,
+			    &permlsn, msg->v.repmsg.control.data,
+			    msg->v.repmsg.control.size, NULL)) != 0)
+				ret = DB_REP_UNAVAIL;
+			else if (rep->master_id == db_rep->self_eid) {
+				/*
+				 * If a master receives a heartbeat, there
+				 * may be a dupmaster.  Resend latest log
+				 * message to prompt base replication to
+				 * detect it without the need for application
+				 * activity.
+				 */
+				ret = __rep_flush(env->dbenv);
+			} else {
+				/*
+				 * Use heartbeat message to initiate rerequest
+				 * processing.
+				 */
+				ret = __rep_check_missing(env,
+				    permlsn.generation, &permlsn.lsn);
+			}
+			break;
+		default:
+			ret = __db_unknown_path(env, "message loop");
+			break;
+		}
+
+		__os_free(env, msg);
+		LOCK_MUTEX(db_rep->mutex);
+		if (incremented)
+			db_rep->non_rep_th--;
+		if (ret != 0)
+			goto out;
+	}
+	/*
+	 * A return of DB_REP_UNAVAIL from __repmgr_queue_get() merely means we
+	 * should finish gracefully.
+	 */
+	if (ret == DB_REP_UNAVAIL)
+		ret = 0;
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+static int
+dispatch_app_message(env, msg)
+	ENV *env;
+	REPMGR_MESSAGE *msg;
+{
+	DB_REP *db_rep;
+	DB_CHANNEL db_channel;
+	CHANNEL channel;
+	__repmgr_msg_metadata_args meta;
+	DBT *dbt, *segment;
+	u_int32_t flags, i, size, *uiptr;
+	u_int8_t *data;
+	void *ptr;
+	int ret;
+
+	COMPQUIET(size, 0);
+
+	db_rep = env->rep_handle;
+
+	db_channel.channel = &channel;
+	db_channel.send_msg = __repmgr_send_response;
+
+	/* Supply stub functions for methods inapplicable in msg disp func. */
+	db_channel.close = __repmgr_channel_close_inval;
+	db_channel.send_request = __repmgr_send_request_inval;
+	db_channel.set_timeout = __repmgr_channel_timeout_inval;
+
+	channel.msg = msg;
+	channel.env = env;
+	channel.c.conn = msg->v.appmsg.conn;
+	channel.responded = FALSE;
+	channel.meta = &meta;
+
+	/*
+	 * The user data is in a form similar to that of a bulk buffer.
+	 * However, there's also our meta-data tacked on to the end of it.
+	 * Fortunately, the meta-data is fixed length, so it's easy to peel it
+	 * off.
+	 *
+	 * The user data "bulk buffer" lacks the usual "-1" end-marker.  But
+	 * that's OK, because we already know how many segments there are (from
+	 * the message header).  Convert this information into the DBT array
+	 * that we will pass to the user's function.
+	 *
+	 * (See the definition of DB_MULTIPLE_INIT for a reminder of the format
+	 * of a bulk buffer.)
+	 */
+	dbt = &msg->v.appmsg.buf;
+	data = dbt->data;
+	dbt->size -= __REPMGR_MSG_METADATA_SIZE;
+	ret = __repmgr_msg_metadata_unmarshal(env,
+	    &meta, &data[dbt->size], __REPMGR_MSG_METADATA_SIZE, NULL);
+	DB_ASSERT(env, ret == 0);
+
+	dbt->ulen = dbt->size;
+	DB_MULTIPLE_INIT(ptr, dbt);
+	for (i = 0; i < APP_MSG_SEGMENT_COUNT(msg->msg_hdr); i++) {
+		segment = &msg->v.appmsg.segments[i];
+		uiptr = ptr;
+		*uiptr = ntohl(*uiptr);
+		uiptr[-1] = ntohl(uiptr[-1]);
+		DB_MULTIPLE_NEXT(ptr, dbt, data, size);
+		DB_ASSERT(env, data != NULL);
+		DB_INIT_DBT(*segment, data, size);
+	}
+
+	flags = F_ISSET(&meta, REPMGR_REQUEST_MSG_TYPE) ?
+	    DB_REPMGR_NEED_RESPONSE : 0;
+
+	if (db_rep->msg_dispatch == NULL) {
+		__db_errx(env, DB_STR("3670",
+	    "No message dispatch call-back function has been configured"));
+		if (F_ISSET(channel.meta, REPMGR_REQUEST_MSG_TYPE))
+			return (__repmgr_send_err_resp(env,
+			    &channel, DB_NOSERVER));
+		else
+			return (0);
+	}
+
+	(*db_rep->msg_dispatch)(env->dbenv,
+	    &db_channel, &msg->v.appmsg.segments[0],
+	    APP_MSG_SEGMENT_COUNT(msg->msg_hdr), flags);
+
+	if (F_ISSET(channel.meta, REPMGR_REQUEST_MSG_TYPE) &&
+	    !channel.responded) {
+		__db_errx(env, DB_STR("3671",
+		    "Application failed to provide a response"));
+		return (__repmgr_send_err_resp(env, &channel, DB_KEYEMPTY));
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int));
+ */
+int
+__repmgr_send_err_resp(env, channel, err)
+	ENV *env;
+	CHANNEL *channel;
+	int err;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_IOVECS iovecs;
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+	int ret;
+
+	db_rep = env->rep_handle;
+	msg_hdr.type = REPMGR_RESP_ERROR;
+
+	/* Make it non-negative, so we can send on wire without worry. */
+	DB_ASSERT(env, err < 0);
+	RESP_ERROR_CODE(msg_hdr) = (u_int32_t)(-err);
+
+	RESP_ERROR_TAG(msg_hdr) = channel->meta->tag;
+
+	__repmgr_iovec_init(&iovecs);
+	__repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+	__repmgr_add_buffer(&iovecs, msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+	conn = channel->c.conn;
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_send_many(env, conn, &iovecs, 0);
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	return (ret);
+}
+
+static int
+process_message(env, control, rec, eid)
+	ENV *env;
+	DBT *control, *rec;
+	int eid;
+{
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	REP *rep;
+	int dirty, ret, t_ret;
+	u_int32_t generation;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * Save initial generation number, in case it changes in a close race
+	 * with a NEWMASTER.
+	 */
+	generation = rep->gen;
+
+	ret = 0;
+	switch (t_ret =
+	    __rep_process_message_int(env, control, rec, eid, &lsn)) {
+	case 0:
+		if (db_rep->takeover_pending)
+			ret = __repmgr_claim_victory(env);
+		break;
+
+	case DB_REP_HOLDELECTION:
+		LOCK_MUTEX(db_rep->mutex);
+		ret = __repmgr_init_election(env,
+		    ELECT_F_IMMED | ELECT_F_INVITEE);
+		UNLOCK_MUTEX(db_rep->mutex);
+		break;
+
+	case DB_REP_DUPMASTER:
+		/*
+		 * Initiate an election if we're configured to be using
+		 * elections, but only if we're *NOT* using leases.  When using
+		 * leases, there is never any uncertainty over which site is the
+		 * rightful master, and only the loser gets the DUPMASTER return
+		 * code.
+		 */
+		if ((ret = __repmgr_become_client(env)) == 0 &&
+		    FLD_ISSET(rep->config, REP_C_LEASE | REP_C_ELECTIONS)
+		    == REP_C_ELECTIONS) {
+			LOCK_MUTEX(db_rep->mutex);
+			ret = __repmgr_init_election(env, ELECT_F_IMMED);
+			UNLOCK_MUTEX(db_rep->mutex);
+		}
+		DB_EVENT(env, DB_EVENT_REP_DUPMASTER, NULL);
+		break;
+
+	case DB_REP_ISPERM:
+#ifdef	CONFIG_TEST
+		if (env->test_abort == DB_TEST_REPMGR_PERM)
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"ISPERM: Test hook.  Skip ACK for permlsn [%lu][%lu]",
+			(u_long)lsn.file, (u_long)lsn.offset));
+#endif
+		DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_PERM);
+		ret = send_permlsn(env, generation, &lsn);
+DB_TEST_RECOVERY_LABEL
+		break;
+
+	case DB_LOCK_DEADLOCK:
+	case DB_REP_IGNORE:
+	case DB_REP_NEWSITE:
+	case DB_REP_NOTPERM:
+		break;
+
+	case DB_REP_JOIN_FAILURE:
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"repmgr fires join failure event"));
+		DB_EVENT(env, DB_EVENT_REP_JOIN_FAILURE, NULL);
+		break;
+
+	case DB_REP_WOULDROLLBACK:
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"repmgr fires would-rollback event"));
+		DB_EVENT(env, DB_EVENT_REP_WOULD_ROLLBACK, &lsn);
+		break;
+
+	default:
+		__db_err(env, t_ret, "DB_ENV->rep_process_message");
+		ret = t_ret;
+	}
+
+	if (ret != 0)
+		goto err;
+	LOCK_MUTEX(db_rep->mutex);
+	dirty = db_rep->gmdb_dirty;
+	db_rep->gmdb_dirty = FALSE;
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (dirty) {
+		if ((ret = __op_rep_enter(env, FALSE, FALSE)) != 0)
+			goto err;
+		ret = __repmgr_reload_gmdb(env);
+		t_ret = __op_rep_exit(env);
+		if (ret == ENOENT)
+			ret = 0;
+		else if (ret == DB_DELETED)
+			ret = __repmgr_bow_out(env);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+	}
+err:
+	return (ret);
+}
+
+/*
+ * Handle replication-related events.  Returns only 0 or DB_EVENT_NOT_HANDLED;
+ * no other error returns are tolerated.
+ *
+ * PUBLIC: int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+ */
+int
+__repmgr_handle_event(env, event, info)
+	ENV *env;
+	u_int32_t event;
+	void *info;
+{
+	DB_REP *db_rep;
+
+	db_rep = env->rep_handle;
+
+	if (db_rep->selector == NULL) {
+		/* Repmgr is not in use, so all events go to application. */
+		return (DB_EVENT_NOT_HANDLED);
+	}
+
+	switch (event) {
+	case DB_EVENT_REP_ELECTED:
+		DB_ASSERT(env, info == NULL);
+		db_rep->takeover_pending = TRUE;
+
+		/*
+		 * The application doesn't really need to see this, because the
+		 * purpose of this event is to tell the winning site that it
+		 * should call rep_start(MASTER), and in repmgr we do that
+		 * automatically.  Still, they could conceivably be curious, and
+		 * it doesn't hurt anything to let them know.
+		 */
+		break;
+	case DB_EVENT_REP_INIT_DONE:
+		db_rep->gmdb_dirty = TRUE;
+		break;
+	case DB_EVENT_REP_NEWMASTER:
+		DB_ASSERT(env, info != NULL);
+
+		/* Application still needs to see this. */
+		break;
+	default:
+		break;
+	}
+	return (DB_EVENT_NOT_HANDLED);
+}
+
+static int
+send_permlsn(env, generation, lsn)
+	ENV *env;
+	u_int32_t generation;
+	DB_LSN *lsn;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	int ack, bcast, eid, master, policy, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+	master = rep->master_id;
+	LOCK_MUTEX(db_rep->mutex);
+
+	/*
+	 * If the file number has changed, send it to everyone, regardless of
+	 * anything else.  Otherwise, send it to the master if we know a master,
+	 * and that master's ack policy requires it.
+	 */
+	bcast = FALSE;
+	if (LOG_COMPARE(lsn, &db_rep->perm_lsn) > 0) {
+		if (lsn->file > db_rep->perm_lsn.file) {
+			bcast = TRUE;
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "send_permlsn: broadcast [%lu][%lu]",
+			    (u_long)lsn->file, (u_long)lsn->offset));
+		}
+		db_rep->perm_lsn = *lsn;
+	}
+	if (IS_KNOWN_REMOTE_SITE(master)) {
+		site = SITE_FROM_EID(master);
+		/*
+		 * Use master's ack policy if we know it; use our own if the
+		 * master is too old (down-rev) to have told us its policy.
+		 */
+		policy = site->ack_policy > 0 ?
+		    site->ack_policy : rep->perm_policy;
+		if (policy == DB_REPMGR_ACKS_NONE ||
+		    (IS_PEER_POLICY(policy) && rep->priority == 0))
+			ack = FALSE;
+		else
+			ack = TRUE;
+	} else {
+		site = NULL;
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "dropping ack with no known master"));
+		ack = FALSE;
+	}
+
+	/*
+	 * Send to master first, since we need to send to all its connections.
+	 */
+	if (site != NULL && (bcast || ack)) {
+		if (site->state == SITE_CONNECTED) {
+			if ((conn = site->ref.conn.in) != NULL &&
+			    conn->state == CONN_READY &&
+			    (ret = send_permlsn_conn(env,
+			    conn, generation, lsn)) != 0)
+				goto unlock;
+			if ((conn = site->ref.conn.out) != NULL &&
+			    conn->state == CONN_READY &&
+			    (ret = send_permlsn_conn(env,
+			    conn, generation, lsn)) != 0)
+				goto unlock;
+		}
+		TAILQ_FOREACH(conn, &site->sub_conns, entries) {
+			if ((ret = send_permlsn_conn(env,
+			    conn, generation, lsn)) != 0)
+				goto unlock;
+		}
+	}
+	if (bcast) {
+		/*
+		 * Send to everyone except the master (since we've already done
+		 * that, above).
+		 */
+		FOR_EACH_REMOTE_SITE_INDEX(eid) {
+			if (eid == master)
+				continue;
+			site = SITE_FROM_EID(eid);
+			/*
+			 * Send the ack out on primary connections only.
+			 */
+			if (site->state == SITE_CONNECTED) {
+				if ((conn = site->ref.conn.in) != NULL &&
+				    conn->state == CONN_READY &&
+				    (ret = send_permlsn_conn(env,
+				    conn, generation, lsn)) != 0)
+					goto unlock;
+				if ((conn = site->ref.conn.out) != NULL &&
+				    conn->state == CONN_READY &&
+				    (ret = send_permlsn_conn(env,
+				    conn, generation, lsn)) != 0)
+					goto unlock;
+			}
+		}
+	}
+
+unlock:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * Sends a perm LSN message on one connection, if it needs it.
+ *
+ * !!! Called with mutex held.
+ */
+static int
+send_permlsn_conn(env, conn, generation, lsn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	u_int32_t generation;
+	DB_LSN *lsn;
+{
+	DBT control2, rec2;
+	__repmgr_permlsn_args permlsn;
+	u_int8_t buf[__REPMGR_PERMLSN_SIZE];
+	int ret;
+
+	ret = 0;
+
+	if (conn->state == CONN_READY) {
+		DB_ASSERT(env, conn->version > 0);
+		permlsn.generation = generation;
+		memcpy(&permlsn.lsn, lsn, sizeof(DB_LSN));
+		if (conn->version == 1) {
+			control2.data = &permlsn;
+			control2.size = sizeof(permlsn);
+		} else {
+			__repmgr_permlsn_marshal(env, &permlsn, buf);
+			control2.data = buf;
+			control2.size = __REPMGR_PERMLSN_SIZE;
+		}
+		rec2.size = 0;
+		/*
+		 * It's hard to imagine anyone would care about a lost ack if
+		 * the path to the master is so congested as to need blocking;
+		 * so pass "maxblock" argument as 0.
+		 */
+		if ((ret = __repmgr_send_one(env, conn, REPMGR_PERMLSN,
+		    &control2, &rec2, 0)) == DB_REP_UNAVAIL)
+			ret = __repmgr_bust_connection(env, conn);
+	}
+	return (ret);
+}
+
+static int
+serve_repmgr_request(env, msg)
+	ENV *env;
+	REPMGR_MESSAGE *msg;
+{
+	DB_THREAD_INFO *ip;
+	DBT *dbt;
+	REPMGR_CONNECTION *conn;
+	int ret, t_ret;
+
+	ENV_ENTER(env, ip);
+	switch (REPMGR_OWN_MSG_TYPE(msg->msg_hdr)) {
+	case REPMGR_JOIN_REQUEST:
+		ret = serve_join_request(env, ip, msg);
+		break;
+	case REPMGR_REJOIN:
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "One try at rejoining group automatically"));
+		if ((ret = __repmgr_join_group(env)) == DB_REP_UNAVAIL)
+			ret = __repmgr_bow_out(env);
+		break;
+	case REPMGR_REMOVE_REQUEST:
+		ret = serve_remove_request(env, ip, msg);
+		break;
+	case REPMGR_RESOLVE_LIMBO:
+		ret = resolve_limbo_wrapper(env, ip);
+		break;
+	case REPMGR_SHARING:
+		dbt = &msg->v.gmdb_msg.request;
+		ret = __repmgr_refresh_membership(env, dbt->data, dbt->size);
+		break;
+	default:
+		ret = __db_unknown_path(env, "serve_repmgr_request");
+		break;
+	}
+	if ((conn = msg->v.gmdb_msg.conn) != NULL) {
+		if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Attempts to fulfill a remote site's request to join the replication group.
+ * Only the master can grant this request, so if we've received this request
+ * when we're not the master, we'll send an appropriate failure message instead.
+ */
+static int
+serve_join_request(env, ip, msg)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REPMGR_MESSAGE *msg;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	DBT *dbt;
+	__repmgr_site_info_args site_info;
+	u_int8_t *buf;
+	char *host;
+	size_t len;
+	u_int32_t status;
+	int eid, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	COMPQUIET(status, 0);
+
+	conn = msg->v.gmdb_msg.conn;
+	dbt = &msg->v.gmdb_msg.request;
+	ret = __repmgr_site_info_unmarshal(env,
+	    &site_info, dbt->data, dbt->size, NULL);
+
+	host = site_info.host.data;
+	host[site_info.host.size - 1] = '\0';
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "Request to join group from %s:%u", host, (u_int)site_info.port));
+
+	if ((ret = __repmgr_hold_master_role(env, conn)) == DB_REP_UNAVAIL)
+		return (0);
+	if (ret != 0)
+		return (ret);
+
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_find_site(env, host, site_info.port, &eid)) == 0) {
+		DB_ASSERT(env, eid != db_rep->self_eid);
+		status = SITE_FROM_EID(eid)->membership;
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (ret != 0)
+		goto err;
+
+	switch (status) {
+	case 0:
+	case SITE_ADDING:
+		ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING);
+		break;
+	case SITE_PRESENT:
+		/* Already in desired state. */
+		break;
+	case SITE_DELETING:
+		ret = rescind_pending(env,
+		    ip, eid, SITE_DELETING, SITE_PRESENT);
+		break;
+	default:
+		ret = __db_unknown_path(env, "serve_join_request");
+		break;
+	}
+	if (ret != 0)
+		goto err;
+
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_marshal_member_list(env, &buf, &len);
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (ret != 0)
+		goto err;
+	ret = __repmgr_send_sync_msg(env, conn, REPMGR_JOIN_SUCCESS,
+	    buf, (u_int32_t)len);
+	__os_free(env, buf);
+
+err:
+
+	if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ret == DB_REP_UNAVAIL)
+		ret = __repmgr_send_sync_msg(env, conn,
+		    REPMGR_GM_FAILURE, NULL, 0);
+
+	return (ret);
+}
+
+static int
+serve_remove_request(env, ip, msg)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REPMGR_MESSAGE *msg;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	DBT *dbt;
+	__repmgr_site_info_args site_info;
+	char *host;
+	u_int32_t status, type;
+	int eid, ret, t_ret;
+
+	COMPQUIET(status, 0);
+	db_rep = env->rep_handle;
+
+	conn = msg->v.gmdb_msg.conn;
+	dbt = &msg->v.gmdb_msg.request;
+	ret = __repmgr_site_info_unmarshal(env,
+	    &site_info, dbt->data, dbt->size, NULL);
+
+	host = site_info.host.data;
+	host[site_info.host.size - 1] = '\0';
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "Request to remove %s:%u from group", host, (u_int)site_info.port));
+
+	if ((ret = __repmgr_hold_master_role(env, conn)) == DB_REP_UNAVAIL)
+		return (0);
+	if (ret != 0)
+		return (ret);
+
+	LOCK_MUTEX(db_rep->mutex);
+	if ((site = __repmgr_lookup_site(env, host, site_info.port)) == NULL)
+		eid = DB_EID_INVALID;
+	else {
+		eid = EID_FROM_SITE(site);
+		status = site->membership;
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (eid == DB_EID_INVALID) {
+		/* Doesn't exist: already been removed. */
+		ret = 0;
+		goto err;
+	} else if (eid == db_rep->self_eid) {
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"Reject request to remove current master"));
+		ret = DB_REP_UNAVAIL;
+		goto err;
+	}
+
+	switch (status) {
+	case 0:
+		/* Already in desired state. */
+		break;
+	case SITE_ADDING:
+		ret = rescind_pending(env, ip, eid, SITE_ADDING, 0);
+		break;
+	case SITE_PRESENT:
+	case SITE_DELETING:
+		ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING);
+		break;
+	default:
+		ret = __db_unknown_path(env, "serve_remove_request");
+		break;
+	}
+err:
+	if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+		ret = t_ret;
+	switch (ret) {
+	case 0:
+		type = REPMGR_REMOVE_SUCCESS;
+		break;
+	case DB_REP_UNAVAIL:
+		type = REPMGR_GM_FAILURE;
+		break;
+	default:
+		return (ret);
+	}
+	return (__repmgr_send_sync_msg(env, conn, type, NULL, 0));
+}
+
+/*
+ * Runs a limbo resolution on a message processing thread, upon request from the
+ * send() function when it notices that a user transaction has gotten a perm
+ * success.  (It wouldn't work for the user thread to do it in-line.)
+ */
+static int
+resolve_limbo_wrapper(env, ip)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+{
+	int do_close, ret, t_ret;
+
+	if ((ret = __repmgr_hold_master_role(env, NULL)) == DB_REP_UNAVAIL)
+		return (0);
+	if (ret != 0)
+		return (ret);
+retry:
+	if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+		goto rlse;
+
+	/*
+	 * A limbo resolution request is merely a "best effort" attempt to
+	 * shorten the duration of a pending change.  So if it fails for lack of
+	 * acks again, no one really cares.
+	 */
+	if ((ret = resolve_limbo_int(env, ip)) == DB_REP_UNAVAIL) {
+		do_close = FALSE;
+		ret = 0;
+	} else
+		do_close = TRUE;
+
+	if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+	if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+		goto retry;
+rlse:
+	if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * Checks for the need to resolve limbo (failure of a previous GMDB update to
+ * get enough acks), and does it if nec.  No-op if none needed.
+ *
+ * Must be called within setup_gmdb_op/cleanup_gmdb_op context.
+ */
+static int
+resolve_limbo_int(env, ip)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+{
+	DB_REP *db_rep;
+	DB_TXN *txn;
+	REPMGR_SITE *site;
+	DB_LSN orig_lsn;
+	DBT key_dbt, data_dbt;
+	__repmgr_member_args logrec;
+	repmgr_netaddr_t addr;
+	u_int32_t orig_status, status;
+	int eid, locked, ret, t_ret;
+	u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+	u_int8_t key_buf[MAX_MSG_BUF];
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+
+	/*
+	 * Is there a previous GMDB update failure currently pending?  If not,
+	 * there's nothing for us to do.
+	 */
+	eid = db_rep->limbo_victim;
+	if (!IS_VALID_EID(eid))
+		goto out;
+	site = SITE_FROM_EID(eid);
+	addr = site->net_addr;
+	marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+	orig_status = site->membership;
+	if (orig_status == SITE_PRESENT || orig_status == 0)
+		goto out;
+
+	if (IS_ZERO_LSN(db_rep->limbo_failure))
+		goto out;
+
+	/*
+	 * There are potentially two parts: the self-update of the existing
+	 * limbo record, and then the finishing-off if the first is successful.
+	 * We might only have to do the finishing-off, if some arbitrary random
+	 * txn triggered a limbo resolution request on a msg processing thread.
+	 */
+	if (LOG_COMPARE(&db_rep->durable_lsn, &db_rep->limbo_failure) > 0) {
+		/*
+		 * Nice!  Limbo has been resolved by an arbitrary other txn
+		 * succeeding subsequently.  So we don't have to do the
+		 * "self-update" part.
+		 */
+	} else {
+		/*
+		 * Do a self-update, to try to trigger a "durable".  Since
+		 * nothing in the database is changing, we need neither an ASL
+		 * hint nor a bump in the version sequence.
+		 */
+		orig_lsn = db_rep->limbo_failure;
+		db_rep->active_gmdb_update = gmdb_primary;
+		UNLOCK_MUTEX(db_rep->mutex);
+		locked = FALSE;
+
+		if ((ret = __txn_begin(env,
+		    ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+			goto out;
+
+		marshal_site_data(env, orig_status, data_buf, &data_dbt);
+
+		ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0);
+		if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto out;
+
+		/*
+		 * Check to see whether we got another PERM failure.  This is
+		 * quite possible in the case where a GMDB request is being
+		 * retried by a requestor, but unlikely if we had a resolution
+		 * via an "arbitrary" txn.
+		 */
+		LOCK_MUTEX(db_rep->mutex);
+		locked = TRUE;
+		if (LOG_COMPARE(&db_rep->limbo_failure, &orig_lsn) > 0) {
+			db_rep->limbo_resolution_needed = TRUE;
+			ret = DB_REP_UNAVAIL;
+			goto out;
+		}
+	}
+	DB_ASSERT(env, locked);
+
+	/*
+	 * Here, either we didn't need to do the self-update, or we did it and
+	 * it succeeded.  So now we're ready to do the second phase update.
+	 */
+	db_rep->limbo_victim = DB_EID_INVALID;
+	UNLOCK_MUTEX(db_rep->mutex);
+	locked = FALSE;
+	status = NEXT_STATUS(orig_status);
+	if ((ret = finish_gmdb_update(env,
+	    ip, &key_dbt, orig_status, status, &logrec)) != 0)
+		goto out;
+
+	/* Track modified membership status in our in-memory sites array. */
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	if ((ret = __repmgr_set_membership(env,
+	    addr.host, addr.port, status)) != 0)
+		goto out;
+	__repmgr_set_sites(env);
+
+out:
+	if (locked)
+		UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * Update a specific record in the Group Membership database.  The record to be
+ * updated is implied by "eid"; "pstatus" is the provisional status (ADDING or
+ * DELETING) to be used in the first phase of the update.  The ultimate goal
+ * status is inferred (ADDING -> PRESENT, or DELETING -> 0).
+ *
+ * PUBLIC: int __repmgr_update_membership __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, int, u_int32_t));
+ */
+int
+__repmgr_update_membership(env, ip, eid, pstatus)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	u_int32_t pstatus;	/* Provisional status. */
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	DB_TXN *txn;
+	DB_LSN lsn, orig_lsn;
+	DBT key_dbt, data_dbt;
+	__repmgr_member_args logrec;
+	repmgr_netaddr_t addr;
+	u_int32_t orig_status, ult_status;
+	int do_close, locked, ret, t_ret;
+	u_int8_t key_buf[MAX_MSG_BUF];
+	u_int8_t status_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+
+	DB_ASSERT(env, pstatus == SITE_ADDING || pstatus == SITE_DELETING);
+
+	db_rep = env->rep_handle;
+	COMPQUIET(orig_status, 0);
+	COMPQUIET(addr.host, NULL);
+	COMPQUIET(addr.port, 0);
+
+retry:
+	txn = NULL;
+	locked = FALSE;
+	DB_ASSERT(env, db_rep->gmdb_busy);
+	if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Usually we'll keep the GMDB closed, to conserve resources, since
+	 * changes should be rare.  However, if a PERM FAIL puts us in limbo, we
+	 * expect to clean that up as soon as we can; so leave it open for now
+	 * in that case.
+	 */
+	do_close = TRUE;
+
+	/*
+	 * Before attempting any fresh updates, resolve any lingering incomplete
+	 * updates from the past (i.e., those that resulted in PERM_FAIL).  If
+	 * we can't, then we mustn't proceed with any more updates.  Getting an
+	 * additional perm failure would increase the dissonance between the
+	 * effective group size and the number of sites from which we can safely
+	 * accept acks.  Besides, if we can't clear the previous failure,
+	 * there's practically no hope that a new update would fare any better.
+	 */
+	if ((ret = resolve_limbo_int(env, ip)) != 0) {
+		if (ret == DB_REP_UNAVAIL)
+			do_close = FALSE;
+		goto err;
+	}
+
+	/*
+	 * If there was a successful limbo resolution, it could have either been
+	 * for some unrelated change, or it could have been the same change our
+	 * caller is now (re-)trying to perform.  In the latter case, we have
+	 * nothing more to do -- resolve_limbo() has done it all for us!  To
+	 * find out, compare the site's current status with the ultimate goal
+	 * status associated with the provisional status that was passed to us
+	 * as input.
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+	site = SITE_FROM_EID(eid);
+	if ((orig_status = site->membership) == NEXT_STATUS(pstatus))
+		goto err;
+	addr = site->net_addr;
+
+	/*
+	 * Anticipate modified membership status in our in-memory sites array.
+	 * This forces us into an awkward rescission, below, if our transaction
+	 * suffers a hard failure and must be aborted.  But it's necessary
+	 * because of the requirement that, on additions, the quorum computation
+	 * must be based on the incremented nsites value.  An alternative might
+	 * possibly be to increment nsites separately from adding the new site
+	 * to the array, or even having a special epicycle at the point where
+	 * send() counts acks (we'd have to make active_gmdb_update richer), but
+	 * those seem even more confusing.
+	 */
+	if ((ret = __repmgr_set_membership(env,
+	    addr.host, addr.port, pstatus)) != 0)
+		goto err;
+	__repmgr_set_sites(env);
+
+	/*
+	 * Hint to our send() function that we want to know the result of ack
+	 * counting.
+	 */
+	orig_lsn = db_rep->limbo_failure;
+	db_rep->active_gmdb_update = gmdb_primary;
+	UNLOCK_MUTEX(db_rep->mutex);
+	locked = FALSE;
+
+	if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		goto err;
+	marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+	marshal_site_data(env, pstatus, status_buf, &data_dbt);
+	if ((ret = __db_put(db_rep->gmdb,
+	    ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+		goto err;
+	if ((ret = incr_gm_version(env, ip, txn)) != 0)
+		goto err;
+
+	/*
+	 * Add some information to the log for this txn.  This is an annotation,
+	 * for the sole purpose of enabling the client to notice whenever a
+	 * change has occurred in this database.  It has nothing to do with
+	 * local recovery.
+	 */
+	ZERO_LSN(lsn);
+	if ((ret = __repmgr_member_log(env,
+	    txn, &lsn, 0, db_rep->membership_version,
+	    orig_status, pstatus, &logrec.host, logrec.port)) != 0)
+		goto err;
+	ret = __txn_commit(txn, 0);
+	txn = NULL;
+	if (ret != 0)
+		goto err;
+
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+
+	if (LOG_COMPARE(&db_rep->limbo_failure, &orig_lsn) > 0) {
+		/*
+		 * Failure LSN advanced, meaning this update wasn't acked by
+		 * enough clients.
+		 */
+		db_rep->limbo_resolution_needed = TRUE;
+		db_rep->limbo_victim = eid;
+		ret = DB_REP_UNAVAIL;
+		do_close = FALSE;
+		goto err;
+	}
+
+	/* Now we'll complete the status change. */
+	ult_status = NEXT_STATUS(pstatus);
+	UNLOCK_MUTEX(db_rep->mutex);
+	locked = FALSE;
+
+	if ((ret = finish_gmdb_update(env, ip,
+	    &key_dbt, pstatus, ult_status, &logrec)) != 0)
+		goto err;
+
+	/* Track modified membership status in our in-memory sites array. */
+	LOCK_MUTEX(db_rep->mutex);
+	locked = TRUE;
+	ret = __repmgr_set_membership(env, addr.host, addr.port, ult_status);
+	__repmgr_set_sites(env);
+
+err:
+	if (locked)
+		UNLOCK_MUTEX(db_rep->mutex);
+	if (txn != NULL) {
+		DB_ASSERT(env, ret != 0);
+		(void)__txn_abort(txn);
+		/*
+		 * We've just aborted the txn which moved the site info from
+		 * orig_status to something else, so restore that value now so
+		 * that we keep in sync.
+		 */
+		(void)__repmgr_set_membership(env,
+		    addr.host, addr.port, orig_status);
+	}
+	if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+	if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+		goto retry;
+	return (ret);
+}
+
+/*
+ * Rescind a partially completed membership DB change, setting the new status to
+ * the value given.
+ */
+static int
+rescind_pending(env, ip, eid, cur_status, new_status)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	int eid;
+	u_int32_t cur_status, new_status;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	DBT key_dbt;
+	__repmgr_member_args logrec;
+	repmgr_netaddr_t addr;
+	u_int8_t key_buf[MAX_MSG_BUF];
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+retry:
+	if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+		return (ret);
+
+	LOCK_MUTEX(db_rep->mutex);
+	DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+	site = SITE_FROM_EID(eid);
+	addr = site->net_addr;
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+	if ((ret = finish_gmdb_update(env,
+	    ip, &key_dbt, cur_status, new_status, &logrec)) != 0)
+		goto err;
+
+	/* Track modified membership status in our in-memory sites array. */
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_set_membership(env, addr.host, addr.port, new_status);
+	__repmgr_set_sites(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+
+err:
+	if ((t_ret = __repmgr_cleanup_gmdb_op(env, TRUE)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+	if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+		goto retry;
+	return (ret);
+}
+
+/*
+ * Caller must have already taken care of serializing this operation
+ * (hold_master_role(), setup_gmdb_op()).
+ */
+static int
+incr_gm_version(env, ip, txn)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	DB_REP *db_rep;
+	u_int32_t version;
+	int ret;
+
+	db_rep = env->rep_handle;
+	version = db_rep->membership_version + 1;
+	if ((ret = __repmgr_set_gm_version(env, ip, txn, version)) == 0)
+		db_rep->membership_version = version;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_gm_version __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+ */
+int
+__repmgr_set_gm_version(env, ip, txn, version)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t version;
+{
+	DB_REP *db_rep;
+	DBT key_dbt, data_dbt;
+	__repmgr_membership_key_args key;
+	__repmgr_member_metadata_args metadata;
+	u_int8_t key_buf[__REPMGR_MEMBERSHIP_KEY_SIZE + 1];
+	u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE];
+	size_t len;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	metadata.format = REPMGR_GMDB_FMT_VERSION;
+	metadata.version = version;
+	__repmgr_member_metadata_marshal(env, &metadata, metadata_buf);
+	DB_INIT_DBT(data_dbt, metadata_buf, __REPMGR_MEMBER_METADATA_SIZE);
+
+	DB_INIT_DBT(key.host, NULL, 0);
+	key.port = 0;
+	ret = __repmgr_membership_key_marshal(env,
+	    &key, key_buf, sizeof(key_buf), &len);
+	DB_ASSERT(env, ret == 0);
+	DB_INIT_DBT(key_dbt, key_buf, len);
+
+	if ((ret = __db_put(db_rep->gmdb,
+	    ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * Performs the second phase of a 2-phase membership DB operation: an "adding"
+ * site becomes fully "present" in the group; a "deleting" site is finally
+ * really deleted.
+ */
+static int
+finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DBT *key_dbt;
+	u_int32_t prev_status, status;
+	__repmgr_member_args *logrec;
+{
+	DB_REP *db_rep;
+	DB_LSN lsn;
+	DB_TXN *txn;
+	DBT data_dbt;
+	u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+	db_rep->active_gmdb_update = gmdb_secondary;
+	if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		return (ret);
+
+	if (status == 0)
+		ret = __db_del(db_rep->gmdb, ip, txn, key_dbt, 0);
+	else {
+		marshal_site_data(env, status, data_buf, &data_dbt);
+		ret = __db_put(db_rep->gmdb, ip, txn, key_dbt, &data_dbt, 0);
+	}
+	if (ret != 0)
+		goto err;
+
+	if ((ret = incr_gm_version(env, ip, txn)) != 0)
+		goto err;
+
+	ZERO_LSN(lsn);
+	if ((ret = __repmgr_member_log(env,
+	    txn, &lsn, 0, db_rep->membership_version,
+	    prev_status, status, &logrec->host, logrec->port)) != 0)
+		goto err;
+
+err:
+	if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * Set up everything we need to update the Group Membership database.  This may
+ * or may not include providing a transaction in which to do the updates
+ * (depending on whether the caller wants the creation of the database to be in
+ * the same transaction as the updates).
+ *
+ * PUBLIC: int __repmgr_setup_gmdb_op __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN **, u_int32_t));
+ */
+int
+__repmgr_setup_gmdb_op(env, ip, txnp, flags)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN **txnp;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	DB_TXN *txn;
+	DB *dbp;
+	int ret, was_open;
+
+	db_rep = env->rep_handle;
+
+	dbp = NULL;
+	txn = NULL;
+
+	/*
+	 * If the caller provided a place to return a txn handle, create it and
+	 * perform any open operation as part of that txn.  The caller is
+	 * responsible for disposing of the txn.  Otherwise, only begin a txn if
+	 * we need to do the open and in that case commit it right after the
+	 * open.
+	 */
+	DB_ASSERT(env, db_rep->gmdb_busy);
+	was_open = db_rep->gmdb != NULL;
+	if ((txnp != NULL || !was_open) &&
+	    (ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		goto err;
+
+	if (!was_open) {
+		DB_ASSERT(env, txn != NULL);
+		/*
+		 * Opening the membership database is like a secondary GMDB
+		 * operation, in the sense that we don't care how many clients
+		 * ack it, yet we don't want the application to see any perm
+		 * failure events.
+		 */
+		DB_ASSERT(env, db_rep->active_gmdb_update == none);
+		db_rep->active_gmdb_update = gmdb_secondary;
+		ret = __rep_open_sysdb(env,
+		    ip, txn, REPMEMBERSHIP, flags, &dbp);
+		if (ret == 0 && txnp == NULL) {
+			/* The txn was just for the open operation. */
+			ret = __txn_commit(txn, 0);
+			txn = NULL;
+		}
+		db_rep->active_gmdb_update = none;
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Lock out normal API operations.  Again because we need to know that
+	 * if a PERM_FAIL occurs, it was associated with our txn.  Also, so that
+	 * we avoid confusing the application with a PERM_FAIL event for our own
+	 * txn.
+	 */
+	if ((ret = __rep_take_apilockout(env)) != 0)
+		goto err;
+
+	/*
+	 * Here, all steps have succeeded.  Stash and/or pass back the fruits of
+	 * our labor.
+	 */
+	if (!was_open) {
+		DB_ASSERT(env, dbp != NULL);
+		db_rep->gmdb = dbp;
+	}
+	if (txnp != NULL) {
+		DB_ASSERT(env, txn != NULL);
+		*txnp = txn;
+	}
+	/*
+	 * In the successful case, a later call to cleanup_gmdb_op will
+	 * ENV_LEAVE.
+	 */
+	return (0);
+
+err:
+	DB_ASSERT(env, ret != 0);
+	if (dbp != NULL)
+		(void)__db_close(dbp, txn, DB_NOSYNC);
+	if (txn != NULL)
+		(void)__txn_abort(txn);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_cleanup_gmdb_op __P((ENV *, int));
+ */
+int
+__repmgr_cleanup_gmdb_op(env, do_close)
+	ENV *env;
+	int do_close;
+{
+	DB_REP *db_rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	db_rep->active_gmdb_update = none;
+	ret = __rep_clear_apilockout(env);
+
+	if (do_close && db_rep->gmdb != NULL) {
+		if ((t_ret = __db_close(db_rep->gmdb, NULL, DB_NOSYNC) != 0) &&
+		    ret == 0)
+			ret = t_ret;
+		db_rep->gmdb = NULL;
+	}
+	return (ret);
+}
+
+/*
+ * Check whether we're currently master, and if so hold that role so that we can
+ * perform a Group Membership database operation.  After a successful call, the
+ * caller must call rlse_master_role to release the hold.
+ *
+ * If we can't guarantee that we can remain master, send an appropriate failure
+ * message on the given connection (unless NULL).
+ *
+ * We also ensure that only one GMDB operation will take place at time, for a
+ * couple of reasons: if we get a PERM_FAIL it means the fate of the change is
+ * indeterminate, so we have to assume the worst.  We have to assume the higher
+ * value of nsites, yet we can't accept ack from the questionable site.  If we
+ * allowed concurrent operations, this could lead to more than one questionable
+ * site, which would be even worse.  Also, when we get a PERM_FAIL we want to
+ * know which txn failed, and that would be messy if there could be several.
+ *
+ * Of course we can't simply take the mutex for the duration, because
+ * the mutex needs to be available in order to send out the log
+ * records.
+ *
+ * PUBLIC: int __repmgr_hold_master_role __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_hold_master_role(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_await_gmdbop(env)) == 0) {
+		/*
+		 * If we're currently master, but client_intent is set, it means
+		 * that another thread is on the way to becoming master, so we
+		 * can't promise to hold the master role for the caller: we've
+		 * lost a close race.
+		 */
+		if (rep->master_id != db_rep->self_eid ||
+		    db_rep->client_intent)
+			ret = DB_REP_UNAVAIL;
+		else
+			db_rep->gmdb_busy = TRUE;
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (conn != NULL && ret == DB_REP_UNAVAIL &&
+	    (t_ret = reject_fwd(env, conn)) != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * Releases the "master role" lock once we're finished performing a membership
+ * DB operation.
+ *
+ * PUBLIC: int __repmgr_rlse_master_role __P((ENV *));
+ */
+int
+__repmgr_rlse_master_role(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	db_rep->gmdb_busy = FALSE;
+	ret = __repmgr_signal(&db_rep->gmdb_idle);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * Responds to a membership change request in the case we're not currently
+ * master.  If we know the master, responds with a "forward" message, to tell
+ * the requestor who is master.  Otherwise rejects it outright.
+ */
+static int
+reject_fwd(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	SITE_STRING_BUFFER site_string;
+	__repmgr_gm_fwd_args fwd;
+	repmgr_netaddr_t addr;
+	u_int8_t buf[MAX_MSG_BUF];
+	u_int32_t msg_type;
+	size_t len;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (IS_KNOWN_REMOTE_SITE(rep->master_id)) {
+		msg_type = REPMGR_GM_FORWARD;
+		LOCK_MUTEX(db_rep->mutex);
+		addr = SITE_FROM_EID(rep->master_id)->net_addr;
+		UNLOCK_MUTEX(db_rep->mutex);
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Forwarding request to master %s",
+		    __repmgr_format_addr_loc(&addr, site_string)));
+		fwd.host.data = addr.host;
+		fwd.host.size = (u_int32_t)strlen(fwd.host.data) + 1;
+		fwd.port = addr.port;
+		fwd.gen = rep->mgen;
+		ret = __repmgr_gm_fwd_marshal(env,
+		    &fwd, buf, sizeof(buf), &len);
+		DB_ASSERT(env, ret == 0);
+	} else {
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Rejecting membership request with no known master"));
+		msg_type = REPMGR_GM_FAILURE;
+		len = 0;
+	}
+
+	return (__repmgr_send_sync_msg(env, conn,
+	    msg_type, buf, (u_int32_t)len));
+}
+
+/*
+ * The length of "buf" must be at least MAX_GMDB_KEY.
+ */
+static void
+marshal_site_key(env, addr, buf, dbt, logrec)
+	ENV *env;
+	repmgr_netaddr_t *addr;
+	u_int8_t *buf;
+	DBT *dbt;
+	__repmgr_member_args *logrec;
+{
+	__repmgr_membership_key_args key;
+	size_t len;
+	int ret;
+
+	DB_INIT_DBT(key.host, addr->host, strlen(addr->host) + 1);
+	logrec->host = key.host;
+	key.port = addr->port;
+	logrec->port = key.port;
+	ret = __repmgr_membership_key_marshal(env,
+	    &key, buf, MAX_MSG_BUF, &len);
+	DB_ASSERT(env, ret == 0);
+	DB_INIT_DBT(*dbt, buf, len);
+}
+
+static void
+marshal_site_data(env, status, buf, dbt)
+	ENV *env;
+	u_int32_t status;
+	u_int8_t *buf;
+	DBT *dbt;
+{
+	__repmgr_membership_data_args member_status;
+
+	member_status.flags = status;
+	__repmgr_membership_data_marshal(env, &member_status, buf);
+	DB_INIT_DBT(*dbt, buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
+}
+
+/*
+ * PUBLIC: void __repmgr_set_sites __P((ENV *));
+ *
+ * Caller must hold mutex.
+ */
+void
+__repmgr_set_sites(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+	u_int32_t n;
+	u_int i;
+
+	db_rep = env->rep_handle;
+
+	for (i = 0, n = 0; i < db_rep->site_cnt; i++) {
+		if (db_rep->sites[i].membership > 0)
+			n++;
+	}
+	ret = __rep_set_nsites_int(env, n);
+	DB_ASSERT(env, ret == 0);
+}
diff --git a/src/repmgr/repmgr_net.c b/src/repmgr/repmgr_net.c
new file mode 100644
index 00000000..54e3d066
--- /dev/null
+++ b/src/repmgr/repmgr_net.c
@@ -0,0 +1,2043 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+
+/*
+ * The functions in this module implement a simple wire protocol for
+ * transmitting messages of various types.  Every message consists of a 9-byte
+ * header followed by a body (though the body could be 0-length).  The header is
+ * the marshaled form of the "msg_hdr" structure defined in repmgr.src.  The
+ * interpretation of header fields depends on message type, and is defined in
+ * repmgr.h.  But as a general principle, in all cases there is enough
+ * information in the header for us to know the total size of the body, and the
+ * total amount of memory we need to allocate for storing and processing the
+ * message.
+ */
+
+/*
+ * In sending a message, we first try to send it in-line, in the sending thread,
+ * and without first copying the message, by using scatter/gather I/O, using
+ * iovecs to point to the various pieces of the message.  If that all works
+ * without blocking, that's optimal.
+ *     If we find that, for a particular connection, we can't send without
+ * blocking, then we must copy the message for sending later in the select()
+ * thread.  In the course of doing that, we might as well "flatten" the message,
+ * forming one single buffer, to simplify life.  Not only that, once we've gone
+ * to the trouble of doing that, other sites to which we also want to send the
+ * message (in the case of a broadcast), may as well take advantage of the
+ * simplified structure also.
+ *     The sending_msg structure below holds it all.  Note that this structure,
+ * and the "flat_msg" structure, are allocated separately, because (1) the
+ * flat_msg version is usually not needed; and (2) when a flat_msg is needed, it
+ * will need to live longer than the wrapping sending_msg structure.
+ *     Note that, for the broadcast case, where we're going to use this
+ * repeatedly, the iovecs is a template that must be copied, since in normal use
+ * the iovecs pointers and lengths get adjusted after every partial write.
+ */
+struct sending_msg {
+	REPMGR_IOVECS *iovecs;
+	REPMGR_FLAT *fmsg;
+};
+
+/*
+ * Context for a thread waiting for client acks for PERM message.  Passed from
+ * the send() function to the got_acks() predicate function, via
+ * __repmgr_await_cond().  The got_acks() function computes two potentially
+ * independent results: (1) do we have enough acks to stop waiting for more (the
+ * function return value, which triggers the behavior of await_cond()); and (2)
+ * whether the PERM message should be considered durable.
+ */
+struct repmgr_permanence {
+	DB_LSN lsn;		/* LSN whose ack this thread is waiting for. */
+	u_int threshold;	/* Number of client acks to wait for. */
+	u_int quorum;		/* Durability threshold for QUORUM policy. */
+	int policy;		/* Ack policy to be used for this txn. */
+	int is_durable;		/* Result flag. */
+};
+
+#ifdef	CONFIG_TEST
+static u_int fake_port __P((ENV *, u_int));
+#endif
+static int final_cleanup __P((ENV *, REPMGR_CONNECTION *, void *));
+static int flatten __P((ENV *, struct sending_msg *));
+static int got_acks __P((ENV *, void *));
+static int __repmgr_finish_connect
+    __P((ENV *, socket_t s, REPMGR_CONNECTION **));
+static int __repmgr_propose_version __P((ENV *, REPMGR_CONNECTION *));
+static int __repmgr_start_connect __P((ENV*, socket_t *, ADDRINFO *, int *));
+static void setup_sending_msg __P((ENV *,
+    struct sending_msg *, u_int8_t *, u_int, const DBT *, const DBT *));
+static int __repmgr_send_internal
+    __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, db_timeout_t));
+static int enqueue_msg
+    __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, size_t));
+static REPMGR_SITE *connected_site __P((ENV *, int));
+static REPMGR_SITE *__repmgr_find_available_peer __P((ENV *));
+static int send_connection __P((ENV *, u_int,
+    REPMGR_CONNECTION *, struct sending_msg *, int *));
+
+/*
+ * Connects to the given network address, using blocking operations.  Any thread
+ * synchronization is the responsibility of the caller.
+ *
+ * PUBLIC: int __repmgr_connect __P((ENV *,
+ * PUBLIC:     repmgr_netaddr_t *, REPMGR_CONNECTION **, int *));
+ */
+int
+__repmgr_connect(env, netaddr, connp, errp)
+	ENV *env;
+	repmgr_netaddr_t *netaddr;
+	REPMGR_CONNECTION **connp;
+	int *errp;
+{
+	REPMGR_CONNECTION *conn;
+	ADDRINFO *ai0, *ai;
+	socket_t sock;
+	int err, ret;
+	u_int port;
+
+	COMPQUIET(err, 0);
+#ifdef	CONFIG_TEST
+	port = fake_port(env, netaddr->port);
+#else
+	port = netaddr->port;
+#endif
+	if ((ret = __repmgr_getaddr(env, netaddr->host, port, 0, &ai0)) != 0)
+		return (ret);
+
+	/*
+	 * Try each address on the list, until success.  Note that if several
+	 * addresses on the list produce retryable error, we can only pass back
+	 * to our caller the last one.
+	 */
+	for (ai = ai0; ai != NULL; ai = ai->ai_next) {
+		switch ((ret = __repmgr_start_connect(env, &sock, ai, &err))) {
+		case 0:
+			if ((ret = __repmgr_finish_connect(env,
+			    sock, &conn)) == 0)
+				*connp = conn;
+			else
+				(void)closesocket(sock);
+			goto out;
+		case DB_REP_UNAVAIL:
+			continue;
+		default:
+			goto out;
+		}
+	}
+
+out:
+	__os_freeaddrinfo(env, ai0);
+	if (ret == DB_REP_UNAVAIL) {
+		__repmgr_print_conn_err(env, netaddr, err);
+		*errp = err;
+	}
+	return (ret);
+}
+
+static int
+__repmgr_start_connect(env, socket_result, ai, err)
+	ENV *env;
+	socket_t *socket_result;
+	ADDRINFO *ai;
+	int *err;
+{
+	socket_t s;
+	int ret;
+
+	if ((s = socket(ai->ai_family,
+		    ai->ai_socktype, ai->ai_protocol)) == SOCKET_ERROR) {
+		ret = net_errno;
+		__db_err(env, ret, "create socket");
+		return (ret);
+	}
+
+	if (connect(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+		*err = net_errno;
+		(void)closesocket(s);
+		return (DB_REP_UNAVAIL);
+	}
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connection established"));
+
+	*socket_result = s;
+	return (0);
+}
+
+static int
+__repmgr_finish_connect(env, s, connp)
+	ENV *env;
+	socket_t s;
+	REPMGR_CONNECTION **connp;
+{
+	REPMGR_CONNECTION *conn;
+	int ret;
+
+	if ((ret = __repmgr_new_connection(env, &conn, s, CONN_CONNECTED)) != 0)
+		return (ret);
+
+	if ((ret = __repmgr_set_keepalive(env, conn)) == 0 &&
+	    (ret = __repmgr_propose_version(env, conn)) == 0)
+		*connp = conn;
+	else
+		(void)__repmgr_destroy_conn(env, conn);
+	return (ret);
+}
+
+static int
+__repmgr_propose_version(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	__repmgr_version_proposal_args versions;
+	repmgr_netaddr_t *my_addr;
+	size_t hostname_len, rec_length;
+	u_int8_t *buf, *p;
+	int ret;
+
+	db_rep = env->rep_handle;
+	my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+	/*
+	 * In repmgr wire protocol version 1, a handshake message had a rec part
+	 * that looked like this:
+	 *
+	 *  +-----------------+----+
+	 *  |  host name ...  | \0 |
+	 *  +-----------------+----+
+	 *
+	 * To ensure its own sanity, the old repmgr would write a NUL into the
+	 * last byte of a received message, and then use normal C library string
+	 * operations (e.g., strlen, strcpy).
+	 *
+	 * Now, a version proposal has a rec part that looks like this:
+	 *
+	 *  +-----------------+----+------------------+------+
+	 *  |  host name ...  | \0 |  extra info ...  |  \0  |
+	 *  +-----------------+----+------------------+------+
+	 *
+	 * The "extra info" contains the version parameters, in marshaled form.
+	 */
+
+	hostname_len = strlen(my_addr->host);
+	rec_length = hostname_len + 1 +
+	    __REPMGR_VERSION_PROPOSAL_SIZE + 1;
+	if ((ret = __os_malloc(env, rec_length, &buf)) != 0)
+		goto out;
+	p = buf;
+	(void)strcpy((char*)p, my_addr->host);
+
+	p += hostname_len + 1;
+	versions.min = DB_REPMGR_MIN_VERSION;
+	versions.max = DB_REPMGR_VERSION;
+	__repmgr_version_proposal_marshal(env, &versions, p);
+
+	ret = __repmgr_send_v1_handshake(env, conn, buf, rec_length);
+	__os_free(env, buf);
+out:
+	return (ret);
+}
+
+/*
+ * __repmgr_send --
+ *	The send function for DB_ENV->rep_set_transport.
+ *
+ * PUBLIC: int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *,
+ * PUBLIC:     const DB_LSN *, int, u_int32_t));
+ */
+int
+__repmgr_send(dbenv, control, rec, lsnp, eid, flags)
+	DB_ENV *dbenv;
+	const DBT *control, *rec;
+	const DB_LSN *lsnp;
+	int eid;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	struct repmgr_permanence perm;
+	db_timeout_t maxblock;
+	u_int32_t available, nclients, needed, npeers_sent, nsites_sent, quorum;
+	int missed_peer, policy, ret, t_ret;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	ret = 0;
+
+	LOCK_MUTEX(db_rep->mutex);
+
+	/*
+	 * If we're already "stopped", we can't send anything.  This covers the
+	 * case where a bulk buffer is flushed at env close, or perhaps an
+	 * unexpected __repmgr_thread_failure.
+	 */
+	if (db_rep->repmgr_status == stopped) {
+		ret = DB_REP_UNAVAIL;
+		goto out;
+	}
+
+	/*
+	 * Check whether we need to refresh our site address information with
+	 * more recent updates from shared memory.
+	 */
+	if (rep->siteinfo_seq > db_rep->siteinfo_seq &&
+	    (ret = __repmgr_sync_siteaddr(env)) != 0)
+		goto out;
+
+	if (eid == DB_EID_BROADCAST) {
+		if ((ret = __repmgr_send_broadcast(env,
+		    REPMGR_REP_MESSAGE, control, rec,
+		    &nsites_sent, &npeers_sent, &missed_peer)) != 0)
+			goto out;
+	} else {
+		DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+
+		/*
+		 * Since repmgr's simple c2c implementation doesn't truly manage
+		 * staged synchronization it doesn't work well with master
+		 * leases.  So, disable it during the time when a new master may
+		 * be trying to establish its first set of lease grants.
+		 */
+		if (IS_USING_LEASES(env) && !rep->stat.st_startup_complete)
+			LF_CLR(DB_REP_ANYWHERE);
+		/*
+		 * If this is a request that can be sent anywhere, then see if
+		 * we can send it to our peer (to save load on the master), but
+		 * not if it's a rerequest, 'cuz that likely means we tried this
+		 * already and failed.
+		 */
+		if ((flags & (DB_REP_ANYWHERE | DB_REP_REREQUEST)) ==
+		    DB_REP_ANYWHERE &&
+		    (site = __repmgr_find_available_peer(env)) != NULL) {
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "sending request to peer"));
+		} else if ((site = connected_site(env, eid)) == NULL) {
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "ignoring message sent to unavailable site"));
+			ret = DB_REP_UNAVAIL;
+			goto out;
+		}
+
+		/*
+		 * In case the connection is clogged up and we have to wait for
+		 * space on the output queue, how long shall we wait?  We could
+		 * of course create a new timeout configuration type, so that
+		 * the application could set it directly.  But that would start
+		 * to overwhelm the user with too many choices to think about.
+		 * We already have an ACK timeout, which is the user's estimate
+		 * of how long it should take to send a message to the client,
+		 * have it be processed, and return a message back to us.  We
+		 * multiply that by the queue size, because that's how many
+		 * messages have to be swallowed up by the client before we're
+		 * able to start sending again (at least to a rough
+		 * approximation).
+		 */
+		maxblock = OUT_QUEUE_LIMIT *
+		    (rep->ack_timeout == 0 ?
+			DB_REPMGR_DEFAULT_ACK_TIMEOUT : rep->ack_timeout);
+
+		/*
+		 * Assign the conn struct pointer to a local variable ("conn"),
+		 * because the pointer in the site struct (ref.conn.in or
+		 * ref.conn.out) could get clobbered if the connection gets
+		 * busted in another thread during our send_one() call.  That
+		 * could happen if the outgoing half of the connection is
+		 * clogged and we decide to await_drain().
+		 */
+#undef	SEND_ONE_CONNECTION
+#define	SEND_ONE_CONNECTION(c)						  \
+		do {							  \
+			if ((conn = (c)) != NULL &&			  \
+			    IS_READY_STATE(conn->state) &&		  \
+			    (ret = __repmgr_send_one(env,		  \
+			    conn, REPMGR_REP_MESSAGE,			  \
+			    control, rec, maxblock)) == DB_REP_UNAVAIL && \
+			    (t_ret =					  \
+			    __repmgr_bust_connection(env, conn)) != 0)	  \
+				ret = t_ret;				  \
+		} while (0)
+
+		SEND_ONE_CONNECTION(site->ref.conn.in);
+		if (ret != 0 && ret != DB_REP_UNAVAIL)
+			goto out;
+		SEND_ONE_CONNECTION(site->ref.conn.out);
+		if (ret != 0)
+			goto out;
+#undef	SEND_ONE_CONNECTION
+
+		nsites_sent = 1;
+		npeers_sent = F_ISSET(site, SITE_ELECTABLE) ? 1 : 0;
+		missed_peer = FALSE;
+	}
+
+	/*
+	 * Traditionally, each ack policy determines how many acks are needed to
+	 * constitute successful durability.  We would simply wait until we
+	 * collected that many acks, and if we got them it was success, or if we
+	 * timed out it was failure.  And if we knew from the start that we
+	 * hadn't even sent the message to enough sites to meet the "needed"
+	 * threshold, then there was no point in waiting.
+	 *     It's a different story for the ALL_AVAILABLE policy.  There the
+	 * decision to continue awaiting more acks is decoupled from the
+	 * durability question: we want to wait until we get acks from all sites
+	 * we sent to (though still within the timeout limit).
+	 *     So now we have to think of "needed" in a slightly more general
+	 * way: it's the threshold that controls how many acks we keep waiting
+	 * for.  It's usually still also controls the determination of the
+	 * durability result; except not for ALL_AVAILABLE.
+	 */
+	if (LF_ISSET(DB_REP_PERMANENT)) {
+		/* Adjust so as not to count the local site, which is master. */
+		nclients = db_rep->region->config_nsites -1;
+
+		/*
+		 * When doing membership DB changes, avoid some impossible
+		 * situations.
+		 */
+		policy = rep->perm_policy;
+		switch (db_rep->active_gmdb_update) {
+		case gmdb_primary:
+			if (policy == DB_REPMGR_ACKS_ALL ||
+			    policy == DB_REPMGR_ACKS_ALL_PEERS)
+				policy = DB_REPMGR_ACKS_ALL_AVAILABLE;
+			else if (policy == DB_REPMGR_ACKS_QUORUM &&
+			    nclients == 1)
+				nclients = 0;
+			else if ((policy == DB_REPMGR_ACKS_ONE ||
+			    policy == DB_REPMGR_ACKS_ONE_PEER) &&
+			    nclients == 1) {
+				nclients = 0;
+				policy = DB_REPMGR_ACKS_QUORUM;
+			}
+			break;
+		case gmdb_secondary:
+			policy = DB_REPMGR_ACKS_NONE;
+			break;
+		case none:
+			break;
+		}
+		quorum = 0;
+		switch (policy) {
+		case DB_REPMGR_ACKS_NONE:
+			needed = 0;
+			COMPQUIET(available, 0);
+			break;
+
+		case DB_REPMGR_ACKS_ONE:
+			needed = 1;
+			available = nsites_sent;
+			break;
+
+		case DB_REPMGR_ACKS_ALL:
+			/* Number of sites in the group besides myself. */
+			needed = nclients;
+			available = nsites_sent;
+			break;
+
+		case DB_REPMGR_ACKS_ONE_PEER:
+			needed = 1;
+			available = npeers_sent;
+			break;
+
+		case DB_REPMGR_ACKS_ALL_PEERS:
+			/*
+			 * Too hard to figure out "needed", since we're not
+			 * keeping track of how many peers we have; so just skip
+			 * the optimization in this case.
+			 */
+			needed = 1;
+			available = npeers_sent;
+			break;
+
+		case DB_REPMGR_ACKS_QUORUM:
+		case DB_REPMGR_ACKS_ALL_AVAILABLE:
+			/*
+			 * The minimum number of acks necessary to ensure that
+			 * the transaction is durable if an election is held.
+			 *
+			 * Unless instructed otherwise, our special handling for
+			 * 2-site groups means that a client that loses contact
+			 * with the master elects itself master (even though
+			 * that doesn't constitute a majority).  In order to
+			 * provide the expected guarantee implied by the
+			 * definition of "quorum" we have to fudge the ack
+			 * calculation in this case: specifically, we need to
+			 * make sure that the client has received it in order
+			 * for us to consider it "perm".  Thus, if nclients is
+			 * 1, needed should be 1.
+			 *
+			 * While we're at it, if nclients is 0 (a nascent
+			 * "group" consisting of nothing but a master), surely
+			 * the number of acks we need should be 0.
+			 *
+			 * Note that turning the usual strict behavior back on
+			 * in a 2-site group results in "0" as the number of
+			 * clients needed to ack a txn in order for it to have
+			 * arrived at a quorum.  This is the correct result,
+			 * strange as it may seem!  This may well mean that in a
+			 * 2-site group the QUORUM policy is rarely the right
+			 * choice.
+			 *
+			 * When a GMDB update adds the second site, force
+			 * "strict" behavior: in that case nsites is 2, but the
+			 * new site is not yet allowed to contribute an ack.
+			 */
+			if (nclients > 1 ||
+			    FLD_ISSET(db_rep->region->config,
+			    REP_C_2SITE_STRICT) ||
+			    db_rep->active_gmdb_update == gmdb_primary)
+				quorum = nclients / 2;
+			else
+				quorum = nclients;
+
+			if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE) {
+				if (nsites_sent > 0)
+					needed = available = nsites_sent;
+				else {
+					ret = quorum > 0 ? DB_REP_UNAVAIL : 0;
+					goto out;
+				}
+			} else {
+				DB_ASSERT(env, policy == DB_REPMGR_ACKS_QUORUM);
+				needed = quorum;
+				available = npeers_sent;
+				if (npeers_sent < quorum && !missed_peer) {
+					/*
+					 * If we sent to all peers, it doesn't
+					 * matter how few there were.  This
+					 * derives from the definition of the
+					 * QUORUM policy: no possible subsequent
+					 * election can fail to include the
+					 * transaction.  If all electable sites
+					 * have the transaction, then it can't
+					 * be lost in an election, no matter how
+					 * few there are.
+					 */
+					needed = npeers_sent;
+				}
+			}
+			break;
+
+		default:
+			ret = __db_unknown_path(env, "__repmgr_send");
+			goto out;
+		}
+		if (policy != DB_REPMGR_ACKS_ALL_AVAILABLE) {
+			/*
+			 * Skip the waiting if it is unnecessary, or if it would
+			 * be futile.  For most ack policies, these decisions
+			 * are straightforward, and can be computed in the
+			 * following generic way.  For ALL_AVAILABLE, skipping
+			 * is also possible, but it is decided earlier (above,
+			 * inside the "switch" statement).
+			 *
+			 * Note that for ALL, there is a surprising side-effect
+			 * if even one client is down.  It will not wait for
+			 * any acks and the running clients can fall further
+			 * and further behind the master.
+			 */
+			if (needed == 0)
+				goto out;
+			if (available < needed) {
+				ret = DB_REP_UNAVAIL;
+				goto out;
+			}
+		}
+
+		/* In ALL_PEERS case, display of "needed" might be confusing. */
+		VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "will await acknowledgement: need %u", needed));
+		perm.lsn = *lsnp;
+		perm.threshold = needed;
+		perm.policy = policy;
+		perm.quorum = quorum;
+		perm.is_durable = FALSE;
+		ret = __repmgr_await_cond(env, got_acks,
+		    &perm, rep->ack_timeout, &db_rep->ack_waiters);
+		if (ret == 0 || ret == DB_TIMEOUT)
+			ret = perm.is_durable ? 0 : DB_REP_UNAVAIL;
+	}
+
+out:	UNLOCK_MUTEX(db_rep->mutex);
+	if (LF_ISSET(DB_REP_PERMANENT)) {
+		if (ret != 0) {
+			switch (db_rep->active_gmdb_update) {
+			case none:
+				/*
+				 * Fire perm-failed event to the application as
+				 * usual; no other bookkeeping needed here.
+				 */
+				STAT(db_rep->region->mstat.st_perm_failed++);
+				DB_EVENT(env, DB_EVENT_REP_PERM_FAILED, NULL);
+				break;
+			case gmdb_primary:
+				/*
+				 * Since this is a membership DB operation,
+				 * refrain from bothering the application about
+				 * it (with an event that it wouldn't be
+				 * expecting), and make a note of the failure so
+				 * we can resolve it later.
+				 */
+				db_rep->limbo_failure = *lsnp;
+				 /* FALLTHROUGH */
+			case gmdb_secondary:
+				/* Merely refrain from firing event. */
+				RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+				    "GMDB perm failure %d at [%lu][%lu]",
+				    (int)db_rep->active_gmdb_update,
+				    (u_long)lsnp->file, (u_long)lsnp->offset));
+				break;
+			}
+		} else if (db_rep->limbo_resolution_needed) {
+			/*
+			 * A previous membership DB operation failed, leaving us
+			 * "in limbo", but now some perm operation has completed
+			 * successfully.  Since the ack of any txn implies ack
+			 * of all txns that occur before it (in LSN order), we
+			 * now know that the previous failure can be resolved.
+			 * We can't do it here in this thread, so put a request
+			 * on the message processing queue to have it handled
+			 * later.
+			 */
+			db_rep->durable_lsn = *lsnp;
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			 "perm success [%lu][%lu] with limbo resolution needed",
+			    (u_long)lsnp->file, (u_long)lsnp->offset));
+			db_rep->limbo_resolution_needed = FALSE;
+
+			/* Don't trump ret, even if it's zero. */
+			LOCK_MUTEX(db_rep->mutex);
+			if ((t_ret = __repmgr_defer_op(env,
+			    REPMGR_RESOLVE_LIMBO)) != 0)
+				__db_err(env, t_ret, "repmgr_defer_op");
+			UNLOCK_MUTEX(db_rep->mutex);
+		}
+	}
+	return (ret);
+}
+
+static REPMGR_SITE *
+connected_site(env, eid)
+	ENV *env;
+	int eid;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+
+	db_rep = env->rep_handle;
+	DB_ASSERT(env, IS_VALID_EID(eid));
+	site = SITE_FROM_EID(eid);
+	if (site->state == SITE_CONNECTED)
+		return (site);
+	return (NULL);
+}
+
+/*
+ * Synchronize our list of sites with new information that has been added to the
+ * list in the shared region.
+ *
+ * PUBLIC: int __repmgr_sync_siteaddr __P((ENV *));
+ */
+int
+__repmgr_sync_siteaddr(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	u_int added;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	ret = 0;
+
+	MUTEX_LOCK(env, rep->mtx_repmgr);
+
+	if (!IS_VALID_EID(db_rep->self_eid))
+		db_rep->self_eid = rep->self_eid;
+
+	added = db_rep->site_cnt;
+	if ((ret = __repmgr_copy_in_added_sites(env)) == 0)
+		ret = __repmgr_init_new_sites(env, (int)added,
+		    (int)db_rep->site_cnt);
+
+	MUTEX_UNLOCK(env, rep->mtx_repmgr);
+	return (ret);
+}
+
+/*
+ * Sends message to all sites with which we currently have an active
+ * connection.  Sets result parameters according to how many sites we attempted
+ * to begin sending to, even if we did nothing more than queue it for later
+ * delivery.
+ *
+ * !!!
+ * Caller must hold env->mutex.
+ * PUBLIC: int __repmgr_send_broadcast __P((ENV *, u_int,
+ * PUBLIC:    const DBT *, const DBT *, u_int *, u_int *, int *));
+ */
+int
+__repmgr_send_broadcast(env, type, control, rec, nsitesp, npeersp, missingp)
+	ENV *env;
+	u_int type;
+	const DBT *control, *rec;
+	u_int *nsitesp, *npeersp;
+	int *missingp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	struct sending_msg msg;
+	REPMGR_SITE *site;
+	REPMGR_IOVECS iovecs;
+	u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+	u_int nsites, npeers;
+	int eid, full_member, has_missing_peer, ret, sent1, sent2;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/*
+	 * Sending a broadcast is quick, because we allow no blocking.  So it
+	 * shouldn't much matter.  But just in case, take the timestamp before
+	 * sending, so that if anything we err on the side of keeping clients
+	 * placated (i.e., possibly sending a heartbeat slightly more frequently
+	 * than necessary).
+	 */
+	__os_gettime(env, &db_rep->last_bcast, 1);
+
+	msg.iovecs = &iovecs;
+	setup_sending_msg(env, &msg, msg_hdr_buf, type, control, rec);
+	nsites = npeers = 0;
+	has_missing_peer = FALSE;
+
+	/* Send to (only the main connection with) every site. */
+	FOR_EACH_REMOTE_SITE_INDEX(eid) {
+		sent1 = sent2 = FALSE;
+		site = SITE_FROM_EID(eid);
+
+		/*
+		 * Exclude non-member sites, unless we're the master, since it's
+		 * useful to keep letting a removed site see updates so that it
+		 * learns of its own removal, and will know to rejoin at its
+		 * next reboot.
+		 */
+		if (site->membership == SITE_PRESENT)
+			full_member = TRUE;
+		else {
+			full_member = FALSE;
+			if (rep->master_id != db_rep->self_eid)
+				goto next;
+		}
+
+		/*
+		 * Send message on either or both main connections, as
+		 * available.
+		 */
+		if ((ret = send_connection(env, type,
+		    site->ref.conn.in, &msg, &sent1)) != 0)
+			return (ret);
+		if ((ret = send_connection(env, type,
+		    site->ref.conn.out, &msg, &sent2)) != 0)
+			return (ret);
+next:
+		/*
+		 * Count how many full-fledged member sites we sent to, and how
+		 * many of those were electable peers.  These values will be
+		 * used by the caller to manage waiting for acks.  Ignore
+		 * non-full-fledged member sites because we don't accept acks
+		 * from them.
+		 */
+		if (full_member) {
+			if (sent1 || sent2) {
+				nsites++;
+				if (F_ISSET(site, SITE_ELECTABLE))
+					npeers++;
+			} else {
+				/*
+				 * Keep track of whether any of the sites we
+				 * failed to send to was an electable peer.  If
+				 * we don't know a site's electability yet, we
+				 * assume the worst in order to be safe.
+				 */
+				if (!F_ISSET(site, SITE_HAS_PRIO) ||
+				    F_ISSET(site, SITE_ELECTABLE))
+					has_missing_peer = TRUE;
+			}
+		}
+	}
+
+	*nsitesp = nsites;
+	*npeersp = npeers;
+	*missingp = has_missing_peer;
+	return (0);
+}
+
+static int
+send_connection(env, type, conn, msg, sent)
+	ENV *env;
+	u_int type;
+	REPMGR_CONNECTION *conn;
+	struct sending_msg *msg;
+	int *sent;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	static const u_int version_max_msg_type[] = {
+		0,
+		REPMGR_MAX_V1_MSG_TYPE,
+		REPMGR_MAX_V2_MSG_TYPE,
+		REPMGR_MAX_V3_MSG_TYPE,
+		REPMGR_MAX_V4_MSG_TYPE
+	};
+
+	db_rep = env->rep_handle;
+	*sent = FALSE;
+	if (conn == NULL || !IS_READY_STATE(conn->state))
+		return (0);
+
+	DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid) &&
+	    conn->version > 0 &&
+	    conn->version <= DB_REPMGR_VERSION);
+
+	/*
+	 * Skip if the type of message we're sending is beyond the range
+	 * of known message types for this connection's version.
+	 *
+	 * !!!
+	 * Don't be misled by the apparent generality of this simple
+	 * test.  It works currently, because the only kinds of messages
+	 * that we broadcast are REP_MESSAGE and HEARTBEAT.  But in the
+	 * future other kinds of messages might require more intricate
+	 * per-connection-version customization (for example,
+	 * per-version message format conversion, addition of new
+	 * fields, etc.).
+	 */
+	if (type > version_max_msg_type[conn->version])
+		return (0);
+
+	/*
+	 * Broadcast messages are either application threads committing
+	 * transactions, or replication status message that we can
+	 * afford to lose.  So don't allow blocking for them (pass
+	 * maxblock argument as 0).
+	 */
+	if ((ret = __repmgr_send_internal(env, conn, msg, 0)) == 0)
+		*sent = TRUE;
+	else if (ret == DB_TIMEOUT) {
+		/*
+		 * Couldn't send because of a full output queue.
+		 * Indicating that we sent it would be wrong, but it's
+		 * otherwise OK in the sense that the connection isn't
+		 * definitively known to be broken, and rep protocol
+		 * always allows us to drop a message if we have to.
+		 */
+		ret = 0;
+	} else if (ret == DB_REP_UNAVAIL)
+		ret = __repmgr_bust_connection(env, conn);
+	return (ret);
+}
+
+/*
+ * __repmgr_send_one --
+ *	Send a message to a site, or if you can't just yet, make a copy of it
+ * and arrange to have it sent later.  'rec' may be NULL, in which case we send
+ * a zero length and no data.
+ *
+ * !!!
+ * Note that the mutex should be held through this call.
+ * It doubles as a synchronizer to make sure that two threads don't
+ * intersperse writes that are part of two single messages.
+ *
+ * PUBLIC: int __repmgr_send_one __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC:    u_int, const DBT *, const DBT *, db_timeout_t));
+ */
+int
+__repmgr_send_one(env, conn, msg_type, control, rec, maxblock)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	u_int msg_type;
+	const DBT *control, *rec;
+	db_timeout_t maxblock;
+{
+	struct sending_msg msg;
+	REPMGR_IOVECS iovecs;
+	u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+	int ret;
+
+	msg.iovecs = &iovecs;
+	setup_sending_msg(env, &msg, hdr_buf, msg_type, control, rec);
+	if ((ret =
+	    __repmgr_send_internal(env, conn, &msg, maxblock)) == DB_TIMEOUT &&
+	    maxblock == 0)
+		ret = 0;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_many __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, REPMGR_IOVECS *, db_timeout_t));
+ */
+int
+__repmgr_send_many(env, conn, iovecs, maxblock)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	REPMGR_IOVECS *iovecs;
+	db_timeout_t maxblock;
+{
+	struct sending_msg msg;
+	int ret;
+
+	if (conn->state == CONN_DEFUNCT)
+		return (DB_REP_UNAVAIL);
+	msg.iovecs = iovecs;
+	msg.fmsg = NULL;
+	if ((ret =
+	    __repmgr_send_internal(env, conn, &msg, maxblock)) == DB_TIMEOUT &&
+	    maxblock == 0)
+		ret = 0;
+	if (ret != 0 && ret != DB_TIMEOUT)
+		(void)__repmgr_disable_connection(env, conn);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_own_msg __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__repmgr_send_own_msg(env, conn, type, buf, len)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	u_int8_t *buf;
+	u_int32_t len, type;
+{
+	REPMGR_IOVECS iovecs;
+	struct sending_msg msg;
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+
+	if (conn->version < OWN_MIN_VERSION)
+		return (0);
+	msg_hdr.type = REPMGR_OWN_MSG;
+	REPMGR_OWN_BUF_SIZE(msg_hdr) = len;
+	REPMGR_OWN_MSG_TYPE(msg_hdr) = type;
+	__repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+
+	__repmgr_iovec_init(&iovecs);
+	__repmgr_add_buffer(&iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+	if (len > 0)
+		__repmgr_add_buffer(&iovecs, buf, len);
+
+	msg.iovecs = &iovecs;
+	msg.fmsg = NULL;
+	return (__repmgr_send_internal(env, conn, &msg, 0));
+}
+
+/*
+ * Attempts a "best effort" to send a message on the given site.  If there is an
+ * excessive backlog of message already queued on the connection, what shall we
+ * do?  If the caller doesn't mind blocking, we'll wait (a limited amount of
+ * time) for the queue to drain.  Otherwise we'll simply drop the message.  This
+ * is always allowed by the replication protocol.  But in the case of a
+ * multi-message response to a request like PAGE_REQ, LOG_REQ or ALL_REQ we
+ * almost always get a flood of messages that instantly fills our queue, so
+ * blocking improves performance (by avoiding the need for the client to
+ * re-request).
+ */
+static int
+__repmgr_send_internal(env, conn, msg, maxblock)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	struct sending_msg *msg;
+	db_timeout_t maxblock;
+{
+	DB_REP *db_rep;
+	SITE_STRING_BUFFER buffer;
+	int ret;
+	size_t total_written;
+
+	db_rep = env->rep_handle;
+
+	DB_ASSERT(env, conn->state != CONN_DEFUNCT);
+	if (!STAILQ_EMPTY(&conn->outbound_queue)) {
+		/*
+		 * Output to this site is currently owned by the select()
+		 * thread, so we can't try sending in-line here.  We can only
+		 * queue the msg for later.
+		 */
+		VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "msg to %s to be queued",
+		    __repmgr_format_eid_loc(db_rep, conn, buffer)));
+		if (conn->out_queue_length >= OUT_QUEUE_LIMIT &&
+		    maxblock > 0 && conn->state != CONN_CONGESTED) {
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "block thread, awaiting output queue space"));
+			conn->ref_count++;
+			ret = __repmgr_await_drain(env, conn, maxblock);
+			conn->ref_count--;
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "drain returned %d (%d,%d)", ret,
+			    db_rep->repmgr_status, conn->out_queue_length));
+			if (db_rep->repmgr_status == stopped)
+				return (DB_TIMEOUT);
+			if (ret != 0)
+				return (ret);
+			if (STAILQ_EMPTY(&conn->outbound_queue))
+				goto empty;
+		}
+		if (conn->out_queue_length < OUT_QUEUE_LIMIT)
+			return (enqueue_msg(env, conn, msg, 0));
+		else {
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "queue limit exceeded"));
+			STAT(env->rep_handle->
+			    region->mstat.st_msgs_dropped++);
+			return (DB_TIMEOUT);
+		}
+	}
+empty:
+	if ((ret = __repmgr_write_iovecs(env,
+	    conn, msg->iovecs, &total_written)) == 0)
+		return (0);
+	switch (ret) {
+	case WOULDBLOCK:
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+	case DB_REPMGR_EAGAIN:
+#endif
+		break;
+	default:
+#ifdef EBADF
+		DB_ASSERT(env, ret != EBADF);
+#endif
+		__repmgr_fire_conn_err_event(env, conn, ret);
+		STAT(env->rep_handle->region->mstat.st_connection_drop++);
+		return (DB_REP_UNAVAIL);
+	}
+
+	VPRINT(env, (env, DB_VERB_REPMGR_MISC, "wrote only %lu bytes to %s",
+	    (u_long)total_written,
+	    __repmgr_format_eid_loc(db_rep, conn, buffer)));
+	/*
+	 * We can't send any more without blocking: queue (a pointer to) a
+	 * "flattened" copy of the message, so that the select() thread will
+	 * finish sending it later.
+	 */
+	if ((ret = enqueue_msg(env, conn, msg, total_written)) != 0)
+		return (ret);
+
+	STAT(env->rep_handle->region->mstat.st_msgs_queued++);
+
+	/*
+	 * Wake the main select thread so that it can discover that it has
+	 * received ownership of this connection.  Note that we didn't have to
+	 * do this in the previous case (above), because the non-empty queue
+	 * implies that the select() thread is already managing ownership of
+	 * this connection.
+	 */
+	return (__repmgr_wake_main_thread(env));
+}
+
+/*
+ * PUBLIC: int __repmgr_write_iovecs __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC:     REPMGR_IOVECS *, size_t *));
+ */
+int
+__repmgr_write_iovecs(env, conn, iovecs, writtenp)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	REPMGR_IOVECS *iovecs;
+	size_t *writtenp;
+{
+	REPMGR_IOVECS iovec_buf, *v;
+	size_t nw, sz, total_written;
+	int ret;
+
+	/*
+	 * Send as much data to the site as we can, without blocking.  Keep
+	 * writing as long as we're making some progress.
+	 *
+	 * Make a scratch copy of iovecs for our use, since we destroy it in the
+	 * process of adjusting pointers after each partial I/O.  The minimal
+	 * REPMGR_IOVECS struct template is usually enough.  But for app
+	 * messages that need more than 3 segments we allocate a separate
+	 * buffer.
+	 */
+	if (iovecs->count <= MIN_IOVEC) {
+		v = &iovec_buf;
+		sz = sizeof(iovec_buf);
+	} else {
+		sz = (size_t)REPMGR_IOVECS_ALLOC_SZ((u_int)iovecs->count);
+		if ((ret = __os_malloc(env, sz, &v)) != 0)
+			return (ret);
+	}
+	memcpy(v, iovecs, sz);
+
+	total_written = 0;
+	while ((ret = __repmgr_writev(conn->fd, &v->vectors[v->offset],
+	    v->count-v->offset, &nw)) == 0) {
+		total_written += nw;
+		if (__repmgr_update_consumed(v, nw)) /* all written */
+			break;
+	}
+	*writtenp = total_written;
+	if (v != &iovec_buf)
+		__os_free(env, v);
+	return (ret);
+}
+
+/*
+ * Count up how many sites have ack'ed the given LSN.
+ *
+ * Computes two results: the main result (function's return code) is a boolean
+ * flag indicating whether we've gotten all the acks we need and can therefore
+ * stop waiting for more.  The perm->is_durable field determines whether we got
+ * enough acks to consider the transaction durably replicated.  These two
+ * results are almost always the same, except when using the ALL_AVAILABLE
+ * policy.
+ *
+ * !!!
+ * Caller must hold the mutex.
+ */
+static int
+got_acks(env, context)
+	ENV *env;
+	void *context;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	struct repmgr_permanence *perm;
+	u_int sites_acked, peers_acked;
+	int done, eid, has_unacked_peer, is_perm, policy;
+
+	db_rep = env->rep_handle;
+	perm = context;
+	policy = perm->policy;
+
+	sites_acked = peers_acked = 0;
+	has_unacked_peer = FALSE;
+	FOR_EACH_REMOTE_SITE_INDEX(eid) {
+		site = SITE_FROM_EID(eid);
+		if (site->membership != SITE_PRESENT)
+			continue;
+		if (!F_ISSET(site, SITE_HAS_PRIO)) {
+			/*
+			 * Never connected to this site: since we can't know
+			 * whether it's a peer, assume the worst.
+			 */
+			has_unacked_peer = TRUE;
+			continue;
+		}
+
+		if (LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) {
+			sites_acked++;
+			if (F_ISSET(site, SITE_ELECTABLE))
+				peers_acked++;
+		} else {
+			/* This site hasn't ack'ed the message. */
+			if (F_ISSET(site, SITE_ELECTABLE))
+				has_unacked_peer = TRUE;
+		}
+	}
+	VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		"checking perm result, %lu, %lu, %d",
+		(u_long)sites_acked, (u_long)peers_acked, has_unacked_peer));
+
+	switch (policy) {
+	case DB_REPMGR_ACKS_ALL:
+	case DB_REPMGR_ACKS_ONE:
+		is_perm = (sites_acked >= perm->threshold);
+		break;
+	case DB_REPMGR_ACKS_ONE_PEER:
+		is_perm = (peers_acked >= perm->threshold);
+		break;
+	case DB_REPMGR_ACKS_QUORUM:
+	case DB_REPMGR_ACKS_ALL_AVAILABLE:
+		is_perm = (peers_acked >= perm->quorum) || !has_unacked_peer;
+		break;
+	case DB_REPMGR_ACKS_ALL_PEERS:
+		is_perm = !has_unacked_peer;
+		break;
+	default:
+		is_perm = FALSE;
+		(void)__db_unknown_path(env, "got_acks");
+	}
+	if (is_perm)
+		perm->is_durable = TRUE;
+	if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE)
+		done = sites_acked >= perm->threshold;
+	else
+		done = is_perm;
+	return (done);
+}
+
+/*
+ * Abandons a connection, to recover from an error.  Takes necessary recovery
+ * action.  Note that we don't actually close and clean up the connection here;
+ * that happens later, in the select() thread main loop.  See further
+ * explanation at function __repmgr_disable_connection().
+ *
+ * Idempotent.
+ *
+ * PUBLIC: int __repmgr_bust_connection __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * !!!
+ * Caller holds mutex.
+ */
+int
+__repmgr_bust_connection(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *site;
+	u_int32_t flags;
+	int ret, eid;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (conn->state == CONN_DEFUNCT)
+		return (0);
+	eid = conn->eid;
+	if ((ret = __repmgr_disable_connection(env, conn)) != 0)
+		return (ret);
+
+	/*
+	 * When we have lost the connection to another site, take any/all
+	 * appropriate recovery steps.  But what does it mean to lose "the"
+	 * connection, now that we actually have various different kinds of
+	 * connection?
+	 *
+	 * 1. We're only talking about "rep" connections.  Connections backing
+	 *    user channels aren't of concern here.
+	 * 2. Subordinate connections are also not of concern here.
+	 * 3. If we have two "main" connections with a given remote site (one
+	 *    incoming and the other outgoing), then if we lose one we still
+	 *    have the other.  So, we still "have a connection" with the remote
+	 *    site.
+	 *
+	 * Finally, the appropriate recovery steps also depend on the current
+	 * replication role (master/client) of both the local site and the
+	 * remote site.
+	 */
+	if (conn->type != REP_CONNECTION || !IS_KNOWN_REMOTE_SITE(eid))
+		goto out;
+
+	site = SITE_FROM_EID(eid);
+	/*
+	 * When closing one of our main connections ("in" or "out"), if we still
+	 * have the other one present, then we still consider ourselves to be
+	 * connected, so there's nothing more to do.  But if we have now become
+	 * "not connected", we have some recovery steps to do.  (Note that we
+	 * don't care at all about subordinate connections, for the purposes of
+	 * recovery steps.)
+	 */
+	if (conn == site->ref.conn.in) {
+		site->ref.conn.in = NULL;
+		if (site->ref.conn.out != NULL) /* We're still connected. */
+			goto out;
+	} else if (conn == site->ref.conn.out) {
+		site->ref.conn.out = NULL;
+		if (site->ref.conn.in != NULL)
+			goto out;
+	} else			/* Subordinate connection. */
+		goto out;
+
+	if ((ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0)
+		goto out;
+
+	/*
+	 * If the failed connection was the one between us and the
+	 * master, assume that the master may have failed, and call for
+	 * an election.  But only do this for the connection to the main
+	 * master process, not a subordinate one.  And only do it if
+	 * we're our site's main process, not a subordinate one.  And
+	 * skip it if the application has configured us not to do
+	 * elections.
+	 */
+	if (!IS_SUBORDINATE(db_rep) && eid == rep->master_id) {
+		/*
+		 * Even if we're not doing elections, defer the event
+		 * notification to later execution in the election
+		 * thread.  We don't want to fire an event in the select
+		 * thread, and certainly not while holding the mutex.
+		 */
+		flags = ELECT_F_EVENT_NOTIFY;
+		if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))
+			LF_SET(ELECT_F_IMMED | ELECT_F_FAST);
+		else
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "Master failure, but no elections"));
+
+		if ((ret = __repmgr_init_election(env, flags)) != 0)
+			goto out;
+	}
+
+	/*
+	 * If we're the master site, and we lose a main connection to a
+	 * client (whether we're the main replication process or a
+	 * subordinate process), then the client is going to have
+	 * trouble receiving live log records from us.  So, set the
+	 * temporary log archive block timer, to give the client a
+	 * fighting chance to restart/recover/reconnect.  (We don't care
+	 * about the client's subordinate connections to us -- i.e.,
+	 * connections with a subordinate process at the client site --
+	 * because those sites can only be reading, not applying updates
+	 * from us.)
+	 */
+	if (rep->master_id == db_rep->self_eid) {
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Repmgr: bust connection.  Block archive"));
+		MASTER_UPDATE(env, (REGENV *)env->reginfo->primary);
+	}
+out:
+	return (ret);
+}
+
+/*
+ * Remove a connection from the possibility of any further activity, making sure
+ * it ends up on the main connections list, so that it will be cleaned up at the
+ * next opportunity in the select() thread.
+ *
+ * Various threads write onto TCP/IP sockets, and an I/O error could occur at
+ * any time.  However, only the dedicated "select()" thread may close the socket
+ * file descriptor, because under POSIX we have to drop our mutex and then call
+ * select() as two distinct (non-atomic) operations.
+ *
+ * To simplify matters, there is a single place in the select thread where we
+ * close and clean up after any defunct connection.  Even if the I/O error
+ * happens in the select thread we follow this convention.
+ *
+ * When an error occurs, we disable the connection (mark it defunct so that no
+ * one else will try to use it, and so that the select thread will find it and
+ * clean it up), and then usually take some additional recovery action: schedule
+ * a connection retry for later, and possibly call for an election if it was a
+ * connection to the master.  (This happens in the function
+ * __repmgr_bust_connection.)  But sometimes we don't want to do the recovery
+ * part; just the disabling part.
+ *
+ * PUBLIC: int __repmgr_disable_connection __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_disable_connection(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	REPMGR_RESPONSE *resp;
+	u_int32_t i;
+	int eid, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	conn->state = CONN_DEFUNCT;
+	if (conn->type == REP_CONNECTION) {
+		eid = conn->eid;
+		if (IS_VALID_EID(eid)) {
+			site = SITE_FROM_EID(eid);
+			if (conn != site->ref.conn.in &&
+			    conn != site->ref.conn.out)
+				/* It's a subordinate connection. */
+				TAILQ_REMOVE(&site->sub_conns, conn, entries);
+			TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+			conn->ref_count++;
+		}
+		conn->eid = -1;
+	} else if (conn->type == APP_CONNECTION) {
+		for (i = 0; i < conn->aresp; i++) {
+			resp = &conn->responses[i];
+			if (F_ISSET(resp, RESP_IN_USE) &&
+			    F_ISSET(resp, RESP_THREAD_WAITING)) {
+				F_SET(resp, RESP_COMPLETE);
+				resp->ret = DB_REP_UNAVAIL;
+			}
+		}
+		ret = __repmgr_wake_waiters(env, &conn->response_waiters);
+	}
+	if ((t_ret = __repmgr_signal(&conn->drained)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __repmgr_wake_main_thread(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_cleanup_defunct __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * Caller should hold mutex, since we remove connection from main list.
+ */
+int
+__repmgr_cleanup_defunct(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+	ret = __repmgr_close_connection(env, conn);
+
+	TAILQ_REMOVE(&db_rep->connections, conn, entries);
+	if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_close_connection __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_close_connection(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	int ret;
+#ifdef DB_WIN32
+	int t_ret;
+#endif
+
+	ret = 0;
+	if (conn->fd != INVALID_SOCKET &&
+	    closesocket(conn->fd) == SOCKET_ERROR) {
+		ret = net_errno;
+		__db_err(env, ret, DB_STR("3582", "closing socket"));
+	}
+	conn->fd = INVALID_SOCKET;
+#ifdef DB_WIN32
+	if (conn->event_object != WSA_INVALID_EVENT &&
+	    !WSACloseEvent(conn->event_object)) {
+		t_ret = net_errno;
+		__db_err(env, t_ret, DB_STR("3583",
+		    "releasing WSA event object"));
+		if (ret == 0)
+			ret = t_ret;
+	}
+	conn->event_object = WSA_INVALID_EVENT;
+#endif
+	return (ret);
+}
+
+/*
+ * Decrements a connection's ref count; destroys the connection when the ref
+ * count reaches zero.
+ *
+ * PUBLIC: int __repmgr_decr_conn_ref __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_decr_conn_ref(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_ASSERT(env, conn->ref_count > 0);
+	return (--conn->ref_count > 0 ? 0 :
+	    __repmgr_destroy_conn(env, conn));
+}
+
+/*
+ * Destroys a conn struct, by freeing all memory and associated resources.
+ * (This is a destructor, so it always must run to completion, and of course the
+ * passed-in object no longer exists upon return.)
+ *
+ * PUBLIC: int __repmgr_destroy_conn __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * Caller is responsible for holding mutex if necessary; we make no assumption
+ * here, since we operate only on the given connection, in isolation.  (However,
+ * note that if this conn has messages on its outbound queue, those are shared
+ * objects, and we decrement the ref count.  So in that case the mutex will need
+ * to be held.)
+ */
+int
+__repmgr_destroy_conn(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	QUEUED_OUTPUT *out;
+	REPMGR_FLAT *msg;
+	REPMGR_RESPONSE *resp;
+	DBT *dbt;
+	int ret, t_ret;
+
+	ret = 0;
+
+	DB_ASSERT(env, conn->ref_count == 0);
+	/*
+	 * Deallocate any input and output buffers we may have.
+	 */
+	if (conn->reading_phase == DATA_PHASE) {
+		switch (conn->msg_type) {
+		case REPMGR_OWN_MSG:
+			if (conn->input.rep_message == NULL)
+				break;
+			/* FALLTHROUGH */
+		case REPMGR_APP_MESSAGE:
+		case REPMGR_HEARTBEAT:
+		case REPMGR_REP_MESSAGE:
+			__os_free(env, conn->input.rep_message);
+			break;
+
+		case REPMGR_APP_RESPONSE:
+			/*
+			 * DATA_PHASE of an APP_RESPONSE is another way of
+			 * saying there must be a cur_resp, and it must be
+			 * READING.
+			 */
+			DB_ASSERT(env, conn->cur_resp < conn->aresp &&
+			    conn->responses != NULL);
+			resp = &conn->responses[conn->cur_resp];
+			DB_ASSERT(env, F_ISSET(resp, RESP_READING));
+			if (F_ISSET(resp, RESP_DUMMY_BUF))
+				__os_free(env, resp->dbt.data);
+			break;
+
+		case REPMGR_PERMLSN:
+		case REPMGR_HANDSHAKE:
+			dbt = &conn->input.repmgr_msg.cntrl;
+			if (dbt->size > 0)
+				__os_free(env, dbt->data);
+			dbt = &conn->input.repmgr_msg.rec;
+			if (dbt->size > 0)
+				__os_free(env, dbt->data);
+			break;
+
+		case REPMGR_RESP_ERROR:
+			/*
+			 * This type doesn't use a DATA_PHASE, so this should be
+			 * impossible.
+			 */
+		default:
+			ret = __db_unknown_path(env, "destroy_conn");
+		}
+	}
+
+	if (conn->type == APP_CONNECTION && conn->responses != NULL)
+		__os_free(env, conn->responses);
+
+	if ((t_ret = __repmgr_destroy_waiters(env,
+		    &conn->response_waiters)) != 0 && ret == 0)
+		ret = t_ret;
+
+	while (!STAILQ_EMPTY(&conn->outbound_queue)) {
+		out = STAILQ_FIRST(&conn->outbound_queue);
+		STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries);
+		msg = out->msg;
+		if (--msg->ref_count <= 0)
+			__os_free(env, msg);
+		__os_free(env, out);
+	}
+	if ((t_ret = __repmgr_free_cond(&conn->drained)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	__os_free(env, conn);
+	return (ret);
+}
+
+static int
+enqueue_msg(env, conn, msg, offset)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	struct sending_msg *msg;
+	size_t offset;
+{
+	QUEUED_OUTPUT *q_element;
+	int ret;
+
+	if (msg->fmsg == NULL && ((ret = flatten(env, msg)) != 0))
+		return (ret);
+	if ((ret = __os_malloc(env, sizeof(QUEUED_OUTPUT), &q_element)) != 0)
+		return (ret);
+	q_element->msg = msg->fmsg;
+	msg->fmsg->ref_count++;	/* encapsulation would be sweeter */
+	q_element->offset = offset;
+
+	/* Put it on the connection's outbound queue. */
+	STAILQ_INSERT_TAIL(&conn->outbound_queue, q_element, entries);
+	conn->out_queue_length++;
+	return (0);
+}
+
+/*
+ * Either "control" or "rec" (or both) may be NULL, in which case we treat it
+ * like a zero-length DBT.
+ */
+static void
+setup_sending_msg(env, msg, hdr_buf, type, control, rec)
+	ENV *env;
+	struct sending_msg *msg;
+	u_int8_t *hdr_buf;
+	u_int type;
+	const DBT *control, *rec;
+{
+	__repmgr_msg_hdr_args msg_hdr;
+
+	/*
+	 * Since we know that the msg hdr is a fixed size, we can add its buffer
+	 * to the iovecs before actually marshaling the content.  But the
+	 * add_buffer and add_dbt calls have to be in the right order.
+	 */
+	__repmgr_iovec_init(msg->iovecs);
+	__repmgr_add_buffer(msg->iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+	msg_hdr.type = type;
+
+	if ((REP_MSG_CONTROL_SIZE(msg_hdr) =
+	    (control == NULL ? 0 : control->size)) > 0)
+		__repmgr_add_dbt(msg->iovecs, control);
+
+	if ((REP_MSG_REC_SIZE(msg_hdr) = (rec == NULL ? 0 : rec->size)) > 0)
+		__repmgr_add_dbt(msg->iovecs, rec);
+
+	__repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+	msg->fmsg = NULL;
+}
+
+/*
+ * Convert a message stored as iovec pointers to various pieces, into flattened
+ * form, by copying all the pieces, and then make the iovec just point to the
+ * new simplified form.
+ */
+static int
+flatten(env, msg)
+	ENV *env;
+	struct sending_msg *msg;
+{
+	u_int8_t *p;
+	size_t msg_size;
+	int i, ret;
+
+	DB_ASSERT(env, msg->fmsg == NULL);
+
+	msg_size = msg->iovecs->total_bytes;
+	if ((ret = __os_malloc(env, sizeof(*msg->fmsg) + msg_size,
+	    &msg->fmsg)) != 0)
+		return (ret);
+	msg->fmsg->length = msg_size;
+	msg->fmsg->ref_count = 0;
+	p = &msg->fmsg->data[0];
+
+	for (i = 0; i < msg->iovecs->count; i++) {
+		memcpy(p, msg->iovecs->vectors[i].iov_base,
+		    msg->iovecs->vectors[i].iov_len);
+		p = &p[msg->iovecs->vectors[i].iov_len];
+	}
+	__repmgr_iovec_init(msg->iovecs);
+	__repmgr_add_buffer(msg->iovecs, &msg->fmsg->data[0], msg_size);
+	return (0);
+}
+
+/*
+ * Scan the list of remote sites, returning the first one that is a peer,
+ * is not the current master, and is available.
+ */
+static REPMGR_SITE *
+__repmgr_find_available_peer(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	FOR_EACH_REMOTE_SITE_INDEX(i) {
+		site = &db_rep->sites[i];
+		if (FLD_ISSET(site->config, DB_REPMGR_PEER) &&
+		    EID_FROM_SITE(site) != rep->master_id &&
+		    site->state == SITE_CONNECTED &&
+		    (((conn = site->ref.conn.in) != NULL &&
+		    conn->state == CONN_READY) ||
+		    ((conn = site->ref.conn.out) != NULL &&
+		    conn->state == CONN_READY)))
+			return (site);
+	}
+	return (NULL);
+}
+
+/*
+ * Copy host/port values into the given netaddr struct.  Allocates memory for
+ * the copy of the host name, which becomes the responsibility of the caller.
+ *
+ * PUBLIC: int __repmgr_pack_netaddr __P((ENV *, const char *,
+ * PUBLIC:     u_int, repmgr_netaddr_t *));
+ */
+int
+__repmgr_pack_netaddr(env, host, port, addr)
+	ENV *env;
+	const char *host;
+	u_int port;
+	repmgr_netaddr_t *addr;
+{
+	int ret;
+
+	DB_ASSERT(env, host != NULL);
+
+	if ((ret = __os_strdup(env, host, &addr->host)) != 0)
+		return (ret);
+	addr->port = (u_int16_t)port;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_getaddr __P((ENV *,
+ * PUBLIC:     const char *, u_int, int, ADDRINFO **));
+ */
+int
+__repmgr_getaddr(env, host, port, flags, result)
+	ENV *env;
+	const char *host;
+	u_int port;
+	int flags;    /* Matches struct addrinfo declaration. */
+	ADDRINFO **result;
+{
+	ADDRINFO *answer, hints;
+	char buffer[10];		/* 2**16 fits in 5 digits. */
+
+	/*
+	 * Ports are really 16-bit unsigned values, but it's too painful to
+	 * push that type through the API.
+	 */
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_flags = flags;
+	(void)snprintf(buffer, sizeof(buffer), "%u", port);
+
+	/*
+	 * Although it's generally bad to discard error information, the return
+	 * code from __os_getaddrinfo is undependable.  Our callers at least
+	 * would like to be able to distinguish errors in getaddrinfo (which we
+	 * want to consider to be re-tryable), from other failure (e.g., EINVAL,
+	 * above).
+	 */
+	if (__os_getaddrinfo(env, host, port, buffer, &hints, &answer) != 0)
+		return (DB_REP_UNAVAIL);
+	*result = answer;
+
+	return (0);
+}
+
+/*
+ * Initialize a socket for listening.  Sets a file descriptor for the socket,
+ * ready for an accept() call in a thread that we're happy to let block.
+ *
+ * PUBLIC:  int __repmgr_listen __P((ENV *));
+ */
+int
+__repmgr_listen(env)
+	ENV *env;
+{
+	ADDRINFO *ai;
+	DB_REP *db_rep;
+	repmgr_netaddr_t *addrp;
+	char *why;
+	int sockopt, ret;
+	socket_t s;
+
+	db_rep = env->rep_handle;
+
+	/* Use OOB value as sentinel to show no socket open. */
+	s = INVALID_SOCKET;
+
+	addrp = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+	if ((ret = __repmgr_getaddr(env,
+	    addrp->host, addrp->port, AI_PASSIVE, &ai)) != 0)
+		return (ret);
+
+	/*
+	 * Given the assert is correct, we execute the loop at least once, which
+	 * means 'why' will have been set by the time it's needed.  But of
+	 * course lint doesn't know about DB_ASSERT.
+	 */
+	COMPQUIET(why, "");
+	DB_ASSERT(env, ai != NULL);
+	for (; ai != NULL; ai = ai->ai_next) {
+
+		if ((s = socket(ai->ai_family,
+		    ai->ai_socktype, ai->ai_protocol)) == INVALID_SOCKET) {
+			why = DB_STR("3584", "can't create listen socket");
+			continue;
+		}
+
+		/*
+		 * When testing, it's common to kill and restart regularly.  On
+		 * some systems, this causes bind to fail with "address in use"
+		 * errors unless this option is set.
+		 */
+		sockopt = 1;
+		if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (sockopt_t)&sockopt,
+		    sizeof(sockopt)) != 0) {
+			why = DB_STR("3585",
+			    "can't set REUSEADDR socket option");
+			break;
+		}
+
+		if (bind(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+			why = DB_STR("3586",
+			    "can't bind socket to listening address");
+			ret = net_errno;
+			(void)closesocket(s);
+			s = INVALID_SOCKET;
+			continue;
+		}
+
+		if (listen(s, 5) != 0) {
+			why = DB_STR("3587", "listen()");
+			break;
+		}
+
+		if ((ret = __repmgr_set_nonblocking(s)) != 0) {
+			__db_err(env, ret, DB_STR("3588",
+			    "can't unblock listen socket"));
+			goto clean;
+		}
+
+		db_rep->listen_fd = s;
+		goto out;
+	}
+
+	if (ret == 0)
+		ret = net_errno;
+	__db_err(env, ret, "%s", why);
+clean:	if (s != INVALID_SOCKET)
+		(void)closesocket(s);
+out:
+	__os_freeaddrinfo(env, ai);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_net_close __P((ENV *));
+ */
+int
+__repmgr_net_close(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *site;
+	u_int eid;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if ((ret = __repmgr_each_connection(env, final_cleanup, NULL,
+	    FALSE)) == 0) {
+		FOR_EACH_REMOTE_SITE_INDEX(eid) {
+			site = SITE_FROM_EID(eid);
+			site->ref.conn.in = NULL;
+			site->ref.conn.out = NULL;
+		}
+	}
+
+	if (db_rep->listen_fd != INVALID_SOCKET) {
+		if (closesocket(db_rep->listen_fd) == SOCKET_ERROR && ret == 0)
+			ret = net_errno;
+		db_rep->listen_fd = INVALID_SOCKET;
+		rep->listener = 0;
+	}
+	return (ret);
+}
+
+/* Called only from env->close(), so we know we're single threaded. */
+static int
+final_cleanup(env, conn, unused)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *unused;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int ret, t_ret;
+
+	COMPQUIET(unused, NULL);
+	db_rep = env->rep_handle;
+
+	ret = __repmgr_close_connection(env, conn);
+	/* Remove the connection from whatever list it's on, if any. */
+	if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) {
+		site = SITE_FROM_EID(conn->eid);
+
+		if (site->state == SITE_CONNECTED &&
+		    (conn == site->ref.conn.in || conn == site->ref.conn.out)) {
+			/* Not on any list, so no need to do anything. */
+		} else
+			TAILQ_REMOVE(&site->sub_conns, conn, entries);
+		t_ret = __repmgr_destroy_conn(env, conn);
+
+	} else {
+		TAILQ_REMOVE(&db_rep->connections, conn, entries);
+		t_ret = __repmgr_decr_conn_ref(env, conn);
+	}
+	if (t_ret != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_net_destroy __P((ENV *, DB_REP *));
+ */
+void
+__repmgr_net_destroy(env, db_rep)
+	ENV *env;
+	DB_REP *db_rep;
+{
+	REPMGR_RETRY *retry;
+
+	while (!TAILQ_EMPTY(&db_rep->retries)) {
+		retry = TAILQ_FIRST(&db_rep->retries);
+		TAILQ_REMOVE(&db_rep->retries, retry, entries);
+		__os_free(env, retry);
+	}
+
+	DB_ASSERT(env, TAILQ_EMPTY(&db_rep->connections));
+}
+
+#ifdef	CONFIG_TEST
+/*
+ * Substitute a fake target port instead of the port actually configured, for
+ * certain types of testing, if desired.
+ *
+ * When a DB_TEST_FAKE_PORT environment variable is present, it names a TCP/IP
+ * port on which a "port arbiter" service may be running.  If it is indeed
+ * running, we should send it a request to ask it what "fake" port to use in
+ * place of the given "real" port.  (The "real" port is the port normally
+ * configured, and present in the membership database.)  The arbiter is not
+ * always running for all tests, so if it's not present it simply means we
+ * should not substitute a fake port.  Also, even if it is running, in some
+ * tests we don't want to substitute a fake port: in that case, the arbiter's
+ * response could name the same port as the "real" port we sent it.
+ *
+ * !!! This is only used for testing.
+ */
+static u_int
+fake_port(env, port)
+	ENV *env;
+	u_int port;
+{
+#define	MIN_PORT	1
+#define	MAX_PORT	65535
+	ADDRINFO *ai0, *ai;
+	db_iovec_t iovec;
+	char *arbiter, buf[100], *end, *p;
+	socket_t s;
+	long result;
+	size_t count;
+	int ret;
+	u_int arbiter_port;
+
+	if ((arbiter = getenv("DB_TEST_FAKE_PORT")) == NULL)
+		return (port);
+	if (__db_getlong(env->dbenv, "repmgr_net.c:fake_port",
+	    arbiter, MIN_PORT, MAX_PORT, &result) != 0)
+		return (port);
+	arbiter_port = (u_int)result;
+
+	/*
+	 * Send a message of the form "{config,Port}" onto a connection to
+	 * arbiter_port.
+	 */
+	if ((ret = __repmgr_getaddr(env,
+	    "localhost", arbiter_port, 0, &ai0)) != 0) {
+		__db_err(env, ret, "fake_port:getaddr");
+		return (port);
+	}
+	s = INVALID_SOCKET;
+	for (ai = ai0; ai != NULL; ai = ai->ai_next) {
+		if ((s = socket(ai->ai_family,
+		    ai->ai_socktype, ai->ai_protocol)) == SOCKET_ERROR) {
+			ret = net_errno;
+			s = INVALID_SOCKET;
+			__db_err(env, ret, "fake_port:socket");
+			goto err;
+		}
+		/*
+		 * Note that port substitution is used in only a small number of
+		 * tests.  When there is no "port arbiter" running, it's not an
+		 * error; it just means we should use the normal configured port
+		 * as is.
+		 */
+		if (connect(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+			ret = net_errno;
+			(void)closesocket(s);
+			s = INVALID_SOCKET;
+		}
+	}
+	if (ret != 0)
+		goto err;
+	(void)snprintf(buf, sizeof(buf), "{config,%u}\r\n", port);
+	iovec.iov_base = buf;
+	iovec.iov_len = (u_long)strlen(buf);
+	while ((ret = __repmgr_writev(s, &iovec, 1, &count)) == 0) {
+		iovec.iov_base = (u_int8_t *)iovec.iov_base + count;
+		if ((iovec.iov_len -= (u_long)count) == 0)
+			break;
+	}
+	if (ret != 0) {
+		__db_err(env, ret, "fake_port:writev");
+		goto err;
+	}
+
+	/* The response should be a line telling us what port to use. */
+	iovec.iov_base = buf;
+	iovec.iov_len = sizeof(buf);
+	p = buf;
+	while ((ret = __repmgr_readv(s, &iovec, 1, &count)) == 0) {
+		if (count == 0) {
+			__db_errx(env, "fake_port: premature EOF");
+			goto err;
+		}
+		/* Keep reading until we get a line end. */
+		for (p = iovec.iov_base, end = &p[count]; p < end; p++)
+			if (*p == '\r' || *p == '\n')
+				break;
+		if (p < end) {
+			*p = '\0';
+			break;
+		}
+		iovec.iov_base = (u_int8_t *)iovec.iov_base + count;
+		iovec.iov_len -= (u_long)count;
+		DB_ASSERT(env, iovec.iov_len > 0);
+	}
+	if (ret != 0)
+		goto err;
+
+	if (__db_getlong(env->dbenv, "repmgr_net.c:fake_port",
+	    buf, MIN_PORT, MAX_PORT, &result) == 0)
+		port = (u_int)result;
+
+err:
+	/*
+	 * Note that we always return some port value, even if an error happens.
+	 * Since this is just test code: if an error prevented proper fake port
+	 * substitution, it should result in a test failure.
+	 */
+	if (s != INVALID_SOCKET)
+		(void)closesocket(s);
+	__os_freeaddrinfo(env, ai0);
+	return (port);
+}
+#endif
diff --git a/src/repmgr/repmgr_posix.c b/src/repmgr/repmgr_posix.c
new file mode 100644
index 00000000..0687681a
--- /dev/null
+++ b/src/repmgr/repmgr_posix.c
@@ -0,0 +1,804 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Invalid open file descriptor value, that can be used as an out-of-band
+ * sentinel to mark our signalling pipe as unopened.
+ */
+#define	NO_SUCH_FILE_DESC	(-1)
+
+/* Aggregated control info needed for preparing for select() call. */
+struct io_info {
+	fd_set *reads, *writes;
+	int maxfd;
+};
+
+static int __repmgr_conn_work __P((ENV *, REPMGR_CONNECTION *, void *));
+static int prepare_io __P((ENV *, REPMGR_CONNECTION *, void *));
+
+/*
+ * Starts the thread described in the argument, and stores the resulting thread
+ * ID therein.
+ *
+ * PUBLIC: int __repmgr_thread_start __P((ENV *, REPMGR_RUNNABLE *));
+ */
+int
+__repmgr_thread_start(env, runnable)
+	ENV *env;
+	REPMGR_RUNNABLE *runnable;
+{
+	pthread_attr_t *attrp;
+#if defined(_POSIX_THREAD_ATTR_STACKSIZE) && defined(DB_STACKSIZE)
+	pthread_attr_t attributes;
+	size_t size;
+	int ret;
+
+	attrp = &attributes;
+	if ((ret = pthread_attr_init(&attributes)) != 0) {
+		__db_err(env, ret, DB_STR("3630",
+		    "pthread_attr_init in repmgr_thread_start"));
+		return (ret);
+	}
+
+	size = DB_STACKSIZE;
+
+#ifdef PTHREAD_STACK_MIN
+	if (size < PTHREAD_STACK_MIN)
+		size = PTHREAD_STACK_MIN;
+#endif
+	if ((ret = pthread_attr_setstacksize(&attributes, size)) != 0) {
+		__db_err(env, ret, DB_STR("3631",
+		    "pthread_attr_setstacksize in repmgr_thread_start"));
+		return (ret);
+	}
+#else
+	attrp = NULL;
+#endif
+
+	runnable->finished = FALSE;
+	runnable->quit_requested = FALSE;
+	runnable->env = env;
+
+	return (pthread_create(&runnable->thread_id, attrp,
+		    runnable->run, runnable));
+}
+
+/*
+ * PUBLIC: int __repmgr_thread_join __P((REPMGR_RUNNABLE *));
+ */
+int
+__repmgr_thread_join(thread)
+	REPMGR_RUNNABLE *thread;
+{
+	return (pthread_join(thread->thread_id, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_set_nonblock_conn __P((REPMGR_CONNECTION *));
+ */
+int
+__repmgr_set_nonblock_conn(conn)
+	REPMGR_CONNECTION *conn;
+{
+	return (__repmgr_set_nonblocking(conn->fd));
+}
+
+/*
+ * PUBLIC: int __repmgr_set_nonblocking __P((socket_t));
+ */
+int
+__repmgr_set_nonblocking(fd)
+	socket_t fd;
+{
+	int flags;
+
+	if ((flags = fcntl(fd, F_GETFL, 0)) < 0)
+		return (errno);
+	if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0)
+		return (errno);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_wake_waiters __P((ENV *, waiter_t *));
+ *
+ * Wake any "waiter" threads (either sending threads waiting for acks, or
+ * channel users waiting for response to request).
+ *
+ * !!!
+ * Caller must hold the db_rep->mutex, if this thread synchronization is to work
+ * properly.
+ */
+int
+__repmgr_wake_waiters(env, waiter)
+	ENV *env;
+	waiter_t *waiter;
+{
+	COMPQUIET(env, NULL);
+	return (pthread_cond_broadcast(waiter));
+}
+
+/*
+ * Waits a limited time for a condition to become true.  (If the limit is 0 we
+ * wait forever.)  All calls share just the one db_rep->mutex, but use whatever
+ * waiter_t the caller passes us.
+ *
+ * PUBLIC: int __repmgr_await_cond __P((ENV *,
+ * PUBLIC:     PREDICATE, void *, db_timeout_t, waiter_t *));
+ */
+int
+__repmgr_await_cond(env, pred, ctx, timeout, wait_condition)
+	ENV *env;
+	PREDICATE pred;
+	void *ctx;
+	db_timeout_t timeout;
+	waiter_t *wait_condition;
+{
+	DB_REP *db_rep;
+	struct timespec deadline;
+	int ret, timed;
+
+	db_rep = env->rep_handle;
+	if ((timed = (timeout > 0)))
+		__repmgr_compute_wait_deadline(env, &deadline, timeout);
+	else
+		COMPQUIET(deadline.tv_sec, 0);
+
+	while (!(*pred)(env, ctx)) {
+		if (timed)
+			ret = pthread_cond_timedwait(wait_condition,
+			    db_rep->mutex, &deadline);
+		else
+			ret = pthread_cond_wait(wait_condition, db_rep->mutex);
+		if (db_rep->repmgr_status == stopped)
+			return (DB_REP_UNAVAIL);
+		if (ret == ETIMEDOUT)
+			return (DB_TIMEOUT);
+		if (ret != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+/*
+ * Waits for an in-progress membership DB operation (if any) to complete.
+ *
+ * PUBLIC: int __repmgr_await_gmdbop __P((ENV *));
+ *
+ * Caller holds mutex; we drop it while waiting.
+ */
+int
+__repmgr_await_gmdbop(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	while (db_rep->gmdb_busy)
+		if ((ret = pthread_cond_wait(&db_rep->gmdb_idle,
+		    db_rep->mutex)) != 0)
+			return (ret);
+	return (0);
+}
+
+/*
+ * __repmgr_compute_wait_deadline --
+ *	Computes a deadline time a certain distance into the future.
+ *
+ * PUBLIC: void __repmgr_compute_wait_deadline __P((ENV*,
+ * PUBLIC:    struct timespec *, db_timeout_t));
+ */
+void
+__repmgr_compute_wait_deadline(env, result, wait)
+	ENV *env;
+	struct timespec *result;
+	db_timeout_t wait;
+{
+	/*
+	 * The result is suitable for the pthread_cond_timewait call.  (That
+	 * call uses nano-second resolution; elsewhere we use microseconds.)
+	 *
+	 * Start with "now"; then add the "wait" offset.
+	 *
+	 * A db_timespec is the same as a "struct timespec" so we can pass
+	 * result directly to the underlying Berkeley DB OS routine.
+	 *
+	 * !!!
+	 * We use the system clock for the pthread_cond_timedwait call, but
+	 * that's not optimal on systems with monotonic timers.   Instead,
+	 * we should call pthread_condattr_setclock on systems where it and
+	 * monotonic timers are available, and then configure both this call
+	 * and the subsequent pthread_cond_timewait call to use a monotonic
+	 * timer.
+	 */
+	__os_gettime(env, (db_timespec *)result, 0);
+	TIMESPEC_ADD_DB_TIMEOUT(result, wait);
+}
+
+/*
+ * PUBLIC: int __repmgr_await_drain __P((ENV *,
+ * PUBLIC:    REPMGR_CONNECTION *, db_timeout_t));
+ *
+ * Waits for space to become available on the connection's output queue.
+ * Various ways we can exit:
+ *
+ * 1. queue becomes non-full
+ * 2. exceed time limit
+ * 3. connection becomes defunct (due to error in another thread)
+ * 4. repmgr is shutting down
+ * 5. any unexpected system resource failure
+ *
+ * In cases #3 and #5 we return an error code.  Caller is responsible for
+ * distinguishing the remaining cases if desired, though we do help with #2 by
+ * showing the connection as congested.
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_await_drain(env, conn, timeout)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	db_timeout_t timeout;
+{
+	DB_REP *db_rep;
+	struct timespec deadline;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	__repmgr_compute_wait_deadline(env, &deadline, timeout);
+
+	ret = 0;
+	while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+		ret = pthread_cond_timedwait(&conn->drained,
+		    db_rep->mutex, &deadline);
+		switch (ret) {
+		case 0:
+			if (db_rep->repmgr_status == stopped)
+				goto out; /* #4. */
+			/*
+			 * Another thread could have stumbled into an error on
+			 * the socket while we were waiting.
+			 */
+			if (conn->state == CONN_DEFUNCT) {
+				ret = DB_REP_UNAVAIL; /* #3. */
+				goto out;
+			}
+			break;
+		case ETIMEDOUT:
+			conn->state = CONN_CONGESTED;
+			ret = 0;
+			goto out; /* #2. */
+		default:
+			goto out; /* #5. */
+		}
+	}
+	/* #1. */
+
+out:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_alloc_cond __P((cond_var_t *));
+ *
+ * Initialize a condition variable (in allocated space).
+ */
+int
+__repmgr_alloc_cond(c)
+	cond_var_t *c;
+{
+	return (pthread_cond_init(c, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_free_cond __P((cond_var_t *));
+ *
+ * Clean up a previously initialized condition variable.
+ */
+int
+__repmgr_free_cond(c)
+	cond_var_t *c;
+{
+	return (pthread_cond_destroy(c));
+}
+
+/*
+ * PUBLIC: void __repmgr_env_create_pf __P((DB_REP *));
+ */
+void
+__repmgr_env_create_pf(db_rep)
+	DB_REP *db_rep;
+{
+	db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+}
+
+/*
+ * "Platform"-specific mutex creation function.
+ *
+ * PUBLIC: int __repmgr_create_mutex_pf __P((mgr_mutex_t *));
+ */
+int
+__repmgr_create_mutex_pf(mutex)
+	mgr_mutex_t *mutex;
+{
+	return (pthread_mutex_init(mutex, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_mutex_pf __P((mgr_mutex_t *));
+ */
+int
+__repmgr_destroy_mutex_pf(mutex)
+	mgr_mutex_t *mutex;
+{
+	return (pthread_mutex_destroy(mutex));
+}
+
+/*
+ * PUBLIC: int __repmgr_init __P((ENV *));
+ */
+int
+__repmgr_init(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	struct sigaction sigact;
+	int ack_inited, elect_inited, file_desc[2], gmdb_inited, queue_inited;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * Make sure we're not ignoring SIGPIPE, 'cuz otherwise we'd be killed
+	 * just for trying to write onto a socket that had been reset.  Note
+	 * that we don't undo this in case of a later error, since we document
+	 * that we leave the signal handling state like this, even after env
+	 * close.
+	 */
+	if (sigaction(SIGPIPE, NULL, &sigact) == -1) {
+		ret = errno;
+		__db_err(env, ret, DB_STR("3632",
+		    "can't access signal handler"));
+		return (ret);
+	}
+	if (sigact.sa_handler == SIG_DFL) {
+		sigact.sa_handler = SIG_IGN;
+		sigact.sa_flags = 0;
+		if (sigaction(SIGPIPE, &sigact, NULL) == -1) {
+			ret = errno;
+			__db_err(env, ret, DB_STR("3633",
+			    "can't access signal handler"));
+			return (ret);
+		}
+	}
+
+	ack_inited = elect_inited = gmdb_inited = queue_inited = FALSE;
+	if ((ret = __repmgr_init_waiters(env, &db_rep->ack_waiters)) != 0)
+		goto err;
+	ack_inited = TRUE;
+
+	if ((ret = pthread_cond_init(&db_rep->check_election, NULL)) != 0)
+		goto err;
+	elect_inited = TRUE;
+
+	if ((ret = pthread_cond_init(&db_rep->gmdb_idle, NULL)) != 0)
+		goto err;
+	gmdb_inited = TRUE;
+
+	if ((ret = pthread_cond_init(&db_rep->msg_avail, NULL)) != 0)
+		goto err;
+	queue_inited = TRUE;
+
+	if ((ret = pipe(file_desc)) == -1) {
+		ret = errno;
+		goto err;
+	}
+
+	db_rep->read_pipe = file_desc[0];
+	db_rep->write_pipe = file_desc[1];
+	return (0);
+err:
+	if (queue_inited)
+		(void)pthread_cond_destroy(&db_rep->msg_avail);
+	if (gmdb_inited)
+		(void)pthread_cond_destroy(&db_rep->gmdb_idle);
+	if (elect_inited)
+		(void)pthread_cond_destroy(&db_rep->check_election);
+	if (ack_inited)
+		(void)__repmgr_destroy_waiters(env, &db_rep->ack_waiters);
+	db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_deinit __P((ENV *));
+ */
+int
+__repmgr_deinit(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+	if (!(REPMGR_INITED(db_rep)))
+		return (0);
+
+	ret = pthread_cond_destroy(&db_rep->msg_avail);
+
+	if ((t_ret = pthread_cond_destroy(&db_rep->gmdb_idle)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = pthread_cond_destroy(&db_rep->check_election)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __repmgr_destroy_waiters(env,
+	    &db_rep->ack_waiters)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (close(db_rep->read_pipe) == -1 && ret == 0)
+		ret = errno;
+	if (close(db_rep->write_pipe) == -1 && ret == 0)
+		ret = errno;
+
+	db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_waiters __P((ENV *, waiter_t *));
+ */
+int
+__repmgr_init_waiters(env, waiters)
+	ENV *env;
+	waiter_t *waiters;
+{
+	COMPQUIET(env, NULL);
+	return (pthread_cond_init(waiters, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_waiters __P((ENV *, waiter_t *));
+ */
+int
+__repmgr_destroy_waiters(env, waiters)
+	ENV *env;
+	waiter_t *waiters;
+{
+	COMPQUIET(env, NULL);
+	return (pthread_cond_destroy(waiters));
+}
+
+/*
+ * PUBLIC: int __repmgr_lock_mutex __P((mgr_mutex_t *));
+ */
+int
+__repmgr_lock_mutex(mutex)
+	mgr_mutex_t  *mutex;
+{
+	return (pthread_mutex_lock(mutex));
+}
+
+/*
+ * PUBLIC: int __repmgr_unlock_mutex __P((mgr_mutex_t *));
+ */
+int
+__repmgr_unlock_mutex(mutex)
+	mgr_mutex_t  *mutex;
+{
+	return (pthread_mutex_unlock(mutex));
+}
+
+/*
+ * Signals a condition variable.
+ *
+ * !!!
+ * Caller must hold mutex.
+ *
+ * PUBLIC: int __repmgr_signal __P((cond_var_t *));
+ */
+int
+__repmgr_signal(v)
+	cond_var_t *v;
+{
+	return (pthread_cond_broadcast(v));
+}
+
+/*
+ * Wake repmgr message processing threads, expressly for the purpose of shutting
+ * some subset of them down.
+ *
+ * !!!
+ * Caller must hold mutex.
+ *
+ * PUBLIC: int __repmgr_wake_msngers __P((ENV*, u_int));
+ */
+int
+__repmgr_wake_msngers(env, n)
+	ENV *env;
+	u_int n;
+{
+	DB_REP *db_rep;
+
+	COMPQUIET(n, 0);
+
+	db_rep = env->rep_handle;
+	return (__repmgr_signal(&db_rep->msg_avail));
+}
+
+/*
+ * PUBLIC: int __repmgr_wake_main_thread __P((ENV*));
+ *
+ * Can be called either with or without the mutex being held.
+ */
+int
+__repmgr_wake_main_thread(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	u_int8_t any_value;
+
+	COMPQUIET(any_value, 0);
+	db_rep = env->rep_handle;
+
+	/*
+	 * It doesn't matter what byte value we write.  Just the appearance of a
+	 * byte in the stream is enough to wake up the select() thread reading
+	 * the pipe.
+	 */
+	if (write(db_rep->write_pipe, VOID_STAR_CAST &any_value, 1) == -1)
+		return (errno);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_writev __P((socket_t, db_iovec_t *, int, size_t *));
+ */
+int
+__repmgr_writev(fd, iovec, buf_count, byte_count_p)
+	socket_t fd;
+	db_iovec_t *iovec;
+	int buf_count;
+	size_t *byte_count_p;
+{
+	int nw, result;
+
+	if ((nw = writev(fd, iovec, buf_count)) == -1) {
+		/* Why?  See note at __repmgr_readv(). */
+		result = errno;
+		DB_ASSERT(NULL, result != 0);
+		return (result);
+	}
+	*byte_count_p = (size_t)nw;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_readv __P((socket_t, db_iovec_t *, int, size_t *));
+ */
+int
+__repmgr_readv(fd, iovec, buf_count, byte_count_p)
+	socket_t fd;
+	db_iovec_t *iovec;
+	int buf_count;
+	size_t *byte_count_p;
+{
+	int result;
+	ssize_t nw;
+
+	if ((nw = readv(fd, iovec, buf_count)) == -1) {
+		/*
+		 * Why bother to assert this obvious "truth"?  On some systems
+		 * when the library is loaded into a single-threaded Tcl
+		 * configuration the differing errno mechanisms apparently
+		 * conflict, and we occasionally "see" a 0 value here!  And that
+		 * turns out to be painful to debug.
+		 */
+		result = errno;
+		DB_ASSERT(NULL, result != 0);
+		return (result);
+	}
+	*byte_count_p = (size_t)nw;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_select_loop __P((ENV *));
+ */
+int
+__repmgr_select_loop(env)
+	ENV *env;
+{
+	struct timeval select_timeout, *select_timeout_p;
+	DB_REP *db_rep;
+	db_timespec timeout;
+	fd_set reads, writes;
+	struct io_info io_info;
+	int ret;
+	u_int8_t buf[10];	/* arbitrary size */
+
+	db_rep = env->rep_handle;
+	/*
+	 * Almost this entire thread operates while holding the mutex.  But note
+	 * that it never blocks, except in the call to select() (which is the
+	 * one place we relinquish the mutex).
+	 */
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_first_try_connections(env)) != 0)
+		goto out;
+	for (;;) {
+		FD_ZERO(&reads);
+		FD_ZERO(&writes);
+
+		/*
+		 * Figure out which sockets to ask for input and output.  It's
+		 * simple for the signalling pipe and listen socket; but depends
+		 * on backlog states for the connections to other sites.
+		 */
+		FD_SET((u_int)db_rep->read_pipe, &reads);
+		io_info.maxfd = db_rep->read_pipe;
+
+		if (!IS_SUBORDINATE(db_rep)) {
+			FD_SET((u_int)db_rep->listen_fd, &reads);
+			if (db_rep->listen_fd > io_info.maxfd)
+				io_info.maxfd = db_rep->listen_fd;
+		}
+
+		io_info.reads = &reads;
+		io_info.writes = &writes;
+		if ((ret = __repmgr_each_connection(env,
+		    prepare_io, &io_info, TRUE)) != 0)
+			goto out;
+
+		if (__repmgr_compute_timeout(env, &timeout)) {
+			/* Convert the timespec to a timeval. */
+			select_timeout.tv_sec = timeout.tv_sec;
+			select_timeout.tv_usec = timeout.tv_nsec / NS_PER_US;
+			select_timeout_p = &select_timeout;
+		} else {
+			/* No time-based events, so wait only for I/O. */
+			select_timeout_p = NULL;
+		}
+
+		UNLOCK_MUTEX(db_rep->mutex);
+
+		if ((ret = select(io_info.maxfd + 1,
+		    &reads, &writes, NULL, select_timeout_p)) == -1) {
+			switch (ret = errno) {
+			case EINTR:
+			case EWOULDBLOCK:
+				LOCK_MUTEX(db_rep->mutex);
+				continue; /* simply retry */
+			default:
+				__db_err(env, ret, DB_STR("3634",
+				    "select"));
+				return (ret);
+			}
+		}
+		LOCK_MUTEX(db_rep->mutex);
+		if (db_rep->repmgr_status == stopped) {
+			ret = 0;
+			goto out;
+		}
+
+		/*
+		 * Timer expiration events include retrying of lost connections.
+		 * Obviously elements can be added to the connection list there.
+		 */
+		if ((ret = __repmgr_check_timeouts(env)) != 0)
+			goto out;
+
+		if ((ret = __repmgr_each_connection(env,
+		    __repmgr_conn_work, &io_info, TRUE)) != 0)
+			goto out;
+
+		/*
+		 * Read any bytes in the signalling pipe.  Note that we don't
+		 * actually need to do anything with them; they're just there to
+		 * wake us up when necessary.
+		 */
+		if (FD_ISSET((u_int)db_rep->read_pipe, &reads) &&
+		    read(db_rep->read_pipe, VOID_STAR_CAST buf,
+		    sizeof(buf)) <= 0) {
+			ret = errno;
+			goto out;
+		}
+		/*
+		 * Obviously elements can be added to the connection list here.
+		 */
+		if (!IS_SUBORDINATE(db_rep) &&
+		    FD_ISSET((u_int)db_rep->listen_fd, &reads) &&
+		    (ret = __repmgr_accept(env)) != 0)
+			goto out;
+	}
+out:
+	UNLOCK_MUTEX(db_rep->mutex);
+	if (ret == DB_DELETED)
+		ret = __repmgr_bow_out(env);
+	LOCK_MUTEX(db_rep->mutex);
+	(void)__repmgr_net_close(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * Examines a connection to see what sort of I/O to ask for.  Clean up defunct
+ * connections.
+ */
+static int
+prepare_io(env, conn, info_)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *info_;
+{
+	struct io_info *info;
+
+	info = info_;
+
+	if (conn->state == CONN_DEFUNCT)
+		return (__repmgr_cleanup_defunct(env, conn));
+
+	if (!STAILQ_EMPTY(&conn->outbound_queue)) {
+		FD_SET((u_int)conn->fd, info->writes);
+		if (conn->fd > info->maxfd)
+			info->maxfd = conn->fd;
+	}
+	/*
+	 * For now we always accept incoming data.  If we ever implement some
+	 * kind of flow control, we should override it for fledgling connections
+	 * (!IS_VALID_EID(conn->eid)) -- in other words, allow reading such a
+	 * connection even during flow control duress.
+	 */
+	FD_SET((u_int)conn->fd, info->reads);
+	if (conn->fd > info->maxfd)
+		info->maxfd = conn->fd;
+
+	return (0);
+}
+
+/*
+ * Examine a connection, to see what work needs to be done.
+ */
+static int
+__repmgr_conn_work(env, conn, info_)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *info_;
+{
+	struct io_info *info;
+	int ret;
+	u_int fd;
+
+	ret = 0;
+	fd = (u_int)conn->fd;
+	info = info_;
+
+	if (conn->state == CONN_DEFUNCT)
+		return (0);
+
+	if (FD_ISSET(fd, info->writes))
+		ret = __repmgr_write_some(env, conn);
+
+	if (ret == 0 && FD_ISSET(fd, info->reads))
+		ret = __repmgr_read_from_site(env, conn);
+
+	if (ret == DB_REP_UNAVAIL)
+		ret = __repmgr_bust_connection(env, conn);
+	return (ret);
+}
diff --git a/src/repmgr/repmgr_queue.c b/src/repmgr/repmgr_queue.c
new file mode 100644
index 00000000..6a381acf
--- /dev/null
+++ b/src/repmgr/repmgr_queue.c
@@ -0,0 +1,180 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static REPMGR_MESSAGE *available_work __P((ENV *));
+
+/*
+ * Deallocates memory used by all messages on the queue.
+ *
+ * PUBLIC: int __repmgr_queue_destroy __P((ENV *));
+ */
+int
+__repmgr_queue_destroy(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_MESSAGE *m;
+	REPMGR_CONNECTION *conn;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+
+	ret = 0;
+	while (!STAILQ_EMPTY(&db_rep->input_queue.header)) {
+		m = STAILQ_FIRST(&db_rep->input_queue.header);
+		STAILQ_REMOVE_HEAD(&db_rep->input_queue.header, entries);
+		if (m->msg_hdr.type == REPMGR_APP_MESSAGE) {
+			if ((conn = m->v.appmsg.conn) != NULL &&
+			    (t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
+		}
+		__os_free(env, m);
+	}
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_get __P((ENV *,
+ * PUBLIC:     REPMGR_MESSAGE **, REPMGR_RUNNABLE *));
+ *
+ * Get the first input message from the queue and return it to the caller.  The
+ * caller hereby takes responsibility for the entire message buffer, and should
+ * free it when done.
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_queue_get(env, msgp, th)
+	ENV *env;
+	REPMGR_MESSAGE **msgp;
+	REPMGR_RUNNABLE *th;
+{
+	DB_REP *db_rep;
+	REPMGR_MESSAGE *m;
+#ifdef DB_WIN32
+	HANDLE wait_events[2];
+#endif
+	int ret;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+
+	while ((m = available_work(env)) == NULL &&
+	    db_rep->repmgr_status == running && !th->quit_requested) {
+#ifdef DB_WIN32
+		/*
+		 * On Windows, msg_avail means either there's something in the
+		 * queue, or we're all finished.  So, reset the event if that is
+		 * not true.
+		 */
+		if (STAILQ_EMPTY(&db_rep->input_queue.header) &&
+		    db_rep->repmgr_status == running &&
+		    !ResetEvent(db_rep->msg_avail)) {
+			ret = GetLastError();
+			goto err;
+		}
+		wait_events[0] = db_rep->msg_avail;
+		wait_events[1] = th->quit_event;
+		UNLOCK_MUTEX(db_rep->mutex);
+		ret = WaitForMultipleObjects(2, wait_events, FALSE, INFINITE);
+		LOCK_MUTEX(db_rep->mutex);
+		if (ret == WAIT_FAILED) {
+			ret = GetLastError();
+			goto err;
+		}
+
+#else
+		if ((ret = pthread_cond_wait(&db_rep->msg_avail,
+		    db_rep->mutex)) != 0)
+			goto err;
+#endif
+	}
+	if (db_rep->repmgr_status == stopped || th->quit_requested)
+		ret = DB_REP_UNAVAIL;
+	else {
+		STAILQ_REMOVE(&db_rep->input_queue.header,
+		    m, __repmgr_message, entries);
+		db_rep->input_queue.size--;
+		*msgp = m;
+	}
+
+err:
+	return (ret);
+}
+
+/*
+ * Gets an "available" item of work (i.e., a message) from the input queue.  If
+ * there are plenty of message threads currently available, then we simply
+ * return the first thing on the queue, regardless of what type of message it
+ * is.  But otherwise skip over any message type that may possibly turn out to
+ * be "long-running", so that we avoid starving out the important rep message
+ * processing.
+ */
+static REPMGR_MESSAGE *
+available_work(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_MESSAGE *m;
+
+	db_rep = env->rep_handle;
+	if (STAILQ_EMPTY(&db_rep->input_queue.header))
+		return (NULL);
+	/*
+	 * The "non_rep_th" field is the dynamically varying count of threads
+	 * currently processing non-replication messages (a.k.a. possibly
+	 * long-running messages, a.k.a. "deferrable").  We always ensure that
+	 * db_rep->nthreads > reserved.
+	 */
+	if (db_rep->nthreads > db_rep->non_rep_th + RESERVED_MSG_TH(env))
+		return (STAILQ_FIRST(&db_rep->input_queue.header));
+	STAILQ_FOREACH(m, &db_rep->input_queue.header, entries) {
+		if (!IS_DEFERRABLE(m->msg_hdr.type))
+			return (m);
+	}
+	return (NULL);
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *));
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_queue_put(env, msg)
+	ENV *env;
+	REPMGR_MESSAGE *msg;
+{
+	DB_REP *db_rep;
+
+	db_rep = env->rep_handle;
+
+	STAILQ_INSERT_TAIL(&db_rep->input_queue.header, msg, entries);
+	db_rep->input_queue.size++;
+
+	return (__repmgr_signal(&db_rep->msg_avail));
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_size __P((ENV *));
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_queue_size(env)
+	ENV *env;
+{
+	return (env->rep_handle->input_queue.size);
+}
diff --git a/src/repmgr/repmgr_rec.c b/src/repmgr/repmgr_rec.c
new file mode 100644
index 00000000..41827aff
--- /dev/null
+++ b/src/repmgr/repmgr_rec.c
@@ -0,0 +1,45 @@
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+/*
+ * __repmgr_member_recover --
+ *	Recovery function for member.
+ *
+ * PUBLIC: int __repmgr_member_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__repmgr_member_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__repmgr_member_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+	COMPQUIET(op, DB_TXN_APPLY);
+
+	REC_PRINT(__repmgr_member_print);
+	REC_NOOP_INTRO(__repmgr_member_read);
+
+	/*
+	 * The annotation log record describes the update in enough detail for
+	 * us to be able to optimize our tracking of it at clients sites.
+	 * However, for now we just simply reread the whole (small) database
+	 * each time, since changes happen so seldom (and we need to have the
+	 * code for reading the whole thing anyway, for other cases).
+	 */
+	env->rep_handle->gmdb_dirty = TRUE;
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
diff --git a/src/repmgr/repmgr_sel.c b/src/repmgr/repmgr_sel.c
new file mode 100644
index 00000000..ba14368f
--- /dev/null
+++ b/src/repmgr/repmgr_sel.c
@@ -0,0 +1,2096 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+typedef int (*HEARTBEAT_ACTION) __P((ENV *));
+
+static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
+static int accept_v1_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
+static void check_min_log_file __P((ENV *));
+static int dispatch_msgin __P((ENV *, REPMGR_CONNECTION *));
+static int prepare_input __P((ENV *, REPMGR_CONNECTION *));
+static int process_own_msg __P((ENV *, REPMGR_CONNECTION *));
+static int process_parameters __P((ENV *,
+    REPMGR_CONNECTION *, char *, u_int, u_int32_t, int, u_int32_t));
+static int read_version_response __P((ENV *, REPMGR_CONNECTION *));
+static int record_permlsn __P((ENV *, REPMGR_CONNECTION *));
+static int __repmgr_call_election __P((ENV *));
+static int __repmgr_connector_main __P((ENV *, REPMGR_RUNNABLE *));
+static void *__repmgr_connector_thread __P((void *));
+static int __repmgr_next_timeout __P((ENV *,
+    db_timespec *, HEARTBEAT_ACTION *));
+static int __repmgr_retry_connections __P((ENV *));
+static int __repmgr_send_heartbeat __P((ENV *));
+static int __repmgr_try_one __P((ENV *, int));
+static int resolve_collision __P((ENV *, REPMGR_SITE *, REPMGR_CONNECTION *));
+static int send_version_response __P((ENV *, REPMGR_CONNECTION *));
+
+#define	ONLY_HANDSHAKE(env, conn) do {				     \
+	if (conn->msg_type != REPMGR_HANDSHAKE) {		     \
+		__db_errx(env, DB_STR_A("3613",		     \
+		    "unexpected msg type %d in state %d", "%d %d"),  \
+		    (int)conn->msg_type, conn->state);		     \
+		return (DB_REP_UNAVAIL);			     \
+	}							     \
+} while (0)
+
+/*
+ * PUBLIC: void *__repmgr_select_thread __P((void *));
+ */
+void *
+__repmgr_select_thread(argsp)
+	void *argsp;
+{
+	REPMGR_RUNNABLE *args;
+	ENV *env;
+	int ret;
+
+	args = argsp;
+	env = args->env;
+
+	if ((ret = __repmgr_select_loop(env))  != 0) {
+		__db_err(env, ret, DB_STR("3614", "select loop failed"));
+		(void)__repmgr_thread_failure(env, ret);
+	}
+	return (NULL);
+}
+
+/*
+ * PUBLIC: int __repmgr_bow_out __P((ENV *));
+ */
+int
+__repmgr_bow_out(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	ret = __repmgr_stop_threads(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+	DB_EVENT(env, DB_EVENT_REP_LOCAL_SITE_REMOVED, NULL);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_accept __P((ENV *));
+ */
+int
+__repmgr_accept(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	ACCEPT_ADDR siaddr;
+	socklen_t addrlen;
+	socket_t s;
+	int ret;
+
+	db_rep = env->rep_handle;
+	addrlen = sizeof(siaddr);
+	if ((s = accept(db_rep->listen_fd, (struct sockaddr *)&siaddr,
+	    &addrlen)) == -1) {
+		/*
+		 * Some errors are innocuous and so should be ignored.  MSDN
+		 * Library documents the Windows ones; the Unix ones are
+		 * advocated in Stevens' UNPv1, section 16.6; and Linux
+		 * Application Development, p. 416.
+		 */
+		switch (ret = net_errno) {
+#ifdef DB_WIN32
+		case WSAECONNRESET:
+		case WSAEWOULDBLOCK:
+#else
+		case EINTR:
+		case EWOULDBLOCK:
+		case ECONNABORTED:
+		case ENETDOWN:
+#ifdef EPROTO
+		case EPROTO:
+#endif
+		case ENOPROTOOPT:
+		case EHOSTDOWN:
+#ifdef ENONET
+		case ENONET:
+#endif
+		case EHOSTUNREACH:
+		case EOPNOTSUPP:
+		case ENETUNREACH:
+#endif
+			VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "accept error %d considered innocuous", ret));
+			return (0);
+		default:
+			__db_err(env, ret, DB_STR("3615", "accept error"));
+			return (ret);
+		}
+	}
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "accepted a new connection"));
+
+	if ((ret =
+	    __repmgr_new_connection(env, &conn, s, CONN_NEGOTIATE)) != 0) {
+		(void)closesocket(s);
+		return (ret);
+	}
+	if ((ret = __repmgr_set_keepalive(env, conn)) != 0) {
+		(void)__repmgr_destroy_conn(env, conn);
+		return (ret);
+	}
+	if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+		__db_err(env, ret, DB_STR("3616",
+		    "can't set nonblock after accept"));
+		(void)__repmgr_destroy_conn(env, conn);
+		return (ret);
+	}
+
+	/*
+	 * We don't yet know which site this connection is coming from.  So for
+	 * now, put it on the "orphans" list; we'll move it to the appropriate
+	 * site struct later when we discover who we're talking with, and what
+	 * type of connection it is.
+	 */
+	conn->eid = -1;
+	TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+	conn->ref_count++;
+
+	return (0);
+}
+
+/*
+ * Computes how long we should wait for input, in other words how long until we
+ * have to wake up and do something.  Returns TRUE if timeout is set; FALSE if
+ * there is nothing to wait for.
+ *
+ * Note that the resulting timeout could be zero; but it can't be negative.
+ *
+ * PUBLIC: int __repmgr_compute_timeout __P((ENV *, db_timespec *));
+ */
+int
+__repmgr_compute_timeout(env, timeout)
+	ENV *env;
+	db_timespec *timeout;
+{
+	DB_REP *db_rep;
+	REPMGR_RETRY *retry;
+	db_timespec now, t;
+	int have_timeout;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * There are two factors to consider: are heartbeats in use?  and, do we
+	 * have any sites with broken connections that we ought to retry?
+	 */
+	have_timeout = __repmgr_next_timeout(env, &t, NULL);
+
+	/* List items are in order, so we only have to examine the first one. */
+	if (!TAILQ_EMPTY(&db_rep->retries)) {
+		retry = TAILQ_FIRST(&db_rep->retries);
+		if (have_timeout) {
+			/* Choose earliest timeout deadline. */
+			t = timespeccmp(&retry->time, &t, <) ? retry->time : t;
+		} else {
+			t = retry->time;
+			have_timeout = TRUE;
+		}
+	}
+
+	if (have_timeout) {
+		__os_gettime(env, &now, 1);
+		if (timespeccmp(&now, &t, >=))
+			timespecclear(timeout);
+		else {
+			*timeout = t;
+			timespecsub(timeout, &now);
+		}
+	}
+
+	return (have_timeout);
+}
+
+/*
+ * Figures out the next heartbeat-related thing to be done, and when it should
+ * be done.  The code is factored this way because this computation needs to be
+ * done both before each select() call, and after (when we're checking for timer
+ * expiration).
+ */
+static int
+__repmgr_next_timeout(env, deadline, action)
+	ENV *env;
+	db_timespec *deadline;
+	HEARTBEAT_ACTION *action;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	HEARTBEAT_ACTION my_action;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *master;
+	db_timespec t;
+	u_int32_t version;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (rep->master_id == db_rep->self_eid &&
+	    rep->heartbeat_frequency > 0) {
+		t = db_rep->last_bcast;
+		TIMESPEC_ADD_DB_TIMEOUT(&t, rep->heartbeat_frequency);
+		my_action = __repmgr_send_heartbeat;
+	} else if ((master = __repmgr_connected_master(env)) != NULL &&
+	    !IS_SUBORDINATE(db_rep) &&
+	    rep->heartbeat_monitor_timeout > 0) {
+		version = 0;
+		if ((conn = master->ref.conn.in) != NULL &&
+		    IS_READY_STATE(conn->state))
+			version = conn->version;
+		if ((conn = master->ref.conn.out) != NULL &&
+		    IS_READY_STATE(conn->state) &&
+		    conn->version > version)
+			version = conn->version;
+		if (version >= HEARTBEAT_MIN_VERSION) {
+			/*
+			 * If we have a working connection to a heartbeat-aware
+			 * master, let's monitor it.  Otherwise there's really
+			 * nothing we can do.
+			 */
+			t = master->last_rcvd_timestamp;
+			TIMESPEC_ADD_DB_TIMEOUT(&t,
+			    rep->heartbeat_monitor_timeout);
+			my_action = __repmgr_call_election;
+		} else
+			return (FALSE);
+	} else
+		return (FALSE);
+
+	*deadline = t;
+	if (action != NULL)
+		*action = my_action;
+	return (TRUE);
+}
+
+/*
+ * Sends a heartbeat message.
+ *
+ * repmgr also uses the heartbeat facility to manage rerequests.  We
+ * send the master's current generation and max_perm_lsn with the heartbeat
+ * message to help a client determine whether it has all master transactions.
+ * When a client receives a heartbeat message, it also checks whether it
+ * needs to rerequest anything.  Note that heartbeats must be enabled for
+ * this rerequest processing to occur.
+ */
+static int
+__repmgr_send_heartbeat(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	DBT control, rec;
+	__repmgr_permlsn_args permlsn;
+	u_int8_t buf[__REPMGR_PERMLSN_SIZE];
+	u_int unused1, unused2;
+	int ret, unused3;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	permlsn.generation = rep->gen;
+	if ((ret = __rep_get_maxpermlsn(env, &permlsn.lsn)) != 0)
+		return (ret);
+	__repmgr_permlsn_marshal(env, &permlsn, buf);
+	control.data = buf;
+	control.size = __REPMGR_PERMLSN_SIZE;
+
+	DB_INIT_DBT(rec, NULL, 0);
+	return (__repmgr_send_broadcast(env,
+	    REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3));
+}
+
+/*
+ * PUBLIC: REPMGR_SITE *__repmgr_connected_master __P((ENV *));
+ */
+REPMGR_SITE *
+__repmgr_connected_master(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *master;
+	int master_id;
+
+	db_rep = env->rep_handle;
+	master_id = db_rep->region->master_id;
+
+	if (!IS_KNOWN_REMOTE_SITE(master_id))
+		return (NULL);
+	master = SITE_FROM_EID(master_id);
+	if (master->state == SITE_CONNECTED)
+		return (master);
+	return (NULL);
+}
+
+static int
+__repmgr_call_election(env)
+	ENV *env;
+{
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *master;
+	int ret;
+
+	master = __repmgr_connected_master(env);
+	if (master == NULL)
+		return (0);
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "heartbeat monitor timeout expired"));
+	STAT(env->rep_handle->region->mstat.st_connection_drop++);
+	if ((conn = master->ref.conn.in) != NULL &&
+	    (ret = __repmgr_bust_connection(env, conn)) != 0)
+		return (ret);
+	if ((conn = master->ref.conn.out) != NULL &&
+	    (ret = __repmgr_bust_connection(env, conn)) != 0)
+		return (ret);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_check_timeouts __P((ENV *));
+ *
+ * !!!
+ * Assumes caller holds the mutex.
+ */
+int
+__repmgr_check_timeouts(env)
+	ENV *env;
+{
+	db_timespec when, now;
+	HEARTBEAT_ACTION action;
+	int ret;
+
+	/*
+	 * Figure out the next heartbeat-related thing to be done.  Then, if
+	 * it's time to do it, do so.
+	 */
+	if (__repmgr_next_timeout(env, &when, &action)) {
+		__os_gettime(env, &now, 1);
+		if (timespeccmp(&when, &now, <=) &&
+		    (ret = (*action)(env)) != 0)
+			return (ret);
+	}
+
+	return (__repmgr_retry_connections(env));
+}
+
+/*
+ * Initiates connection attempts for any sites on the idle list whose retry
+ * times have expired.
+ */
+static int
+__repmgr_retry_connections(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	REPMGR_RETRY *retry;
+	db_timespec now;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+	__os_gettime(env, &now, 1);
+
+	while (!TAILQ_EMPTY(&db_rep->retries)) {
+		retry = TAILQ_FIRST(&db_rep->retries);
+		if (timespeccmp(&retry->time, &now, >=))
+			break;	/* since items are in time order */
+
+		TAILQ_REMOVE(&db_rep->retries, retry, entries);
+
+		eid = retry->eid;
+		__os_free(env, retry);
+		DB_ASSERT(env, IS_VALID_EID(eid));
+		site = SITE_FROM_EID(eid);
+		DB_ASSERT(env, site->state == SITE_PAUSING);
+
+		if (site->membership == SITE_PRESENT) {
+			if ((ret = __repmgr_try_one(env, eid)) != 0)
+				return (ret);
+		} else
+			site->state = SITE_IDLE;
+	}
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_first_try_connections __P((ENV *));
+ *
+ * !!!
+ * Assumes caller holds the mutex.
+ */
+int
+__repmgr_first_try_connections(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+	FOR_EACH_REMOTE_SITE_INDEX(eid) {
+		site = SITE_FROM_EID(eid);
+		/*
+		 * Normally all sites would be IDLE here.  But if a user thread
+		 * triggered an auto-start in a subordinate process, our send()
+		 * function may have found new sites when it sync'ed site
+		 * addresses, and that action causes connection attempts to be
+		 * scheduled (resulting in PAUSING state here, or conceivably
+		 * even CONNECTING or CONNECTED).
+		 */
+		if (site->state == SITE_IDLE &&
+		    site->membership == SITE_PRESENT &&
+		    (ret = __repmgr_try_one(env, eid)) != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+/*
+ * Starts a thread to open a connection to the site at the given EID.
+ */
+static int
+__repmgr_try_one(env, eid)
+	ENV *env;
+	int eid;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	REPMGR_RUNNABLE *th;
+	int ret;
+
+	db_rep = env->rep_handle;
+	DB_ASSERT(env, IS_VALID_EID(eid));
+	site = SITE_FROM_EID(eid);
+	th = site->connector;
+	if (th == NULL) {
+		if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0)
+			return (ret);
+		site->connector = th;
+	} else if (th->finished) {
+		if ((ret = __repmgr_thread_join(th)) != 0)
+			return (ret);
+	} else {
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		  "eid %lu previous connector thread still running; will retry",
+		    (u_long)eid));
+		return (__repmgr_schedule_connection_attempt(env,
+			eid, FALSE));
+	}
+
+	site->state = SITE_CONNECTING;
+
+	th->run = __repmgr_connector_thread;
+	th->args.eid = eid;
+	if ((ret = __repmgr_thread_start(env, th)) != 0) {
+		__os_free(env, th);
+		site->connector = NULL;
+	}
+	return (ret);
+}
+
+static void *
+__repmgr_connector_thread(argsp)
+	void *argsp;
+{
+	REPMGR_RUNNABLE *th;
+	ENV *env;
+	int ret;
+
+	th = argsp;
+	env = th->env;
+
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "starting connector thread, eid %u", th->args.eid));
+	if ((ret = __repmgr_connector_main(env, th)) != 0) {
+		__db_err(env, ret, DB_STR("3617", "connector thread failed"));
+		(void)__repmgr_thread_failure(env, ret);
+	}
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connector thread is exiting"));
+
+	th->finished = TRUE;
+	return (NULL);
+}
+
+static int
+__repmgr_connector_main(env, th)
+	ENV *env;
+	REPMGR_RUNNABLE *th;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	REPMGR_CONNECTION *conn;
+	DB_REPMGR_CONN_ERR info;
+	repmgr_netaddr_t netaddr;
+	SITE_STRING_BUFFER site_string;
+	int err, ret, t_ret;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	LOCK_MUTEX(db_rep->mutex);
+	DB_ASSERT(env, IS_VALID_EID(th->args.eid));
+	site = SITE_FROM_EID(th->args.eid);
+	if (site->state != SITE_CONNECTING && db_rep->repmgr_status == stopped)
+		goto unlock;
+
+	/*
+	 * Drop the mutex during operations that could block.  During those
+	 * times, the site struct could move (if we had to grow the sites
+	 * array), but host wouldn't.
+	 *
+	 * Also, during those times we might receive an incoming connection from
+	 * the site, which would change its state.  So, check state each time we
+	 * reacquire the mutex, and quit if the state of the world changed while
+	 * we were away.
+	 */
+	netaddr = site->net_addr;
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connecting to %s",
+		__repmgr_format_site_loc(site, site_string)));
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	if ((ret = __repmgr_connect(env, &netaddr, &conn, &err)) == 0) {
+		DB_EVENT(env,  DB_EVENT_REP_CONNECT_ESTD, &th->args.eid);
+		LOCK_MUTEX(db_rep->mutex);
+		if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+			__db_err(env, ret, DB_STR("3618",
+			    "set_nonblock in connnect thread"));
+			goto cleanup;
+		}
+		conn->type = REP_CONNECTION;
+		site = SITE_FROM_EID(th->args.eid);
+		if (site->state != SITE_CONNECTING ||
+		    db_rep->repmgr_status == stopped)
+			goto cleanup;
+
+		conn->eid = th->args.eid;
+		site = SITE_FROM_EID(th->args.eid);
+		site->ref.conn.out = conn;
+		site->state = SITE_CONNECTED;
+		__os_gettime(env, &site->last_rcvd_timestamp, 1);
+		ret = __repmgr_wake_main_thread(env);
+	} else if (ret == DB_REP_UNAVAIL) {
+		/* Retryable error while trying to connect: retry later. */
+		info.eid = th->args.eid;
+		info.error = err;
+		DB_EVENT(env, DB_EVENT_REP_CONNECT_TRY_FAILED, &info);
+		STAT(db_rep->region->mstat.st_connect_fail++);
+
+		LOCK_MUTEX(db_rep->mutex);
+		site = SITE_FROM_EID(th->args.eid);
+		if (site->state != SITE_CONNECTING ||
+		    db_rep->repmgr_status == stopped) {
+			ret = 0;
+			goto unlock;
+		}
+		ret = __repmgr_schedule_connection_attempt(env,
+		    th->args.eid, FALSE);
+	} else
+		goto out;
+
+	if (0) {
+cleanup:
+		if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+unlock:
+	UNLOCK_MUTEX(db_rep->mutex);
+out:
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_v1_handshake __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, void *, size_t));
+ */
+int
+__repmgr_send_v1_handshake(env, conn, buf, len)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *buf;
+	size_t len;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	repmgr_netaddr_t *my_addr;
+	DB_REPMGR_V1_HANDSHAKE buffer;
+	DBT cntrl, rec;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+	/*
+	 * We're about to send from a structure that has padding holes in it.
+	 * Initializing it keeps Valgrind happy, plus we really shouldn't be
+	 * sending out random garbage anyway (pro forma privacy issue).
+	 */
+	memset(&buffer, 0, sizeof(buffer));
+	buffer.version = 1;
+	buffer.priority = htonl(rep->priority);
+	buffer.port = my_addr->port;
+	cntrl.data = &buffer;
+	cntrl.size = sizeof(buffer);
+
+	rec.data = buf;
+	rec.size = (u_int32_t)len;
+
+	/*
+	 * It would of course be disastrous to block the select() thread, so
+	 * pass the "maxblock" argument as 0.  Fortunately blocking should
+	 * never be necessary here, because the hand-shake is always the first
+	 * thing we send.  Which is a good thing, because it would be almost as
+	 * disastrous if we allowed ourselves to drop a handshake.
+	 */
+	return (__repmgr_send_one(env,
+	    conn, REPMGR_HANDSHAKE, &cntrl, &rec, 0));
+}
+
+/*
+ * PUBLIC: int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * !!!
+ * Caller is assumed to hold repmgr->mutex, 'cuz we call queue_put() from here.
+ */
+int
+__repmgr_read_from_site(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * Loop, just in case we get EINTR and need to restart the I/O.  (All
+	 * other branches return.)
+	 */
+	for (;;) {
+		switch ((ret = __repmgr_read_conn(conn))) {
+#ifndef DB_WIN32
+		case EINTR:
+			continue;
+#endif
+
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+		case DB_REPMGR_EAGAIN:
+#endif
+		case WOULDBLOCK:
+			return (0);
+
+		case DB_REP_UNAVAIL:
+			/* Error 0 is understood to mean EOF. */
+			__repmgr_fire_conn_err_event(env, conn, 0);
+			STAT(env->rep_handle->
+			    region->mstat.st_connection_drop++);
+			return (DB_REP_UNAVAIL);
+
+		case 0:
+			if (IS_VALID_EID(conn->eid)) {
+				site = SITE_FROM_EID(conn->eid);
+				__os_gettime(env,
+				    &site->last_rcvd_timestamp, 1);
+			}
+			return (conn->reading_phase == SIZES_PHASE ?
+			    prepare_input(env, conn) :
+			    dispatch_msgin(env, conn));
+
+		default:
+#ifdef EBADF
+			DB_ASSERT(env, ret != EBADF);
+#endif
+			__repmgr_fire_conn_err_event(env, conn, ret);
+			STAT(db_rep->region->mstat.st_connection_drop++);
+			return (DB_REP_UNAVAIL);
+		}
+	}
+}
+
+/*
+ * Reads in the current input phase, as defined by the connection's IOVECS
+ * struct.
+ *
+ * Returns DB_REP_UNAVAIL for EOF.
+ *
+ * Makes no assumption about synchronization: it's up to the caller to hold
+ * mutex if necessary.
+ *
+ * PUBLIC: int __repmgr_read_conn __P((REPMGR_CONNECTION *));
+ */
+int
+__repmgr_read_conn(conn)
+	REPMGR_CONNECTION *conn;
+{
+	size_t nr;
+	int ret;
+
+	/*
+	 * Keep reading pieces as long as we're making some progress, or until
+	 * we complete the current read phase as defined in iovecs.
+	 */
+	for (;;) {
+		if ((ret = __repmgr_readv(conn->fd,
+		    &conn->iovecs.vectors[conn->iovecs.offset],
+		    conn->iovecs.count - conn->iovecs.offset, &nr)) != 0)
+			return (ret);
+
+		if (nr == 0)
+			return (DB_REP_UNAVAIL);
+
+		if (__repmgr_update_consumed(&conn->iovecs, nr)) {
+			/* We've fully read as much as we wanted. */
+			return (0);
+		}
+	}
+}
+
+/*
+ * Having finished reading the 9-byte message header, figure out what kind of
+ * message we're about to receive, and prepare input buffers accordingly.  The
+ * header includes enough information for us to figure out how much buffer space
+ * we need to allocate (though in some cases we need to do a bit of computation
+ * to arrive at the answer).
+ *
+ * Caller must hold mutex.
+ */
+static int
+prepare_input(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+#define	MEM_ALIGN sizeof(double)
+	DBT *dbt;
+	__repmgr_msg_hdr_args msg_hdr;
+	REPMGR_RESPONSE *resp;
+	u_int32_t control_size, rec_size, size;
+	size_t memsize, control_offset, rec_offset;
+	void *membase;
+	int ret, skip;
+
+	DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+
+	/*
+	 * We can only get here after having read the full 9 bytes that we
+	 * expect, so this can't fail.
+	 */
+	ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+	    conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+	DB_ASSERT(env, ret == 0);
+
+	__repmgr_iovec_init(&conn->iovecs);
+	skip = FALSE;
+
+	switch ((conn->msg_type = msg_hdr.type)) {
+	case REPMGR_HEARTBEAT:
+		/*
+		 * The underlying byte-receiving mechanism will already have
+		 * noted the fact that we got some traffic on this connection,
+		 * which is all that is needed to monitor the heartbeat.  But
+		 * we also put the heartbeat message on the message queue so
+		 * that it will perform rerequest processing.
+		 */
+	case REPMGR_REP_MESSAGE:
+		env->rep_handle->seen_repmsg = TRUE;
+		control_size = REP_MSG_CONTROL_SIZE(msg_hdr);
+		rec_size = REP_MSG_REC_SIZE(msg_hdr);
+		if (control_size == 0) {
+			if (conn->msg_type == REPMGR_HEARTBEAT) {
+				/*
+				 * Got an old-style heartbeat without payload,
+				 * nothing to do.
+				 */
+				skip = TRUE;
+				break;
+			} else {
+				__db_errx(env, DB_STR("3619",
+				    "illegal size for rep msg"));
+				return (DB_REP_UNAVAIL);
+			}
+		}
+		/*
+		 * Allocate a block of memory large enough to hold a
+		 * DB_REPMGR_MESSAGE wrapper, plus the (one or) two DBT
+		 * data areas that it points to.  Start by calculating
+		 * the total memory needed.
+		 */
+		memsize = DB_ALIGN(sizeof(REPMGR_MESSAGE), MEM_ALIGN);
+		control_offset = memsize;
+		memsize += control_size;
+		if (rec_size > 0) {
+			memsize = DB_ALIGN(memsize, MEM_ALIGN);
+			rec_offset = memsize;
+			memsize += rec_size;
+		} else
+			COMPQUIET(rec_offset, 0);
+		if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+			return (ret);
+		conn->input.rep_message = membase;
+		conn->input.rep_message->msg_hdr = msg_hdr;
+		conn->input.rep_message->v.repmsg.originating_eid = conn->eid;
+
+		DB_INIT_DBT(conn->input.rep_message->v.repmsg.control,
+		    (u_int8_t*)membase + control_offset, control_size);
+		__repmgr_add_dbt(&conn->iovecs,
+		    &conn->input.rep_message->v.repmsg.control);
+
+		if (rec_size > 0) {
+			DB_INIT_DBT(conn->input.rep_message->v.repmsg.rec,
+			    (rec_size > 0 ?
+				(u_int8_t*)membase + rec_offset : NULL),
+			    rec_size);
+			__repmgr_add_dbt(&conn->iovecs,
+			    &conn->input.rep_message->v.repmsg.rec);
+		} else
+			DB_INIT_DBT(conn->input.rep_message->v.repmsg.rec,
+			    NULL, 0);
+		break;
+
+	case REPMGR_APP_MESSAGE:
+		/*
+		 * We need a buffer big enough to hold the REPMGR_MESSAGE struct
+		 * and the data that we expect to receive on the wire.  We must
+		 * extend the struct size for the variable-length DBT array at
+		 * the end.
+		 */
+		size = DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) +
+		    APP_MSG_SEGMENT_COUNT(msg_hdr) * sizeof(DBT)),
+		    MEM_ALIGN);
+		memsize = size + APP_MSG_BUFFER_SIZE(msg_hdr);
+		if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+			return (ret);
+		conn->input.rep_message = membase;
+		conn->input.rep_message->msg_hdr = msg_hdr;
+		conn->input.rep_message->v.appmsg.conn = conn;
+
+		DB_INIT_DBT(conn->input.rep_message->v.appmsg.buf,
+		    (u_int8_t*)membase + size,
+		    APP_MSG_BUFFER_SIZE(msg_hdr));
+		__repmgr_add_dbt(&conn->iovecs,
+		    &conn->input.rep_message->v.appmsg.buf);
+		break;
+
+	case REPMGR_OWN_MSG:
+		size = sizeof(REPMGR_MESSAGE) + REPMGR_OWN_BUF_SIZE(msg_hdr);
+		if ((ret = __os_malloc(env, size, &membase)) != 0)
+			return (ret);
+		conn->input.rep_message = membase;
+		conn->input.rep_message->msg_hdr = msg_hdr;
+
+		/*
+		 * Save "conn" pointer in case this turns out to be a one-shot
+		 * request.  If it isn't, it won't matter.
+		 */
+		/*
+		 * An OWN msg that arrives in PARAMETERS state has bypassed the
+		 * final handshake, implying that this connection is to be used
+		 * for a one-shot GMDB request.
+		 */
+		if (REPMGR_OWN_BUF_SIZE(msg_hdr) == 0) {
+			__db_errx(env, DB_STR_A("3680",
+			    "invalid own buf size %lu in prepare_input", "%lu"),
+			    (u_long)REPMGR_OWN_BUF_SIZE(msg_hdr));
+			return (DB_REP_UNAVAIL);
+		}
+		DB_INIT_DBT(conn->input.rep_message->v.gmdb_msg.request,
+		    (u_int8_t*)membase + sizeof(REPMGR_MESSAGE),
+		    REPMGR_OWN_BUF_SIZE(msg_hdr));
+		__repmgr_add_dbt(&conn->iovecs,
+		    &conn->input.rep_message->v.gmdb_msg.request);
+		break;
+
+	case REPMGR_APP_RESPONSE:
+		size = APP_RESP_BUFFER_SIZE(msg_hdr);
+		conn->cur_resp = APP_RESP_TAG(msg_hdr);
+		if (conn->cur_resp >= conn->aresp) {
+			__db_errx(env, DB_STR_A("3681",
+			    "invalid cur resp %lu in prepare_input", "%lu"),
+			    (u_long)conn->cur_resp);
+			return (DB_REP_UNAVAIL);
+		}
+		resp = &conn->responses[conn->cur_resp];
+		DB_ASSERT(env, F_ISSET(resp, RESP_IN_USE));
+
+		dbt = &resp->dbt;
+
+		/*
+		 * Prepare to read message body into either the user-supplied
+		 * buffer, or one we allocate here.
+		 */
+		ret = 0;
+		if (!F_ISSET(resp, RESP_THREAD_WAITING)) {
+			/* Caller already timed out; allocate dummy buffer. */
+			if (size > 0) {
+				memset(dbt, 0, sizeof(*dbt));
+				ret = __os_malloc(env, size, &dbt->data);
+				F_SET(resp, RESP_DUMMY_BUF);
+			} else
+				F_CLR(resp, RESP_IN_USE);
+		} else if (F_ISSET(dbt, DB_DBT_MALLOC))
+			ret = __os_umalloc(env, size, &dbt->data);
+		else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+			if (dbt->data == NULL || dbt->size < size)
+				ret = __os_urealloc(env, size, &dbt->data);
+		} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+			/* Recipient should have checked size limit. */
+			DB_ASSERT(env, size <= dbt->ulen);
+		}
+		dbt->size = size;
+		if (ret != 0)
+			return (ret);
+
+		if (size > 0) {
+			__repmgr_add_dbt(&conn->iovecs, dbt);
+			F_SET(resp, RESP_READING);
+		} else {
+			skip = TRUE;
+			if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+				F_SET(resp, RESP_COMPLETE);
+				if ((ret = __repmgr_wake_waiters(env,
+				    &conn->response_waiters)) != 0)
+					return (ret);
+			}
+		}
+		break;
+
+	case REPMGR_RESP_ERROR:
+		DB_ASSERT(env, RESP_ERROR_TAG(msg_hdr) < conn->aresp &&
+		    conn->responses != NULL);
+		resp = &conn->responses[RESP_ERROR_TAG(msg_hdr)];
+		DB_ASSERT(env, !F_ISSET(resp, RESP_READING));
+		if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+			F_SET(resp, RESP_COMPLETE);
+
+			/*
+			 * DB errors are always negative, but we only send
+			 * unsigned values on the wire.
+			 */
+			resp->ret = -((int)RESP_ERROR_CODE(msg_hdr));
+			if ((ret = __repmgr_wake_waiters(env,
+			    &conn->response_waiters)) != 0)
+				return (ret);
+		} else
+			F_CLR(resp, RESP_IN_USE);
+		skip = TRUE;
+		break;
+
+	case REPMGR_HANDSHAKE:
+	case REPMGR_PERMLSN:
+		if ((ret = __repmgr_prepare_simple_input(env,
+		    conn, &msg_hdr)) != 0)
+			return (ret);
+		break;
+
+	default:
+		__db_errx(env, DB_STR_A("3676",
+		    "unexpected msg type %lu in prepare_input", "%lu"),
+		    (u_long)conn->msg_type);
+		return (DB_REP_UNAVAIL);
+	}
+
+	if (skip) {
+		/*
+		 * We can skip the DATA_PHASE, because the current message type
+		 * only has a header, no following data.
+		 */
+		__repmgr_reset_for_reading(conn);
+	} else
+		conn->reading_phase = DATA_PHASE;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_prepare_simple_input __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, __repmgr_msg_hdr_args *));
+ */
+int
+__repmgr_prepare_simple_input(env, conn, msg_hdr)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	__repmgr_msg_hdr_args *msg_hdr;
+{
+	DBT *dbt;
+	u_int32_t control_size, rec_size;
+	int ret;
+
+	control_size = REP_MSG_CONTROL_SIZE(*msg_hdr);
+	rec_size = REP_MSG_REC_SIZE(*msg_hdr);
+
+	dbt = &conn->input.repmgr_msg.cntrl;
+	if ((dbt->size = control_size) > 0) {
+		if ((ret = __os_malloc(env,
+		    dbt->size, &dbt->data)) != 0)
+			return (ret);
+		__repmgr_add_dbt(&conn->iovecs, dbt);
+	}
+
+	dbt = &conn->input.repmgr_msg.rec;
+	if ((dbt->size = rec_size) > 0) {
+		if ((ret = __os_malloc(env,
+		    dbt->size, &dbt->data)) != 0) {
+			dbt = &conn->input.repmgr_msg.cntrl;
+			if (dbt->size > 0)
+				__os_free(env, dbt->data);
+			return (ret);
+		}
+		__repmgr_add_dbt(&conn->iovecs, dbt);
+	}
+	return (0);
+}
+
+/*
+ * Processes an incoming message, depending on our current state.
+ *
+ * Caller must hold mutex.
+ */
+static int
+dispatch_msgin(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	REPMGR_RUNNABLE *th;
+	REPMGR_RESPONSE *resp;
+	DBT *dbt;
+	char *hostname;
+	int eid, ret;
+
+	DB_ASSERT(env, conn->reading_phase == DATA_PHASE);
+	db_rep = env->rep_handle;
+
+	switch (conn->state) {
+	case CONN_CONNECTED:
+		/*
+		 * In this state, we know we're working with an outgoing
+		 * connection.  We've sent a version proposal, and now expect
+		 * the response (which could be a dumb old V1 handshake).
+		 */
+		ONLY_HANDSHAKE(env, conn);
+
+		/*
+		 * Here is a good opportunity to clean up this site's connector
+		 * thread, because we generally come through here after making
+		 * an outgoing connection, yet we're out of the main loop, so we
+		 * don't hit this often.
+		 */
+		eid = conn->eid;
+		DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid));
+		site = SITE_FROM_EID(eid);
+		th = site->connector;
+		if (th != NULL && th->finished) {
+			if ((ret = __repmgr_thread_join(th)) != 0)
+				return (ret);
+			__os_free(env, th);
+			site->connector = NULL;
+		}
+
+		if ((ret = read_version_response(env, conn)) != 0)
+			return (ret);
+		break;
+
+	case CONN_NEGOTIATE:
+		/*
+		 * Since we're in this state, we know we're working with an
+		 * incoming connection, and this is the first message we've
+		 * received.  So it must be a version negotiation proposal (or a
+		 * legacy V1 handshake).  (We'll verify this of course.)
+		 */
+		ONLY_HANDSHAKE(env, conn);
+		if ((ret = send_version_response(env, conn)) != 0)
+			return (ret);
+		break;
+
+	case CONN_PARAMETERS:
+		/*
+		 * We've previously agreed on a (>1) version, so we expect
+		 * either the other side's parameters handshake, or possibly a
+		 * GMDB request on a one-shot, dedicated connection.
+		 */
+		switch (conn->msg_type) {
+		case REPMGR_HANDSHAKE:
+			dbt = &conn->input.repmgr_msg.rec;
+			hostname = dbt->data;
+			hostname[dbt->size-1] = '\0';
+			if ((ret = accept_handshake(env, conn, hostname)) != 0)
+				return (ret);
+			conn->state = CONN_READY;
+			break;
+		case REPMGR_OWN_MSG:
+			/*
+			 * GM change requests arrive in their own dedicated
+			 * connections, and when they're served the entire
+			 * connection isn't needed any more.  So the message
+			 * processing thread will do the entire job of serving
+			 * the request and finishing off the connection; so we
+			 * don't have to read it any more.  Note that normally
+			 * whenever we remove a connection from our list we
+			 * decrement the reference count; but we also increment
+			 * it whenever we pass a reference over to the message
+			 * processing threads' queue.  So in this case it's a
+			 * wash.
+			 */
+			conn->input.rep_message->v.gmdb_msg.conn = conn;
+			TAILQ_REMOVE(&db_rep->connections, conn, entries);
+			if ((ret = __repmgr_queue_put(env,
+			    conn->input.rep_message)) != 0)
+				return (ret);
+			break;
+
+		default:
+			__db_errx(env, DB_STR_A("3620",
+			    "unexpected msg type %d in PARAMETERS state", "%d"),
+			    (int)conn->msg_type);
+			return (DB_REP_UNAVAIL);
+		}
+
+		break;
+
+	case CONN_READY:
+	case CONN_CONGESTED:
+		/*
+		 * We have a complete message, so process it.  Acks and
+		 * handshakes get processed here, in line.  Regular rep messages
+		 * get posted to a queue, to be handled by a thread from the
+		 * message thread pool.
+		 */
+		switch (conn->msg_type) {
+		case REPMGR_PERMLSN:
+			if ((ret = record_permlsn(env, conn)) != 0)
+				return (ret);
+			break;
+
+		case REPMGR_HEARTBEAT:
+		case REPMGR_APP_MESSAGE:
+		case REPMGR_REP_MESSAGE:
+			if ((ret = __repmgr_queue_put(env,
+			    conn->input.rep_message)) != 0)
+				return (ret);
+			/*
+			 * The queue has taken over responsibility for the
+			 * rep_message buffer, and will free it later.
+			 */
+			if (conn->msg_type == REPMGR_APP_MESSAGE)
+				conn->ref_count++;
+			break;
+
+		case REPMGR_OWN_MSG:
+			/*
+			 * Since we're in one of the "ready" states we know this
+			 * isn't a one-shot request, so we are not giving
+			 * ownership of this connection over to the message
+			 * thread queue; we're going to keep reading on it
+			 * ourselves.  The message thread that processes this
+			 * request has no need for a connection anyway, since
+			 * there is no response that needs to be returned.
+			 */
+			conn->input.rep_message->v.gmdb_msg.conn = NULL;
+			if ((ret = process_own_msg(env, conn)) != 0)
+				return (ret);
+			break;
+
+		case REPMGR_APP_RESPONSE:
+			DB_ASSERT(env, conn->cur_resp < conn->aresp &&
+			    conn->responses != NULL);
+			resp = &conn->responses[conn->cur_resp];
+			DB_ASSERT(env, F_ISSET(resp, RESP_READING));
+			F_CLR(resp, RESP_READING);
+			if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+				F_SET(resp, RESP_COMPLETE);
+				if ((ret = __repmgr_wake_waiters(env,
+				    &conn->response_waiters)) != 0)
+					return (ret);
+			} else {
+				/*
+				 * If the calling thread is no longer with us,
+				 * yet we're reading, it can only mean we're
+				 * reading into a dummy buffer, so free it now.
+				 */
+				DB_ASSERT(env, F_ISSET(resp, RESP_DUMMY_BUF));
+				__os_free(env, resp->dbt.data);
+				F_CLR(resp, RESP_IN_USE);
+			}
+			break;
+
+		case REPMGR_RESP_ERROR:
+		default:
+			__db_errx(env, DB_STR_A("3621",
+			    "unexpected msg type rcvd in ready state: %d",
+			    "%d"), (int)conn->msg_type);
+			return (DB_REP_UNAVAIL);
+		}
+		break;
+
+	case CONN_DEFUNCT:
+		break;
+
+	default:
+		DB_ASSERT(env, FALSE);
+	}
+
+	switch (conn->msg_type) {
+	case REPMGR_HANDSHAKE:
+	case REPMGR_PERMLSN:
+		dbt = &conn->input.repmgr_msg.cntrl;
+		if (dbt->size > 0)
+			__os_free(env, dbt->data);
+		dbt = &conn->input.repmgr_msg.rec;
+		if (dbt->size > 0)
+			__os_free(env, dbt->data);
+		break;
+	default:
+		/*
+		 * Some messages in REPMGR_OWN_MSG format are also handled
+		 */
+		break;
+	}
+	__repmgr_reset_for_reading(conn);
+	return (0);
+}
+
+/*
+ * Process one of repmgr's "own" message types, and one that occurs on a regular
+ * (not one-shot) connection.
+ */
+static int
+process_own_msg(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	DBT *dbt;
+	REPMGR_SITE *site;
+	REPMGR_MESSAGE *msg;
+	__repmgr_connect_reject_args reject;
+	__repmgr_parm_refresh_args parms;
+	int ret;
+
+	ret = 0;
+	/*
+	 * Set "msg" to point to the message struct.  If we do all necessary
+	 * processing here now, leave it set so that it can be freed.  On the
+	 * other hand, if we pass it off to the message queue for later
+	 * processing by a message thread, we want to avoid freeing the memory
+	 * here, so clear the pointer in such a case.
+	 */
+	switch (REPMGR_OWN_MSG_TYPE((msg = conn->input.rep_message)->msg_hdr)) {
+	case REPMGR_CONNECT_REJECT:
+		dbt = &msg->v.gmdb_msg.request;
+		if ((ret = __repmgr_connect_reject_unmarshal(env,
+		    &reject, dbt->data, dbt->size, NULL)) != 0)
+			return (DB_REP_UNAVAIL);
+
+		/*
+		 * If we're being rejected by someone who has more up-to-date
+		 * membership information than we do, it means we have been
+		 * removed from the group.  If we've just gotten started, we can
+		 * make one attempt at automatically rejoining; otherwise we bow
+		 * out gracefully.
+		 */
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"got rejection msg citing version %lu/%lu",
+			(u_long)reject.gen, (u_long)reject.version));
+
+		if (__repmgr_gmdb_version_cmp(env,
+		    reject.gen, reject.version) > 0) {
+			if (env->rep_handle->seen_repmsg)
+				ret = DB_DELETED;
+			else if ((ret = __repmgr_defer_op(env,
+			    REPMGR_REJOIN)) == 0)
+				ret = DB_REP_UNAVAIL;
+		} else
+			ret = DB_REP_UNAVAIL;
+		DB_ASSERT(env, ret != 0);
+		return (ret);
+
+	case REPMGR_SHARING:
+		if ((ret = __repmgr_queue_put(env, msg)) != 0)
+			return (ret);
+		/* Show that we no longer own this memory. */
+		msg = NULL;
+		break;
+
+	case REPMGR_PARM_REFRESH:
+		dbt = &conn->input.rep_message->v.gmdb_msg.request;
+		if ((ret = __repmgr_parm_refresh_unmarshal(env,
+		    &parms, dbt->data, dbt->size, NULL)) != 0)
+			return (DB_REP_UNAVAIL);
+		db_rep = env->rep_handle;
+		DB_ASSERT(env, conn->type == REP_CONNECTION &&
+		    IS_KNOWN_REMOTE_SITE(conn->eid));
+		site = SITE_FROM_EID(conn->eid);
+		site->ack_policy = (int)parms.ack_policy;
+		if (F_ISSET(&parms, ELECTABLE_SITE))
+			F_SET(site, SITE_ELECTABLE);
+		else
+			F_CLR(site, SITE_ELECTABLE);
+		F_SET(site, SITE_HAS_PRIO);
+		break;
+
+	case REPMGR_GM_FAILURE:
+	case REPMGR_GM_FORWARD:
+	case REPMGR_JOIN_REQUEST:
+	case REPMGR_JOIN_SUCCESS:
+	case REPMGR_REMOVE_REQUEST:
+	case REPMGR_RESOLVE_LIMBO:
+	default:
+		__db_errx(env, DB_STR_A("3677",
+		    "unexpected msg type %lu in process_own_msg", "%lu"),
+		    (u_long)REPMGR_OWN_MSG_TYPE(msg->msg_hdr));
+		return (DB_REP_UNAVAIL);
+	}
+	/*
+	 * If we haven't given ownership of the msg buffer to another thread,
+	 * free it now.
+	 */
+	if (msg != NULL)
+		__os_free(env, msg);
+	return (ret);
+}
+
+/*
+ * Examine and verify the incoming version proposal message, and send an
+ * appropriate response.
+ */
+static int
+send_version_response(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	__repmgr_version_proposal_args versions;
+	__repmgr_version_confirmation_args conf;
+	repmgr_netaddr_t *my_addr;
+	char *hostname;
+	u_int8_t buf[__REPMGR_VERSION_CONFIRMATION_SIZE+1];
+	DBT vi;
+	int ret;
+
+	db_rep = env->rep_handle;
+	my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+	if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+		return (ret);
+	if (vi.size == 0) {
+		/* No version info, so we must be talking to a v1 site. */
+		hostname = conn->input.repmgr_msg.rec.data;
+		if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
+			return (ret);
+		if ((ret = __repmgr_send_v1_handshake(env,
+		    conn, my_addr->host, strlen(my_addr->host) + 1)) != 0)
+			return (ret);
+		conn->state = CONN_READY;
+	} else {
+		if ((ret = __repmgr_version_proposal_unmarshal(env,
+		    &versions, vi.data, vi.size, NULL)) != 0)
+			return (DB_REP_UNAVAIL);
+
+		if (DB_REPMGR_VERSION >= versions.min &&
+		    DB_REPMGR_VERSION <= versions.max)
+			conf.version = DB_REPMGR_VERSION;
+		else if (versions.max >= DB_REPMGR_MIN_VERSION &&
+		    versions.max <= DB_REPMGR_VERSION)
+			conf.version = versions.max;
+		else {
+			/*
+			 * User must have wired up a combination of versions
+			 * exceeding what we said we'd support.
+			 */
+			__db_errx(env, DB_STR_A("3622",
+			    "No available version between %lu and %lu",
+			    "%lu %lu"), (u_long)versions.min,
+			    (u_long)versions.max);
+			return (DB_REP_UNAVAIL);
+		}
+		conn->version = conf.version;
+
+		__repmgr_version_confirmation_marshal(env, &conf, buf);
+		buf[__REPMGR_VERSION_CONFIRMATION_SIZE] = '\0';
+		DB_ASSERT(env, !IS_SUBORDINATE(db_rep));
+		if ((ret = __repmgr_send_handshake(env,
+		     conn, buf, sizeof(buf), 0)) != 0)
+			return (ret);
+
+		conn->state = CONN_PARAMETERS;
+	}
+	return (ret);
+}
+
+/*
+ * Sends a version-aware handshake to the remote site, only after we've verified
+ * that it is indeed version-aware.  We can send either v2 or v3 handshake,
+ * depending on the connection's version.
+ *
+ * PUBLIC: int __repmgr_send_handshake __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, void *, size_t, u_int32_t));
+ */
+int
+__repmgr_send_handshake(env, conn, opt, optlen, flags)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *opt;
+	size_t optlen;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	DBT cntrl, rec;
+	__repmgr_handshake_args hs;
+	__repmgr_v2handshake_args v2hs;
+	__repmgr_v3handshake_args v3hs;
+	repmgr_netaddr_t *my_addr;
+	size_t hostname_len, rec_len;
+	void *buf;
+	u_int8_t *p;
+	u_int32_t cntrl_len;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+	/*
+	 * The cntrl part has various parameters (varies by version).  The rec
+	 * part has the host name, followed by whatever optional extra data was
+	 * passed to us.
+	 *
+	 * Version awareness was introduced with protocol version 2 (so version
+	 * 1 is handled elsewhere).
+	 */
+	switch (conn->version) {
+	case 2:
+		cntrl_len = __REPMGR_V2HANDSHAKE_SIZE;
+		break;
+	case 3:
+		cntrl_len = __REPMGR_V3HANDSHAKE_SIZE;
+		break;
+	case 4:
+		cntrl_len = __REPMGR_HANDSHAKE_SIZE;
+		break;
+	default:
+		__db_errx(env, DB_STR_A("3678",
+		    "unexpected conn version %lu in send_handshake", "%lu"),
+		    (u_long)conn->version);
+		return (DB_REP_UNAVAIL);
+	}
+	hostname_len = strlen(my_addr->host);
+	rec_len = hostname_len + 1 +
+	    (opt == NULL ? 0 : optlen);
+
+	if ((ret = __os_malloc(env, cntrl_len + rec_len, &buf)) != 0)
+		return (ret);
+
+	cntrl.data = p = buf;
+	switch (conn->version) {
+	case 2:
+		/* Not allowed to use multi-process feature in v2 group. */
+		DB_ASSERT(env, !IS_SUBORDINATE(db_rep));
+		v2hs.port = my_addr->port;
+		v2hs.priority = rep->priority;
+		__repmgr_v2handshake_marshal(env, &v2hs, p);
+		break;
+	case 3:
+		v3hs.port = my_addr->port;
+		v3hs.priority = rep->priority;
+		v3hs.flags = flags;
+		__repmgr_v3handshake_marshal(env, &v3hs, p);
+		break;
+	case 4:
+		hs.port = my_addr->port;
+		hs.alignment = MEM_ALIGN;
+		hs.ack_policy = (u_int32_t)rep->perm_policy;
+		hs.flags = flags;
+		if (rep->priority > 0)
+			F_SET(&hs, ELECTABLE_SITE);
+		__repmgr_handshake_marshal(env, &hs, p);
+		break;
+	default:
+		DB_ASSERT(env, FALSE);
+		break;
+	}
+	cntrl.size = cntrl_len;
+
+	p = rec.data = &p[cntrl_len];
+	(void)strcpy((char*)p, my_addr->host);
+	p += hostname_len + 1;
+	if (opt != NULL) {
+		memcpy(p, opt, optlen);
+		p += optlen;
+	}
+	rec.size = (u_int32_t)(p - (u_int8_t*)rec.data);
+
+	/* Never block on select thread: pass maxblock as 0. */
+	ret = __repmgr_send_one(env,
+	    conn, REPMGR_HANDSHAKE, &cntrl, &rec, 0);
+	__os_free(env, buf);
+	return (ret);
+}
+
+static int
+read_version_response(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	__repmgr_version_confirmation_args conf;
+	DBT vi;
+	char *hostname;
+	u_int32_t flags;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+		return (ret);
+	hostname = conn->input.repmgr_msg.rec.data;
+	if (vi.size == 0) {
+		if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __repmgr_version_confirmation_unmarshal(env,
+		    &conf, vi.data, vi.size, NULL)) != 0)
+			return (DB_REP_UNAVAIL);
+		if (conf.version >= DB_REPMGR_MIN_VERSION &&
+		    conf.version <= DB_REPMGR_VERSION)
+			conn->version = conf.version;
+		else {
+			/*
+			 * Remote site "confirmed" a version outside of the
+			 * range we proposed.  It should never do that.
+			 */
+			__db_errx(env, DB_STR_A("3623",
+			    "Can't support confirmed version %lu", "%lu"),
+			    (u_long)conf.version);
+			return (DB_REP_UNAVAIL);
+		}
+
+		if ((ret = accept_handshake(env, conn, hostname)) != 0)
+			return (ret);
+		flags = IS_SUBORDINATE(db_rep) ? REPMGR_SUBORDINATE : 0;
+		if ((ret = __repmgr_send_handshake(env,
+		    conn, NULL, 0, flags)) != 0)
+			return (ret);
+	}
+	conn->state = CONN_READY;
+	return (ret);
+}
+
+/*
+ * Examine the rec part of a handshake message to see if it has any version
+ * information in it.  This is the magic that lets us allows version-aware sites
+ * to exchange information, and yet avoids tripping up v1 sites, which don't
+ * know how to look for it.
+ *
+ * PUBLIC: int __repmgr_find_version_info __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, DBT *));
+ */
+int
+__repmgr_find_version_info(env, conn, vi)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	DBT *vi;
+{
+	DBT *dbt;
+	char *hostname;
+	u_int32_t hostname_len;
+
+	dbt = &conn->input.repmgr_msg.rec;
+	if (dbt->size == 0) {
+		__db_errx(env, DB_STR("3624",
+		    "handshake is missing rec part"));
+		return (DB_REP_UNAVAIL);
+	}
+	hostname = dbt->data;
+	hostname[dbt->size-1] = '\0';
+	hostname_len = (u_int32_t)strlen(hostname);
+	if (hostname_len + 1 == dbt->size) {
+		/*
+		 * The rec DBT held only the host name.  This is a simple legacy
+		 * V1 handshake; it contains no version information.
+		 */
+		vi->size = 0;
+	} else {
+		/*
+		 * There's more data than just the host name.  The remainder is
+		 * available to be treated as a normal byte buffer (and read in
+		 * by one of the unmarshal functions).  Note that the remaining
+		 * length should not include the padding byte that we have
+		 * already clobbered.
+		 */
+		vi->data = &((u_int8_t *)dbt->data)[hostname_len + 1];
+		vi->size = (dbt->size - (hostname_len+1)) - 1;
+	}
+	return (0);
+}
+
+static int
+accept_handshake(env, conn, hostname)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	char *hostname;
+{
+	__repmgr_handshake_args hs;
+	__repmgr_v2handshake_args hs2;
+	__repmgr_v3handshake_args hs3;
+	u_int port;
+	u_int32_t ack, flags;
+	int electable;
+
+	switch (conn->version) {
+	case 2:
+		if (__repmgr_v2handshake_unmarshal(env, &hs2,
+		    conn->input.repmgr_msg.cntrl.data,
+		    conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+			return (DB_REP_UNAVAIL);
+		port = hs2.port;
+		electable = hs2.priority > 0;
+		ack = flags = 0;
+		break;
+	case 3:
+		if (__repmgr_v3handshake_unmarshal(env, &hs3,
+		   conn->input.repmgr_msg.cntrl.data,
+		   conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+			return (DB_REP_UNAVAIL);
+		port = hs3.port;
+		electable = hs3.priority > 0;
+		flags = hs3.flags;
+		ack = 0;
+		break;
+	case 4:
+		if (__repmgr_handshake_unmarshal(env, &hs,
+		   conn->input.repmgr_msg.cntrl.data,
+		   conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+			return (DB_REP_UNAVAIL);
+		port = hs.port;
+		electable = F_ISSET(&hs, ELECTABLE_SITE);
+		flags = hs.flags;
+		ack = hs.ack_policy;
+		break;
+	default:
+		__db_errx(env, DB_STR_A("3679",
+		    "unexpected conn version %lu in accept_handshake", "%lu"),
+		    (u_long)conn->version);
+		return (DB_REP_UNAVAIL);
+	}
+
+	return (process_parameters(env,
+	    conn, hostname, port, ack, electable, flags));
+}
+
+static int
+accept_v1_handshake(env, conn, hostname)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	char *hostname;
+{
+	DB_REPMGR_V1_HANDSHAKE *handshake;
+	u_int32_t prio;
+	int electable;
+
+	handshake = conn->input.repmgr_msg.cntrl.data;
+	if (conn->input.repmgr_msg.cntrl.size != sizeof(*handshake) ||
+	    handshake->version != 1) {
+		__db_errx(env, DB_STR("3625", "malformed V1 handshake"));
+		return (DB_REP_UNAVAIL);
+	}
+
+	conn->version = 1;
+	prio = ntohl(handshake->priority);
+	electable = prio > 0;
+	return (process_parameters(env,
+	    conn, hostname, handshake->port, 0, electable, 0));
+}
+
+/* Caller must hold mutex. */
+static int
+process_parameters(env, conn, host, port, ack, electable, flags)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	char *host;
+	u_int port;
+	int electable;
+	u_int32_t ack, flags;
+{
+	DB_REP *db_rep;
+	REPMGR_RETRY *retry;
+	REPMGR_SITE *site;
+	__repmgr_connect_reject_args reject;
+	u_int8_t reject_buf[__REPMGR_CONNECT_REJECT_SIZE];
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+
+	/* Connection state can be used to discern incoming versus outgoing. */
+	if (conn->state == CONN_CONNECTED) {
+		/*
+		 * Since we initiated this as an outgoing connection, we
+		 * obviously already know the host, port and site.  We just need
+		 * the other site's electability flag (which we'll grab below,
+		 * after the big "else" clause).
+		 */
+		DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid));
+		site = SITE_FROM_EID(conn->eid);
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "handshake from connection to %s:%lu EID %u",
+		    site->net_addr.host,
+		    (u_long)site->net_addr.port, conn->eid));
+	} else {
+		DB_ASSERT(env, conn->state == CONN_NEGOTIATE ||
+		    conn->state == CONN_PARAMETERS);
+		/*
+		 * Incoming connection: until now we haven't known what kind of
+		 * connection we're dealing with (and in the case of a
+		 * REP_CONNECTION, what its EID is); so it must be on the
+		 * "orphans" list.  But now that we've received the parameters
+		 * we'll be able to figure all that out.
+		 */
+		if (LF_ISSET(APP_CHANNEL_CONNECTION)) {
+			conn->type = APP_CONNECTION;
+			return (0);
+		} else
+			conn->type = REP_CONNECTION;
+
+		/*
+		 * Now that we've been given the host and port, use them to find
+		 * the site.
+		 */
+		if ((site = __repmgr_lookup_site(env, host, port)) != NULL &&
+		    site->membership == SITE_PRESENT) {
+			TAILQ_REMOVE(&db_rep->connections, conn, entries);
+			conn->ref_count--;
+
+			eid = EID_FROM_SITE(site);
+			if (LF_ISSET(REPMGR_SUBORDINATE)) {
+				/*
+				 * Accept it, as a supplementary source of
+				 * input, but nothing else.
+				 */
+				TAILQ_INSERT_TAIL(&site->sub_conns,
+				    conn, entries);
+				conn->eid = eid;
+			} else {
+				DB_EVENT(env,
+				    DB_EVENT_REP_CONNECT_ESTD, &eid);
+				switch (site->state) {
+				case SITE_PAUSING:
+					RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+				      "handshake from paused site %s:%u EID %u",
+					    host, port, eid));
+					retry = site->ref.retry;
+					TAILQ_REMOVE(&db_rep->retries,
+					    retry, entries);
+					__os_free(env, retry);
+					break;
+				case SITE_CONNECTED:
+					/*
+					 * We got an incoming connection for a
+					 * site we were already connected to; at
+					 * least we thought we were.
+					 */
+					RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			 "connection from %s:%u EID %u while already connected",
+					    host, port, eid));
+					if ((ret = resolve_collision(env,
+					    site, conn)) != 0)
+						return (ret);
+					break;
+				case SITE_CONNECTING:
+					RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+				  "handshake from connecting site %s:%u EID %u",
+					    host, port, eid));
+					/*
+					 * Connector thread will give up when it
+					 * sees this site's state change, so we
+					 * don't have to do anything else here.
+					 */
+					break;
+				default:
+					DB_ASSERT(env, FALSE);
+				}
+				conn->eid = eid;
+				site->state = SITE_CONNECTED;
+				site->ref.conn.in = conn;
+				__os_gettime(env,
+				    &site->last_rcvd_timestamp, 1);
+			}
+		} else {
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		  "rejecting connection from unknown or provisional site %s:%u",
+			    host, port));
+			reject.version = db_rep->membership_version;
+			reject.gen = db_rep->member_version_gen;
+			__repmgr_connect_reject_marshal(env,
+			    &reject, reject_buf);
+
+			if ((ret = __repmgr_send_own_msg(env, conn,
+			    REPMGR_CONNECT_REJECT, reject_buf,
+			    __REPMGR_CONNECT_REJECT_SIZE)) != 0)
+				return (ret);
+
+			/*
+			 * Since we haven't set conn->eid, bust_connection will
+			 * not schedule a retry for this "failure", which is
+			 * exactly what we want.
+			 */
+			return (DB_REP_UNAVAIL);
+		}
+	}
+
+	if (electable)
+		F_SET(site, SITE_ELECTABLE);
+	else
+		F_CLR(site, SITE_ELECTABLE);
+	F_SET(site, SITE_HAS_PRIO);
+	site->ack_policy = (int)ack;
+
+	/*
+	 * If we're moping around wishing we knew who the master was, then
+	 * getting in touch with another site might finally provide sufficient
+	 * connectivity to find out.
+	 */
+	if (!IS_SUBORDINATE(db_rep) && /* us */
+	    !__repmgr_master_is_known(env) &&
+	    !LF_ISSET(REPMGR_SUBORDINATE)) { /* the remote site */
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "handshake with no known master to wake election thread"));
+		db_rep->new_connection = TRUE;
+		if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+static int
+resolve_collision(env, site, conn)
+	ENV *env;
+	REPMGR_SITE *site;
+	REPMGR_CONNECTION *conn;
+{
+	int ret;
+
+	/*
+	 * No need for site-oriented recovery, since we now have a replacement
+	 * connection; so skip bust_connection() and call disable_conn()
+	 * directly.
+	 *
+	 * If we already had an incoming connection, this new one always
+	 * replaces it.  Whether it also/alternatively replaces an outgoing
+	 * connection depends on whether we're client or server (so as to avoid
+	 * connection collisions resulting in no remaining connections).  (If
+	 * it's an older version that doesn't know about our collision
+	 * resolution protocol, it will behave like a client.)
+	 */
+	if (site->ref.conn.in != NULL) {
+		ret = __repmgr_disable_connection(env, site->ref.conn.in);
+		site->ref.conn.in = NULL;
+		if (ret != 0)
+			return (ret);
+	}
+	if (site->ref.conn.out != NULL &&
+	    conn->version >= CONN_COLLISION_VERSION &&
+	    __repmgr_is_server(env, site)) {
+		ret = __repmgr_disable_connection(env, site->ref.conn.out);
+		site->ref.conn.out = NULL;
+		if (ret != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+static int
+record_permlsn(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	__repmgr_permlsn_args *ackp, ack;
+	SITE_STRING_BUFFER location;
+	u_int32_t gen;
+	int ret;
+	u_int do_log_check;
+
+	db_rep = env->rep_handle;
+	do_log_check = 0;
+
+	if (conn->version == 0 ||
+	    !IS_READY_STATE(conn->state) || !IS_VALID_EID(conn->eid)) {
+		__db_errx(env, DB_STR("3682",
+		    "unexpected connection info in record_permlsn"));
+		return (DB_REP_UNAVAIL);
+	}
+	site = SITE_FROM_EID(conn->eid);
+
+	/*
+	 * Extract the LSN.  Save it only if it is an improvement over what the
+	 * site has already ack'ed.
+	 */
+	if (conn->version == 1) {
+		ackp = conn->input.repmgr_msg.cntrl.data;
+		if (conn->input.repmgr_msg.cntrl.size != sizeof(ack) ||
+		    conn->input.repmgr_msg.rec.size != 0) {
+			__db_errx(env, DB_STR("3627", "bad ack msg size"));
+			return (DB_REP_UNAVAIL);
+		}
+	} else {
+		ackp = &ack;
+		if ((ret = __repmgr_permlsn_unmarshal(env, ackp,
+			 conn->input.repmgr_msg.cntrl.data,
+			 conn->input.repmgr_msg.cntrl.size, NULL)) != 0)
+			return (DB_REP_UNAVAIL);
+	}
+
+	/* Ignore stale acks. */
+	gen = db_rep->region->gen;
+	if (ackp->generation < gen) {
+		VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "ignoring stale ack (%lu<%lu), from %s",
+		     (u_long)ackp->generation, (u_long)gen,
+		     __repmgr_format_site_loc(site, location)));
+		return (0);
+	}
+	VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "got ack [%lu][%lu](%lu) from %s", (u_long)ackp->lsn.file,
+	    (u_long)ackp->lsn.offset, (u_long)ackp->generation,
+	    __repmgr_format_site_loc(site, location)));
+
+	if (ackp->generation == gen &&
+	    LOG_COMPARE(&ackp->lsn, &site->max_ack) == 1) {
+		/*
+		 * If file number for this site changed, check lowest log
+		 * file needed after recording new permlsn for this site.
+		 */
+		if (ackp->lsn.file > site->max_ack.file)
+			do_log_check = 1;
+		memcpy(&site->max_ack, &ackp->lsn, sizeof(DB_LSN));
+		if (do_log_check)
+			check_min_log_file(env);
+		if ((ret = __repmgr_wake_waiters(env,
+		    &db_rep->ack_waiters)) != 0)
+			return (ret);
+	}
+	return (0);
+}
+
+/*
+ * Maintains lowest log file still needed by the repgroup.  This is stored
+ * in shared rep region so that it is accessible to repmgr subordinate
+ * processes that may not themselves have connections to other sites
+ * (e.g. a separate db_archive process.)
+ */
+static void
+check_min_log_file(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	u_int32_t min_log;
+	int eid;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	min_log = 0;
+
+	/*
+	 * Record the lowest log file number from all connected sites.  If this
+	 * is a client, ignore the master because the master does not maintain
+	 * nor send out its repmgr perm LSN in this way.  Consider connections
+	 * so that we don't allow a site that has been down a long time to
+	 * indefinitely prevent log archiving.
+	 */
+	FOR_EACH_REMOTE_SITE_INDEX(eid) {
+		if (eid == rep->master_id)
+			continue;
+		site = SITE_FROM_EID(eid);
+		if (site->state == SITE_CONNECTED &&
+		    (((conn = site->ref.conn.in) != NULL &&
+		    conn->state == CONN_READY) ||
+		    ((conn = site->ref.conn.out) != NULL &&
+		    conn->state == CONN_READY)) &&
+		    !IS_ZERO_LSN(site->max_ack) &&
+		    (min_log == 0 || site->max_ack.file < min_log))
+			min_log = site->max_ack.file;
+	}
+	/*
+	 * During normal operation min_log should increase over time, but it
+	 * is possible if a site returns after being disconnected for a while
+	 * that min_log could decrease.
+	 */
+	if (min_log != 0 && min_log != rep->min_log_file)
+		rep->min_log_file = min_log;
+}
+
+/*
+ * PUBLIC: int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_write_some(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	QUEUED_OUTPUT *output;
+	REPMGR_FLAT *msg;
+	int bytes, ret;
+
+	while (!STAILQ_EMPTY(&conn->outbound_queue)) {
+		output = STAILQ_FIRST(&conn->outbound_queue);
+		msg = output->msg;
+		if ((bytes = sendsocket(conn->fd, &msg->data[output->offset],
+		    msg->length - output->offset, 0)) == SOCKET_ERROR) {
+			switch (ret = net_errno) {
+			case WOULDBLOCK:
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+			case DB_REPMGR_EAGAIN:
+#endif
+				return (0);
+			default:
+				__repmgr_fire_conn_err_event(env, conn, ret);
+				STAT(env->rep_handle->
+				    region->mstat.st_connection_drop++);
+				return (DB_REP_UNAVAIL);
+			}
+		}
+
+		if ((output->offset += (size_t)bytes) >= msg->length) {
+			STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries);
+			__os_free(env, output);
+			conn->out_queue_length--;
+			if (--msg->ref_count <= 0)
+				__os_free(env, msg);
+
+			/*
+			 * We've achieved enough movement to free up at least
+			 * one space in the outgoing queue.  Wake any message
+			 * threads that may be waiting for space.  Leave
+			 * CONGESTED state so that when the queue reaches the
+			 * high-water mark again, the filling thread will be
+			 * allowed to try waiting again.
+			 */
+			conn->state = CONN_READY;
+			if ((ret = __repmgr_signal(&conn->drained)) != 0)
+				return (ret);
+		}
+	}
+
+	return (0);
+}
diff --git a/src/repmgr/repmgr_stat.c b/src/repmgr/repmgr_stat.c
new file mode 100644
index 00000000..fd6dabd3
--- /dev/null
+++ b/src/repmgr/repmgr_stat.c
@@ -0,0 +1,363 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_STATISTICS
+static int __repmgr_print_all __P((ENV *, u_int32_t));
+static int __repmgr_print_sites __P((ENV *));
+static int __repmgr_print_stats __P((ENV *, u_int32_t));
+static int __repmgr_stat __P((ENV *, DB_REPMGR_STAT **, u_int32_t));
+
+/*
+ * __repmgr_stat_pp --
+ *	DB_ENV->repmgr_stat pre/post processing.
+ *
+ * PUBLIC: int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+ */
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REPMGR_STAT **statp;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->repmgr_stat", DB_INIT_REP);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->repmgr_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	return (__repmgr_stat(env, statp, flags));
+}
+
+/*
+ * __repmgr_stat --
+ *	ENV->repmgr_stat.
+ */
+static int
+__repmgr_stat(env, statp, flags)
+	ENV *env;
+	DB_REPMGR_STAT **statp;
+	u_int32_t flags;
+{
+	DB_REP *db_rep;
+	DB_REPMGR_STAT *copy, *stats;
+	uintmax_t tmp;
+	int ret;
+
+	db_rep = env->rep_handle;
+	stats = &db_rep->region->mstat;
+
+	*statp = NULL;
+
+	/* Allocate a stat struct to return to the user. */
+	if ((ret = __os_umalloc(env, sizeof(DB_REPMGR_STAT), &copy)) != 0)
+		return (ret);
+
+	memcpy(copy, stats, sizeof(*stats));
+	if (LF_ISSET(DB_STAT_CLEAR)) {
+		tmp = stats->st_max_elect_threads;
+		memset(stats, 0, sizeof(DB_REPMGR_STAT));
+		stats->st_max_elect_threads = tmp;
+	}
+
+	*statp = copy;
+	return (0);
+}
+
+/*
+ * __repmgr_stat_print_pp --
+ *	DB_ENV->repmgr_stat_print pre/post processing.
+ *
+ * PUBLIC: int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__repmgr_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG_XX(
+	    env, rep_handle, "DB_ENV->repmgr_stat_print", DB_INIT_REP);
+
+	if ((ret = __db_fchk(env, "DB_ENV->repmgr_stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	return (__repmgr_stat_print(env, flags));
+}
+
+/*
+ * PUBLIC: int __repmgr_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		if ((ret = __repmgr_print_stats(env, orig_flags)) == 0)
+			ret = __repmgr_print_sites(env);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __repmgr_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+static int
+__repmgr_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_REPMGR_STAT *sp;
+	int ret;
+
+	if ((ret = __repmgr_stat(env, &sp, flags)) != 0)
+		return (ret);
+
+	__db_dl(env, "Number of PERM messages not acknowledged",
+	    (u_long)sp->st_perm_failed);
+	__db_dl(env, "Number of messages queued due to network delay",
+	    (u_long)sp->st_msgs_queued);
+	__db_dl(env, "Number of messages discarded due to queue length",
+	    (u_long)sp->st_msgs_dropped);
+	__db_dl(env, "Number of existing connections dropped",
+	    (u_long)sp->st_connection_drop);
+	__db_dl(env, "Number of failed new connection attempts",
+	    (u_long)sp->st_connect_fail);
+	__db_dl(env, "Number of currently active election threads",
+	    (u_long)sp->st_elect_threads);
+	__db_dl(env, "Election threads for which space is reserved",
+	    (u_long)sp->st_max_elect_threads);
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+static int
+__repmgr_print_sites(env)
+	ENV *env;
+{
+	DB_REPMGR_SITE *list;
+	DB_MSGBUF mb;
+	u_int count, i;
+	int ret;
+
+	if ((ret = __repmgr_site_list(env->dbenv, &count, &list)) != 0)
+		return (ret);
+
+	if (count == 0)
+		return (0);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_REPMGR site information:");
+
+	DB_MSGBUF_INIT(&mb);
+	for (i = 0; i < count; ++i) {
+		__db_msgadd(env, &mb, "%s (eid: %d, port: %u",
+		    list[i].host, list[i].eid, list[i].port);
+		if (list[i].status != 0)
+			__db_msgadd(env, &mb, ", %sconnected",
+			    list[i].status == DB_REPMGR_CONNECTED ? "" : "dis");
+		__db_msgadd(env, &mb, ", %speer",
+		    F_ISSET(&list[i], DB_REPMGR_ISPEER) ? "" : "non-");
+		__db_msgadd(env, &mb, ")");
+		DB_MSGBUF_FLUSH(env, &mb);
+	}
+
+	__os_ufree(env, list);
+
+	return (0);
+}
+
+/*
+ * __repmgr_print_all --
+ *	Display debugging replication manager statistics.
+ */
+static int
+__repmgr_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REPMGR_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__repmgr_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
+
+/*
+ * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ */
+int
+__repmgr_site_list(dbenv, countp, listp)
+	DB_ENV *dbenv;
+	u_int *countp;
+	DB_REPMGR_SITE **listp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	DB_REPMGR_SITE *status;
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	REPMGR_SITE *site;
+	size_t array_size, total_size;
+	int eid, locked, ret;
+	u_int count, i;
+	char *name;
+
+	env = dbenv->env;
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	ENV_NOT_CONFIGURED(
+	    env, db_rep->region, "DB_ENV->repmgr_site_list", DB_INIT_REP);
+
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		LOCK_MUTEX(db_rep->mutex);
+		locked = TRUE;
+
+		ENV_ENTER(env, ip);
+		if (rep->siteinfo_seq > db_rep->siteinfo_seq)
+			ret = __repmgr_sync_siteaddr(env);
+		ENV_LEAVE(env, ip);
+		if (ret != 0)
+			goto err;
+	} else {
+		rep = NULL;
+		locked = FALSE;
+	}
+
+	/* Initialize for empty list or error return. */
+	*countp = 0;
+	*listp = NULL;
+
+	/*
+	 * First, add up how much memory we need for the host names, excluding
+	 * the local site.
+	 */
+	for (i = 0, count = 0, total_size = 0; i < db_rep->site_cnt; i++) {
+		site = &db_rep->sites[i];
+
+		if ((int)i == db_rep->self_eid || site->membership == 0)
+			continue;
+
+		/* Make room for the NUL terminating byte. */
+		total_size += strlen(site->net_addr.host) + 1;
+		count++;
+	}
+	if (count == 0)
+		goto err;
+	array_size = sizeof(DB_REPMGR_SITE) * count;
+	total_size += array_size;
+
+	if ((ret = __os_umalloc(env, total_size, &status)) != 0)
+		goto err;
+
+	/*
+	 * Put the storage for the host names after the array of structs.  This
+	 * way, the caller can free the whole thing in one single operation.
+	 */
+	name = (char *)((u_int8_t *)status + array_size);
+	for (eid = 0, i = 0; eid < (int)db_rep->site_cnt; eid++) {
+		site = &db_rep->sites[eid];
+		if (eid == db_rep->self_eid || site->membership == 0)
+			continue;
+
+		/* If we don't have rep, we can't really know EID yet. */
+		status[i].eid = rep ? eid : DB_EID_INVALID;
+
+		status[i].host = name;
+		(void)strcpy(name, site->net_addr.host);
+		name += strlen(name) + 1;
+
+		status[i].port = site->net_addr.port;
+
+		status[i].flags = 0;
+
+		if (FLD_ISSET(site->config, DB_REPMGR_PEER))
+			F_SET(&status[i], DB_REPMGR_ISPEER);
+
+		/*
+		 * If we haven't started a communications thread, connection
+		 * status is kind of meaningless.  This distinction is useful
+		 * for calls from the db_stat utility: it could be useful for
+		 * db_stat to display known sites with EID; but would be
+		 * confusing for it to display "disconnected" if another process
+		 * does indeed have a connection established (db_stat can't know
+		 * that).
+		 */
+		if (db_rep->selector == NULL)
+			status[i].status = 0;
+		else if (site->state != SITE_CONNECTED)
+			status[i].status = DB_REPMGR_DISCONNECTED;
+		else if ((site->ref.conn.in != NULL &&
+		    IS_READY_STATE(site->ref.conn.in->state)) ||
+		    (site->ref.conn.out != NULL &&
+		    IS_READY_STATE(site->ref.conn.out->state)))
+			status[i].status = DB_REPMGR_CONNECTED;
+		else
+			status[i].status = DB_REPMGR_DISCONNECTED;
+
+		i++;
+	}
+
+	*countp = count;
+	*listp = status;
+
+err:	if (locked)
+		UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
diff --git a/src/repmgr/repmgr_stub.c b/src/repmgr/repmgr_stub.c
new file mode 100644
index 00000000..734c2240
--- /dev/null
+++ b/src/repmgr/repmgr_stub.c
@@ -0,0 +1,262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_REPLICATION_THREADS
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * If the library wasn't compiled with replication support, various routines
+ * aren't available.  Stub them here, returning an appropriate error.
+ */
+static int __db_norepmgr __P((DB_ENV *));
+
+/*
+ * __db_norepmgr --
+ *	Error when a Berkeley DB build doesn't include replication mgr support.
+ */
+static int
+__db_norepmgr(dbenv)
+	DB_ENV *dbenv;
+{
+	__db_errx(dbenv->env, DB_STR("3628",
+    "library build did not include support for the Replication Manager"));
+	return (DB_OPNOTSUP);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_close __P((ENV *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_close(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_get_ack_policy(dbenv, policy)
+	DB_ENV *dbenv;
+	int *policy;
+{
+	COMPQUIET(policy, NULL);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_set_ack_policy __P((DB_ENV *, int));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_set_ack_policy(dbenv, policy)
+	DB_ENV *dbenv;
+	int policy;
+{
+	COMPQUIET(policy, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site
+ * PUBLIC:     __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site(dbenv, host, port, dbsitep, flags)
+	DB_ENV *dbenv;
+	const char *host;
+	u_int port;
+	DB_SITE **dbsitep;
+	u_int32_t flags;
+{
+	COMPQUIET(host, NULL);
+	COMPQUIET(port, 0);
+	COMPQUIET(dbsitep, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site_by_eid(dbenv, eid, dbsitep)
+	DB_ENV *dbenv;
+	int eid;
+	DB_SITE **dbsitep;
+{
+	COMPQUIET(eid, 0);
+	COMPQUIET(dbsitep, NULL);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_local_site
+ * PUBLIC:     __P((DB_ENV *, DB_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_local_site(dbenv, dbsitep)
+	DB_ENV *dbenv;
+	DB_SITE **dbsitep;
+{
+	COMPQUIET(dbsitep, NULL);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site_list(dbenv, countp, listp)
+	DB_ENV *dbenv;
+	u_int *countp;
+	DB_REPMGR_SITE **listp;
+{
+	COMPQUIET(countp, NULL);
+	COMPQUIET(listp, NULL);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_start(dbenv, nthreads, flags)
+	DB_ENV *dbenv;
+	int nthreads;
+	u_int32_t flags;
+{
+	COMPQUIET(nthreads, 0);
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_REPMGR_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_handle_event(env, event, info)
+	ENV *env;
+	u_int32_t event;
+	void *info;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(event, 0);
+	COMPQUIET(info, NULL);
+
+	/*
+	 * It's not an error for this function to be called.  Replication calls
+	 * this to let repmgr handle events.  If repmgr isn't part of the build,
+	 * all replication events should be forwarded to the application.
+	 */
+	return (DB_EVENT_NOT_HANDLED);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_channel(dbenv, eid, dbchannelp, flags)
+	DB_ENV *dbenv;
+	int eid;
+	DB_CHANNEL **dbchannelp;
+	u_int32_t flags;
+{
+	COMPQUIET(eid, 0);
+	COMPQUIET(dbchannelp, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_set_msg_dispatch __P((DB_ENV *,
+ * PUBLIC:     void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+ * PUBLIC:     u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_set_msg_dispatch(dbenv, dispatch, flags)
+	DB_ENV *dbenv;
+	void (*dispatch) __P((DB_ENV *,
+		DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+	u_int32_t flags;
+{
+	COMPQUIET(dispatch, NULL);
+	COMPQUIET(flags, 0);
+	return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	COMPQUIET(env, NULL);
+	COMPQUIET(dtabp, NULL);
+	return (0);
+}
+#endif /* !HAVE_REPLICATION_THREADS */
diff --git a/src/repmgr/repmgr_util.c b/src/repmgr/repmgr_util.c
new file mode 100644
index 00000000..c2439436
--- /dev/null
+++ b/src/repmgr/repmgr_util.c
@@ -0,0 +1,2086 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+#define	INITIAL_SITES_ALLOCATION	3	     /* Arbitrary guess. */
+
+static int get_eid __P((ENV *, const char *, u_int, int *));
+static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *));
+static int read_gmdb __P((ENV *, DB_THREAD_INFO *, u_int8_t **, size_t *));
+
+/*
+ * Schedules a future attempt to re-establish a connection with the given site.
+ * Usually, we wait the configured retry_wait period.  But if the "immediate"
+ * parameter is given as TRUE, we'll make the wait time 0, and put the request
+ * at the _beginning_ of the retry queue.
+ *
+ * PUBLIC: int __repmgr_schedule_connection_attempt __P((ENV *, int, int));
+ *
+ * !!!
+ * Caller should hold mutex.
+ *
+ * Unless an error occurs, we always attempt to wake the main thread;
+ * __repmgr_bust_connection relies on this behavior.
+ */
+int
+__repmgr_schedule_connection_attempt(env, eid, immediate)
+	ENV *env;
+	int eid;
+	int immediate;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_RETRY *retry, *target;
+	REPMGR_SITE *site;
+	db_timespec t;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	if ((ret = __os_malloc(env, sizeof(*retry), &retry)) != 0)
+		return (ret);
+
+	DB_ASSERT(env, IS_VALID_EID(eid));
+	site = SITE_FROM_EID(eid);
+	__os_gettime(env, &t, 1);
+	if (immediate)
+		TAILQ_INSERT_HEAD(&db_rep->retries, retry, entries);
+	else {
+		TIMESPEC_ADD_DB_TIMEOUT(&t, rep->connection_retry_wait);
+		/*
+		 * Insert the new "retry" on the (time-ordered) list in its
+		 * proper position.  To do so, find the list entry ("target")
+		 * with a later time; insert the new entry just before that.
+		 */
+		TAILQ_FOREACH(target, &db_rep->retries, entries) {
+			if (timespeccmp(&target->time, &t, >))
+				break;
+		}
+		if (target == NULL)
+			TAILQ_INSERT_TAIL(&db_rep->retries, retry, entries);
+		else
+			TAILQ_INSERT_BEFORE(target, retry, entries);
+
+	}
+	retry->eid = eid;
+	retry->time = t;
+
+	site->state = SITE_PAUSING;
+	site->ref.retry = retry;
+
+	return (__repmgr_wake_main_thread(env));
+}
+
+/*
+ * Determines whether a remote site should be considered a "server" to us as a
+ * "client" (in typical client/server terminology, not to be confused with our
+ * usual use of the term "client" as in the master/client replication role), or
+ * vice versa.
+ *
+ * PUBLIC: int __repmgr_is_server __P((ENV *, REPMGR_SITE *));
+ */
+int
+__repmgr_is_server(env, site)
+	ENV *env;
+	REPMGR_SITE *site;
+{
+	DB_REP *db_rep;
+	int cmp;
+
+	db_rep = env->rep_handle;
+	cmp = __repmgr_addrcmp(&site->net_addr,
+	    &SITE_FROM_EID(db_rep->self_eid)->net_addr);
+	DB_ASSERT(env, cmp != 0);
+
+	/*
+	 * The mnemonic here is that a server conventionally has a
+	 * small well-known port number, while clients typically use a port
+	 * number from the higher ephemeral range.  So, for the remote site to
+	 * be considered a server, its address should have compared as lower
+	 * than ours.
+	 */
+	return (cmp == -1);
+}
+
+/*
+ * Compare two network addresses (lexicographically), and return -1, 0, or 1, as
+ * the first is less than, equal to, or greater than the second.
+ */
+static int
+__repmgr_addrcmp(addr1, addr2)
+	repmgr_netaddr_t *addr1, *addr2;
+{
+	int cmp;
+
+	cmp = strcmp(addr1->host, addr2->host);
+	if (cmp != 0)
+		return (cmp);
+
+	if (addr1->port < addr2->port)
+		return (-1);
+	else if (addr1->port > addr2->port)
+		return (1);
+	return (0);
+}
+
+/*
+ * Initialize the necessary control structures to begin reading a new input
+ * message.
+ *
+ * PUBLIC: void __repmgr_reset_for_reading __P((REPMGR_CONNECTION *));
+ */
+void
+__repmgr_reset_for_reading(con)
+	REPMGR_CONNECTION *con;
+{
+	con->reading_phase = SIZES_PHASE;
+	__repmgr_iovec_init(&con->iovecs);
+	__repmgr_add_buffer(&con->iovecs,
+	    con->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+}
+
+/*
+ * Constructs a DB_REPMGR_CONNECTION structure.
+ *
+ * PUBLIC: int __repmgr_new_connection __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION **, socket_t, int));
+ */
+int
+__repmgr_new_connection(env, connp, s, state)
+	ENV *env;
+	REPMGR_CONNECTION **connp;
+	socket_t s;
+	int state;
+{
+	REPMGR_CONNECTION *c;
+	int ret;
+
+	if ((ret = __os_calloc(env, 1, sizeof(REPMGR_CONNECTION), &c)) != 0)
+		return (ret);
+	if ((ret = __repmgr_alloc_cond(&c->drained)) != 0) {
+		__os_free(env, c);
+		return (ret);
+	}
+	if ((ret = __repmgr_init_waiters(env, &c->response_waiters)) != 0) {
+		(void)__repmgr_free_cond(&c->drained);
+		__os_free(env, c);
+		return (ret);
+	}
+
+	c->fd = s;
+	c->state = state;
+	c->type = UNKNOWN_CONN_TYPE;
+#ifdef DB_WIN32
+	c->event_object = WSA_INVALID_EVENT;
+#endif
+
+	STAILQ_INIT(&c->outbound_queue);
+	c->out_queue_length = 0;
+
+	__repmgr_reset_for_reading(c);
+	*connp = c;
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_keepalive __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_set_keepalive(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	int ret, sockopt;
+
+	ret = 0;
+#ifdef SO_KEEPALIVE
+	sockopt = 1;
+	if (setsockopt(conn->fd, SOL_SOCKET,
+	    SO_KEEPALIVE, (sockopt_t)&sockopt, sizeof(sockopt)) != 0) {
+		ret = net_errno;
+		__db_err(env, ret, DB_STR("3626",
+			"can't set KEEPALIVE socket option"));
+		(void)__repmgr_destroy_conn(env, conn);
+	}
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_new_site __P((ENV *, REPMGR_SITE**,
+ * PUBLIC:     const char *, u_int));
+ *
+ * Manipulates the process-local copy of the sites list.  So, callers should
+ * hold the db_rep->mutex (except for single-threaded, pre-open configuration).
+ */
+int
+__repmgr_new_site(env, sitep, host, port)
+	ENV *env;
+	REPMGR_SITE **sitep;
+	const char *host;
+	u_int port;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site, *sites;
+	char *p;
+	u_int i, new_site_max;
+	int ret;
+
+	db_rep = env->rep_handle;
+	if (db_rep->site_cnt >= db_rep->site_max) {
+		new_site_max = db_rep->site_max == 0 ?
+		    INITIAL_SITES_ALLOCATION : db_rep->site_max * 2;
+		if ((ret = __os_malloc(env,
+		     sizeof(REPMGR_SITE) * new_site_max, &sites)) != 0)
+			 return (ret);
+		if (db_rep->site_max > 0) {
+			/*
+			 * For each site in the array, copy the old struct to
+			 * the space allocated for the new struct.  But the
+			 * sub_conns list header (and one of the conn structs on
+			 * the list, if any) contain pointers to the address of
+			 * the old list header; so we have to move them
+			 * explicitly.  If not for that, we could use a simple
+			 * __os_realloc() call.
+			 */
+			for (i = 0; i < db_rep->site_cnt; i++) {
+				sites[i] = db_rep->sites[i];
+				TAILQ_INIT(&sites[i].sub_conns);
+				while (!TAILQ_EMPTY(
+				    &db_rep->sites[i].sub_conns)) {
+					conn = TAILQ_FIRST(
+					    &db_rep->sites[i].sub_conns);
+					TAILQ_REMOVE(
+					    &db_rep->sites[i].sub_conns,
+					    conn, entries);
+					TAILQ_INSERT_TAIL(&sites[i].sub_conns,
+					    conn, entries);
+				}
+			}
+			__os_free(env, db_rep->sites);
+		}
+		db_rep->sites = sites;
+		db_rep->site_max = new_site_max;
+	}
+	if ((ret = __os_strdup(env, host, &p)) != 0) {
+		/* No harm in leaving the increased site_max intact. */
+		return (ret);
+	}
+	site = &db_rep->sites[db_rep->site_cnt++];
+
+	site->net_addr.host = p;
+	site->net_addr.port = (u_int16_t)port;
+
+	ZERO_LSN(site->max_ack);
+	site->ack_policy = 0;
+	site->alignment = 0;
+	site->flags = 0;
+	timespecclear(&site->last_rcvd_timestamp);
+	TAILQ_INIT(&site->sub_conns);
+	site->connector = NULL;
+	site->ref.conn.in = site->ref.conn.out = NULL;
+	site->state = SITE_IDLE;
+
+	site->membership = 0;
+	site->config = 0;
+
+	*sitep = site;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_create_mutex __P((ENV *, mgr_mutex_t **));
+ */
+int
+__repmgr_create_mutex(env, mtxp)
+	ENV *env;
+	mgr_mutex_t **mtxp;
+{
+	mgr_mutex_t *mtx;
+	int ret;
+
+	if ((ret = __os_malloc(env, sizeof(mgr_mutex_t), &mtx)) == 0 &&
+	    (ret = __repmgr_create_mutex_pf(mtx)) != 0) {
+		__os_free(env, mtx);
+	}
+	if (ret == 0)
+		*mtxp = mtx;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_mutex __P((ENV *, mgr_mutex_t *));
+ */
+int
+__repmgr_destroy_mutex(env, mtx)
+	ENV *env;
+	mgr_mutex_t *mtx;
+{
+	int ret;
+
+	ret = __repmgr_destroy_mutex_pf(mtx);
+	__os_free(env, mtx);
+	return (ret);
+}
+
+/*
+ * Kind of like a destructor for a repmgr_netaddr_t: cleans up any subordinate
+ * allocated memory pointed to by the addr, though it does not free the struct
+ * itself.
+ *
+ * PUBLIC: void __repmgr_cleanup_netaddr __P((ENV *, repmgr_netaddr_t *));
+ */
+void
+__repmgr_cleanup_netaddr(env, addr)
+	ENV *env;
+	repmgr_netaddr_t *addr;
+{
+	if (addr->host != NULL) {
+		__os_free(env, addr->host);
+		addr->host = NULL;
+	}
+}
+
+/*
+ * PUBLIC: void __repmgr_iovec_init __P((REPMGR_IOVECS *));
+ */
+void
+__repmgr_iovec_init(v)
+	REPMGR_IOVECS *v;
+{
+	v->offset = v->count = 0;
+	v->total_bytes = 0;
+}
+
+/*
+ * PUBLIC: void __repmgr_add_buffer __P((REPMGR_IOVECS *, void *, size_t));
+ *
+ * !!!
+ * There is no checking for overflow of the vectors[5] array.
+ */
+void
+__repmgr_add_buffer(v, address, length)
+	REPMGR_IOVECS *v;
+	void *address;
+	size_t length;
+{
+	if (length > 0) {
+		v->vectors[v->count].iov_base = address;
+		v->vectors[v->count++].iov_len = (u_long)length;
+		v->total_bytes += length;
+	}
+}
+
+/*
+ * PUBLIC: void __repmgr_add_dbt __P((REPMGR_IOVECS *, const DBT *));
+ */
+void
+__repmgr_add_dbt(v, dbt)
+	REPMGR_IOVECS *v;
+	const DBT *dbt;
+{
+	if (dbt->size > 0) {
+		v->vectors[v->count].iov_base = dbt->data;
+		v->vectors[v->count++].iov_len = dbt->size;
+		v->total_bytes += dbt->size;
+	}
+}
+
+/*
+ * Update a set of iovecs to reflect the number of bytes transferred in an I/O
+ * operation, so that the iovecs can be used to continue transferring where we
+ * left off.
+ *     Returns TRUE if the set of buffers is now fully consumed, FALSE if more
+ * remains.
+ *
+ * PUBLIC: int __repmgr_update_consumed __P((REPMGR_IOVECS *, size_t));
+ */
+int
+__repmgr_update_consumed(v, byte_count)
+	REPMGR_IOVECS *v;
+	size_t byte_count;
+{
+	db_iovec_t *iov;
+	int i;
+
+	for (i = v->offset; ; i++) {
+		DB_ASSERT(NULL, i < v->count && byte_count > 0);
+		iov = &v->vectors[i];
+		if (byte_count > iov->iov_len) {
+			/*
+			 * We've consumed (more than) this vector's worth.
+			 * Adjust count and continue.
+			 */
+			byte_count -= iov->iov_len;
+		} else {
+			/*
+			 * Adjust length of remaining portion of vector.
+			 * byte_count can never be greater than iov_len, or we
+			 * would not be in this section of the if clause.
+			 */
+			iov->iov_len -= (u_int32_t)byte_count;
+			if (iov->iov_len > 0) {
+				/*
+				 * Still some left in this vector.  Adjust base
+				 * address too, and leave offset pointing here.
+				 */
+				iov->iov_base = (void *)
+				    ((u_int8_t *)iov->iov_base + byte_count);
+				v->offset = i;
+			} else {
+				/*
+				 * Consumed exactly to a vector boundary.
+				 * Advance to next vector for next time.
+				 */
+				v->offset = i+1;
+			}
+			/*
+			 * If offset has reached count, the entire thing is
+			 * consumed.
+			 */
+			return (v->offset >= v->count);
+		}
+	}
+}
+
+/*
+ * Builds a buffer containing our network address information, suitable for
+ * publishing as cdata via a call to rep_start, and sets up the given DBT to
+ * point to it.  The buffer is dynamically allocated memory, and the caller must
+ * assume responsibility for it.
+ *
+ * PUBLIC: int __repmgr_prepare_my_addr __P((ENV *, DBT *));
+ */
+int
+__repmgr_prepare_my_addr(env, dbt)
+	ENV *env;
+	DBT *dbt;
+{
+	DB_REP *db_rep;
+	repmgr_netaddr_t addr;
+	size_t size, hlen;
+	u_int16_t port_buffer;
+	u_int8_t *ptr;
+	int ret;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	addr = SITE_FROM_EID(db_rep->self_eid)->net_addr;
+	UNLOCK_MUTEX(db_rep->mutex);
+	/*
+	 * The cdata message consists of the 2-byte port number, in network byte
+	 * order, followed by the null-terminated host name string.
+	 */
+	port_buffer = htons(addr.port);
+	size = sizeof(port_buffer) + (hlen = strlen(addr.host) + 1);
+	if ((ret = __os_malloc(env, size, &ptr)) != 0)
+		return (ret);
+
+	DB_INIT_DBT(*dbt, ptr, size);
+
+	memcpy(ptr, &port_buffer, sizeof(port_buffer));
+	ptr = &ptr[sizeof(port_buffer)];
+	memcpy(ptr, addr.host, hlen);
+
+	return (0);
+}
+
+/*
+ * !!!
+ * This may only be called after threads have been started, because we don't
+ * know the answer until we have established group membership (e.g., reading the
+ * membership database).  That should be OK, because we only need this
+ * for starting an election, or counting acks after sending a PERM message.
+ *
+ * PUBLIC: int __repmgr_get_nsites __P((ENV *, u_int32_t *));
+ */
+int
+__repmgr_get_nsites(env, nsitesp)
+	ENV *env;
+	u_int32_t *nsitesp;
+{
+	DB_REP *db_rep;
+	u_int32_t nsites;
+
+	db_rep = env->rep_handle;
+
+	if ((nsites = db_rep->region->config_nsites) == 0) {
+		__db_errx(env, DB_STR("3672",
+		    "Nsites unknown before repmgr_start()"));
+		return (EINVAL);
+	}
+	*nsitesp = nsites;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_thread_failure __P((ENV *, int));
+ */
+int
+__repmgr_thread_failure(env, why)
+	ENV *env;
+	int why;
+{
+	DB_REP *db_rep;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	(void)__repmgr_stop_threads(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (__env_panic(env, why));
+}
+
+/*
+ * Format a printable representation of a site location, suitable for inclusion
+ * in an error message.  The buffer must be at least as big as
+ * MAX_SITE_LOC_STRING.
+ *
+ * PUBLIC: char *__repmgr_format_eid_loc __P((DB_REP *,
+ * PUBLIC:    REPMGR_CONNECTION *, char *));
+ *
+ * Caller must hold mutex.
+ */
+char *
+__repmgr_format_eid_loc(db_rep, conn, buffer)
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	char *buffer;
+{
+	int eid;
+
+	if (conn->type == APP_CONNECTION)
+		snprintf(buffer,
+		    MAX_SITE_LOC_STRING, "(application channel)");
+	else if (conn->type == REP_CONNECTION &&
+	    IS_VALID_EID(eid = conn->eid))
+		(void)__repmgr_format_site_loc(SITE_FROM_EID(eid), buffer);
+	else
+		snprintf(buffer, MAX_SITE_LOC_STRING, "(unidentified site)");
+	return (buffer);
+}
+
+/*
+ * PUBLIC: char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *));
+ */
+char *
+__repmgr_format_site_loc(site, buffer)
+	REPMGR_SITE *site;
+	char *buffer;
+{
+	return (__repmgr_format_addr_loc(&site->net_addr, buffer));
+}
+
+/*
+ * PUBLIC: char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *));
+ */
+char *
+__repmgr_format_addr_loc(addr, buffer)
+	repmgr_netaddr_t *addr;
+	char *buffer;
+{
+	snprintf(buffer, MAX_SITE_LOC_STRING, "site %s:%lu",
+	    addr->host, (u_long)addr->port);
+	return (buffer);
+}
+
+/*
+ * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_repstart(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DBT my_addr;
+	int ret;
+
+	/* Include "cdata" in case sending to old-version site. */
+	if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
+		return (ret);
+	ret = __rep_start_int(env, &my_addr, flags);
+	__os_free(env, my_addr.data);
+	if (ret != 0)
+		__db_err(env, ret, DB_STR("3673", "rep_start"));
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_become_master __P((ENV *));
+ */
+int
+__repmgr_become_master(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	DB_THREAD_INFO *ip;
+	DB *dbp;
+	DB_TXN *txn;
+	REPMGR_SITE *site;
+	DBT key_dbt, data_dbt;
+	__repmgr_membership_key_args key;
+	__repmgr_membership_data_args member_status;
+	repmgr_netaddr_t addr;
+	u_int32_t status;
+	u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+	u_int8_t key_buf[MAX_MSG_BUF];
+	size_t len;
+	u_int i;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	dbp = NULL;
+	txn = NULL;
+
+	/* Examine membership list to see if we have a victim in limbo. */
+	LOCK_MUTEX(db_rep->mutex);
+	ZERO_LSN(db_rep->limbo_failure);
+	ZERO_LSN(db_rep->durable_lsn);
+	db_rep->limbo_victim = DB_EID_INVALID;
+	db_rep->limbo_resolution_needed = FALSE;
+	FOR_EACH_REMOTE_SITE_INDEX(i) {
+		site = SITE_FROM_EID(i);
+		if (site->membership == SITE_ADDING ||
+		    site->membership == SITE_DELETING) {
+			db_rep->limbo_victim = (int)i;
+			db_rep->limbo_resolution_needed = TRUE;
+
+			/*
+			 * Since there can never be more than one limbo victim,
+			 * when we find one we don't have to continue looking
+			 * for others.
+			 */
+			break;
+		}
+	}
+	db_rep->client_intent = FALSE;
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	if ((ret = __repmgr_repstart(env, DB_REP_MASTER)) != 0)
+		return (ret);
+
+	if (db_rep->have_gmdb)
+		return (0);
+
+	db_rep->member_version_gen = db_rep->region->gen;
+	ENV_ENTER(env, ip);
+	if ((ret = __repmgr_hold_master_role(env, NULL)) != 0)
+		goto leave;
+retry:
+	if ((ret = __repmgr_setup_gmdb_op(env, ip, &txn, DB_CREATE)) != 0)
+		goto err;
+
+	DB_ASSERT(env, txn != NULL);
+	dbp = db_rep->gmdb;
+	DB_ASSERT(env, dbp != NULL);
+
+	/* Write the meta-data record. */
+	if ((ret = __repmgr_set_gm_version(env, ip, txn, 1)) != 0)
+		goto err;
+
+	/* Write a record representing each site in the group. */
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		LOCK_MUTEX(db_rep->mutex);
+		site = SITE_FROM_EID(i);
+		addr = site->net_addr;
+		status = site->membership;
+		UNLOCK_MUTEX(db_rep->mutex);
+		if (status == 0)
+			continue;
+		DB_INIT_DBT(key.host, addr.host, strlen(addr.host) + 1);
+		key.port = addr.port;
+		ret = __repmgr_membership_key_marshal(env,
+		    &key, key_buf, sizeof(key_buf), &len);
+		DB_ASSERT(env, ret == 0);
+		DB_INIT_DBT(key_dbt, key_buf, len);
+		member_status.flags = status;
+		__repmgr_membership_data_marshal(env, &member_status, data_buf);
+		DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
+		if ((ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+			goto err;
+	}
+
+err:
+	if (txn != NULL) {
+		if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		if ((t_ret = __repmgr_cleanup_gmdb_op(env, TRUE)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+	if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+		goto retry;
+	if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+		ret = t_ret;
+leave:
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Visits all the connections we know about, performing the desired action.
+ * "err_quit" determines whether we give up, or soldier on, in case of an
+ * error.
+ *
+ * PUBLIC: int __repmgr_each_connection __P((ENV *,
+ * PUBLIC:     CONNECTION_ACTION, void *, int));
+ *
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_each_connection(env, callback, info, err_quit)
+	ENV *env;
+	CONNECTION_ACTION callback;
+	void *info;
+	int err_quit;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn, *next;
+	REPMGR_SITE *site;
+	int eid, ret, t_ret;
+
+#define	HANDLE_ERROR		        \
+	do {			        \
+		if (err_quit)	        \
+			return (t_ret); \
+		if (ret == 0)	        \
+			ret = t_ret;    \
+	} while (0)
+
+	db_rep = env->rep_handle;
+	ret = 0;
+
+	/*
+	 * We might have used TAILQ_FOREACH here, except that in some cases we
+	 * need to unlink an element along the way.
+	 */
+	for (conn = TAILQ_FIRST(&db_rep->connections);
+	     conn != NULL;
+	     conn = next) {
+		next = TAILQ_NEXT(conn, entries);
+
+		if ((t_ret = (*callback)(env, conn, info)) != 0)
+			HANDLE_ERROR;
+	}
+
+	FOR_EACH_REMOTE_SITE_INDEX(eid) {
+		site = SITE_FROM_EID(eid);
+
+		if (site->state == SITE_CONNECTED) {
+			if ((conn = site->ref.conn.in) != NULL &&
+			    (t_ret = (*callback)(env, conn, info)) != 0)
+				HANDLE_ERROR;
+			if ((conn = site->ref.conn.out) != NULL &&
+			    (t_ret = (*callback)(env, conn, info)) != 0)
+				HANDLE_ERROR;
+		}
+
+		for (conn = TAILQ_FIRST(&site->sub_conns);
+		     conn != NULL;
+		     conn = next) {
+			next = TAILQ_NEXT(conn, entries);
+			if ((t_ret = (*callback)(env, conn, info)) != 0)
+				HANDLE_ERROR;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Initialize repmgr's portion of the shared region area.  Note that we can't
+ * simply get the REP* address from the env as we usually do, because at the
+ * time of this call it hasn't been linked into there yet.
+ *
+ * This function is only called during creation of the region.  If anything
+ * fails, our caller will panic and remove the region.  So, if we have any
+ * failure, we don't have to clean up any partial allocation.
+ *
+ * PUBLIC: int __repmgr_open __P((ENV *, void *));
+ */
+int
+__repmgr_open(env, rep_)
+	ENV *env;
+	void *rep_;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = rep_;
+
+	if ((ret = __mutex_alloc(env, MTX_REPMGR, 0, &rep->mtx_repmgr)) != 0)
+		return (ret);
+
+	DB_ASSERT(env, rep->siteinfo_seq == 0 && db_rep->siteinfo_seq == 0);
+	rep->siteinfo_off = INVALID_ROFF;
+	rep->siteinfo_seq = 0;
+	if ((ret = __repmgr_share_netaddrs(env, rep, 0, db_rep->site_cnt)) != 0)
+		return (ret);
+
+	rep->self_eid = db_rep->self_eid;
+	rep->perm_policy = db_rep->perm_policy;
+	rep->ack_timeout = db_rep->ack_timeout;
+	rep->connection_retry_wait = db_rep->connection_retry_wait;
+	rep->election_retry_wait = db_rep->election_retry_wait;
+	rep->heartbeat_monitor_timeout = db_rep->heartbeat_monitor_timeout;
+	rep->heartbeat_frequency = db_rep->heartbeat_frequency;
+	return (ret);
+}
+
+/*
+ * Join an existing environment, by setting up our local site info structures
+ * from shared network address configuration in the region.
+ *
+ * As __repmgr_open(), note that we can't simply get the REP* address from the
+ * env as we usually do, because at the time of this call it hasn't been linked
+ * into there yet.
+ *
+ * PUBLIC: int __repmgr_join __P((ENV *, void *));
+ */
+int
+__repmgr_join(env, rep_)
+	ENV *env;
+	void *rep_;
+{
+	DB_REP *db_rep;
+	REGINFO *infop;
+	REP *rep;
+	SITEINFO *p;
+	REPMGR_SITE *site, temp;
+	repmgr_netaddr_t *addrp;
+	char *host;
+	u_int i, j;
+	int ret;
+
+	db_rep = env->rep_handle;
+	infop = env->reginfo;
+	rep = rep_;
+	ret = 0;
+
+	MUTEX_LOCK(env, rep->mtx_repmgr);
+
+	/*
+	 * Merge local and shared lists of remote sites.  Note that the
+	 * placement of entries in the shared array must not change.  To
+	 * accomplish the merge, pull in entries from the shared list, into the
+	 * proper position, shuffling not-yet-resolved local entries if
+	 * necessary.  Then add any remaining locally known entries to the
+	 * shared list.
+	 */
+	i = 0;
+	if (rep->siteinfo_off != INVALID_ROFF) {
+		p = R_ADDR(infop, rep->siteinfo_off);
+
+		/* For each address in the shared list ... */
+		for (; i < rep->site_cnt; i++) {
+			host = R_ADDR(infop, p[i].addr.host);
+
+			RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			    "Site %s:%lu found at EID %u",
+				host, (u_long)p[i].addr.port, i));
+			/*
+			 * Find it in the local list.  Everything before 'i'
+			 * already matches the shared list, and is therefore in
+			 * the right place.  So we only need to search starting
+			 * from 'i'.  When found, local config values will be
+			 * used because they are assumed to be "fresher".  But
+			 * membership status is not, since this process hasn't
+			 * been active (running) yet.
+			 */
+			for (j = i; j < db_rep->site_cnt; j++) {
+				site = &db_rep->sites[j];
+				addrp = &site->net_addr;
+				if (strcmp(host, addrp->host) == 0 &&
+				    p[i].addr.port == addrp->port) {
+					p[i].config = site->config;
+					site->membership = p[i].status;
+					break;
+				}
+			}
+
+			/*
+			 * When not found in local list, copy peer values
+			 * from shared list.
+			 */
+			if (j == db_rep->site_cnt) {
+				if ((ret = __repmgr_new_site(env,
+				    &site, host, p[i].addr.port)) != 0)
+					goto unlock;
+				site->config = p[i].config;
+				site->membership = p[i].status;
+			}
+			DB_ASSERT(env, j < db_rep->site_cnt);
+
+			/* Found or added at 'j', but belongs at 'i': swap. */
+			if (i != j) {
+				temp = db_rep->sites[j];
+				db_rep->sites[j] = db_rep->sites[i];
+				db_rep->sites[i] = temp;
+				/*
+				 * If we're moving the entry that self_eid
+				 * points to, then adjust self_eid to match.
+				 * For now this is still merely our original,
+				 * in-process pointer; we have yet to make sure
+				 * it matches the one from shared memory.
+				 */
+				if (db_rep->self_eid == (int)j)
+					db_rep->self_eid = (int)i;
+			}
+		}
+	}
+	if ((ret = __repmgr_share_netaddrs(env, rep, i, db_rep->site_cnt)) != 0)
+		goto unlock;
+	if (db_rep->self_eid == DB_EID_INVALID)
+		db_rep->self_eid = rep->self_eid;
+	else if (rep->self_eid == DB_EID_INVALID)
+		rep->self_eid = db_rep->self_eid;
+	else if (db_rep->self_eid != rep->self_eid) {
+		__db_errx(env, DB_STR("3674",
+    "A mismatching local site address has been set in the environment"));
+		ret = EINVAL;
+		goto unlock;
+	}
+
+	db_rep->siteinfo_seq = rep->siteinfo_seq;
+unlock:
+	MUTEX_UNLOCK(env, rep->mtx_repmgr);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_env_refresh __P((ENV *env));
+ */
+int
+__repmgr_env_refresh(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGINFO *infop;
+	SITEINFO *shared_array;
+	u_int i;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+	ret = 0;
+	COMPQUIET(i, 0);
+
+	if (F_ISSET(env, ENV_PRIVATE)) {
+		ret = __mutex_free(env, &rep->mtx_repmgr);
+		if (rep->siteinfo_off != INVALID_ROFF) {
+			shared_array = R_ADDR(infop, rep->siteinfo_off);
+			for (i = 0; i < db_rep->site_cnt; i++)
+				__env_alloc_free(infop, R_ADDR(infop,
+				    shared_array[i].addr.host));
+			__env_alloc_free(infop, shared_array);
+			rep->siteinfo_off = INVALID_ROFF;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * Copies new remote site information from the indicated private array slots
+ * into the shared region.  The corresponding shared array slots do not exist
+ * yet; they must be allocated.
+ *
+ * PUBLIC: int __repmgr_share_netaddrs __P((ENV *, void *, u_int, u_int));
+ *
+ * !!! The rep pointer is passed, because it may not yet have been installed
+ * into the env handle.
+ *
+ * !!! Assumes caller holds mtx_repmgr lock.
+ */
+int
+__repmgr_share_netaddrs(env, rep_, start, limit)
+	ENV *env;
+	void *rep_;
+	u_int start, limit;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGINFO *infop;
+	REGENV *renv;
+	SITEINFO *orig, *shared_array;
+	char *host, *hostbuf;
+	size_t sz;
+	u_int i, n;
+	int eid, ret, touched;
+
+	db_rep = env->rep_handle;
+	infop = env->reginfo;
+	renv = infop->primary;
+	rep = rep_;
+	ret = 0;
+	touched = FALSE;
+
+	MUTEX_LOCK(env, renv->mtx_regenv);
+
+	for (i = start; i < limit; i++) {
+		if (rep->site_cnt >= rep->site_max) {
+			/* Table is full, we need more space. */
+			if (rep->siteinfo_off == INVALID_ROFF) {
+				n = INITIAL_SITES_ALLOCATION;
+				sz = n * sizeof(SITEINFO);
+				if ((ret = __env_alloc(infop,
+				    sz, &shared_array)) != 0)
+					goto out;
+			} else {
+				n = 2 * rep->site_max;
+				sz = n * sizeof(SITEINFO);
+				if ((ret = __env_alloc(infop,
+				    sz, &shared_array)) != 0)
+					goto out;
+				orig = R_ADDR(infop, rep->siteinfo_off);
+				memcpy(shared_array, orig,
+				    sizeof(SITEINFO) * rep->site_cnt);
+				__env_alloc_free(infop, orig);
+			}
+			rep->siteinfo_off = R_OFFSET(infop, shared_array);
+			rep->site_max = n;
+		} else
+			shared_array = R_ADDR(infop, rep->siteinfo_off);
+
+		DB_ASSERT(env, rep->site_cnt < rep->site_max &&
+		    rep->siteinfo_off != INVALID_ROFF);
+
+		host = db_rep->sites[i].net_addr.host;
+		sz = strlen(host) + 1;
+		if ((ret = __env_alloc(infop, sz, &hostbuf)) != 0)
+			goto out;
+		eid = (int)rep->site_cnt++;
+		(void)strcpy(hostbuf, host);
+		shared_array[eid].addr.host = R_OFFSET(infop, hostbuf);
+		shared_array[eid].addr.port = db_rep->sites[i].net_addr.port;
+		shared_array[eid].config = db_rep->sites[i].config;
+		shared_array[eid].status = db_rep->sites[i].membership;
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "EID %d is assigned for site %s:%lu",
+			eid, host, (u_long)shared_array[eid].addr.port));
+		touched = TRUE;
+	}
+
+out:
+	if (touched)
+		db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+	MUTEX_UNLOCK(env, renv->mtx_regenv);
+	return (ret);
+}
+
+/*
+ * Copy into our local list any newly added/changed remote site
+ * configuration information.
+ *
+ * !!! Caller must hold db_rep->mutex and mtx_repmgr locks.
+ *
+ * PUBLIC: int __repmgr_copy_in_added_sites __P((ENV *));
+ */
+int
+__repmgr_copy_in_added_sites(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGINFO *infop;
+	SITEINFO *base, *p;
+	REPMGR_SITE *site;
+	char *host;
+	int ret;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (rep->siteinfo_off == INVALID_ROFF)
+		goto out;
+
+	infop = env->reginfo;
+	base = R_ADDR(infop, rep->siteinfo_off);
+
+	/* Create private array slots for new sites. */
+	for (i = db_rep->site_cnt; i < rep->site_cnt; i++) {
+		p = &base[i];
+		host = R_ADDR(infop, p->addr.host);
+		if ((ret = __repmgr_new_site(env,
+		    &site, host, p->addr.port)) != 0)
+			return (ret);
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "Site %s:%lu found at EID %u",
+			host, (u_long)p->addr.port, i));
+	}
+
+	/* Make sure info is up to date for all sites, old and new. */
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		p = &base[i];
+		site = SITE_FROM_EID(i);
+		site->config = p->config;
+		site->membership = p->status;
+	}
+
+out:
+	/*
+	 * We always make sure our local list has been brought up to date with
+	 * the shared list before adding to the local list (except before env
+	 * open of course).  So here there should be nothing on our local list
+	 * not yet in shared memory.
+	 */
+	DB_ASSERT(env, db_rep->site_cnt == rep->site_cnt);
+	db_rep->siteinfo_seq = rep->siteinfo_seq;
+	return (0);
+}
+
+/*
+ * Initialize a range of sites newly added to our site list array.  Process each
+ * array entry in the range from <= x < limit.  Passing from >= limit is
+ * allowed, and is effectively a no-op.
+ *
+ * PUBLIC: int __repmgr_init_new_sites __P((ENV *, int, int));
+ *
+ * !!! Assumes caller holds db_rep->mutex.
+ */
+int
+__repmgr_init_new_sites(env, from, limit)
+	ENV *env;
+	int from, limit;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	int i, ret;
+
+	db_rep = env->rep_handle;
+
+	if (db_rep->selector == NULL)
+		return (0);
+
+	DB_ASSERT(env, IS_VALID_EID(from) && IS_VALID_EID(limit) &&
+	    from <= limit);
+	for (i = from; i < limit; i++) {
+		site = SITE_FROM_EID(i);
+		if (site->membership == SITE_PRESENT &&
+		    (ret = __repmgr_schedule_connection_attempt(env,
+		    i, TRUE)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_failchk __P((ENV *));
+ */
+int
+__repmgr_failchk(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_REP *db_rep;
+	REP *rep;
+	db_threadid_t unused;
+
+	dbenv = env->dbenv;
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	DB_THREADID_INIT(unused);
+	MUTEX_LOCK(env, rep->mtx_repmgr);
+
+	/*
+	 * Check to see if the main (listener) replication process may have died
+	 * without cleaning up the flag.  If so, we only have to clear it, and
+	 * another process should then be able to come along and become the
+	 * listener.  So in either case we can return success.
+	 */
+	if (rep->listener != 0 && !dbenv->is_alive(dbenv,
+	    rep->listener, unused, DB_MUTEX_PROCESS_ONLY))
+		rep->listener = 0;
+	MUTEX_UNLOCK(env, rep->mtx_repmgr);
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_master_is_known __P((ENV *));
+ */
+int
+__repmgr_master_is_known(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *master;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * We are the master, or we know of a master and have a healthy
+	 * connection to it.
+	 */
+	if (db_rep->region->master_id == db_rep->self_eid)
+		return (TRUE);
+	if ((master = __repmgr_connected_master(env)) == NULL)
+		return (FALSE);
+	if ((conn = master->ref.conn.in) != NULL &&
+	    IS_READY_STATE(conn->state))
+		return (TRUE);
+	if ((conn = master->ref.conn.out) != NULL &&
+	    IS_READY_STATE(conn->state))
+		return (TRUE);
+	return (FALSE);
+}
+
+/*
+ * PUBLIC: int __repmgr_stable_lsn __P((ENV *, DB_LSN *));
+ *
+ * This function may be called before any of repmgr's threads have
+ * been started.  This code must not be called before env open.
+ * Currently that is impossible since its only caller is log_archive
+ * which itself cannot be called before env_open.
+ */
+int
+__repmgr_stable_lsn(env, stable_lsn)
+	ENV *env;
+	DB_LSN *stable_lsn;
+{
+	DB_REP *db_rep;
+	REP *rep;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if (rep->min_log_file != 0 && rep->min_log_file < stable_lsn->file) {
+		/*
+		 * Returning an LSN to be consistent with the rest of the
+		 * log archiving processing.  Construct LSN of format
+		 * [filenum][0].
+		 */
+		stable_lsn->file = rep->min_log_file;
+		stable_lsn->offset = 0;
+	}
+	RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+	    "Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu]",
+	    (u_long)stable_lsn->file, (u_long)stable_lsn->offset));
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC:     u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__repmgr_send_sync_msg(env, conn, type, buf, len)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	u_int8_t *buf;
+	u_int32_t len, type;
+{
+	REPMGR_IOVECS iovecs;
+	__repmgr_msg_hdr_args msg_hdr;
+	u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+	size_t unused;
+
+	msg_hdr.type = REPMGR_OWN_MSG;
+	REPMGR_OWN_BUF_SIZE(msg_hdr) = len;
+	REPMGR_OWN_MSG_TYPE(msg_hdr) = type;
+	__repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+
+	__repmgr_iovec_init(&iovecs);
+	__repmgr_add_buffer(&iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+	if (len > 0)
+		__repmgr_add_buffer(&iovecs, buf, len);
+
+	return (__repmgr_write_iovecs(env, conn, &iovecs, &unused));
+}
+
+/*
+ * Produce a membership list from the known info currently in memory.
+ *
+ * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_marshal_member_list(env, bufp, lenp)
+	ENV *env;
+	u_int8_t **bufp;
+	size_t *lenp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *site;
+	__repmgr_membr_vers_args membr_vers;
+	__repmgr_site_info_args site_info;
+	u_int8_t *buf, *p;
+	size_t bufsize, len;
+	u_int i;
+	int ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	/* Compute a (generous) upper bound on needed buffer size. */
+	bufsize = __REPMGR_MEMBR_VERS_SIZE +
+	    db_rep->site_cnt * (__REPMGR_SITE_INFO_SIZE + MAXHOSTNAMELEN + 1);
+	if ((ret = __os_malloc(env, bufsize, &buf)) != 0)
+		return (ret);
+	p = buf;
+
+	membr_vers.version = db_rep->membership_version;
+	membr_vers.gen = rep->gen;
+	__repmgr_membr_vers_marshal(env, &membr_vers, p);
+	p += __REPMGR_MEMBR_VERS_SIZE;
+
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		site = SITE_FROM_EID(i);
+		if (site->membership == 0)
+			continue;
+
+		site_info.host.data = site->net_addr.host;
+		site_info.host.size =
+		    (u_int32_t)strlen(site->net_addr.host) + 1;
+		site_info.port = site->net_addr.port;
+		site_info.flags = site->membership;
+
+		ret = __repmgr_site_info_marshal(env,
+		    &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+		DB_ASSERT(env, ret == 0);
+		p += len;
+	}
+	len = (size_t)(p - buf);
+
+	*bufp = buf;
+	*lenp = len;
+	DB_ASSERT(env, ret == 0);
+	return (0);
+}
+
+/*
+ * Produce a membership list by reading the database.
+ */
+static int
+read_gmdb(env, ip, bufp, lenp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int8_t **bufp;
+	size_t *lenp;
+{
+	DB_TXN *txn;
+	DB *dbp;
+	DBC *dbc;
+	DBT key_dbt, data_dbt;
+	__repmgr_membership_key_args key;
+	__repmgr_membership_data_args member_status;
+	__repmgr_member_metadata_args metadata;
+	__repmgr_membr_vers_args membr_vers;
+	__repmgr_site_info_args site_info;
+	u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+	u_int8_t key_buf[MAX_MSG_BUF];
+	u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE];
+	char *host;
+	size_t bufsize, len;
+	u_int8_t *buf, *p;
+	u_int32_t gen;
+	int ret, t_ret;
+
+	txn = NULL;
+	dbp = NULL;
+	dbc = NULL;
+	buf = NULL;
+	COMPQUIET(len, 0);
+
+	if ((ret = __rep_get_datagen(env, &gen)) != 0)
+		return (ret);
+	if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+		goto err;
+	if ((ret = __rep_open_sysdb(env, ip, txn, REPMEMBERSHIP, 0, &dbp)) != 0)
+		goto err;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err;
+
+	memset(&key_dbt, 0, sizeof(key_dbt));
+	key_dbt.data = key_buf;
+	key_dbt.ulen = sizeof(key_buf);
+	F_SET(&key_dbt, DB_DBT_USERMEM);
+	memset(&data_dbt, 0, sizeof(data_dbt));
+	data_dbt.data = metadata_buf;
+	data_dbt.ulen = sizeof(metadata_buf);
+	F_SET(&data_dbt, DB_DBT_USERMEM);
+
+	/* Get metadata record, make sure key looks right. */
+	if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) != 0)
+		goto err;
+	ret = __repmgr_membership_key_unmarshal(env,
+	    &key, key_buf, key_dbt.size, NULL);
+	DB_ASSERT(env, ret == 0);
+	DB_ASSERT(env, key.host.size == 0);
+	DB_ASSERT(env, key.port == 0);
+	ret = __repmgr_member_metadata_unmarshal(env,
+	    &metadata, metadata_buf, data_dbt.size, NULL);
+	DB_ASSERT(env, ret == 0);
+	DB_ASSERT(env, metadata.format == REPMGR_GMDB_FMT_VERSION);
+	DB_ASSERT(env, metadata.version > 0);
+
+	bufsize = 1000;		/* Initial guess. */
+	if ((ret = __os_malloc(env, bufsize, &buf)) != 0)
+		goto err;
+	membr_vers.version = metadata.version;
+	membr_vers.gen = gen;
+	__repmgr_membr_vers_marshal(env, &membr_vers, buf);
+	p = &buf[__REPMGR_MEMBR_VERS_SIZE];
+
+	data_dbt.data = data_buf;
+	data_dbt.ulen = sizeof(data_buf);
+	while ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) == 0) {
+		ret = __repmgr_membership_key_unmarshal(env,
+		    &key, key_buf, key_dbt.size, NULL);
+		DB_ASSERT(env, ret == 0);
+		DB_ASSERT(env, key.host.size <= MAXHOSTNAMELEN + 1 &&
+		    key.host.size > 1);
+		host = (char*)key.host.data;
+		DB_ASSERT(env, host[key.host.size-1] == '\0');
+		DB_ASSERT(env, key.port > 0);
+
+		ret = __repmgr_membership_data_unmarshal(env,
+		    &member_status, data_buf, data_dbt.size, NULL);
+		DB_ASSERT(env, ret == 0);
+		DB_ASSERT(env, member_status.flags != 0);
+
+		site_info.host = key.host;
+		site_info.port = key.port;
+		site_info.flags = member_status.flags;
+		if ((ret = __repmgr_site_info_marshal(env, &site_info,
+		    p, (size_t)(&buf[bufsize]-p), &len)) == ENOMEM) {
+			bufsize *= 2;
+			len = (size_t)(p - buf);
+			if ((ret = __os_realloc(env, bufsize, &buf)) != 0)
+				goto err;
+			p = &buf[len];
+			ret = __repmgr_site_info_marshal(env,
+			    &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+			DB_ASSERT(env, ret == 0);
+		}
+		p += len;
+	}
+	len = (size_t)(p - buf);
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+	if (txn != NULL &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret == 0) {
+		*bufp = buf;
+		*lenp = len;
+	} else if (buf != NULL)
+		__os_free(env, buf);
+	return (ret);
+}
+
+/*
+ * Refresh our sites array from the given membership list.
+ *
+ * PUBLIC: int __repmgr_refresh_membership __P((ENV *,
+ * PUBLIC:     u_int8_t *, size_t));
+ */
+int
+__repmgr_refresh_membership(env, buf, len)
+	ENV *env;
+	u_int8_t *buf;
+	size_t len;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	__repmgr_membr_vers_args membr_vers;
+	__repmgr_site_info_args site_info;
+	char *host;
+	u_int8_t *p;
+	u_int16_t port;
+	u_int32_t i, n;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+
+	/*
+	 * Membership list consists of membr_vers followed by a number of
+	 * site_info structs.
+	 */
+	ret = __repmgr_membr_vers_unmarshal(env, &membr_vers, buf, len, &p);
+	DB_ASSERT(env, ret == 0);
+
+	if (db_rep->repmgr_status == stopped)
+		return (0);
+	/* Ignore obsolete versions. */
+	if (__repmgr_gmdb_version_cmp(env,
+	    membr_vers.gen, membr_vers.version) <= 0)
+		return (0);
+
+	LOCK_MUTEX(db_rep->mutex);
+
+	db_rep->membership_version = membr_vers.version;
+	db_rep->member_version_gen = membr_vers.gen;
+
+	for (i = 0; i < db_rep->site_cnt; i++)
+		F_CLR(SITE_FROM_EID(i), SITE_TOUCHED);
+
+	for (n = 0; p < &buf[len]; ++n) {
+		ret = __repmgr_site_info_unmarshal(env,
+		    &site_info, p, (size_t)(&buf[len] - p), &p);
+		DB_ASSERT(env, ret == 0);
+
+		host = site_info.host.data;
+		DB_ASSERT(env,
+		    (u_int8_t*)site_info.host.data + site_info.host.size <= p);
+		host[site_info.host.size-1] = '\0';
+		port = site_info.port;
+
+		if ((ret = __repmgr_set_membership(env,
+		    host, port, site_info.flags)) != 0)
+			goto err;
+
+		if ((ret = __repmgr_find_site(env, host, port, &eid)) != 0)
+			goto err;
+		DB_ASSERT(env, IS_VALID_EID(eid));
+		F_SET(SITE_FROM_EID(eid), SITE_TOUCHED);
+	}
+	ret = __rep_set_nsites_int(env, n);
+	DB_ASSERT(env, ret == 0);
+
+	/* Scan "touched" flags so as to notice sites that have been removed. */
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		site = SITE_FROM_EID(i);
+		if (F_ISSET(site, SITE_TOUCHED))
+			continue;
+		host = site->net_addr.host;
+		port = site->net_addr.port;
+		if ((ret = __repmgr_set_membership(env, host, port, 0)) != 0)
+			goto err;
+	}
+
+err:
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_reload_gmdb __P((ENV *));
+ */
+int
+__repmgr_reload_gmdb(env)
+	ENV *env;
+{
+	DB_THREAD_INFO *ip;
+	u_int8_t *buf;
+	size_t len;
+	int ret;
+
+	ENV_ENTER(env, ip);
+	if ((ret = read_gmdb(env, ip, &buf, &len)) == 0) {
+		env->rep_handle->have_gmdb = TRUE;
+		ret = __repmgr_refresh_membership(env, buf, len);
+		__os_free(env, buf);
+	}
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Return 1, 0, or -1, as the given gen/version combination is >, =, or < our
+ * currently known version.
+ *
+ * PUBLIC: int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_gmdb_version_cmp(env, gen, version)
+	ENV *env;
+	u_int32_t gen, version;
+{
+	DB_REP *db_rep;
+	u_int32_t g, v;
+
+	db_rep = env->rep_handle;
+	g = db_rep->member_version_gen;
+	v = db_rep->membership_version;
+
+	if (gen == g)
+		return (version == v ? 0 :
+		    (version < v ? -1 : 1));
+	return (gen < g ? -1 : 1);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_save __P((ENV *, DBT *));
+ */
+int
+__repmgr_init_save(env, dbt)
+	ENV *env;
+	DBT *dbt;
+{
+	DB_REP *db_rep;
+	u_int8_t *buf;
+	size_t len;
+	int ret;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	if (db_rep->site_cnt == 0) {
+		dbt->data = NULL;
+		dbt->size = 0;
+		ret = 0;
+	} else if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) == 0) {
+		dbt->data = buf;
+		dbt->size = (u_int32_t)len;
+	}
+	UNLOCK_MUTEX(db_rep->mutex);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_restore __P((ENV *, DBT *));
+ */
+int
+__repmgr_init_restore(env, dbt)
+	ENV *env;
+	DBT *dbt;
+{
+	DB_REP *db_rep;
+
+	db_rep = env->rep_handle;
+	db_rep->restored_list = dbt->data;
+	db_rep->restored_list_length = dbt->size;
+	return (0);
+}
+
+/*
+ * Generates an internal request for a deferred operation, to be performed on a
+ * separate thread (conveniently, a message-processing thread).
+ *
+ * PUBLIC: int __repmgr_defer_op __P((ENV *, u_int32_t));
+ *
+ * Caller should hold mutex.
+ */
+int
+__repmgr_defer_op(env, op)
+	ENV *env;
+	u_int32_t op;
+{
+	REPMGR_MESSAGE *msg;
+	int ret;
+
+	/*
+	 * Overload REPMGR_MESSAGE to convey the type of operation being
+	 * requested.  For now "op" is all we need; plenty of room for expansion
+	 * if needed in the future.
+	 *
+	 * Leave msg->v.gmdb_msg.conn NULL to show no conn to be cleaned up.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(*msg), &msg)) != 0)
+		return (ret);
+	msg->msg_hdr.type = REPMGR_OWN_MSG;
+	REPMGR_OWN_MSG_TYPE(msg->msg_hdr) = op;
+	ret = __repmgr_queue_put(env, msg);
+	return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_fire_conn_err_event __P((ENV *,
+ * PUBLIC:     REPMGR_CONNECTION *, int));
+ */
+void
+__repmgr_fire_conn_err_event(env, conn, err)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	int err;
+{
+	DB_REP *db_rep;
+	DB_REPMGR_CONN_ERR info;
+
+	db_rep = env->rep_handle;
+	if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) {
+		__repmgr_print_conn_err(env,
+		    &SITE_FROM_EID(conn->eid)->net_addr, err);
+		info.eid = conn->eid;
+		info.error = err;
+		DB_EVENT(env, DB_EVENT_REP_CONNECT_BROKEN, &info);
+	}
+}
+
+/*
+ * PUBLIC: void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int));
+ */
+void
+__repmgr_print_conn_err(env, netaddr, err)
+	ENV *env;
+	repmgr_netaddr_t *netaddr;
+	int err;
+{
+	SITE_STRING_BUFFER site_loc_buf;
+	char msgbuf[200];	/* Arbitrary size. */
+
+	(void)__repmgr_format_addr_loc(netaddr, site_loc_buf);
+	/* TCP/IP sockets API convention: 0 indicates "end-of-file". */
+	if (err == 0)
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"EOF on connection to %s", site_loc_buf));
+	else
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+			"`%s' (%d) on connection to %s",
+			__os_strerror(err, msgbuf, sizeof(msgbuf)),
+			err, site_loc_buf));
+}
+
+/*
+ * Change role from master to client, but if a GMDB operation is in progress,
+ * wait for it to finish first.
+ *
+ * PUBLIC: int __repmgr_become_client __P((ENV *));
+ */
+int
+__repmgr_become_client(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_await_gmdbop(env)) == 0)
+		db_rep->client_intent = TRUE;
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT) : ret);
+}
+
+/*
+ * Looks up a site from our local (in-process) list, or returns NULL if not
+ * found.
+ *
+ * PUBLIC: REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int));
+ */
+REPMGR_SITE *
+__repmgr_lookup_site(env, host, port)
+	ENV *env;
+	const char *host;
+	u_int port;
+{
+	DB_REP *db_rep;
+	REPMGR_SITE *site;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	for (i = 0; i < db_rep->site_cnt; i++) {
+		site = &db_rep->sites[i];
+
+		if (strcmp(site->net_addr.host, host) == 0 &&
+		    site->net_addr.port == port)
+			return (site);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Look up a site, or add it if it doesn't already exist.
+ *
+ * Caller must hold db_rep mutex and be within ENV_ENTER context, unless this is
+ * a pre-open call.
+ *
+ * PUBLIC: int __repmgr_find_site __P((ENV *, const char *, u_int, int *));
+ */
+int
+__repmgr_find_site(env, host, port, eidp)
+	ENV *env;
+	const char *host;
+	u_int port;
+	int *eidp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *site;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+	ret = 0;
+	if (REP_ON(env)) {
+		rep = db_rep->region;
+		MUTEX_LOCK(env, rep->mtx_repmgr);
+		ret = get_eid(env, host, port, &eid);
+		MUTEX_UNLOCK(env, rep->mtx_repmgr);
+	} else {
+		if ((site = __repmgr_lookup_site(env, host, port)) == NULL &&
+		    (ret = __repmgr_new_site(env, &site, host, port)) != 0)
+			return (ret);
+		eid = EID_FROM_SITE(site);
+	}
+	if (ret == 0)
+		*eidp = eid;
+	return (ret);
+}
+
+/*
+ * Get the EID of the named remote site, even if it means creating a new entry
+ * in our table if it doesn't already exist.
+ *
+ * Caller must hold both db_rep mutex and mtx_repmgr.
+ */
+static int
+get_eid(env, host, port, eidp)
+	ENV *env;
+	const char *host;
+	u_int port;
+	int *eidp;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REPMGR_SITE *site;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+
+	if ((ret = __repmgr_copy_in_added_sites(env)) != 0)
+		return (ret);
+	if ((site = __repmgr_lookup_site(env, host, port)) == NULL) {
+		/*
+		 * Store both locally and in shared region.
+		 */
+		if ((ret = __repmgr_new_site(env, &site, host, port)) != 0)
+			return (ret);
+
+		eid = EID_FROM_SITE(site);
+		DB_ASSERT(env, (u_int)eid == db_rep->site_cnt - 1);
+		if ((ret = __repmgr_share_netaddrs(env,
+		    rep, (u_int)eid, db_rep->site_cnt)) == 0) {
+			/* Show that a change was made. */
+			db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+		} else {
+			/*
+			 * Rescind the local slot we just added, so that we at
+			 * least keep the two lists in sync.
+			 */
+			db_rep->site_cnt--;
+			__repmgr_cleanup_netaddr(env, &site->net_addr);
+		}
+	} else
+		eid = EID_FROM_SITE(site);
+	if (ret == 0)
+		*eidp = eid;
+	return (ret);
+}
+
+/*
+ * Sets the named remote site's group membership status to the given value,
+ * creating it first if it doesn't already exist.  Adjusts connections
+ * accordingly.
+ *
+ * PUBLIC: int __repmgr_set_membership __P((ENV *,
+ * PUBLIC:     const char *, u_int, u_int32_t));
+ *
+ * Caller must host db_rep mutex, and be in ENV_ENTER context.
+ */
+int
+__repmgr_set_membership(env, host, port, status)
+	ENV *env;
+	const char *host;
+	u_int port;
+	u_int32_t status;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	REGINFO *infop;
+	REPMGR_SITE *site;
+	SITEINFO *sites;
+	u_int32_t orig;
+	int eid, ret;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	infop = env->reginfo;
+
+	COMPQUIET(orig, 0);
+	COMPQUIET(site, NULL);
+	DB_ASSERT(env, REP_ON(env));
+
+	MUTEX_LOCK(env, rep->mtx_repmgr);
+	if ((ret = get_eid(env, host, port, &eid)) == 0) {
+		DB_ASSERT(env, IS_VALID_EID(eid));
+		site = SITE_FROM_EID(eid);
+		orig = site->membership;
+		sites = R_ADDR(infop, rep->siteinfo_off);
+
+		RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+		    "set membership for %s:%lu %lu (was %lu)",
+		    host, (u_long)port, (u_long)status, (u_long)orig));
+		if (status != sites[eid].status) {
+			/*
+			 * Show that a change is occurring.
+			 *
+			 * The call to get_eid() might have also bumped the
+			 * sequence number, and since this is all happening
+			 * within a single critical section it would be possible
+			 * to avoid "wasting" a sequence number.  But it's
+			 * hardly worth the trouble and mental complexity: the
+			 * sequence number counts changes that occur within an
+			 * env region lifetime, so there should be plenty.
+			 * We'll run out of membership DB version numbers long
+			 * before this becomes a problem.
+			 */
+			db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+		}
+
+		/* Set both private and shared copies of the info. */
+		site->membership = status;
+		sites[eid].status = status;
+	}
+	MUTEX_UNLOCK(env, rep->mtx_repmgr);
+
+	/*
+	 * If our notion of the site's membership changed, we may need to create
+	 * or kill a connection.
+	 */
+	if (ret == 0 && db_rep->repmgr_status == running &&
+	    SELECTOR_RUNNING(db_rep)) {
+
+		if (eid == db_rep->self_eid && status != SITE_PRESENT)
+			ret = DB_DELETED;
+		else if (orig != SITE_PRESENT && status == SITE_PRESENT &&
+		    site->state == SITE_IDLE) {
+			/*
+			 * Here we might have just joined a group, or we might
+			 * be an existing site and we've just learned of another
+			 * site joining the group.  In the former case, we
+			 * certainly want to connect right away; in the later
+			 * case it might be better to wait, because the new site
+			 * probably isn't quite ready to accept our connection.
+			 * But deciding which case we're in here would be messy,
+			 * so for now we just keep it simple and always try
+			 * connecting immediately.  The resulting connection
+			 * failure shouldn't hurt anything, because we'll just
+			 * naturally try again later.
+			 */
+			ret = __repmgr_schedule_connection_attempt(env,
+			    eid, TRUE);
+			if (eid != db_rep->self_eid)
+				DB_EVENT(env, DB_EVENT_REP_SITE_ADDED, &eid);
+		} else if (orig != 0 && status == 0)
+			DB_EVENT(env, DB_EVENT_REP_SITE_REMOVED, &eid);
+
+		/*
+		 * Callers are responsible for adjusting nsites, even though in
+		 * a way it would make sense to do it here.  It's awkward to do
+		 * it here at start-up/join time, when we load up starting from
+		 * an empty array.  Then we would get rep_set_nsites()
+		 * repeatedly, and when leases were in use that would thrash the
+		 * lease table adjustment.
+		 */
+	}
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_bcast_parm_refresh __P((ENV *));
+ */
+int
+__repmgr_bcast_parm_refresh(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	REP *rep;
+	__repmgr_parm_refresh_args parms;
+	u_int8_t buf[__REPMGR_PARM_REFRESH_SIZE];
+	int ret;
+
+	DB_ASSERT(env, REP_ON(env));
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	LOCK_MUTEX(db_rep->mutex);
+	parms.ack_policy = (u_int32_t)rep->perm_policy;
+	if (rep->priority == 0)
+		parms.flags = 0;
+	else
+		parms.flags = SITE_ELECTABLE;
+	__repmgr_parm_refresh_marshal(env, &parms, buf);
+	ret = __repmgr_bcast_own_msg(env,
+	    REPMGR_PARM_REFRESH, buf, __REPMGR_PARM_REFRESH_SIZE);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_chg_prio(env, prev, cur)
+	ENV *env;
+	u_int32_t prev, cur;
+{
+	if ((prev == 0 && cur != 0) ||
+	    (prev != 0 && cur == 0))
+		return (__repmgr_bcast_parm_refresh(env));
+	return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_bcast_own_msg __P((ENV *,
+ * PUBLIC:     u_int32_t, u_int8_t *, size_t));
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_bcast_own_msg(env, type, buf, len)
+	ENV *env;
+	u_int32_t type;
+	u_int8_t *buf;
+	size_t len;
+{
+	DB_REP *db_rep;
+	REPMGR_CONNECTION *conn;
+	REPMGR_SITE *site;
+	int ret;
+	u_int i;
+
+	db_rep = env->rep_handle;
+	if (!SELECTOR_RUNNING(db_rep))
+		return (0);
+	FOR_EACH_REMOTE_SITE_INDEX(i) {
+		site = SITE_FROM_EID(i);
+		if (site->state != SITE_CONNECTED)
+			continue;
+		if ((conn = site->ref.conn.in) != NULL &&
+		    conn->state == CONN_READY &&
+		    (ret = __repmgr_send_own_msg(env,
+		    conn, type, buf, (u_int32_t)len)) != 0 &&
+		    (ret = __repmgr_bust_connection(env, conn)) != 0)
+			return (ret);
+		if ((conn = site->ref.conn.out) != NULL &&
+		    conn->state == CONN_READY &&
+		    (ret = __repmgr_send_own_msg(env,
+		    conn, type, buf, (u_int32_t)len)) != 0 &&
+		    (ret = __repmgr_bust_connection(env, conn)) != 0)
+			return (ret);
+	}
+	return (0);
+}
diff --git a/src/repmgr/repmgr_windows.c b/src/repmgr/repmgr_windows.c
new file mode 100644
index 00000000..d9c2a03d
--- /dev/null
+++ b/src/repmgr/repmgr_windows.c
@@ -0,0 +1,849 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/* Convert time-out from microseconds to milliseconds, rounding up. */
+#define	DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t) (((t) + (US_PER_MS - 1)) / US_PER_MS)
+
+typedef struct __cond_waiter {
+	HANDLE event;
+	PREDICATE pred;
+	void *ctx;
+	int next_free;
+} COND_WAITER;
+
+#define	WAITER_SLOT_IN_USE(w) ((w)->pred != NULL)
+
+/*
+ * Array slots [0:next_avail-1] are initialized, and either in use or on the
+ * free list.  Slots beyond that are virgin territory, whose memory contents
+ * could be garbage.  In particular, note that slots [0:next_avail-1] have a
+ * Win32 Event Object created for them, which have to be freed when cleaning up
+ * this data structure.
+ *
+ * "first_free" points to a list of not-in-use slots threaded through the first
+ * section of the array.
+ */
+struct __cond_waiters_table {
+	struct __cond_waiter *array;
+	int size;
+	int next_avail;
+	int first_free;
+};
+
+/*
+ * Aggregated control info needed for preparing for WSAWaitForMultipleEvents()
+ * call.
+ */
+struct io_info {
+	REPMGR_CONNECTION **connections;
+	WSAEVENT *events;
+	DWORD nevents;
+};
+
+static int allocate_wait_slot __P((ENV *, int *, COND_WAITERS_TABLE *));
+static void free_wait_slot __P((ENV *, int, COND_WAITERS_TABLE *));
+static int handle_completion __P((ENV *, REPMGR_CONNECTION *));
+static int prepare_io __P((ENV *, REPMGR_CONNECTION *, void *));
+
+int
+__repmgr_thread_start(env, runnable)
+	ENV *env;
+	REPMGR_RUNNABLE *runnable;
+{
+	HANDLE event, thread_id;
+
+	runnable->finished = FALSE;
+	runnable->quit_requested = FALSE;
+	runnable->env = env;
+
+	if ((event = CreateEvent(NULL, TRUE, FALSE, NULL)) == NULL)
+		return (GetLastError());
+	thread_id = CreateThread(NULL, 0,
+	    (LPTHREAD_START_ROUTINE)runnable->run, runnable, 0, NULL);
+	if (thread_id == NULL) {
+		CloseHandle(event);
+		return (GetLastError());
+	}
+	runnable->thread_id = thread_id;
+	runnable->quit_event = event;
+	return (0);
+}
+
+int
+__repmgr_thread_join(thread)
+	REPMGR_RUNNABLE *thread;
+{
+	int ret;
+
+	ret = 0;
+	if (WaitForSingleObject(thread->thread_id, INFINITE) != WAIT_OBJECT_0)
+		ret = GetLastError();
+	if (!CloseHandle(thread->thread_id) && ret == 0)
+		ret = GetLastError();
+	if (!CloseHandle(thread->quit_event) && ret == 0)
+		ret = GetLastError();
+
+	return (ret);
+}
+
+int
+__repmgr_set_nonblocking(s)
+	SOCKET s;
+{
+	int ret;
+	u_long onoff;
+
+	onoff = 1;		/* any non-zero value */
+	if ((ret = ioctlsocket(s, FIONBIO, &onoff)) == SOCKET_ERROR)
+		return (WSAGetLastError());
+	return (0);
+}
+
+int
+__repmgr_set_nonblock_conn(conn)
+	REPMGR_CONNECTION *conn;
+{
+	int ret;
+
+	if ((ret = __repmgr_set_nonblocking(conn->fd)) != 0)
+		return (ret);
+
+	if ((conn->event_object = WSACreateEvent()) == WSA_INVALID_EVENT) {
+		ret = net_errno;
+		return (ret);
+	}
+	return (0);
+}
+
+/*
+ * !!!
+ * Caller must hold the repmgr->mutex, if this thread synchronization is to work
+ * properly.
+ */
+int
+__repmgr_wake_waiters(env, w)
+	ENV *env;
+	waiter_t *w;
+{
+	DB_REP *db_rep;
+	COND_WAITERS_TABLE *waiters;
+	COND_WAITER *slot;
+	int i, ret;
+
+	ret = 0;
+	db_rep = env->rep_handle;
+	waiters = *w;
+	for (i = 0; i < waiters->next_avail; i++) {
+		slot = &waiters->array[i];
+		if (!WAITER_SLOT_IN_USE(slot))
+			continue;
+		if ((*slot->pred)(env, slot->ctx) ||
+		    db_rep->repmgr_status == stopped)
+			if (!SetEvent(slot->event) && ret == 0)
+				ret = GetLastError();
+	}
+	return (ret);
+}
+
+/*
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_await_cond(env, pred, ctx, timeout, waiters_p)
+	ENV *env;
+	PREDICATE pred;
+	void *ctx;
+	db_timeout_t timeout;
+	waiter_t *waiters_p;
+{
+	COND_WAITERS_TABLE *waiters;
+	COND_WAITER *waiter;
+	DB_REP *db_rep;
+	REP *rep;
+	DWORD ret, win_timeout;
+	int i;
+
+	db_rep = env->rep_handle;
+	rep = db_rep->region;
+	waiters = *waiters_p;
+
+	if ((ret = allocate_wait_slot(env, &i, waiters)) != 0)
+		goto err;
+	waiter = &waiters->array[i];
+
+	win_timeout = timeout > 0 ?
+	    DB_TIMEOUT_TO_WINDOWS_TIMEOUT(timeout) : INFINITE;
+	waiter->pred = pred;
+	waiter->ctx = ctx;
+	if ((ret = SignalObjectAndWait(*db_rep->mutex,
+	    waiter->event, win_timeout, FALSE)) == WAIT_FAILED) {
+		ret = GetLastError();
+	} else if (ret == WAIT_TIMEOUT)
+		ret = DB_TIMEOUT;
+	else
+		DB_ASSERT(env, ret == WAIT_OBJECT_0);
+
+	LOCK_MUTEX(db_rep->mutex);
+	free_wait_slot(env, i, waiters);
+	if (db_rep->repmgr_status == stopped)
+		ret = DB_REP_UNAVAIL;
+
+err:
+	return (ret);
+}
+
+/*
+ * !!!
+ * Caller must hold the mutex.
+ */
+static int
+allocate_wait_slot(env, resultp, table)
+	ENV *env;
+	int *resultp;
+	COND_WAITERS_TABLE *table;
+{
+	COND_WAITER *w;
+	HANDLE event;
+	int i, ret;
+
+	if (table->first_free == -1) {
+		if (table->next_avail >= table->size) {
+			/*
+			 * Grow the array.
+			 */
+			table->size *= 2;
+			w = table->array;
+			if ((ret = __os_realloc(env, table->size * sizeof(*w),
+			     &w)) != 0)
+				return (ret);
+			table->array = w;
+		}
+		if ((event = CreateEvent(NULL,
+		    FALSE, FALSE, NULL)) == NULL) {
+			/* No need to rescind the memory reallocation. */
+			return (GetLastError());
+		}
+
+		/*
+		 * Here if, one way or another, we're good to go for using the
+		 * next slot (for the first time).
+		 */
+		i = table->next_avail++;
+		w = &table->array[i];
+		w->event = event;
+	} else {
+		i = table->first_free;
+		w = &table->array[i];
+		table->first_free = w->next_free;
+	}
+	/*
+	 * Make sure this event state is nonsignaled. It is possible that
+	 * late processing could have signaled this event after the end of
+	 * the previous wait but before reacquiring the mutex, and this
+	 * extra signal would incorrectly cause the next wait to return
+	 * immediately.
+	 */ 
+	(void)WaitForSingleObject(w->event, 0);
+	*resultp = i;
+	return (0);
+}
+
+static void
+free_wait_slot(env, slot_index, table)
+	ENV *env;
+	int slot_index;
+	COND_WAITERS_TABLE *table;
+{
+	DB_REP *db_rep;
+	COND_WAITER *slot;
+
+	db_rep = env->rep_handle;
+	slot = &table->array[slot_index];
+
+	slot->pred = NULL;	/* show it's not in use */
+	slot->next_free = table->first_free;
+	table->first_free = slot_index;
+}
+
+int
+__repmgr_await_gmdbop(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	int ret;
+
+	db_rep = env->rep_handle;
+	while (db_rep->gmdb_busy) {
+		if (!ResetEvent(db_rep->gmdb_idle))
+			return (GetLastError());
+		ret = SignalObjectAndWait(*db_rep->mutex,
+		    db_rep->gmdb_idle, INFINITE, FALSE);
+		LOCK_MUTEX(db_rep->mutex);
+		if (ret == WAIT_FAILED)
+			return (GetLastError());
+		DB_ASSERT(env, ret == WAIT_OBJECT_0);
+	}
+	return (0);
+}
+
+/* (See requirements described in repmgr_posix.c.) */
+int
+__repmgr_await_drain(env, conn, timeout)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	db_timeout_t timeout;
+{
+	DB_REP *db_rep;
+	db_timespec deadline, delta, now;
+	db_timeout_t t;
+	DWORD duration, ret;
+	int round_up;
+
+	db_rep = env->rep_handle;
+
+	__os_gettime(env, &deadline, 1);
+	TIMESPEC_ADD_DB_TIMEOUT(&deadline, timeout);
+
+	while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+		if (!ResetEvent(conn->drained))
+			return (GetLastError());
+
+		/* How long until the deadline? */
+		__os_gettime(env, &now, 1);
+		if (timespeccmp(&now, &deadline, >=)) {
+			conn->state = CONN_CONGESTED;
+			return (0);
+		}
+		delta = deadline;
+		timespecsub(&delta, &now);
+		round_up = TRUE;
+		DB_TIMESPEC_TO_TIMEOUT(t, &delta, round_up);
+		duration = DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t);
+
+		ret = SignalObjectAndWait(*db_rep->mutex,
+		    conn->drained, duration, FALSE);
+		LOCK_MUTEX(db_rep->mutex);
+		if (ret == WAIT_FAILED)
+			return (GetLastError());
+		else if (ret == WAIT_TIMEOUT) {
+			conn->state = CONN_CONGESTED;
+			return (0);
+		} else
+			DB_ASSERT(env, ret == WAIT_OBJECT_0);
+
+		if (db_rep->repmgr_status == stopped)
+			return (0);
+		if (conn->state == CONN_DEFUNCT)
+			return (DB_REP_UNAVAIL);
+	}
+	return (0);
+}
+
+/*
+ * Creates a manual reset event, which is usually our best choice when we may
+ * have multiple threads waiting on a single event.
+ */
+int
+__repmgr_alloc_cond(c)
+	cond_var_t *c;
+{
+	HANDLE event;
+
+	if ((event = CreateEvent(NULL, TRUE, FALSE, NULL)) == NULL)
+		return (GetLastError());
+	*c = event;
+	return (0);
+}
+
+int
+__repmgr_free_cond(c)
+	cond_var_t *c;
+{
+	if (CloseHandle(*c))
+		return (0);
+	return (GetLastError());
+}
+
+void
+__repmgr_env_create_pf(db_rep)
+	DB_REP *db_rep;
+{
+}
+
+int
+__repmgr_create_mutex_pf(mutex)
+	mgr_mutex_t *mutex;
+{
+	if ((*mutex = CreateMutex(NULL, FALSE, NULL)) == NULL)
+		return (GetLastError());
+	return (0);
+}
+
+int
+__repmgr_destroy_mutex_pf(mutex)
+	mgr_mutex_t  *mutex;
+{
+	return (CloseHandle(*mutex) ? 0 : GetLastError());
+}
+
+int
+__repmgr_init(env)
+     ENV *env;
+{
+	DB_REP *db_rep;
+	WSADATA wsaData;
+	int ret;
+
+	db_rep = env->rep_handle;
+
+	if ((ret = WSAStartup(MAKEWORD(2, 2), &wsaData)) != 0) {
+		__db_err(env, ret, DB_STR("3589",
+		    "unable to initialize Windows networking"));
+		return (ret);
+	}
+
+	if ((db_rep->signaler = CreateEvent(NULL, /* security attr */
+	    FALSE,	/* (not) of the manual reset variety  */
+	    FALSE,		/* (not) initially signaled */
+	    NULL)) == NULL)		/* name */
+		goto geterr;
+
+	if ((db_rep->msg_avail = CreateEvent(NULL, TRUE, FALSE, NULL))
+	    == NULL)
+		goto geterr;
+
+	if ((db_rep->check_election = CreateEvent(NULL, TRUE, FALSE, NULL))
+	    == NULL)
+		goto geterr;
+
+	if ((db_rep->gmdb_idle = CreateEvent(NULL, TRUE, FALSE, NULL))
+	    == NULL)
+		goto geterr;
+
+	if ((ret = __repmgr_init_waiters(env, &db_rep->ack_waiters)) != 0)
+		goto err;
+	return (0);
+
+geterr:
+	ret = GetLastError();
+err:
+	if (db_rep->gmdb_idle != NULL)
+		CloseHandle(db_rep->gmdb_idle);
+	if (db_rep->check_election != NULL)
+		CloseHandle(db_rep->check_election);
+	if (db_rep->msg_avail != NULL)
+		CloseHandle(db_rep->msg_avail);
+	if (db_rep->signaler != NULL)
+		CloseHandle(db_rep->signaler);
+	db_rep->msg_avail =
+	    db_rep->check_election =
+	    db_rep->gmdb_idle =
+	    db_rep->signaler = NULL;
+	(void)WSACleanup();
+	return (ret);
+}
+
+int
+__repmgr_deinit(env)
+     ENV *env;
+{
+	DB_REP *db_rep;
+	int ret, t_ret;
+
+	db_rep = env->rep_handle;
+	if (!(REPMGR_INITED(db_rep)))
+		return (0);
+
+	ret = 0;
+	if (WSACleanup() == SOCKET_ERROR)
+		ret = WSAGetLastError();
+
+	if ((t_ret = __repmgr_destroy_waiters(env, &db_rep->ack_waiters))
+	    != 0 && ret == 0)
+		ret = t_ret;
+
+	if (!CloseHandle(db_rep->gmdb_idle) && ret == 0)
+		ret = GetLastError();
+
+	if (!CloseHandle(db_rep->check_election) && ret == 0)
+		ret = GetLastError();
+
+	if (!CloseHandle(db_rep->msg_avail) && ret == 0)
+		ret = GetLastError();
+
+	if (!CloseHandle(db_rep->signaler) && ret == 0)
+		ret = GetLastError();
+	db_rep->msg_avail =
+	    db_rep->check_election =
+	    db_rep->gmdb_idle =
+	    db_rep->signaler = NULL;
+
+	return (ret);
+}
+
+int
+__repmgr_init_waiters(env, waiters)
+	ENV *env;
+	waiter_t *waiters;
+{
+#define	INITIAL_ALLOCATION 5		/* arbitrary size */
+	COND_WAITERS_TABLE *table;
+	int ret;
+
+	table = NULL;
+
+	if ((ret =
+	    __os_calloc(env, 1, sizeof(COND_WAITERS_TABLE), &table)) != 0)
+		return (ret);
+
+	if ((ret = __os_calloc(env, INITIAL_ALLOCATION, sizeof(COND_WAITER),
+	    &table->array)) != 0) {
+		__os_free(env, table);
+		return (ret);
+	}
+
+	table->size = INITIAL_ALLOCATION;
+	table->first_free = -1;
+	table->next_avail = 0;
+
+	/* There's a restaurant joke in there somewhere. */
+	*waiters = table;
+	return (0);
+}
+
+int
+__repmgr_destroy_waiters(env, waitersp)
+	ENV *env;
+	waiter_t *waitersp;
+{
+	waiter_t waiters;
+	int i, ret;
+
+	waiters = *waitersp;
+	ret = 0;
+	for (i = 0; i < waiters->next_avail; i++) {
+		if (!CloseHandle(waiters->array[i].event) && ret == 0)
+			ret = GetLastError();
+	}
+	__os_free(env, waiters->array);
+	__os_free(env, waiters);
+	return (ret);
+}
+
+int
+__repmgr_lock_mutex(mutex)
+	mgr_mutex_t  *mutex;
+{
+	if (WaitForSingleObject(*mutex, INFINITE) == WAIT_OBJECT_0)
+		return (0);
+	return (GetLastError());
+}
+
+int
+__repmgr_unlock_mutex(mutex)
+	mgr_mutex_t  *mutex;
+{
+	if (ReleaseMutex(*mutex))
+		return (0);
+	return (GetLastError());
+}
+
+int
+__repmgr_signal(v)
+	cond_var_t *v;
+{
+	return (SetEvent(*v) ? 0 : GetLastError());
+}
+
+int
+__repmgr_wake_msngers(env, n)
+	ENV *env;
+	u_int n;
+{
+	DB_REP *db_rep;
+	u_int i;
+
+	db_rep = env->rep_handle;
+
+	/* Ask all threads beyond index 'n' to shut down. */
+	for (i = n; i< db_rep->nthreads; i++)
+		if (!SetEvent(db_rep->messengers[i]->quit_event))
+			return (GetLastError());
+	return (0);
+}
+
+int
+__repmgr_wake_main_thread(env)
+	ENV *env;
+{
+	if (!SetEvent(env->rep_handle->signaler))
+		return (GetLastError());
+	return (0);
+}
+
+int
+__repmgr_writev(fd, iovec, buf_count, byte_count_p)
+	socket_t fd;
+	db_iovec_t *iovec;
+	int buf_count;
+	size_t *byte_count_p;
+{
+	DWORD bytes;
+
+	if (WSASend(fd, iovec,
+	    (DWORD)buf_count, &bytes, 0, NULL, NULL) == SOCKET_ERROR)
+		return (net_errno);
+
+	*byte_count_p = (size_t)bytes;
+	return (0);
+}
+
+int
+__repmgr_readv(fd, iovec, buf_count, xfr_count_p)
+	socket_t fd;
+	db_iovec_t *iovec;
+	int buf_count;
+	size_t *xfr_count_p;
+{
+	DWORD bytes, flags;
+
+	flags = 0;
+	if (WSARecv(fd, iovec,
+	    (DWORD)buf_count, &bytes, &flags, NULL, NULL) == SOCKET_ERROR)
+		return (net_errno);
+
+	*xfr_count_p = (size_t)bytes;
+	return (0);
+}
+
+int
+__repmgr_select_loop(env)
+	ENV *env;
+{
+	DB_REP *db_rep;
+	DWORD ret;
+	DWORD select_timeout;
+	REPMGR_CONNECTION *connections[WSA_MAXIMUM_WAIT_EVENTS];
+	WSAEVENT events[WSA_MAXIMUM_WAIT_EVENTS];
+	db_timespec timeout;
+	WSAEVENT listen_event;
+	WSANETWORKEVENTS net_events;
+	struct io_info io_info;
+	int i;
+
+	db_rep = env->rep_handle;
+	io_info.connections = connections;
+	io_info.events = events;
+
+	if ((listen_event = WSACreateEvent()) == WSA_INVALID_EVENT) {
+		__db_err(env, net_errno, DB_STR("3590",
+		    "can't create event for listen socket"));
+		return (net_errno);
+	}
+	if (!IS_SUBORDINATE(db_rep) &&
+	    WSAEventSelect(db_rep->listen_fd, listen_event, FD_ACCEPT) ==
+	    SOCKET_ERROR) {
+		ret = net_errno;
+		__db_err(env, ret, DB_STR("3591",
+		    "can't enable event for listener"));
+		(void)WSACloseEvent(listen_event);
+		goto out;
+	}
+
+	LOCK_MUTEX(db_rep->mutex);
+	if ((ret = __repmgr_first_try_connections(env)) != 0)
+		goto unlock;
+	for (;;) {
+		/* Start with the two events that we always wait for. */
+#define	SIGNALER_INDEX	0
+#define	LISTENER_INDEX	1
+		events[SIGNALER_INDEX] = db_rep->signaler;
+		if (IS_SUBORDINATE(db_rep))
+			io_info.nevents = 1;
+		else {
+			events[LISTENER_INDEX] = listen_event;
+			io_info.nevents = 2;
+		}
+
+		if ((ret = __repmgr_each_connection(env,
+		    prepare_io, &io_info, TRUE)) != 0)
+			goto unlock;
+
+		if (__repmgr_compute_timeout(env, &timeout))
+			select_timeout =
+			    (DWORD)(timeout.tv_sec * MS_PER_SEC +
+			    timeout.tv_nsec / NS_PER_MS);
+		else {
+			/* No time-based events to wake us up. */
+			select_timeout = WSA_INFINITE;
+		}
+
+		UNLOCK_MUTEX(db_rep->mutex);
+		ret = WSAWaitForMultipleEvents(
+		    io_info.nevents, events, FALSE, select_timeout, FALSE);
+		if (db_rep->repmgr_status == stopped) {
+			ret = 0;
+			goto out;
+		}
+		LOCK_MUTEX(db_rep->mutex);
+
+		/*
+		 * !!!
+		 * Note that `ret' remains set as the return code from
+		 * WSAWaitForMultipleEvents, above.
+		 */
+		if (ret >= WSA_WAIT_EVENT_0 &&
+		    ret < WSA_WAIT_EVENT_0 + io_info.nevents) {
+			if ((i = ret - WSA_WAIT_EVENT_0) == SIGNALER_INDEX) {
+				/* Another thread woke us. */
+			} else if (!IS_SUBORDINATE(db_rep) &&
+			    i == LISTENER_INDEX) {
+				if ((ret = WSAEnumNetworkEvents(
+				    db_rep->listen_fd, listen_event,
+				    &net_events)) == SOCKET_ERROR) {
+					ret = net_errno;
+					goto unlock;
+				}
+				DB_ASSERT(env,
+				    net_events.lNetworkEvents & FD_ACCEPT);
+				if ((ret = net_events.iErrorCode[FD_ACCEPT_BIT])
+				    != 0)
+					goto unlock;
+				if ((ret = __repmgr_accept(env)) != 0)
+					goto unlock;
+			} else {
+				if (connections[i]->state != CONN_DEFUNCT &&
+				    (ret = handle_completion(env,
+				    connections[i])) != 0)
+					goto unlock;
+			}
+		} else if (ret == WSA_WAIT_TIMEOUT) {
+			if ((ret = __repmgr_check_timeouts(env)) != 0)
+				goto unlock;
+		} else if (ret == WSA_WAIT_FAILED) {
+			ret = net_errno;
+			goto unlock;
+		}
+	}
+
+unlock:
+	UNLOCK_MUTEX(db_rep->mutex);
+out:
+	if (!CloseHandle(listen_event) && ret == 0)
+		ret = GetLastError();
+	if (ret == DB_DELETED)
+		ret = __repmgr_bow_out(env);
+	LOCK_MUTEX(db_rep->mutex);
+	(void)__repmgr_net_close(env);
+	UNLOCK_MUTEX(db_rep->mutex);
+	return (ret);
+}
+
+static int
+prepare_io(env, conn, info_)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+	void *info_;
+{
+	struct io_info *info;
+	long desired_events;
+	int ret;
+
+	if (conn->state == CONN_DEFUNCT)
+		return (__repmgr_cleanup_defunct(env, conn));
+
+	/*
+	 * Note that even if we're suffering flow control, we
+	 * nevertheless still read if we haven't even yet gotten
+	 * a handshake.  Why?  (1) Handshakes are important; and
+	 * (2) they don't hurt anything flow-control-wise.
+	 */
+	info = info_;
+
+	/*
+	 * If we ever implemented flow control, we would have some conditions to
+	 * examine here.  But as it is, we always are willing to accept I/O on
+	 * every connection.
+	 *
+	 * We can only handle as many connections as the number of events the
+	 * WSAWaitForMultipleEvents function allows (minus 2, for our overhead:
+	 * the listener and the signaler).
+	 */
+	DB_ASSERT(env, info->nevents < WSA_MAXIMUM_WAIT_EVENTS);
+	info->events[info->nevents] = conn->event_object;
+	info->connections[info->nevents++] = conn;
+
+	desired_events = FD_READ | FD_CLOSE;
+	if (!STAILQ_EMPTY(&conn->outbound_queue))
+		desired_events |= FD_WRITE;
+	if (WSAEventSelect(conn->fd,
+	    conn->event_object, desired_events) == SOCKET_ERROR) {
+		ret = net_errno;
+		__db_err(env, ret, DB_STR_A("3592",
+		    "can't set event bits 0x%lx", "%lx"), desired_events);
+	} else
+		ret = 0;
+
+	return (ret);
+}
+
+static int
+handle_completion(env, conn)
+	ENV *env;
+	REPMGR_CONNECTION *conn;
+{
+	int error, ret;
+	WSANETWORKEVENTS events;
+
+	if ((ret = WSAEnumNetworkEvents(conn->fd, conn->event_object, &events))
+	    == SOCKET_ERROR) {
+		error = net_errno;
+		__db_err(env, error, DB_STR("3593", "EnumNetworkEvents"));
+		goto report;
+	}
+
+	/* Check both writing and reading. */
+	if (events.lNetworkEvents & FD_CLOSE) {
+		error = events.iErrorCode[FD_CLOSE_BIT];
+		goto report;
+	}
+
+	if (events.lNetworkEvents & FD_WRITE) {
+		if (events.iErrorCode[FD_WRITE_BIT] != 0) {
+			error = events.iErrorCode[FD_WRITE_BIT];
+			goto report;
+		} else if ((ret =
+			__repmgr_write_some(env, conn)) != 0)
+			goto err;
+	}
+
+	if (events.lNetworkEvents & FD_READ) {
+		if (events.iErrorCode[FD_READ_BIT] != 0) {
+			error = events.iErrorCode[FD_READ_BIT];
+			goto report;
+		} else if ((ret =
+			__repmgr_read_from_site(env, conn)) != 0)
+			goto err;
+	}
+
+	if (0) {
+report:
+		__repmgr_fire_conn_err_event(env, conn, error);
+		STAT(env->rep_handle->region->mstat.st_connection_drop++);
+		ret = DB_REP_UNAVAIL;
+	}
+err:
+	if (ret == DB_REP_UNAVAIL)
+		ret = __repmgr_bust_connection(env, conn);
+	return (ret);
+}
diff --git a/src/sequence/seq_stat.c b/src/sequence/seq_stat.c
new file mode 100644
index 00000000..d5b9a401
--- /dev/null
+++ b/src/sequence/seq_stat.c
@@ -0,0 +1,275 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#ifdef HAVE_64BIT_TYPES
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/sequence_ext.h"
+
+#ifdef HAVE_STATISTICS
+static int __seq_print_all __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_print_stats __P((DB_SEQUENCE *, u_int32_t));
+
+/*
+ * __seq_stat --
+ *	Get statistics from the sequence.
+ *
+ * PUBLIC: int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t));
+ */
+int
+__seq_stat(seq, spp, flags)
+	DB_SEQUENCE *seq;
+	DB_SEQUENCE_STAT **spp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT data;
+	DB_SEQUENCE_STAT *sp;
+	DB_SEQ_RECORD record;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat");
+
+	switch (flags) {
+	case DB_STAT_CLEAR:
+	case DB_STAT_ALL:
+	case 0:
+		break;
+	default:
+		return (__db_ferr(env, "DB_SEQUENCE->stat", 0));
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Allocate and clear the structure. */
+	if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+		goto err;
+	memset(sp, 0, sizeof(*sp));
+
+	if (seq->mtx_seq != MUTEX_INVALID) {
+		__mutex_set_wait_info(
+		    env, seq->mtx_seq, &sp->st_wait, &sp->st_nowait);
+
+		if (LF_ISSET(DB_STAT_CLEAR))
+			__mutex_clear(env, seq->mtx_seq);
+	}
+	memset(&data, 0, sizeof(data));
+	data.data = &record;
+	data.ulen = sizeof(record);
+	data.flags = DB_DBT_USERMEM;
+retry:	if ((ret = __db_get(dbp, ip, NULL, &seq->seq_key, &data, 0)) != 0) {
+		if (ret == DB_BUFFER_SMALL &&
+		    data.size > sizeof(seq->seq_record)) {
+			if ((ret = __os_malloc(env,
+			    data.size, &data.data)) != 0)
+				goto err;
+			data.ulen = data.size;
+			goto retry;
+		}
+		goto err;
+	}
+
+	if (data.data != &record)
+		memcpy(&record, data.data, sizeof(record));
+	sp->st_current = record.seq_value;
+	sp->st_value = seq->seq_record.seq_value;
+	sp->st_last_value = seq->seq_last_value;
+	sp->st_min = seq->seq_record.seq_min;
+	sp->st_max = seq->seq_record.seq_max;
+	sp->st_cache_size = seq->seq_cache_size;
+	sp->st_flags = seq->seq_record.flags;
+
+	*spp = sp;
+	if (data.data != &record)
+		__os_free(env, data.data);
+
+	/* Release replication block. */
+err:	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __seq_stat_print --
+ *	Print statistics from the sequence.
+ *
+ * PUBLIC: int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t));
+ */
+int
+__seq_stat_print(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat_print");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __seq_print_stats(seq, flags)) != 0)
+		goto err;
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __seq_print_all(seq, flags)) != 0)
+		goto err;
+
+	/* Release replication block. */
+err:	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+
+}
+
+static const FN __db_seq_flags_fn[] = {
+	{ DB_SEQ_DEC,		"decrement" },
+	{ DB_SEQ_INC,		"increment" },
+	{ DB_SEQ_RANGE_SET,	"range set (internal)" },
+	{ DB_SEQ_WRAP,		"wraparound at end" },
+	{ 0,			NULL }
+};
+
+/*
+ * __db_get_seq_flags_fn --
+ *	Return the __db_seq_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_seq_flags_fn __P((void));
+ */
+const FN *
+__db_get_seq_flags_fn()
+{
+	return (__db_seq_flags_fn);
+}
+
+/*
+ * __seq_print_stats --
+ *	Display sequence stat structure.
+ */
+static int
+__seq_print_stats(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	DB_SEQUENCE_STAT *sp;
+	ENV *env;
+	int ret;
+
+	env = seq->seq_dbp->env;
+
+	if ((ret = __seq_stat(seq, &sp, flags)) != 0)
+		return (ret);
+	__db_dl_pct(env, "The number of sequence locks that required waiting",
+	    (u_long)sp->st_wait,
+	     DB_PCT(sp->st_wait, sp->st_wait + sp->st_nowait), NULL);
+	STAT_FMT("The current sequence value",
+	    INT64_FMT, db_seq_t, sp->st_current);
+	STAT_FMT("The cached sequence value",
+	    INT64_FMT, db_seq_t, sp->st_value);
+	STAT_FMT("The last cached sequence value",
+	    INT64_FMT, db_seq_t, sp->st_last_value);
+	STAT_FMT("The minimum sequence value",
+	    INT64_FMT, db_seq_t, sp->st_min);
+	STAT_FMT("The maximum sequence value",
+	    INT64_FMT, db_seq_t, sp->st_max);
+	STAT_ULONG("The cache size", sp->st_cache_size);
+	__db_prflags(env, NULL,
+	    sp->st_flags, __db_seq_flags_fn, NULL, "\tSequence flags");
+	__os_ufree(seq->seq_dbp->env, sp);
+	return (0);
+}
+
+/*
+ * __seq_print_all --
+ *	Display sequence debugging information - none for now.
+ *	(The name seems a bit strange, no?)
+ */
+static int
+__seq_print_all(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	COMPQUIET(seq, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__seq_stat(seq, statp, flags)
+	DB_SEQUENCE *seq;
+	DB_SEQUENCE_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(seq->seq_dbp->env));
+}
+
+int
+__seq_stat_print(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(seq->seq_dbp->env));
+}
+
+/*
+ * __db_get_seq_flags_fn --
+ *	Return the __db_seq_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_seq_flags_fn __P((void));
+ */
+const FN *
+__db_get_seq_flags_fn()
+{
+	static const FN __db_seq_flags_fn[] = {
+		{ 0,	NULL }
+	};
+
+	/*
+	 * !!!
+	 * The Tcl API uses this interface, stub it off.
+	 */
+	return (__db_seq_flags_fn);
+}
+#endif /* !HAVE_STATISTICS */
+#endif /* HAVE_64BIT_TYPES */
diff --git a/src/sequence/sequence.c b/src/sequence/sequence.c
new file mode 100644
index 00000000..1c19f838
--- /dev/null
+++ b/src/sequence/sequence.c
@@ -0,0 +1,1011 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/sequence_ext.h"
+
+#ifdef HAVE_64BIT_TYPES
+/*
+ * Sequences must be architecture independent but they are stored as user
+ * data in databases so the code here must handle the byte ordering.  We
+ * store them in little-endian byte ordering.  If we are on a big-endian
+ * machine we swap in and out when we read from the database. seq->seq_rp
+ * always points to the record in native ordering.
+ *
+ * Version 1 always stored things in native format so if we detect this we
+ * upgrade on the fly and write the record back at open time.
+ */
+#define	SEQ_SWAP(rp)							\
+	do {								\
+		M_32_SWAP((rp)->seq_version);				\
+		M_32_SWAP((rp)->flags);					\
+		M_64_SWAP((rp)->seq_value);				\
+		M_64_SWAP((rp)->seq_max);				\
+		M_64_SWAP((rp)->seq_min);				\
+	} while (0)
+
+#define	SEQ_SWAP_IN(env, seq) \
+	do {								\
+		if (!F_ISSET((env), ENV_LITTLEENDIAN)) {		\
+			memcpy(&seq->seq_record, seq->seq_data.data,	\
+			     sizeof(seq->seq_record));			\
+			SEQ_SWAP(&seq->seq_record);			\
+		}							\
+	} while (0)
+
+#define	SEQ_SWAP_OUT(env, seq) \
+	do {								\
+		if (!F_ISSET((env), ENV_LITTLEENDIAN)) {		\
+			memcpy(seq->seq_data.data,			\
+			     &seq->seq_record, sizeof(seq->seq_record));\
+			SEQ_SWAP((DB_SEQ_RECORD*)seq->seq_data.data);	\
+		}							\
+	} while (0)
+
+static int __seq_chk_cachesize __P((ENV *, int32_t, db_seq_t, db_seq_t));
+static int __seq_close __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_close_pp __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_get
+	       __P((DB_SEQUENCE *, DB_TXN *, int32_t,  db_seq_t *, u_int32_t));
+static int __seq_get_cachesize __P((DB_SEQUENCE *, int32_t *));
+static int __seq_get_db __P((DB_SEQUENCE *, DB **));
+static int __seq_get_flags __P((DB_SEQUENCE *, u_int32_t *));
+static int __seq_get_key __P((DB_SEQUENCE *, DBT *));
+static int __seq_get_range __P((DB_SEQUENCE *, db_seq_t *, db_seq_t *));
+static int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t));
+static int __seq_open_pp __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t));
+static int __seq_remove __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
+static int __seq_set_cachesize __P((DB_SEQUENCE *, int32_t));
+static int __seq_set_flags __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_set_range __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
+static int __seq_update
+	__P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, int32_t, u_int32_t));
+
+/*
+ * db_sequence_create --
+ *	DB_SEQUENCE constructor.
+ *
+ * EXTERN: int db_sequence_create __P((DB_SEQUENCE **, DB *, u_int32_t));
+ */
+int
+db_sequence_create(seqp, dbp, flags)
+	DB_SEQUENCE **seqp;
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_SEQUENCE *seq;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "db_sequence_create");
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	default:
+		return (__db_ferr(env, "db_sequence_create", 0));
+	}
+
+	if (dbp->type == DB_HEAP) {
+		__db_errx(env, DB_STR("4016",
+		    "Heap databases may not be used with sequences."));
+		return (EINVAL);
+
+	}
+
+	/* Allocate the sequence. */
+	if ((ret = __os_calloc(env, 1, sizeof(*seq), &seq)) != 0)
+		return (ret);
+
+	seq->seq_dbp = dbp;
+	seq->close = __seq_close_pp;
+	seq->get = __seq_get;
+	seq->get_cachesize = __seq_get_cachesize;
+	seq->set_cachesize = __seq_set_cachesize;
+	seq->get_db = __seq_get_db;
+	seq->get_flags = __seq_get_flags;
+	seq->get_key = __seq_get_key;
+	seq->get_range = __seq_get_range;
+	seq->initial_value = __seq_initial_value;
+	seq->open = __seq_open_pp;
+	seq->remove = __seq_remove;
+	seq->set_flags = __seq_set_flags;
+	seq->set_range = __seq_set_range;
+	seq->stat = __seq_stat;
+	seq->stat_print = __seq_stat_print;
+	seq->seq_rp = &seq->seq_record;
+	*seqp = seq;
+
+	return (0);
+}
+
+/*
+ * __seq_open --
+ *	DB_SEQUENCE->open method.
+ *
+ */
+static int
+__seq_open_pp(seq, txn, keyp, flags)
+	DB_SEQUENCE *seq;
+	DB_TXN *txn;
+	DBT *keyp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_SEQ_RECORD *rp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t tflags;
+	int handle_check, txn_local, ret, t_ret;
+#define	SEQ_OPEN_FLAGS	(DB_CREATE | DB_EXCL | DB_THREAD)
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->open");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __db_fchk(env,
+	    "DB_SEQUENCE->open", flags, SEQ_OPEN_FLAGS)) != 0)
+		goto err;
+
+	if (keyp->size == 0) {
+		__db_errx(env, DB_STR("4001",
+		    "Zero length sequence key specified"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if ((ret = __db_get_flags(dbp, &tflags)) != 0)
+		goto err;
+
+	/*
+	 * We can let replication clients open sequences, but must
+	 * check later that they do not update them.
+	 */
+	if (F_ISSET(dbp, DB_AM_RDONLY)) {
+		ret = __db_rdonly(dbp->env, "DB_SEQUENCE->open");
+		goto err;
+	}
+	if (FLD_ISSET(tflags, DB_DUP)) {
+		__db_errx(env, DB_STR("4002",
+	"Sequences not supported in databases configured for duplicate data"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (LF_ISSET(DB_THREAD)) {
+		if ((ret = __mutex_alloc(env,
+		    MTX_SEQUENCE, DB_MUTEX_PROCESS_ONLY, &seq->mtx_seq)) != 0)
+			goto err;
+	}
+
+	memset(&seq->seq_data, 0, sizeof(DBT));
+	if (F_ISSET(env, ENV_LITTLEENDIAN)) {
+		seq->seq_data.data = &seq->seq_record;
+		seq->seq_data.flags = DB_DBT_USERMEM;
+	} else {
+		if ((ret = __os_umalloc(env,
+		     sizeof(seq->seq_record), &seq->seq_data.data)) != 0)
+			goto err;
+		seq->seq_data.flags = DB_DBT_REALLOC;
+	}
+
+	seq->seq_data.ulen = seq->seq_data.size = sizeof(seq->seq_record);
+	seq->seq_rp = &seq->seq_record;
+
+	if ((ret = __dbt_usercopy(env, keyp)) != 0)
+		goto err;
+
+	memset(&seq->seq_key, 0, sizeof(DBT));
+	if ((ret = __os_malloc(env, keyp->size, &seq->seq_key.data)) != 0)
+		goto err;
+	memcpy(seq->seq_key.data, keyp->data, keyp->size);
+	seq->seq_key.size = seq->seq_key.ulen = keyp->size;
+	seq->seq_key.flags = DB_DBT_USERMEM;
+
+retry:	if ((ret = __db_get(dbp, ip,
+	    txn, &seq->seq_key, &seq->seq_data, 0)) != 0) {
+		if (ret == DB_BUFFER_SMALL &&
+		    seq->seq_data.size > sizeof(seq->seq_record)) {
+			seq->seq_data.flags = DB_DBT_REALLOC;
+			seq->seq_data.data = NULL;
+			goto retry;
+		}
+		if ((ret != DB_NOTFOUND && ret != DB_KEYEMPTY) ||
+		    !LF_ISSET(DB_CREATE))
+			goto err;
+		if (IS_REP_CLIENT(env) &&
+		    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+			ret = __db_rdonly(env, "DB_SEQUENCE->open");
+			goto err;
+		}
+		ret = 0;
+
+		rp = &seq->seq_record;
+		if (!F_ISSET(rp, DB_SEQ_RANGE_SET)) {
+			rp->seq_max = INT64_MAX;
+			rp->seq_min = INT64_MIN;
+		}
+		/* INC is the default. */
+		if (!F_ISSET(rp, DB_SEQ_DEC))
+			F_SET(rp, DB_SEQ_INC);
+
+		rp->seq_version = DB_SEQUENCE_VERSION;
+
+		if (rp->seq_value > rp->seq_max ||
+		    rp->seq_value < rp->seq_min) {
+			__db_errx(env, DB_STR("4003",
+			    "Sequence value out of range"));
+			ret = EINVAL;
+			goto err;
+		} else {
+			SEQ_SWAP_OUT(env, seq);
+			/* Create local transaction as necessary. */
+			if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+				if ((ret =
+				    __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+					goto err;
+				txn_local = 1;
+			}
+
+			if ((ret = __db_put(dbp, ip, txn, &seq->seq_key,
+			     &seq->seq_data, DB_NOOVERWRITE)) != 0) {
+				__db_errx(env, DB_STR("4004",
+				    "Sequence create failed"));
+				goto err;
+			}
+		}
+	} else if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+		ret = EEXIST;
+		goto err;
+	} else if (seq->seq_data.size < sizeof(seq->seq_record)) {
+		__db_errx(env, DB_STR("4005",
+		    "Bad sequence record format"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (F_ISSET(env, ENV_LITTLEENDIAN))
+		seq->seq_rp = seq->seq_data.data;
+
+	/*
+	 * The first release was stored in native mode.
+	 * Check the version number before swapping.
+	 */
+	rp = seq->seq_data.data;
+	if (rp->seq_version == DB_SEQUENCE_OLDVER) {
+oldver:		if (IS_REP_CLIENT(env) &&
+		    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+			ret = __db_rdonly(env, "DB_SEQUENCE->open");
+			goto err;
+		}
+		rp->seq_version = DB_SEQUENCE_VERSION;
+		if (!F_ISSET(env, ENV_LITTLEENDIAN)) {
+			if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+				if ((ret =
+				    __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+					goto err;
+				txn_local = 1;
+				goto retry;
+			}
+			memcpy(&seq->seq_record, rp, sizeof(seq->seq_record));
+			SEQ_SWAP_OUT(env, seq);
+		}
+		if ((ret = __db_put(dbp,
+		     ip, txn, &seq->seq_key, &seq->seq_data, 0)) != 0)
+			goto err;
+	}
+	rp = seq->seq_rp;
+
+	SEQ_SWAP_IN(env, seq);
+
+	if (rp->seq_version != DB_SEQUENCE_VERSION) {
+		/*
+		 * The database may have moved from one type
+		 * of machine to another, check here.
+		 * If we moved from little-end to big-end then
+		 * the swap above will make the version correct.
+		 * If the move was from big to little
+		 * then we need to swap to see if this
+		 * is an old version.
+		 */
+		if (rp->seq_version == DB_SEQUENCE_OLDVER)
+			goto oldver;
+		M_32_SWAP(rp->seq_version);
+		if (rp->seq_version == DB_SEQUENCE_OLDVER) {
+			SEQ_SWAP(rp);
+			goto oldver;
+		}
+		M_32_SWAP(rp->seq_version);
+		__db_errx(env, DB_STR_A("4006",
+		    "Unsupported sequence version: %d", "%d"),
+		    rp->seq_version);
+		goto err;
+	}
+
+	seq->seq_last_value = seq->seq_prev_value = rp->seq_value;
+	if (F_ISSET(rp, DB_SEQ_INC))
+		seq->seq_last_value--;
+	else
+		seq->seq_last_value++;
+
+	/*
+	 * It's an error to specify a cache larger than the range of sequences.
+	 */
+	if (seq->seq_cache_size != 0 && (ret = __seq_chk_cachesize(
+	    env, seq->seq_cache_size, rp->seq_max, rp->seq_min)) != 0)
+		goto err;
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+	if (ret != 0) {
+		__os_free(env, seq->seq_key.data);
+		seq->seq_key.data = NULL;
+	}
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, keyp, NULL, NULL);
+	return (ret);
+}
+
+/*
+ * __seq_get_cachesize --
+ *	Accessor for value passed into DB_SEQUENCE->set_cachesize call.
+ *
+ */
+static int
+__seq_get_cachesize(seq, cachesize)
+	DB_SEQUENCE *seq;
+	int32_t *cachesize;
+{
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_cachesize");
+
+	*cachesize = seq->seq_cache_size;
+	return (0);
+}
+
+/*
+ * __seq_set_cachesize --
+ *	DB_SEQUENCE->set_cachesize.
+ *
+ */
+static int
+__seq_set_cachesize(seq, cachesize)
+	DB_SEQUENCE *seq;
+	int32_t cachesize;
+{
+	ENV *env;
+	int ret;
+
+	env = seq->seq_dbp->env;
+
+	if (cachesize < 0) {
+		__db_errx(env, DB_STR("4007",
+		    "Cache size must be >= 0"));
+		return (EINVAL);
+	}
+
+	/*
+	 * It's an error to specify a cache larger than the range of sequences.
+	 */
+	if (SEQ_IS_OPEN(seq) && (ret = __seq_chk_cachesize(env,
+	    cachesize, seq->seq_rp->seq_max, seq->seq_rp->seq_min)) != 0)
+		return (ret);
+
+	seq->seq_cache_size = cachesize;
+	return (0);
+}
+
+#define	SEQ_SET_FLAGS	(DB_SEQ_WRAP | DB_SEQ_INC | DB_SEQ_DEC)
+/*
+ * __seq_get_flags --
+ *	Accessor for flags passed into DB_SEQUENCE->open call
+ *
+ */
+static int
+__seq_get_flags(seq, flagsp)
+	DB_SEQUENCE *seq;
+	u_int32_t *flagsp;
+{
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_flags");
+
+	*flagsp = F_ISSET(seq->seq_rp, SEQ_SET_FLAGS);
+	return (0);
+}
+
+/*
+ * __seq_set_flags --
+ *	DB_SEQUENCE->set_flags.
+ *
+ */
+static int
+__seq_set_flags(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	DB_SEQ_RECORD *rp;
+	ENV *env;
+	int ret;
+
+	env = seq->seq_dbp->env;
+	rp = seq->seq_rp;
+
+	SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_flags");
+
+	if ((ret = __db_fchk(
+	    env, "DB_SEQUENCE->set_flags", flags, SEQ_SET_FLAGS)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(env,
+	     "DB_SEQUENCE->set_flags", flags, DB_SEQ_DEC, DB_SEQ_INC)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_SEQ_DEC | DB_SEQ_INC))
+		F_CLR(rp, DB_SEQ_DEC | DB_SEQ_INC);
+	F_SET(rp, flags);
+
+	return (0);
+}
+
+/*
+ * __seq_initial_value --
+ *	DB_SEQUENCE->initial_value.
+ *
+ */
+static int
+__seq_initial_value(seq, value)
+	DB_SEQUENCE *seq;
+	db_seq_t value;
+{
+	DB_SEQ_RECORD *rp;
+	ENV *env;
+
+	env = seq->seq_dbp->env;
+	SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->initial_value");
+
+	rp = seq->seq_rp;
+	if (F_ISSET(rp, DB_SEQ_RANGE_SET) &&
+	     (value > rp->seq_max || value < rp->seq_min)) {
+		__db_errx(env, DB_STR("4008",
+		    "Sequence value out of range"));
+		return (EINVAL);
+	}
+
+	rp->seq_value = value;
+
+	return (0);
+}
+
+/*
+ * __seq_get_range --
+ *	Accessor for range passed into DB_SEQUENCE->set_range call
+ *
+ */
+static int
+__seq_get_range(seq, minp, maxp)
+	DB_SEQUENCE *seq;
+	db_seq_t *minp, *maxp;
+{
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_range");
+
+	*minp = seq->seq_rp->seq_min;
+	*maxp = seq->seq_rp->seq_max;
+	return (0);
+}
+
+/*
+ * __seq_set_range --
+ *	SEQUENCE->set_range.
+ *
+ */
+static int
+__seq_set_range(seq, min, max)
+	DB_SEQUENCE *seq;
+	db_seq_t min, max;
+{
+	DB_SEQ_RECORD *rp;
+	ENV *env;
+
+	env = seq->seq_dbp->env;
+	SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_range");
+
+	rp = seq->seq_rp;
+	if (min >= max) {
+		__db_errx(env, DB_STR("4009",
+    "Minimum sequence value must be less than maximum sequence value"));
+		return (EINVAL);
+	}
+
+	rp->seq_min = min;
+	rp->seq_max = max;
+	F_SET(rp, DB_SEQ_RANGE_SET);
+
+	return (0);
+}
+
+static int
+__seq_update(seq, ip, txn, delta, flags)
+	DB_SEQUENCE *seq;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	int32_t delta;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBT *data, ldata;
+	DB_SEQ_RECORD *rp;
+	ENV *env;
+	int32_t adjust;
+	int ret, txn_local, need_mutex;
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+	need_mutex = 0;
+	data = &seq->seq_data;
+
+	/*
+	 * Create a local transaction as necessary, check for consistent
+	 * transaction usage, and, if we have no transaction but do have
+	 * locking on, acquire a locker id for the handle lock acquisition.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0)
+			return (ret);
+		txn_local = 1;
+	} else
+		txn_local = 0;
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+	/*
+	 * If we are in a global transaction avoid deadlocking on the mutex.
+	 * The write lock on the data will prevent two updaters getting in
+	 * at once.  Fetch the data then see if things are what we thought
+	 * they were.
+	 */
+	if (txn_local == 0 && txn != NULL) {
+		MUTEX_UNLOCK(env, seq->mtx_seq);
+		need_mutex = 1;
+		data = &ldata;
+		data->data = NULL;
+		data->flags = DB_DBT_REALLOC;
+	}
+
+retry:	if ((ret = __db_get(dbp, ip,
+	    txn, &seq->seq_key, data, DB_RMW)) != 0) {
+		if (ret == DB_BUFFER_SMALL &&
+		    seq->seq_data.size > sizeof(seq->seq_record)) {
+			data->flags = DB_DBT_REALLOC;
+			data->data = NULL;
+			goto retry;
+		}
+		goto err;
+	}
+
+	if (data->size < sizeof(seq->seq_record)) {
+		__db_errx(env, DB_STR("4010",
+		    "Bad sequence record format"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	/* We have an exclusive lock on the data, see if we raced. */
+	if (need_mutex) {
+		MUTEX_LOCK(env, seq->mtx_seq);
+		need_mutex = 0;
+		rp = seq->seq_rp;
+		/*
+		 * Note that caching must be off if we have global
+		 * transaction so the value we fetch from the database
+		 * is the correct current value.
+		 */
+		if (data->size <= seq->seq_data.size) {
+			memcpy(seq->seq_data.data, data->data, data->size);
+			__os_ufree(env, data->data);
+		} else {
+			seq->seq_data.data = data->data;
+			seq->seq_data.size = data->size;
+		}
+	}
+	if (F_ISSET(env, ENV_LITTLEENDIAN))
+		seq->seq_rp = seq->seq_data.data;
+	SEQ_SWAP_IN(env, seq);
+	rp = seq->seq_rp;
+
+	if (F_ISSET(rp, DB_SEQ_WRAPPED))
+		goto overflow;
+
+	adjust = delta > seq->seq_cache_size ? delta : seq->seq_cache_size;
+
+	/*
+	 * Check whether this operation will cause the sequence to wrap.
+	 *
+	 * The sequence minimum and maximum values can be INT64_MIN and
+	 * INT64_MAX, so we need to do the test carefully to cope with
+	 * arithmetic overflow.  The first part of the test below checks
+	 * whether we will hit the end of the 64-bit range.  The second part
+	 * checks whether we hit the end of the sequence.
+	 */
+again:	if (F_ISSET(rp, DB_SEQ_INC)) {
+		if (rp->seq_value + adjust - 1 < rp->seq_value ||
+		     rp->seq_value + adjust - 1 > rp->seq_max) {
+			/* Don't wrap just to fill the cache. */
+			if (adjust > delta) {
+				adjust = delta;
+				goto again;
+			}
+			if (F_ISSET(rp, DB_SEQ_WRAP))
+				rp->seq_value = rp->seq_min;
+			else {
+overflow:			__db_errx(env, DB_STR("4011",
+				    "Sequence overflow"));
+				ret = EINVAL;
+				goto err;
+			}
+		}
+		/* See if we are at the end of the 64 bit range. */
+		if (!F_ISSET(rp, DB_SEQ_WRAP) &&
+		    rp->seq_value + adjust < rp->seq_value)
+			F_SET(rp, DB_SEQ_WRAPPED);
+	} else {
+		if ((rp->seq_value - adjust) + 1 > rp->seq_value ||
+		   (rp->seq_value - adjust) + 1 < rp->seq_min) {
+			/* Don't wrap just to fill the cache. */
+			if (adjust > delta) {
+				adjust = delta;
+				goto again;
+			}
+			if (F_ISSET(rp, DB_SEQ_WRAP))
+				rp->seq_value = rp->seq_max;
+			else
+				goto overflow;
+		}
+		/* See if we are at the end of the 64 bit range. */
+		if (!F_ISSET(rp, DB_SEQ_WRAP) &&
+		    rp->seq_value - adjust > rp->seq_value)
+			F_SET(rp, DB_SEQ_WRAPPED);
+		adjust = -adjust;
+	}
+
+	rp->seq_value += adjust;
+	SEQ_SWAP_OUT(env, seq);
+	ret = __db_put(dbp, ip, txn, &seq->seq_key, &seq->seq_data, 0);
+	rp->seq_value -= adjust;
+	if (ret != 0) {
+		__db_errx(env, DB_STR("4012",
+		    "Sequence update failed"));
+		goto err;
+	}
+	seq->seq_last_value = rp->seq_value + adjust;
+	if (F_ISSET(rp, DB_SEQ_INC))
+		seq->seq_last_value--;
+	else
+		seq->seq_last_value++;
+
+err:	if (need_mutex) {
+		if (data->data != NULL)
+			__os_ufree(env, data->data);
+		MUTEX_LOCK(env, seq->mtx_seq);
+	}
+	return (txn_local ? __db_txn_auto_resolve(
+	    env, txn, LF_ISSET(DB_TXN_NOSYNC), ret) : ret);
+}
+
+static int
+__seq_get(seq, txn, delta, retp, flags)
+	DB_SEQUENCE *seq;
+	DB_TXN *txn;
+	int32_t delta;
+	db_seq_t *retp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_SEQ_RECORD *rp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+	rp = seq->seq_rp;
+	ret = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get");
+
+	if (delta < 0 || (delta == 0 && !LF_ISSET(DB_CURRENT))) {
+		__db_errx(env, "Sequence delta must be greater than 0");
+		return (EINVAL);
+	}
+
+	if (seq->seq_cache_size != 0 && txn != NULL) {
+		__db_errx(env,
+	    "Sequence with non-zero cache may not specify transaction handle");
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0)
+		return (ret);
+
+	MUTEX_LOCK(env, seq->mtx_seq);
+
+	if (handle_check && IS_REP_CLIENT(env) &&
+	    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		ret = __db_rdonly(env, "DB_SEQUENCE->get");
+		goto err;
+	}
+
+	if (rp->seq_min + delta > rp->seq_max) {
+		__db_errx(env, DB_STR("4013", "Sequence overflow"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (LF_ISSET(DB_CURRENT)) {
+		*retp = seq->seq_prev_value;
+	} else if (F_ISSET(rp, DB_SEQ_INC)) {
+		if (seq->seq_last_value + 1 - rp->seq_value < delta &&
+		    (ret = __seq_update(seq, ip, txn, delta, flags)) != 0)
+			goto err;
+
+		rp = seq->seq_rp;
+		*retp = rp->seq_value;
+		seq->seq_prev_value = rp->seq_value;
+		rp->seq_value += delta;
+	} else {
+		if ((rp->seq_value - seq->seq_last_value) + 1 < delta &&
+		    (ret = __seq_update(seq, ip, txn, delta, flags)) != 0)
+			goto err;
+
+		rp = seq->seq_rp;
+		*retp = rp->seq_value;
+		seq->seq_prev_value = rp->seq_value;
+		rp->seq_value -= delta;
+	}
+
+err:	MUTEX_UNLOCK(env, seq->mtx_seq);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __seq_get_db --
+ *	Accessor for dbp passed into db_sequence_create call
+ *
+ */
+static int
+__seq_get_db(seq, dbpp)
+	DB_SEQUENCE *seq;
+	DB **dbpp;
+{
+	*dbpp = seq->seq_dbp;
+	return (0);
+}
+
+/*
+ * __seq_get_key --
+ *	Accessor for key passed into DB_SEQUENCE->open call
+ *
+ */
+static int
+__seq_get_key(seq, key)
+	DB_SEQUENCE *seq;
+	DBT *key;
+{
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_key");
+
+	if (F_ISSET(key, DB_DBT_USERCOPY))
+		return (__db_retcopy(seq->seq_dbp->env, key,
+		    seq->seq_key.data, seq->seq_key.size, NULL, 0));
+
+	key->data = seq->seq_key.data;
+	key->size = key->ulen = seq->seq_key.size;
+	key->flags = seq->seq_key.flags;
+	return (0);
+}
+
+/*
+ * __seq_close_pp --
+ *	Close a sequence pre/post processing
+ *
+ */
+static int
+__seq_close_pp(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ENV_ENTER(seq->seq_dbp->env, ip);
+	ret = __seq_close(seq, flags);
+	ENV_LEAVE(seq->seq_dbp->env, ip);
+
+	return (ret);
+}
+
+/*
+ * __seq_close --
+ *	Close a sequence
+ *
+ */
+static int
+__seq_close(seq, flags)
+	DB_SEQUENCE *seq;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret, t_ret;
+
+	ret = 0;
+	env = seq->seq_dbp->env;
+
+	if (flags != 0)
+		ret = __db_ferr(env, "DB_SEQUENCE->close", 0);
+
+	if ((t_ret = __mutex_free(env, &seq->mtx_seq)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (seq->seq_key.data != NULL)
+		__os_free(env, seq->seq_key.data);
+	if (seq->seq_data.data != NULL &&
+	    seq->seq_data.data != &seq->seq_record)
+		__os_ufree(env, seq->seq_data.data);
+	seq->seq_key.data = NULL;
+
+	memset(seq, CLEAR_BYTE, sizeof(*seq));
+	__os_free(env, seq);
+
+	return (ret);
+}
+
+/*
+ * __seq_remove --
+ *	Remove a sequence from the database.
+ */
+static int
+__seq_remove(seq, txn, flags)
+	DB_SEQUENCE *seq;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	dbp = seq->seq_dbp;
+	env = dbp->env;
+	txn_local = 0;
+
+	SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->remove");
+
+	/*
+	 * Flags can only be 0, unless the database has DB_AUTO_COMMIT enabled.
+	 * Then DB_TXN_NOSYNC is allowed.
+	 */
+	if (flags != 0 &&
+	    (flags != DB_TXN_NOSYNC || !IS_DB_AUTO_COMMIT(dbp, txn)))
+		return (__db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0));
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create a local transaction as necessary, check for consistent
+	 * transaction usage, and, if we have no transaction but do have
+	 * locking on, acquire a locker id for the handle lock acquisition.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0)
+			return (ret);
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_del(dbp, ip, txn, &seq->seq_key, 0);
+
+	if ((t_ret = __seq_close(seq, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+err:	if (txn_local && (t_ret =
+	    __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __seq_chk_cachesize --
+ *	Validate the cache size vs. the range.
+ */
+static int
+__seq_chk_cachesize(env, cachesize, max, min)
+	ENV *env;
+	int32_t cachesize;
+	db_seq_t max, min;
+{
+	/*
+	 * It's an error to specify caches larger than the sequence range.
+	 *
+	 * The min and max of the range can be either positive or negative,
+	 * the difference will fit in an unsigned variable of the same type.
+	 * Assume a 2's complement machine, and simply subtract.
+	 */
+	if ((u_int32_t)cachesize > (u_int64_t)max - (u_int64_t)min) {
+		__db_errx(env, DB_STR("4014",
+    "Number of items to be cached is larger than the sequence range"));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+#else /* !HAVE_64BIT_TYPES */
+
+int
+db_sequence_create(seqp, dbp, flags)
+	DB_SEQUENCE **seqp;
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(seqp, NULL);
+	COMPQUIET(flags, 0);
+	__db_errx(dbp->env, DB_STR("4015",
+	    "library build did not include support for sequences"));
+	return (DB_OPNOTSUP);
+}
+#endif /* HAVE_64BIT_TYPES */
diff --git a/src/txn/txn.c b/src/txn/txn.c
new file mode 100644
index 00000000..81225e5c
--- /dev/null
+++ b/src/txn/txn.c
@@ -0,0 +1,2169 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#define	LOG_FLAGS(txn)						\
+		(DB_LOG_COMMIT | (F_ISSET(txn, TXN_SYNC) ?	\
+		DB_FLUSH : (F_ISSET(txn, TXN_WRITE_NOSYNC) ?	\
+		DB_LOG_WRNOSYNC : 0)))
+
+/*
+ * __txn_isvalid enumerated types.  We cannot simply use the transaction
+ * statuses, because different statuses need to be handled differently
+ * depending on the caller.
+ */
+typedef enum {
+	TXN_OP_ABORT,
+	TXN_OP_COMMIT,
+	TXN_OP_DISCARD,
+	TXN_OP_PREPARE
+} txnop_t;
+
+static int  __txn_abort_pp __P((DB_TXN *));
+static int  __txn_applied __P((ENV *,
+    DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+static void __txn_build_token __P((DB_TXN *, DB_LSN *));
+static int  __txn_begin_int __P((DB_TXN *));
+static int  __txn_close_cursors __P((DB_TXN *));
+static int  __txn_commit_pp __P((DB_TXN *, u_int32_t));
+static int  __txn_discard __P((DB_TXN *, u_int32_t));
+static int  __txn_dispatch_undo
+		__P((ENV *, DB_TXN *, DBT *, DB_LSN *, DB_TXNHEAD *));
+static int  __txn_end __P((DB_TXN *, int));
+static int  __txn_isvalid __P((const DB_TXN *, txnop_t));
+static int  __txn_undo __P((DB_TXN *));
+static int __txn_set_commit_token __P((DB_TXN *txn, DB_TXN_TOKEN *));
+static void __txn_set_txn_lsnp __P((DB_TXN *, DB_LSN **, DB_LSN **));
+
+#define	TxnAlloc "Unable to allocate a transaction handle"
+
+/*
+ * __txn_begin_pp --
+ *	ENV->txn_begin pre/post processing.
+ *
+ * PUBLIC: int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+ */
+int
+__txn_begin_pp(dbenv, parent, txnpp, flags)
+	DB_ENV *dbenv;
+	DB_TXN *parent, **txnpp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_check, ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_begin", DB_INIT_TXN);
+
+	if ((ret = __db_fchk(env,
+	    "txn_begin", flags,
+	    DB_IGNORE_LEASE |DB_READ_COMMITTED | DB_READ_UNCOMMITTED |
+	    DB_TXN_FAMILY | DB_TXN_NOSYNC | DB_TXN_SNAPSHOT | DB_TXN_SYNC |
+	    DB_TXN_WAIT | DB_TXN_WRITE_NOSYNC | DB_TXN_NOWAIT |
+	    DB_TXN_BULK)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(env, "txn_begin", flags,
+	    DB_TXN_WRITE_NOSYNC | DB_TXN_NOSYNC, DB_TXN_SYNC)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(env, "txn_begin",
+	    flags, DB_TXN_WRITE_NOSYNC, DB_TXN_NOSYNC)) != 0)
+		return (ret);
+	if (parent != NULL && LF_ISSET(DB_TXN_FAMILY)) {
+		__db_errx(env, DB_STR("4521",
+		    "Family transactions cannot have parents"));
+		return (EINVAL);
+	} else if (IS_REAL_TXN(parent) &&
+	    !F_ISSET(parent, TXN_SNAPSHOT) && LF_ISSET(DB_TXN_SNAPSHOT)) {
+		__db_errx(env, DB_STR("4522",
+		    "Child transaction snapshot setting must match parent"));
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Replication accounts for top-level transactions. */
+	rep_check = IS_ENV_REPLICATED(env) &&
+	    !IS_REAL_TXN(parent) && !LF_ISSET(DB_TXN_FAMILY);
+
+	if (rep_check && (ret = __op_rep_enter(env, 0, 1)) != 0)
+		goto err;
+
+	ret = __txn_begin(env, ip, parent, txnpp, flags);
+
+	/*
+	 * We only decrement the count if the operation fails.
+	 * Otherwise the count will be decremented when the
+	 * txn is resolved by txn_commit, txn_abort, etc.
+	 */
+	if (ret != 0 && rep_check)
+		(void)__op_rep_exit(env);
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_begin --
+ *	ENV->txn_begin.
+ *
+ * This is a wrapper to the actual begin process.  We allocate a DB_TXN
+ * structure for the caller and then call into __txn_begin_int code.
+ *
+ * Internally, we use TXN_DETAIL structures, but the DB_TXN structure
+ * provides access to the transaction ID and the offset in the transaction
+ * region of the TXN_DETAIL structure.
+ *
+ * PUBLIC: int __txn_begin __P((ENV *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t));
+ */
+int
+__txn_begin(env, ip, parent, txnpp, flags)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN *parent, **txnpp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DB_LOCKREGION *region;
+	DB_TXN *txn;
+	TXN_DETAIL *ptd, *td;
+	int ret;
+
+	if (F_ISSET(env, ENV_FORCE_TXN_BULK))
+		flags |= DB_TXN_BULK;
+
+	*txnpp = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) {
+		__db_errx(env, TxnAlloc);
+		return (ret);
+	}
+
+	dbenv = env->dbenv;
+	txn->mgrp = env->tx_handle;
+	txn->parent = parent;
+	if (parent != NULL && F_ISSET(parent, TXN_FAMILY))
+		parent = NULL;
+	TAILQ_INIT(&txn->kids);
+	TAILQ_INIT(&txn->events);
+	STAILQ_INIT(&txn->logs);
+	TAILQ_INIT(&txn->my_cursors);
+	TAILQ_INIT(&txn->femfs);
+	txn->flags = TXN_MALLOC;
+	txn->thread_info =
+	     ip != NULL ? ip : (parent != NULL ? parent->thread_info : NULL);
+
+	/*
+	 * Set the sync mode for commit.  Any local bits override those
+	 * in the environment.  SYNC is the default.
+	 */
+	if (LF_ISSET(DB_TXN_SYNC))
+		F_SET(txn, TXN_SYNC);
+	else if (LF_ISSET(DB_TXN_NOSYNC))
+		F_SET(txn, TXN_NOSYNC);
+	else if (LF_ISSET(DB_TXN_WRITE_NOSYNC))
+		F_SET(txn, TXN_WRITE_NOSYNC);
+	else if (F_ISSET(dbenv, DB_ENV_TXN_NOSYNC))
+		F_SET(txn, TXN_NOSYNC);
+	else if (F_ISSET(dbenv, DB_ENV_TXN_WRITE_NOSYNC))
+		F_SET(txn, TXN_WRITE_NOSYNC);
+	else
+		F_SET(txn, TXN_SYNC);
+
+	if (LF_ISSET(DB_TXN_NOWAIT) ||
+	    (F_ISSET(dbenv, DB_ENV_TXN_NOWAIT) && !LF_ISSET(DB_TXN_WAIT)))
+		F_SET(txn, TXN_NOWAIT);
+	if (LF_ISSET(DB_READ_COMMITTED))
+		F_SET(txn, TXN_READ_COMMITTED);
+	if (LF_ISSET(DB_READ_UNCOMMITTED))
+		F_SET(txn, TXN_READ_UNCOMMITTED);
+	if (LF_ISSET(DB_TXN_FAMILY))
+		F_SET(txn, TXN_FAMILY | TXN_INFAMILY | TXN_READONLY);
+	if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) ||
+	    (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT)))
+		F_SET(txn, TXN_SNAPSHOT);
+	if (LF_ISSET(DB_IGNORE_LEASE))
+		F_SET(txn, TXN_IGNORE_LEASE);
+
+	/*
+	 * We set TXN_BULK only for the outermost transaction.  This
+	 * is a temporary limitation; in the future we will allow it
+	 * for nested transactions as well.  See #17669 for details.
+	 *
+	 * Also, ignore requests for DB_TXN_BULK if replication is enabled.
+	 */
+	if (LF_ISSET(DB_TXN_BULK) && parent == NULL && !REP_ON(txn->mgrp->env))
+		F_SET(txn, TXN_BULK);
+
+	if ((ret = __txn_begin_int(txn)) != 0)
+		goto err;
+	td = txn->td;
+
+	if (parent != NULL) {
+		ptd = parent->td;
+		TAILQ_INSERT_HEAD(&parent->kids, txn, klinks);
+		SH_TAILQ_INSERT_HEAD(&ptd->kids, td, klinks, __txn_detail);
+	}
+
+	if (LOCKING_ON(env)) {
+		region = env->lk_handle->reginfo.primary;
+		if (parent != NULL) {
+			ret = __lock_inherit_timeout(env,
+			    parent->locker, txn->locker);
+			/* No parent locker set yet. */
+			if (ret == EINVAL) {
+				parent = NULL;
+				ret = 0;
+			}
+			if (ret != 0)
+				goto err;
+		}
+
+		/*
+		 * Parent is NULL if we have no parent
+		 * or it has no timeouts set.
+		 */
+		if (parent == NULL && region->tx_timeout != 0)
+			if ((ret = __lock_set_timeout(env, txn->locker,
+			    region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0)
+				goto err;
+	}
+
+	*txnpp = txn;
+	PERFMON2(env, txn, begin, txn->txnid, flags);
+	return (0);
+
+err:
+	__os_free(env, txn);
+	return (ret);
+}
+
+/*
+ * __txn_recycle_id --
+ *	Find a range of useable transaction ids.
+ *
+ * PUBLIC: int __txn_recycle_id __P((ENV *, int));
+ */
+int
+__txn_recycle_id(env, locked)
+	ENV *env;
+	int locked;
+{
+	DB_LSN null_lsn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *td;
+	u_int32_t *ids;
+	int nids, ret;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(u_int32_t) * region->curtxns, &ids)) != 0) {
+		__db_errx(env, DB_STR("4523",
+		    "Unable to allocate transaction recycle buffer"));
+		return (ret);
+	}
+	nids = 0;
+	SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+		ids[nids++] = td->txnid;
+	region->last_txnid = TXN_MINIMUM - 1;
+	region->cur_maxid = TXN_MAXIMUM;
+	if (nids != 0)
+		__db_idspace(ids, nids,
+		    &region->last_txnid, &region->cur_maxid);
+	__os_free(env, ids);
+
+	/*
+	 * Check LOGGING_ON rather than DBENV_LOGGING as we want to emit this
+	 * record at the end of recovery.
+	 */
+	if (LOGGING_ON(env)) {
+		if (locked)
+			TXN_SYSTEM_UNLOCK(env);
+		ret = __txn_recycle_log(env, NULL, &null_lsn,
+		     0, region->last_txnid + 1, region->cur_maxid);
+		/* Make it simple on the caller, if error we hold the lock. */
+		if (locked && ret != 0)
+			TXN_SYSTEM_LOCK(env);
+	}
+
+	return (ret);
+}
+
+/*
+ * __txn_begin_int --
+ *	Normal DB version of txn_begin.
+ */
+static int
+__txn_begin_int(txn)
+	DB_TXN *txn;
+{
+	DB_ENV *dbenv;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	ENV *env;
+	TXN_DETAIL *td;
+	u_int32_t id;
+	int inserted, ret;
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+	dbenv = env->dbenv;
+	region = mgr->reginfo.primary;
+	td = NULL;
+	inserted = 0;
+
+	TXN_SYSTEM_LOCK(env);
+	if (!F_ISSET(txn, TXN_COMPENSATE) && F_ISSET(region, TXN_IN_RECOVERY)) {
+		__db_errx(env, DB_STR("4524",
+		    "operation not permitted during recovery"));
+		ret = EINVAL;
+		goto err;
+	}
+
+	/*
+	 * Allocate a new transaction id. Our current valid range can span
+	 * the maximum valid value, so check for it and wrap manually.
+	 */
+	if (region->last_txnid == TXN_MAXIMUM &&
+	    region->cur_maxid != TXN_MAXIMUM)
+		region->last_txnid = TXN_MINIMUM - 1;
+
+	/* Allocate a new transaction detail structure. */
+	if ((ret =
+	    __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) {
+		__db_errx(env, DB_STR("4525",
+		    "Unable to allocate memory for transaction detail"));
+		goto err;
+	}
+
+	id = ++region->last_txnid;
+
+#ifdef HAVE_STATISTICS
+	STAT_INC(env, txn, nbegins, region->stat.st_nbegins, id);
+	STAT_INC(env, txn, nactive, region->stat.st_nactive, id);
+	if (region->stat.st_nactive > region->stat.st_maxnactive)
+		STAT_SET(env, txn, maxnactive,
+		    region->stat.st_maxnactive, region->stat.st_nactive, id);
+#endif
+
+	td->txnid = id;
+	dbenv->thread_id(dbenv, &td->pid, &td->tid);
+
+	ZERO_LSN(td->last_lsn);
+	ZERO_LSN(td->begin_lsn);
+	SH_TAILQ_INIT(&td->kids);
+	if (txn->parent != NULL && !F_ISSET(txn->parent, TXN_FAMILY))
+		td->parent = R_OFFSET(&mgr->reginfo, txn->parent->td);
+	else
+		td->parent = INVALID_ROFF;
+	td->name = INVALID_ROFF;
+	MAX_LSN(td->read_lsn);
+	MAX_LSN(td->visible_lsn);
+	td->mvcc_ref = 0;
+	td->mvcc_mtx = MUTEX_INVALID;
+	td->status = TXN_RUNNING;
+	td->flags = F_ISSET(txn, TXN_NOWAIT) ? TXN_DTL_NOWAIT : 0;
+	td->nlog_dbs = 0;
+	td->nlog_slots = TXN_NSLOTS;
+	td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots);
+
+	/* XA specific fields. */
+	td->xa_ref = 1;
+	td->xa_br_status = TXN_XA_IDLE;
+
+	/* Place transaction on active transaction list. */
+	SH_TAILQ_INSERT_HEAD(&region->active_txn, td, links, __txn_detail);
+	region->curtxns++;
+
+	/* Increment bulk transaction counter while holding transaction lock. */
+	if (F_ISSET(txn, TXN_BULK))
+		((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_bulk_txn++;
+
+	inserted = 1;
+
+	if (region->last_txnid == region->cur_maxid) {
+		if ((ret = __txn_recycle_id(env, 1)) != 0)
+			goto err;
+	} else
+		TXN_SYSTEM_UNLOCK(env);
+
+	txn->txnid = id;
+	txn->td  = td;
+
+	/* Allocate a locker for this txn. */
+	if (LOCKING_ON(env) && (ret =
+		__lock_getlocker(env->lk_handle, id, 1, &txn->locker)) != 0)
+			goto err;
+
+	txn->abort = __txn_abort_pp;
+	txn->commit = __txn_commit_pp;
+	txn->discard = __txn_discard;
+	txn->get_name = __txn_get_name;
+	txn->get_priority = __txn_get_priority;
+	txn->id = __txn_id;
+	txn->prepare = __txn_prepare;
+	txn->set_commit_token = __txn_set_commit_token;
+	txn->set_txn_lsnp = __txn_set_txn_lsnp;
+	txn->set_name = __txn_set_name;
+	txn->set_priority = __txn_set_priority;
+	txn->set_timeout = __txn_set_timeout;
+
+	/* We can't call __txn_set_priority until txn->td is set. */
+	if (LOCKING_ON(env) && (ret = __txn_set_priority(txn,
+		txn->parent == NULL ?
+		TXN_PRIORITY_DEFAULT : txn->parent->locker->priority)) != 0)
+		goto err;
+	else
+		td->priority = 0;
+
+	/*
+	 * If this is a transaction family, we must link the child to the
+	 * maximal grandparent in the lock table for deadlock detection.
+	 */
+	if (txn->parent != NULL) {
+		if (LOCKING_ON(env) && (ret = __lock_addfamilylocker(env,
+		    txn->parent->txnid, txn->txnid,
+		    F_ISSET(txn->parent, TXN_FAMILY))) != 0)
+			goto err;
+
+		/*
+		 * If the parent is only used to establish compatability, do
+		 * not reference it again.
+		 */
+		if (F_ISSET(txn->parent, TXN_FAMILY)) {
+			txn->parent = NULL;
+			F_SET(txn, TXN_INFAMILY);
+		}
+	}
+
+	if (F_ISSET(txn, TXN_MALLOC)) {
+		MUTEX_LOCK(env, mgr->mutex);
+		TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
+		MUTEX_UNLOCK(env, mgr->mutex);
+	}
+
+	return (0);
+
+err:	if (inserted) {
+		TXN_SYSTEM_LOCK(env);
+		SH_TAILQ_REMOVE(&region->active_txn, td, links, __txn_detail);
+		region->curtxns--;
+		if (F_ISSET(txn, TXN_BULK))
+			((DB_TXNREGION *)
+			 env->tx_handle->reginfo.primary)->n_bulk_txn--;
+	}
+	if (td != NULL)
+		__env_alloc_free(&mgr->reginfo, td);
+	TXN_SYSTEM_UNLOCK(env);
+	return (ret);
+}
+
+/*
+ * __txn_continue
+ *	Fill in the fields of the local transaction structure given
+ *	the detail transaction structure.  Optionally link transactions
+ *	to transaction manager list.
+ *
+ * PUBLIC: int __txn_continue __P((ENV *,
+ * PUBLIC:     DB_TXN *, TXN_DETAIL *, DB_THREAD_INFO *, int));
+ */
+int
+__txn_continue(env, txn, td, ip, add_to_list)
+	ENV *env;
+	DB_TXN *txn;
+	TXN_DETAIL *td;
+	DB_THREAD_INFO *ip;
+	int add_to_list;
+{
+	DB_LOCKREGION *region;
+	DB_TXNMGR *mgr;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * This code follows the order of the structure definition so it
+	 * is relatively easy to make sure that we are setting everything.
+	 */
+	mgr = txn->mgrp = env->tx_handle;
+	txn->parent = NULL;
+	txn->thread_info = ip;
+	txn->txnid = td->txnid;
+	txn->name = NULL;
+	txn->td = td;
+	td->xa_ref++;
+
+	/* This never seems to be used: txn->expire */
+	txn->txn_list = NULL;
+
+	TAILQ_INIT(&txn->kids);
+	TAILQ_INIT(&txn->events);
+	STAILQ_INIT(&txn->logs);
+
+	/*
+	 * These fields should never persist across different processes as we
+	 * require that cursors be opened/closed within the same service routine
+	 * and we disallow file level operations in XA transactions.
+	 */
+	TAILQ_INIT(&txn->my_cursors);
+	TAILQ_INIT(&txn->femfs);
+
+	/* Put the transaction onto the transaction manager's list. */
+	if (add_to_list) {
+		MUTEX_LOCK(env, mgr->mutex);
+		TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
+		MUTEX_UNLOCK(env, mgr->mutex);
+	}
+
+	txn->token_buffer = 0;
+	txn->cursors = 0;
+
+	txn->abort = __txn_abort_pp;
+	txn->commit = __txn_commit_pp;
+	txn->discard = __txn_discard;
+	txn->get_name = __txn_get_name;
+	txn->get_priority = __txn_get_priority;
+	txn->id = __txn_id;
+	txn->prepare = __txn_prepare;
+	txn->set_commit_token = __txn_set_commit_token;
+	txn->set_name = __txn_set_name;
+	txn->set_priority = __txn_set_priority;
+	txn->set_timeout = __txn_set_timeout;
+	txn->set_txn_lsnp = __txn_set_txn_lsnp;
+
+	/* XXX Do we need to explicitly set a SYNC flag here? */
+	txn->flags = TXN_MALLOC |
+	    (F_ISSET(td, TXN_DTL_NOWAIT) ? TXN_NOWAIT : 0);
+	txn->xa_thr_status = TXN_XA_THREAD_NOTA;
+
+	/*
+	 * If this is a restored transaction, we need to propagate that fact
+	 * to the process-local structure.  However, if it's not a restored
+	 * transaction, we need to make sure that we have a locker associated
+	 * with this transaction.
+	 */
+	if (F_ISSET(td, TXN_DTL_RESTORED))
+		F_SET(txn, TXN_RESTORED);
+	else
+		if ((ret = __lock_getlocker(env->lk_handle,
+		    txn->txnid, 0, &txn->locker)) == 0)
+			ret = __txn_set_priority(txn, td->priority);
+
+	if (LOCKING_ON(env)) {
+		region = env->lk_handle->reginfo.primary;
+		if (region->tx_timeout != 0 &&
+		    (ret = __lock_set_timeout(env, txn->locker,
+		    region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0)
+			return (ret);
+		txn->lock_timeout = region->tx_timeout;
+	}
+
+	return (ret);
+}
+
+/*
+ * __txn_commit_pp --
+ *	Interface routine to TXN->commit.
+ */
+static int
+__txn_commit_pp(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_check, ret, t_ret;
+
+	env = txn->mgrp->env;
+	rep_check = IS_ENV_REPLICATED(env) &&
+	    txn->parent == NULL && IS_REAL_TXN(txn);
+
+	ENV_ENTER(env, ip);
+	ret = __txn_commit(txn, flags);
+	if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_commit --
+ *	Commit a transaction.
+ *
+ * PUBLIC: int __txn_commit __P((DB_TXN *, u_int32_t));
+ */
+int
+__txn_commit(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBT list_dbt;
+	DB_LOCKREQ request;
+	DB_TXN *kid;
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+	TXN_DETAIL *td;
+	DB_LSN token_lsn;
+	u_int32_t id;
+	int ret, t_ret;
+
+	env = txn->mgrp->env;
+	td = txn->td;
+	PERFMON2(env, txn, commit, txn->txnid, flags);
+
+	DB_ASSERT(env, txn->xa_thr_status == TXN_XA_THREAD_NOTA ||
+	    td->xa_ref == 1);
+	/*
+	 * A common mistake in Berkeley DB programs is to mis-handle deadlock
+	 * return.  If the transaction deadlocked, they want abort, not commit.
+	 */
+	if (F_ISSET(txn, TXN_DEADLOCK)) {
+		ret = __db_txn_deadlock_err(env, txn);
+		goto err;
+	}
+
+	/* Close registered cursors before committing. */
+	if ((ret = __txn_close_cursors(txn)) != 0)
+		goto err;
+
+	if ((ret = __txn_isvalid(txn, TXN_OP_COMMIT)) != 0)
+		return (ret);
+
+	/*
+	 * Check for master leases at the beginning.  If we are a master and
+	 * cannot have valid leases now, we error and abort this txn.  There
+	 * should always be a perm record in the log because the master updates
+	 * the LSN history system database in rep_start() (with IGNORE_LEASE
+	 * set).
+	 *
+	 * Only check leases if this txn writes to the log file
+	 * (i.e. td->last_lsn).
+	 */
+	if (txn->parent == NULL && IS_REP_MASTER(env) &&
+	    IS_USING_LEASES(env) && !F_ISSET(txn, TXN_IGNORE_LEASE) &&
+	    !IS_ZERO_LSN(td->last_lsn) &&
+	    (ret = __rep_lease_check(env, 1)) != 0) {
+		DB_ASSERT(env, ret != DB_NOTFOUND);
+		goto err;
+	}
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	/*
+	 * No mutex is needed as envid is read-only once it is set.
+	 */
+	id = renv->envid;
+
+	/*
+	 * We clear flags that are incorrect, ignoring any flag errors, and
+	 * default to synchronous operations.  By definition, transaction
+	 * handles are dead when we return, and this error should never
+	 * happen, but we don't want to fail in the field 'cause the app is
+	 * specifying the wrong flag for some reason.
+	 */
+	if (__db_fchk(env, "DB_TXN->commit", flags,
+	    DB_TXN_NOSYNC | DB_TXN_SYNC | DB_TXN_WRITE_NOSYNC) != 0)
+		flags = DB_TXN_SYNC;
+	if (__db_fcchk(env, "DB_TXN->commit", flags,
+	    DB_TXN_SYNC, DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC) != 0)
+		flags = DB_TXN_SYNC;
+
+	if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) {
+		F_CLR(txn, TXN_SYNC_FLAGS);
+		F_SET(txn, TXN_WRITE_NOSYNC);
+	}
+	if (LF_ISSET(DB_TXN_NOSYNC)) {
+		F_CLR(txn, TXN_SYNC_FLAGS);
+		F_SET(txn, TXN_NOSYNC);
+	}
+	if (LF_ISSET(DB_TXN_SYNC)) {
+		F_CLR(txn, TXN_SYNC_FLAGS);
+		F_SET(txn, TXN_SYNC);
+	}
+
+	DB_ASSERT(env, F_ISSET(txn, TXN_SYNC_FLAGS));
+
+	/*
+	 * Commit any unresolved children.  If anyone fails to commit,
+	 * then try to abort the rest of the kids and then abort the parent.
+	 * Abort should never fail; if it does, we bail out immediately.
+	 */
+	while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+		if ((ret = __txn_commit(kid, flags)) != 0)
+			while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+				if ((t_ret = __txn_abort(kid)) != 0)
+					return (__env_panic(env, t_ret));
+
+	/*
+	 * If there are any log records, write a log record and sync the log,
+	 * else do no log writes.  If the commit is for a child transaction,
+	 * we do not need to commit the child synchronously since it may still
+	 * abort (if its parent aborts), and otherwise its parent or ultimate
+	 * ancestor will write synchronously.
+	 */
+	ZERO_LSN(token_lsn);
+	if (DBENV_LOGGING(env) && (!IS_ZERO_LSN(td->last_lsn) ||
+	    STAILQ_FIRST(&txn->logs) != NULL)) {
+		if (txn->parent == NULL) {
+			/*
+			 * We are about to free all the read locks for this
+			 * transaction below.  Some of those locks might be
+			 * handle locks which should not be freed, because
+			 * they will be freed when the handle is closed. Check
+			 * the events and preprocess any trades now so we don't
+			 * release the locks below.
+			 */
+			if ((ret =
+			    __txn_doevents(env, txn, TXN_COMMIT, 1)) != 0)
+				goto err;
+
+			memset(&request, 0, sizeof(request));
+			if (LOCKING_ON(env)) {
+				request.op = DB_LOCK_PUT_READ;
+				if (IS_REP_MASTER(env) &&
+				    !IS_ZERO_LSN(td->last_lsn)) {
+					memset(&list_dbt, 0, sizeof(list_dbt));
+					request.obj = &list_dbt;
+				}
+				ret = __lock_vec(env,
+				    txn->locker, 0, &request, 1, NULL);
+			}
+
+			if (ret == 0 && !IS_ZERO_LSN(td->last_lsn)) {
+				ret = __txn_flush_fe_files(txn);
+				if (ret == 0)
+					ret = __txn_regop_log(env, txn,
+					    &td->visible_lsn, LOG_FLAGS(txn),
+					    TXN_COMMIT,
+					    (int32_t)time(NULL), id,
+					    request.obj);
+				if (ret == 0)
+					token_lsn = td->last_lsn =
+					    td->visible_lsn;
+#ifdef DIAGNOSTIC
+				if (ret == 0) {
+					DB_LSN s_lsn;
+
+					DB_ASSERT(env, __log_current_lsn_int(
+					    env, &s_lsn, NULL, NULL) == 0);
+					DB_ASSERT(env, LOG_COMPARE(
+					    &td->visible_lsn, &s_lsn) <= 0);
+					COMPQUIET(s_lsn.file, 0);
+				}
+#endif
+			}
+
+			if (request.obj != NULL && request.obj->data != NULL)
+				__os_free(env, request.obj->data);
+			if (ret != 0)
+				goto err;
+		} else {
+			/* Log the commit in the parent! */
+			if (!IS_ZERO_LSN(td->last_lsn) &&
+			    (ret = __txn_child_log(env, txn->parent,
+			    &((TXN_DETAIL *)txn->parent->td)->last_lsn,
+			    0, txn->txnid, &td->last_lsn)) != 0) {
+				goto err;
+			}
+			if (STAILQ_FIRST(&txn->logs) != NULL) {
+				/*
+				 * Put the child first so we back it out first.
+				 * All records are undone in reverse order.
+				 */
+				STAILQ_CONCAT(&txn->logs, &txn->parent->logs);
+				txn->parent->logs = txn->logs;
+				STAILQ_INIT(&txn->logs);
+			}
+
+			F_SET(txn->parent, TXN_CHILDCOMMIT);
+		}
+	}
+	if (txn->token_buffer != NULL && ret == 0 && DBENV_LOGGING(env))
+		__txn_build_token(txn, &token_lsn);
+
+	if (txn->txn_list != NULL) {
+		__db_txnlist_end(env, txn->txn_list);
+		txn->txn_list = NULL;
+	}
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Check for master leases at the end of only a normal commit.
+	 * If we're a child, that is not a perm record.  If we are a
+	 * master and cannot get valid leases now, something happened
+	 * during the commit.  The only thing to do is panic.
+	 *
+	 * Only check leases if this txn writes to the log file
+	 * (i.e. td->last_lsn).
+	 */
+	if (txn->parent == NULL && IS_REP_MASTER(env) &&
+	    IS_USING_LEASES(env) && !F_ISSET(txn, TXN_IGNORE_LEASE) &&
+	    !IS_ZERO_LSN(td->last_lsn) &&
+	    (ret = __rep_lease_check(env, 1)) != 0)
+		return (__env_panic(env, ret));
+
+	/*
+	 * This is here rather than in __txn_end because __txn_end is
+	 * called too late during abort.  So commit and abort each
+	 * call it independently.
+	 */
+	__txn_reset_fe_watermarks(txn);
+
+	/* This is OK because __txn_end can only fail with a panic. */
+	return (__txn_end(txn, 1));
+
+err:	/*
+	 * If we are prepared, then we "must" be able to commit.  We panic here
+	 * because even though the coordinator might be able to retry it is not
+	 * clear it would know to do that.  Otherwise  we'll try to abort.  If
+	 * that is successful, then we return whatever was in ret (that is, the
+	 * reason we failed).  If the abort was unsuccessful, abort probably
+	 * returned DB_RUNRECOVERY and we need to propagate that up.
+	 */
+	if (td->status == TXN_PREPARED)
+		return (__env_panic(env, ret));
+
+	if ((t_ret = __txn_abort(txn)) != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __txn_close_cursors
+ *	Close a transaction's registered cursors, all its cursors are
+ *	guaranteed to be closed.
+ */
+static int
+__txn_close_cursors(txn)
+	DB_TXN *txn;
+{
+	int ret, tret;
+	DBC *dbc;
+
+	ret = tret = 0;
+	dbc = NULL;
+
+	if (txn == NULL)
+		return (0);
+
+	while ((dbc = TAILQ_FIRST(&txn->my_cursors)) != NULL) {
+
+		DB_ASSERT(dbc->env, txn == dbc->txn);
+
+		/*
+		 * Unregister the cursor from its transaction, regardless
+		 * of return.
+		 */
+		TAILQ_REMOVE(&(txn->my_cursors), dbc, txn_cursors);
+		dbc->txn_cursors.tqe_next = NULL;
+		dbc->txn_cursors.tqe_prev = NULL;
+
+		/* Removed from the active queue here. */
+		if (F_ISSET(dbc, DBC_ACTIVE))
+			ret = __dbc_close(dbc);
+
+		dbc->txn = NULL;
+
+		/* We have to close all cursors anyway, so continue on error. */
+		if (ret != 0) {
+			__db_err(dbc->env, ret, "__dbc_close");
+			if (tret == 0)
+				tret = ret;
+		}
+	}
+	txn->my_cursors.tqh_first = NULL;
+	txn->my_cursors.tqh_last = NULL;
+
+	return (tret);/* Return the first error if any. */
+}
+
+/*
+ * __txn_set_commit_token --
+ *	Store a pointer to user's commit token buffer, for later use.
+ */
+static int
+__txn_set_commit_token(txn, tokenp)
+	DB_TXN *txn;
+	DB_TXN_TOKEN *tokenp;
+{
+	ENV *env;
+
+	env = txn->mgrp->env;
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_TXN->set_commit_token", DB_INIT_LOG);
+	if (txn->parent != NULL) {
+		__db_errx(env, DB_STR("4526",
+		    "commit token unavailable for nested txn"));
+		return (EINVAL);
+	}
+	if (IS_REP_CLIENT(env)) {
+		__db_errx(env, DB_STR("4527",
+		    "may not be called on a replication client"));
+		return (EINVAL);
+	}
+
+	txn->token_buffer = tokenp;
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Applications may rely on the contents of the token buffer becoming
+	 * valid only after a successful commit().  So it is not strictly
+	 * necessary to initialize the buffer here.  But in case they get
+	 * confused we initialize it here to a recognizably invalid value.
+	 */
+	memset(tokenp, 0, DB_TXN_TOKEN_SIZE);
+#endif
+
+	return (0);
+}
+
+/*
+ * __txn_build_token --
+ *	Stash a token describing the committing transaction into the buffer
+ * previously designated by the user.  Called only in the case where the user
+ * has indeed supplied a buffer address.
+ */
+static void
+__txn_build_token(txn, lsnp)
+	DB_TXN *txn;
+	DB_LSN *lsnp;
+{
+	ENV *env;
+	REGENV *renv;
+	u_int8_t *bp;
+	u_int32_t gen, version;
+
+	bp = txn->token_buffer->buf;
+	env = txn->mgrp->env;
+	renv = env->reginfo->primary;
+
+	/* Marshal the information into external form. */
+	version = REP_COMMIT_TOKEN_FMT_VERSION;
+	gen = REP_ON(env) ? env->rep_handle->region->gen : 0;
+	DB_HTONL_COPYOUT(env, bp, version);
+	DB_HTONL_COPYOUT(env, bp, gen);
+	DB_HTONL_COPYOUT(env, bp, renv->envid);
+	DB_HTONL_COPYOUT(env, bp, lsnp->file);
+	DB_HTONL_COPYOUT(env, bp, lsnp->offset);
+}
+
+/*
+ * __txn_abort_pp --
+ *	Interface routine to TXN->abort.
+ */
+static int
+__txn_abort_pp(txn)
+	DB_TXN *txn;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_check, ret, t_ret;
+
+	env = txn->mgrp->env;
+	rep_check = IS_ENV_REPLICATED(env) &&
+	    txn->parent == NULL && IS_REAL_TXN(txn);
+
+	ENV_ENTER(env, ip);
+	ret = __txn_abort(txn);
+	if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_abort --
+ *	Abort a transaction.
+ *
+ * PUBLIC: int __txn_abort __P((DB_TXN *));
+ */
+int
+__txn_abort(txn)
+	DB_TXN *txn;
+{
+	DB_LOCKREQ request;
+	DB_TXN *kid;
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+	TXN_DETAIL *td;
+	u_int32_t id;
+	int ret;
+
+	env = txn->mgrp->env;
+	td = txn->td;
+	/*
+	 * Do not abort an XA transaction if another process is still using
+	 * it, however make sure that it is aborted when the last process
+	 * tries to abort it.
+	 */
+	if (txn->xa_thr_status != TXN_XA_THREAD_NOTA &&  td->xa_ref > 1) {
+		td->status = TXN_NEED_ABORT;
+		return (0);
+	}
+
+	PERFMON1(env, txn, abort, txn->txnid);
+	/*
+	 * Close registered cursors before the abort. Even if the call fails,
+	 * all cursors are closed.
+	 */
+	if ((ret = __txn_close_cursors(txn)) != 0)
+		return (__env_panic(env, ret));
+
+	/* Ensure that abort always fails fatally. */
+	if ((ret = __txn_isvalid(txn, TXN_OP_ABORT)) != 0)
+		return (__env_panic(env, ret));
+
+	/*
+	 * Clear the watermarks now.  Can't do this in __txn_end because
+	 * __db_refresh, called from undo, will free the DB_MPOOLFILEs.
+	 */
+	__txn_reset_fe_watermarks(txn);
+
+	/*
+	 * Try to abort any unresolved children.
+	 *
+	 * Abort either succeeds or panics the region.  As soon as we
+	 * see any failure, we just get out of here and return the panic
+	 * up.
+	 */
+	while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+		if ((ret = __txn_abort(kid)) != 0)
+			return (ret);
+
+	infop = env->reginfo;
+	renv = infop->primary;
+	/*
+	 * No mutex is needed as envid is read-only once it is set.
+	 */
+	id = renv->envid;
+
+	/*
+	 * Fast path -- no need to do anything fancy if there were no
+	 * modifications (e.g., log records) for this transaction.
+	 * We still call txn_undo to cleanup the txn_list from our
+	 * children.
+	 */
+	if (IS_ZERO_LSN(td->last_lsn) && STAILQ_FIRST(&txn->logs) == NULL) {
+		if (txn->txn_list == NULL)
+			goto done;
+		else
+			goto undo;
+	}
+
+	if (LOCKING_ON(env)) {
+		/* Allocate a locker for this restored txn if necessary. */
+		if (txn->locker == NULL &&
+		    (ret = __lock_getlocker(env->lk_handle,
+		    txn->txnid, 1, &txn->locker)) != 0)
+			return (__env_panic(env, ret));
+		/*
+		 * We are about to free all the read locks for this transaction
+		 * below.  Some of those locks might be handle locks which
+		 * should not be freed, because they will be freed when the
+		 * handle is closed.  Check the events and preprocess any
+		 * trades now so that we don't release the locks below.
+		 */
+		if ((ret = __txn_doevents(env, txn, TXN_ABORT, 1)) != 0)
+			return (__env_panic(env, ret));
+
+		/* Turn off timeouts. */
+		if ((ret = __lock_set_timeout(env,
+		    txn->locker, 0, DB_SET_TXN_TIMEOUT)) != 0)
+			return (__env_panic(env, ret));
+
+		if ((ret = __lock_set_timeout(env,
+		    txn->locker, 0, DB_SET_LOCK_TIMEOUT)) != 0)
+			return (__env_panic(env, ret));
+
+		request.op = DB_LOCK_UPGRADE_WRITE;
+		request.obj = NULL;
+		if ((ret = __lock_vec(
+		    env, txn->locker, 0, &request, 1, NULL)) != 0)
+			return (__env_panic(env, ret));
+	}
+undo:	if ((ret = __txn_undo(txn)) != 0)
+		return (__env_panic(env, ret));
+
+	/*
+	 * Normally, we do not need to log aborts.  However, if we
+	 * are a distributed transaction (i.e., we have a prepare),
+	 * then we log the abort so we know that this transaction
+	 * was actually completed.
+	 */
+done:	 if (DBENV_LOGGING(env) && td->status == TXN_PREPARED &&
+	    (ret = __txn_regop_log(env, txn, &td->last_lsn,
+	    LOG_FLAGS(txn), TXN_ABORT, (int32_t)time(NULL), id, NULL)) != 0)
+		return (__env_panic(env, ret));
+
+	/* __txn_end always panics if it errors, so pass the return along. */
+	return (__txn_end(txn, 0));
+}
+
+/*
+ * __txn_discard --
+ *	Interface routine to TXN->discard.
+ */
+static int
+__txn_discard(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int rep_check, ret, t_ret;
+
+	env = txn->mgrp->env;
+	rep_check = IS_ENV_REPLICATED(env) &&
+	    txn->parent == NULL && IS_REAL_TXN(txn);
+
+	ENV_ENTER(env, ip);
+	ret = __txn_discard_int(txn, flags);
+	if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_discard --
+ *	Free the per-process resources associated with this txn handle.
+ *
+ * PUBLIC: int __txn_discard_int __P((DB_TXN *, u_int32_t flags));
+ */
+int
+__txn_discard_int(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_TXNMGR *mgr;
+	ENV *env;
+	int ret;
+
+	COMPQUIET(flags, 0);
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+
+	/* Close registered cursors. */
+	if ((ret = __txn_close_cursors(txn)) != 0)
+		return (ret);
+
+	if ((ret = __txn_isvalid(txn, TXN_OP_DISCARD)) != 0)
+		return (ret);
+
+	/* Should be no children. */
+	DB_ASSERT(env, TAILQ_FIRST(&txn->kids) == NULL);
+
+	/* Free the space. */
+	MUTEX_LOCK(env, mgr->mutex);
+	mgr->n_discards++;
+	if (F_ISSET(txn, TXN_MALLOC)) {
+		TAILQ_REMOVE(&mgr->txn_chain, txn, links);
+	}
+	MUTEX_UNLOCK(env, mgr->mutex);
+	if (F_ISSET(txn, TXN_MALLOC) &&
+	    txn->xa_thr_status != TXN_XA_THREAD_ASSOCIATED)
+		__os_free(env, txn);
+
+	return (0);
+}
+
+/*
+ * __txn_prepare --
+ *	Flush the log so a future commit is guaranteed to succeed.
+ *
+ * PUBLIC: int __txn_prepare __P((DB_TXN *, u_int8_t *));
+ */
+int
+__txn_prepare(txn, gid)
+	DB_TXN *txn;
+	u_int8_t *gid;
+{
+	DBT list_dbt, gid_dbt;
+	DB_LOCKREQ request;
+	DB_THREAD_INFO *ip;
+	DB_TXN *kid;
+	ENV *env;
+	TXN_DETAIL *td;
+	u_int32_t lflags;
+	int ret;
+
+	env = txn->mgrp->env;
+	td = txn->td;
+	PERFMON2(env, txn, prepare, txn->txnid, gid);
+	DB_ASSERT(env, txn->xa_thr_status == TXN_XA_THREAD_NOTA ||
+	    td->xa_ref == 1);
+	ENV_ENTER(env, ip);
+
+	/* Close registered cursors. */
+	if ((ret = __txn_close_cursors(txn)) != 0)
+		goto err;
+
+	if ((ret = __txn_isvalid(txn, TXN_OP_PREPARE)) != 0)
+		goto err;
+	if (F_ISSET(txn, TXN_DEADLOCK)) {
+		ret = __db_txn_deadlock_err(env, txn);
+		goto err;
+	}
+
+	/* Commit any unresolved children. */
+	while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+		if ((ret = __txn_commit(kid, DB_TXN_NOSYNC)) != 0)
+			goto err;
+
+	/* We must set the global transaction ID here.  */
+	memcpy(td->gid, gid, DB_GID_SIZE);
+	if ((ret = __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0)
+		goto err;
+	memset(&request, 0, sizeof(request));
+	if (LOCKING_ON(env)) {
+		request.op = DB_LOCK_PUT_READ;
+		if (!IS_ZERO_LSN(td->last_lsn)) {
+			memset(&list_dbt, 0, sizeof(list_dbt));
+			request.obj = &list_dbt;
+		}
+		if ((ret = __lock_vec(env,
+		    txn->locker, 0, &request, 1, NULL)) != 0)
+			goto err;
+
+	}
+	if (DBENV_LOGGING(env)) {
+		memset(&gid_dbt, 0, sizeof(gid));
+		gid_dbt.data = gid;
+		gid_dbt.size = DB_GID_SIZE;
+		lflags = DB_LOG_COMMIT | DB_FLUSH;
+		if ((ret = __txn_prepare_log(env,
+		    txn, &td->last_lsn, lflags, TXN_PREPARE,
+		    &gid_dbt, &td->begin_lsn, request.obj)) != 0)
+			__db_err(env, ret, DB_STR("4528",
+			    "DB_TXN->prepare: log_write failed"));
+
+		if (request.obj != NULL && request.obj->data != NULL)
+			__os_free(env, request.obj->data);
+		if (ret != 0)
+			goto err;
+
+	}
+
+	MUTEX_LOCK(env, txn->mgrp->mutex);
+	td->status = TXN_PREPARED;
+	MUTEX_UNLOCK(env, txn->mgrp->mutex);
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_id --
+ *	Return the transaction ID.
+ *
+ * PUBLIC: u_int32_t __txn_id __P((DB_TXN *));
+ */
+u_int32_t
+__txn_id(txn)
+	DB_TXN *txn;
+{
+	return (txn->txnid);
+}
+
+/*
+ * __txn_get_name --
+ *	Get a descriptive string from a transaction.
+ *
+ * PUBLIC: int __txn_get_name __P((DB_TXN *, const char **));
+ */
+int
+__txn_get_name(txn, namep)
+	DB_TXN *txn;
+	const char **namep;
+{
+	*namep = txn->name;
+
+	return (0);
+}
+
+/*
+ * __txn_set_name --
+ *	Set a descriptive string for a transaction.
+ *
+ * PUBLIC: int __txn_set_name __P((DB_TXN *, const char *));
+ */
+int
+__txn_set_name(txn, name)
+	DB_TXN *txn;
+	const char *name;
+{
+	DB_THREAD_INFO *ip;
+	DB_TXNMGR *mgr;
+	ENV *env;
+	TXN_DETAIL *td;
+	size_t len;
+	int ret;
+	char *p;
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+	td = txn->td;
+	len = strlen(name) + 1;
+
+	if ((ret = __os_realloc(env, len, &txn->name)) != 0)
+		return (ret);
+	memcpy(txn->name, name, len);
+
+	ENV_ENTER(env, ip);
+	TXN_SYSTEM_LOCK(env);
+	if (td->name != INVALID_ROFF) {
+		__env_alloc_free(
+		    &mgr->reginfo, R_ADDR(&mgr->reginfo, td->name));
+		td->name = INVALID_ROFF;
+	}
+	if ((ret = __env_alloc(&mgr->reginfo, len, &p)) != 0) {
+		TXN_SYSTEM_UNLOCK(env);
+		__db_errx(env, DB_STR("4529",
+		    "Unable to allocate memory for transaction name"));
+
+		__os_free(env, txn->name);
+		txn->name = NULL;
+
+		ENV_LEAVE(env, ip);
+		return (ret);
+	}
+	TXN_SYSTEM_UNLOCK(env);
+	td->name = R_OFFSET(&mgr->reginfo, p);
+	memcpy(p, name, len);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * If DIAGNOSTIC is set, map the name into the log so users can track
+	 * operations through the log.
+	 */
+	if (DBENV_LOGGING(env))
+		(void)__log_printf(env, txn, "transaction %#lx named %s",
+		    (u_long)txn->txnid, name);
+#endif
+
+	ENV_LEAVE(env, ip);
+	return (0);
+}
+
+/*
+ * __txn_get_priority --
+ *	Get a transaction's priority level
+ * PUBLIC: int __txn_get_priority __P((DB_TXN *, u_int32_t *));
+ */
+int
+__txn_get_priority(txn, priorityp)
+	DB_TXN *txn;
+	u_int32_t *priorityp;
+{
+	if (txn->locker == NULL)
+		return EINVAL;
+
+	*priorityp = txn->locker->priority;
+	return (0);
+}
+
+/*
+ * __txn_set_priority --
+ *	Assign a transaction a priority level
+ * PUBLIC: int __txn_set_priority __P((DB_TXN *, u_int32_t));
+ */
+int
+__txn_set_priority(txn, priority)
+	DB_TXN *txn;
+	u_int32_t priority;
+{
+	if (txn->locker == NULL)
+		return EINVAL;
+
+	txn->locker->priority = priority;
+	((TXN_DETAIL *)txn->td)->priority = priority;
+
+	return (0);
+}
+
+/*
+ * __txn_set_timeout --
+ *	ENV->set_txn_timeout.
+ * PUBLIC: int  __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t));
+ */
+int
+__txn_set_timeout(txn, timeout, op)
+	DB_TXN *txn;
+	db_timeout_t timeout;
+	u_int32_t op;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = txn->mgrp->env;
+
+	if (op != DB_SET_TXN_TIMEOUT && op != DB_SET_LOCK_TIMEOUT)
+		return (__db_ferr(env, "DB_TXN->set_timeout", 0));
+
+	ENV_ENTER(env, ip);
+	ret = __lock_set_timeout( env, txn->locker, timeout, op);
+	ENV_LEAVE(txn->mgrp->env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_isvalid --
+ *	Return 0 if the DB_TXN is reasonable, otherwise panic.
+ */
+static int
+__txn_isvalid(txn, op)
+	const DB_TXN *txn;
+	txnop_t op;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	ENV *env;
+	TXN_DETAIL *td;
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+	region = mgr->reginfo.primary;
+
+	/* Check for recovery. */
+	if (!F_ISSET(txn, TXN_COMPENSATE) &&
+	    F_ISSET(region, TXN_IN_RECOVERY)) {
+		__db_errx(env, DB_STR("4530",
+		    "operation not permitted during recovery"));
+		goto err;
+	}
+
+	/* Check for live cursors. */
+	if (txn->cursors != 0) {
+		__db_errx(env, DB_STR("4531",
+		    "transaction has active cursors"));
+		goto err;
+	}
+
+	/* Check transaction's state. */
+	td = txn->td;
+
+	/* Handle any operation specific checks. */
+	switch (op) {
+	case TXN_OP_DISCARD:
+		/*
+		 * Since we're just tossing the per-process space; there are
+		 * a lot of problems with the transaction that we can tolerate.
+		 */
+
+		/* Transaction is already been reused. */
+		if (txn->txnid != td->txnid)
+			return (0);
+
+		/*
+		 * What we've got had better be either a prepared or
+		 * restored transaction.
+		 */
+		if (td->status != TXN_PREPARED &&
+		    !F_ISSET(td, TXN_DTL_RESTORED)) {
+			__db_errx(env, DB_STR("4532",
+			    "not a restored transaction"));
+			return (__env_panic(env, EINVAL));
+		}
+
+		return (0);
+	case TXN_OP_PREPARE:
+		if (txn->parent != NULL) {
+			/*
+			 * This is not fatal, because you could imagine an
+			 * application that simply prepares everybody because
+			 * it doesn't distinguish between children and parents.
+			 * I'm not arguing this is good, but I could imagine
+			 * someone doing it.
+			 */
+			__db_errx(env, DB_STR("4533",
+			    "Prepare disallowed on child transactions"));
+			return (EINVAL);
+		}
+		break;
+	case TXN_OP_ABORT:
+	case TXN_OP_COMMIT:
+	default:
+		break;
+	}
+
+	switch (td->status) {
+	case TXN_PREPARED:
+		if (op == TXN_OP_PREPARE) {
+			__db_errx(env, DB_STR("4534",
+			    "transaction already prepared"));
+			/*
+			 * Txn_prepare doesn't blow away the user handle, so
+			 * in this case, give the user the opportunity to
+			 * abort or commit.
+			 */
+			return (EINVAL);
+		}
+		break;
+	case TXN_RUNNING:
+	case TXN_NEED_ABORT:
+		break;
+	case TXN_ABORTED:
+	case TXN_COMMITTED:
+	default:
+		__db_errx(env, DB_STR_A("4535",
+		    "transaction already %s", "%s"),
+		    td->status == TXN_COMMITTED ?
+		    DB_STR_P("committed") : DB_STR_P("aborted"));
+		goto err;
+	}
+
+	return (0);
+
+err:	/*
+	 * If there's a serious problem with the transaction, panic.  TXN
+	 * handles are dead by definition when we return, and if you use
+	 * a cursor you forgot to close, we have no idea what will happen.
+	 */
+	return (__env_panic(env, EINVAL));
+}
+
+/*
+ * __txn_end --
+ *	Internal transaction end routine.
+ */
+static int
+__txn_end(txn, is_commit)
+	DB_TXN *txn;
+	int is_commit;
+{
+	DB_LOCKREQ request;
+	DB_TXNLOGREC *lr;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	ENV *env;
+	TXN_DETAIL *ptd, *td;
+	db_mutex_t mvcc_mtx;
+	int do_closefiles, ret;
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+	region = mgr->reginfo.primary;
+	do_closefiles = 0;
+
+	/* Process commit events. */
+	if ((ret = __txn_doevents(env,
+	    txn, is_commit ? TXN_COMMIT : TXN_ABORT, 0)) != 0)
+		return (__env_panic(env, ret));
+
+	/* End the transaction. */
+	td = txn->td;
+	if (td->nlog_dbs != 0 &&
+	     (ret = __txn_dref_fname(env, txn)) != 0 && ret != EIO)
+		return (__env_panic(env, ret));
+
+	if (td->mvcc_ref != 0 && IS_MAX_LSN(td->visible_lsn)) {
+		/*
+		 * Some pages were dirtied but nothing was logged.  This can
+		 * happen easily if we are aborting, but there are also cases
+		 * in the compact code where pages are dirtied unconditionally
+		 * and then we find out that there is no work to do.
+		 *
+		 * We need to make sure that the versions become visible to
+		 * future transactions.  We need to set visible_lsn before
+		 * setting td->status to ensure safe reads of visible_lsn in
+		 * __memp_fget.
+		 */
+		if ((ret = __log_current_lsn_int(env, &td->visible_lsn,
+		    NULL, NULL)) != 0)
+			return (__env_panic(env, ret));
+	}
+
+	/*
+	 * Release the locks.
+	 *
+	 * __txn_end cannot return an simple error, we MUST return
+	 * success/failure from commit or abort, ignoring any internal
+	 * errors.  So, we panic if something goes wrong.  We can't
+	 * deadlock here because we're not acquiring any new locks,
+	 * so DB_LOCK_DEADLOCK is just as fatal as any other error.
+	 */
+	if (LOCKING_ON(env)) {
+		/* Allocate a locker for this restored txn if necessary. */
+		if (txn->locker == NULL &&
+		    (ret = __lock_getlocker(env->lk_handle,
+		    txn->txnid, 1, &txn->locker)) != 0)
+			return (__env_panic(env, ret));
+		request.op = txn->parent == NULL ||
+		    is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT;
+		request.obj = NULL;
+		if ((ret = __lock_vec(env,
+		    txn->locker, 0, &request, 1, NULL)) != 0)
+			return (__env_panic(env, ret));
+	}
+
+	TXN_SYSTEM_LOCK(env);
+	td->status = is_commit ? TXN_COMMITTED : TXN_ABORTED;
+	SH_TAILQ_REMOVE(&region->active_txn, td, links, __txn_detail);
+	region->curtxns--;
+	if (F_ISSET(td, TXN_DTL_RESTORED)) {
+		region->stat.st_nrestores--;
+		do_closefiles = region->stat.st_nrestores == 0;
+	}
+
+	if (td->name != INVALID_ROFF) {
+		__env_alloc_free(&mgr->reginfo,
+		    R_ADDR(&mgr->reginfo, td->name));
+		td->name = INVALID_ROFF;
+	}
+	if (td->nlog_slots != TXN_NSLOTS)
+		__env_alloc_free(&mgr->reginfo,
+		    R_ADDR(&mgr->reginfo, td->log_dbs));
+
+	if (txn->parent != NULL) {
+		ptd = txn->parent->td;
+		SH_TAILQ_REMOVE(&ptd->kids, td, klinks, __txn_detail);
+	} else if ((mvcc_mtx = td->mvcc_mtx) != MUTEX_INVALID) {
+		MUTEX_LOCK(env, mvcc_mtx);
+		if (td->mvcc_ref != 0) {
+			SH_TAILQ_INSERT_HEAD(&region->mvcc_txn,
+			    td, links, __txn_detail);
+
+			/*
+			 * The transaction has been added to the list of
+			 * committed snapshot transactions with active pages.
+			 * It needs to be freed when the last page is evicted.
+			 */
+			F_SET(td, TXN_DTL_SNAPSHOT);
+#ifdef HAVE_STATISTICS
+			STAT_INC(env, txn,
+			    nsnapshot, region->stat.st_nsnapshot, txn->txnid);
+			if (region->stat.st_nsnapshot >
+			    region->stat.st_maxnsnapshot)
+				    STAT_SET(env, txn, maxnsnapshot,
+					region->stat.st_maxnsnapshot,
+					region->stat.st_nsnapshot,
+					txn->txnid);
+#endif
+			td = NULL;
+		}
+		MUTEX_UNLOCK(env, mvcc_mtx);
+		if (td != NULL)
+			if ((ret = __mutex_free(env, &td->mvcc_mtx)) != 0)
+				return (__env_panic(env, ret));
+	}
+
+	if (td != NULL)
+		__env_alloc_free(&mgr->reginfo, td);
+
+#ifdef HAVE_STATISTICS
+	if (is_commit)
+		STAT_INC(env,
+		    txn, ncommits, region->stat.st_ncommits, txn->txnid);
+	else
+		STAT_INC(env,
+		    txn, naborts, region->stat.st_naborts, txn->txnid);
+	STAT_DEC(env, txn, nactive, region->stat.st_nactive, txn->txnid);
+#endif
+
+	/* Increment bulk transaction counter while holding transaction lock. */
+	if (F_ISSET(txn, TXN_BULK))
+		((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_bulk_txn--;
+
+	TXN_SYSTEM_UNLOCK(env);
+
+	/*
+	 * The transaction cannot get more locks, remove its locker info,
+	 * if any.
+	 */
+	if (LOCKING_ON(env) && (ret =
+	    __lock_freelocker(env->lk_handle, txn->locker)) != 0)
+		return (__env_panic(env, ret));
+	if (txn->parent != NULL)
+		TAILQ_REMOVE(&txn->parent->kids, txn, klinks);
+
+	/* Free the space. */
+	while ((lr = STAILQ_FIRST(&txn->logs)) != NULL) {
+		STAILQ_REMOVE(&txn->logs, lr, __txn_logrec, links);
+		__os_free(env, lr);
+	}
+	if (txn->name != NULL) {
+		__os_free(env, txn->name);
+		txn->name = NULL;
+	}
+
+	/*
+	 * Free the transaction structure if we allocated it and if we are
+	 * not in an XA transaction that will be freed when we exit the XA
+	 * wrapper routines.
+	 */
+	if (F_ISSET(txn, TXN_MALLOC) &&
+	    txn->xa_thr_status != TXN_XA_THREAD_ASSOCIATED) {
+		MUTEX_LOCK(env, mgr->mutex);
+		TAILQ_REMOVE(&mgr->txn_chain, txn, links);
+		MUTEX_UNLOCK(env, mgr->mutex);
+
+		__os_free(env, txn);
+	}
+
+	if (do_closefiles) {
+		/*
+		 * Otherwise, we have resolved the last outstanding prepared
+		 * txn and need to invalidate the fileids that were left
+		 * open for those txns and then close them.
+		 */
+		(void)__dbreg_invalidate_files(env, 1);
+		(void)__dbreg_close_files(env, 1);
+		if (IS_REP_MASTER(env))
+			F_CLR(env->rep_handle, DBREP_OPENFILES);
+		F_CLR(env->lg_handle, DBLOG_OPENFILES);
+		mgr->n_discards = 0;
+		(void)__txn_checkpoint(env, 0, 0,
+		    DB_CKP_INTERNAL | DB_FORCE);
+	}
+
+	return (0);
+}
+
+static int
+__txn_dispatch_undo(env, txn, rdbt, key_lsn, txnlist)
+	ENV *env;
+	DB_TXN *txn;
+	DBT *rdbt;
+	DB_LSN *key_lsn;
+	DB_TXNHEAD *txnlist;
+{
+	int ret;
+
+	txnlist->td = txn->td;
+	ret = __db_dispatch(env, &env->recover_dtab,
+	    rdbt, key_lsn, DB_TXN_ABORT, txnlist);
+	if (ret == DB_SURPRISE_KID) {
+		F_SET(txn, TXN_CHILDCOMMIT);
+		ret = 0;
+	}
+	if (ret == 0 && F_ISSET(txn, TXN_CHILDCOMMIT) && IS_ZERO_LSN(*key_lsn))
+		ret = __db_txnlist_lsnget(env, txnlist, key_lsn, 0);
+
+	return (ret);
+}
+
+/*
+ * __txn_undo --
+ *	Undo the transaction with id txnid.
+ */
+static int
+__txn_undo(txn)
+	DB_TXN *txn;
+{
+	DBT rdbt;
+	DB_LOGC *logc;
+	DB_LSN key_lsn;
+	DB_TXN *ptxn;
+	DB_TXNHEAD *txnlist;
+	DB_TXNLOGREC *lr;
+	DB_TXNMGR *mgr;
+	ENV *env;
+	int ret, t_ret;
+
+	mgr = txn->mgrp;
+	env = mgr->env;
+	logc = NULL;
+	txnlist = NULL;
+	ret = 0;
+
+	if (!LOGGING_ON(env))
+		return (0);
+
+	/*
+	 * This is the simplest way to code this, but if the mallocs during
+	 * recovery turn out to be a performance issue, we can do the
+	 * allocation here and use DB_DBT_USERMEM.
+	 */
+	memset(&rdbt, 0, sizeof(rdbt));
+
+	/*
+	 * Allocate a txnlist for children and aborted page allocs.
+	 * We need to associate the list with the maximal parent
+	 * so that aborted pages are recovered when that transaction
+	 * is committed or aborted.
+	 */
+	for (ptxn = txn->parent; ptxn != NULL && ptxn->parent != NULL;)
+		ptxn = ptxn->parent;
+
+	if (ptxn != NULL && ptxn->txn_list != NULL)
+		txnlist = ptxn->txn_list;
+	else if (txn->txn_list != NULL)
+		txnlist = txn->txn_list;
+	else if ((ret = __db_txnlist_init(env,
+	    txn->thread_info, 0, 0, NULL, &txnlist)) != 0)
+		return (ret);
+	else if (ptxn != NULL)
+		ptxn->txn_list = txnlist;
+
+	/*
+	 * Take log records from the linked list stored in the transaction,
+	 * then from the log.
+	 */
+	STAILQ_FOREACH(lr, &txn->logs, links) {
+		rdbt.data = lr->data;
+		rdbt.size = 0;
+		LSN_NOT_LOGGED(key_lsn);
+		ret =
+		    __txn_dispatch_undo(env, txn, &rdbt, &key_lsn, txnlist);
+		if (ret != 0) {
+			__db_err(env, ret, DB_STR("4536",
+			    "DB_TXN->abort: in-memory log undo failed"));
+			goto err;
+		}
+	}
+
+	key_lsn = ((TXN_DETAIL *)txn->td)->last_lsn;
+
+	if (!IS_ZERO_LSN(key_lsn) &&
+	     (ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+
+	while (!IS_ZERO_LSN(key_lsn)) {
+		/*
+		 * The dispatch routine returns the lsn of the record
+		 * before the current one in the key_lsn argument.
+		 */
+		if ((ret = __logc_get(logc, &key_lsn, &rdbt, DB_SET)) == 0) {
+			ret = __txn_dispatch_undo(env,
+			    txn, &rdbt, &key_lsn, txnlist);
+		}
+
+		if (ret != 0) {
+			__db_err(env, ret, DB_STR_A("4537",
+		    "DB_TXN->abort: log undo failed for LSN: %lu %lu",
+			    "%lu %lu"), (u_long)key_lsn.file,
+			    (u_long)key_lsn.offset);
+			goto err;
+		}
+	}
+
+err:	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (ptxn == NULL && txnlist != NULL)
+		__db_txnlist_end(env, txnlist);
+	return (ret);
+}
+
+/*
+ * __txn_activekids --
+ *	Return if this transaction has any active children.
+ *
+ * PUBLIC: int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *));
+ */
+int
+__txn_activekids(env, rectype, txn)
+	ENV *env;
+	u_int32_t rectype;
+	DB_TXN *txn;
+{
+	/*
+	 * On a child commit, we know that there are children (i.e., the
+	 * committing child at the least.  In that case, skip this check.
+	 */
+	if (F_ISSET(txn, TXN_COMPENSATE) || rectype == DB___txn_child)
+		return (0);
+
+	if (TAILQ_FIRST(&txn->kids) != NULL) {
+		__db_errx(env, DB_STR("4538",
+		    "Child transaction is active"));
+		return (EPERM);
+	}
+	return (0);
+}
+
+/*
+ * __txn_force_abort --
+ *	Force an abort record into the log if the commit record
+ *	failed to get to disk.
+ *
+ * PUBLIC: int __txn_force_abort __P((ENV *, u_int8_t *));
+ */
+int
+__txn_force_abort(env, buffer)
+	ENV *env;
+	u_int8_t *buffer;
+{
+	DB_CIPHER *db_cipher;
+	HDR hdr, *hdrp;
+	u_int32_t offset, opcode, sum_len;
+	u_int8_t *bp, *key;
+	size_t hdrsize, rec_len;
+	int ret;
+
+	db_cipher = env->crypto_handle;
+
+	/*
+	 * This routine depends on the layout of HDR and the __txn_regop
+	 * record in txn.src.  We are passed the beginning of the commit
+	 * record in the log buffer and overwrite the commit with an abort
+	 * and recalculate the checksum.
+	 */
+	hdrsize = CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ;
+
+	hdrp = (HDR *)buffer;
+	memcpy(&hdr.prev, buffer + SSZ(HDR, prev), sizeof(hdr.prev));
+	memcpy(&hdr.len, buffer + SSZ(HDR, len), sizeof(hdr.len));
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(&hdr, CRYPTO_ON(env));
+	rec_len = hdr.len - hdrsize;
+
+	offset = sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN);
+	if (CRYPTO_ON(env)) {
+		key = db_cipher->mac_key;
+		sum_len = DB_MAC_KEY;
+		if ((ret = db_cipher->decrypt(env, db_cipher->data,
+		    &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0)
+			return (__env_panic(env, ret));
+	} else {
+		key = NULL;
+		sum_len = sizeof(u_int32_t);
+	}
+	bp = buffer + hdrsize + offset;
+	opcode = TXN_ABORT;
+	LOGCOPY_32(env, bp, &opcode);
+
+	if (CRYPTO_ON(env) &&
+	    (ret = db_cipher->encrypt(env,
+	    db_cipher->data, &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0)
+		return (__env_panic(env, ret));
+
+#ifdef HAVE_LOG_CHECKSUM
+	__db_chksum(&hdr, buffer + hdrsize, rec_len, key, NULL);
+	if (LOG_SWAPPED(env))
+		__log_hdrswap(&hdr, CRYPTO_ON(env));
+	memcpy(buffer + SSZA(HDR, chksum), hdr.chksum, sum_len);
+#endif
+
+	return (0);
+}
+
+/*
+ * __txn_preclose --
+ *	Before we can close an environment, we need to check if we were in the
+ *	middle of taking care of restored transactions.  If so, close the files
+ *	we opened.
+ *
+ * PUBLIC: int __txn_preclose __P((ENV *));
+ */
+int
+__txn_preclose(env)
+	ENV *env;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	int do_closefiles, ret;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+	do_closefiles = 0;
+
+	TXN_SYSTEM_LOCK(env);
+	if (region != NULL &&
+	    region->stat.st_nrestores <= mgr->n_discards &&
+	    mgr->n_discards != 0)
+		do_closefiles = 1;
+	TXN_SYSTEM_UNLOCK(env);
+
+	if (do_closefiles) {
+		/*
+		 * Set the DBLOG_RECOVER flag while closing these files so they
+		 * do not create additional log records that will confuse future
+		 * recoveries.
+		 */
+		F_SET(env->lg_handle, DBLOG_RECOVER);
+		ret = __dbreg_close_files(env, 0);
+		F_CLR(env->lg_handle, DBLOG_RECOVER);
+	} else
+		ret = 0;
+
+	return (ret);
+}
+
+/*
+ * __txn_reset --
+ *	Reset the last txnid to its minimum value, and log the reset.
+ *
+ * PUBLIC: int __txn_reset __P((ENV *));
+ */
+int
+__txn_reset(env)
+	ENV *env;
+{
+	DB_LSN scrap;
+	DB_TXNREGION *region;
+
+	region = env->tx_handle->reginfo.primary;
+	region->last_txnid = TXN_MINIMUM;
+
+	DB_ASSERT(env, LOGGING_ON(env));
+	return (__txn_recycle_log(env,
+	    NULL, &scrap, 0, TXN_MINIMUM, TXN_MAXIMUM));
+}
+
+/*
+ * txn_set_txn_lsnp --
+ *	Set the pointer to the begin_lsn field if that field is zero.
+ *	Set the pointer to the last_lsn field.
+ */
+static void
+__txn_set_txn_lsnp(txn, blsnp, llsnp)
+	DB_TXN *txn;
+	DB_LSN **blsnp, **llsnp;
+{
+	TXN_DETAIL *td;
+
+	td = txn->td;
+	*llsnp = &td->last_lsn;
+
+	while (txn->parent != NULL)
+		txn = txn->parent;
+
+	td = txn->td;
+	if (IS_ZERO_LSN(td->begin_lsn))
+		*blsnp = &td->begin_lsn;
+}
+
+/*
+ * PUBLIC: int __txn_applied_pp __P((DB_ENV *,
+ * PUBLIC:     DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+ */
+int
+__txn_applied_pp(dbenv, token, timeout, flags)
+	DB_ENV *dbenv;
+	DB_TXN_TOKEN *token;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_COMMIT_INFO commit_info;
+	u_int8_t *bp;
+	int ret;
+
+	env = dbenv->env;
+
+	if (flags != 0)
+		return (__db_ferr(env, "DB_ENV->txn_applied", 0));
+
+	/* Unmarshal the token from its stored form. */
+	bp = token->buf;
+	DB_NTOHL_COPYIN(env, commit_info.version, bp);
+	DB_ASSERT(env, commit_info.version == REP_COMMIT_TOKEN_FMT_VERSION);
+	DB_NTOHL_COPYIN(env, commit_info.gen, bp);
+	DB_NTOHL_COPYIN(env, commit_info.envid, bp);
+	DB_NTOHL_COPYIN(env, commit_info.lsn.file, bp);
+	DB_NTOHL_COPYIN(env, commit_info.lsn.offset, bp);
+
+	/*
+	 * Check for a token representing a transaction that committed without
+	 * any log records having been written.  Ideally an application should
+	 * be smart enough to avoid trying to use a token from such an "empty"
+	 * transaction.  But in some cases it might be difficult for them to
+	 * keep track, so we don't really forbid it.
+	 */
+	if (IS_ZERO_LSN(commit_info.lsn))
+		return (DB_KEYEMPTY);
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->lg_handle, "DB_ENV->txn_applied", DB_INIT_LOG);
+
+	ENV_ENTER(env, ip);
+	ret = __txn_applied(env, ip, &commit_info, timeout);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+__txn_applied(env, ip, commit_info, timeout)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_COMMIT_INFO *commit_info;
+	db_timeout_t timeout;
+{
+	LOG *lp;
+	DB_LSN lsn;
+	REGENV *renv;
+
+	/*
+	 * The lockout protection scope between __op_handle_enter and
+	 * __env_db_rep_exit is handled within __rep_txn_applied, and is not
+	 * needed here since the rest of this function only runs in a
+	 * non-replication env.
+	 */
+	if (REP_ON(env))
+		return (__rep_txn_applied(env, ip, commit_info, timeout));
+
+	if (commit_info->gen != 0) {
+		__db_errx(env, DB_STR("4539",
+		    "replication commit token in non-replication env"));
+		return (EINVAL);
+	}
+
+	lp = env->lg_handle->reginfo.primary;
+	LOG_SYSTEM_LOCK(env);
+	lsn = lp->lsn;
+	LOG_SYSTEM_UNLOCK(env);
+
+	renv = env->reginfo->primary;
+
+	if (renv->envid == commit_info->envid &&
+	    LOG_COMPARE(&commit_info->lsn, &lsn) <= 0)
+		return (0);
+	return (DB_NOTFOUND);
+}
diff --git a/src/txn/txn.src b/src/txn/txn.src
new file mode 100644
index 00000000..7e82dc82
--- /dev/null
+++ b/src/txn/txn.src
@@ -0,0 +1,120 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__txn
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/lock.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * This is the standard log operation for commit.
+ * Note that we are using an int32_t for the timestamp.  This means that
+ * in 2039 we will need to deprecate this log record and create one that
+ * either changes the Epoch or has a 64-bit offset.
+ * NOTE: The opcode MUST be the first argument in these records, because
+ * the force_abort code overwrites it with an ABORT should the write to
+ * the log fail.
+ * envid:
+ *	Environment ID of this operation (4.4+).
+ */
+BEGIN_COMPAT regop		42	10
+ARG	opcode		u_int32_t	lu
+TIME	timestamp	int32_t		ld
+LOCKS	locks		DBT		s
+END
+
+BEGIN regop		44	10
+ARG	opcode		u_int32_t	lu
+TIME	timestamp	int32_t		ld
+ARG	envid		u_int32_t	lu
+LOCKS	locks		DBT		s
+END
+
+/*
+ * This is the checkpoint record.  It contains the lsn that the checkpoint
+ * guarantees and a pointer to the last checkpoint so we can walk backwards
+ * by checkpoint.
+ *
+ * ckp_lsn:
+ *	The lsn in the log of the most recent point at which all begun
+ *	transactions have been aborted.  This is the point for which
+ *	the checkpoint is relevant.
+ * last_ckp:
+ *	The previous checkpoint.
+ * timestamp:
+ *	See comment in commit about timestamps.
+ * envid:
+ *	Environment ID of this checkpoint (4.3+).
+ * rep_gen:
+ *	Persistent replication generation number (4.2-4.5 only).
+ *	Renamed to 'spare' in 4.6.
+ */
+BEGIN_COMPAT ckp		42	11
+POINTER	ckp_lsn		DB_LSN *	lu
+POINTER	last_ckp	DB_LSN *	lu
+TIME	timestamp	int32_t		ld
+ARG	rep_gen		u_int32_t	lu
+END
+
+BEGIN ckp		43	11
+POINTER	ckp_lsn		DB_LSN *	lu
+POINTER	last_ckp	DB_LSN *	lu
+TIME	timestamp	int32_t		ld
+ARG	envid		u_int32_t	lu
+ARG	spare		u_int32_t	lu
+END
+
+/*
+ * This is the (new) log operation for a child commit.  It is
+ * logged as a record in the PARENT.  The child field contains
+ * the transaction ID of the child committing and the c_lsn is
+ * the last LSN of the child's log trail.
+ */
+BEGIN child		42	12
+ARG	child	u_int32_t	lx
+POINTER	c_lsn	DB_LSN *	lu
+END
+
+
+/*
+ * This is the standard log operation for prepare.
+ * NOTE: The opcode MUST be the first argument in these records, because
+ * the force_abort code overwrites it with an ABORT should the write to
+ * the log fail.
+ */
+BEGIN_COMPAT xa_regop		42	13
+ARG	opcode		u_int32_t	lu
+DBT	xid		DBT		s
+ARG	formatID	int32_t		ld
+ARG	gtrid		u_int32_t	lu
+ARG	bqual		u_int32_t	lu
+POINTER	begin_lsn	DB_LSN *	lu
+LOCKS	locks		DBT		s
+END
+
+BEGIN prepare		48		13
+ARG	opcode		u_int32_t	lu
+DBT	gid		DBT	s
+POINTER	begin_lsn	DB_LSN *	lu
+LOCKS	locks		DBT		s
+END
+
+/*
+ * Log the fact that we are recycling txnids.
+ */
+BEGIN recycle		42	14
+ARG	min		u_int32_t	lu
+ARG	max		u_int32_t	lu
+END
diff --git a/src/txn/txn_auto.c b/src/txn/txn_auto.c
new file mode 100644
index 00000000..926d3653
--- /dev/null
+++ b/src/txn/txn_auto.c
@@ -0,0 +1,93 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __txn_regop_42_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_regop_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_TIME, SSZ(__txn_regop_42_args, timestamp), "timestamp", ""},
+	{LOGREC_LOCKS, SSZ(__txn_regop_42_args, locks), "locks", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_regop_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_regop_args, opcode), "opcode", "%lu"},
+	{LOGREC_TIME, SSZ(__txn_regop_args, timestamp), "timestamp", ""},
+	{LOGREC_ARG, SSZ(__txn_regop_args, envid), "envid", "%lu"},
+	{LOGREC_LOCKS, SSZ(__txn_regop_args, locks), "locks", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_ckp_42_desc[] = {
+	{LOGREC_POINTER, SSZ(__txn_ckp_42_args, ckp_lsn), "ckp_lsn", ""},
+	{LOGREC_POINTER, SSZ(__txn_ckp_42_args, last_ckp), "last_ckp", ""},
+	{LOGREC_TIME, SSZ(__txn_ckp_42_args, timestamp), "timestamp", ""},
+	{LOGREC_ARG, SSZ(__txn_ckp_42_args, rep_gen), "rep_gen", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_ckp_desc[] = {
+	{LOGREC_POINTER, SSZ(__txn_ckp_args, ckp_lsn), "ckp_lsn", ""},
+	{LOGREC_POINTER, SSZ(__txn_ckp_args, last_ckp), "last_ckp", ""},
+	{LOGREC_TIME, SSZ(__txn_ckp_args, timestamp), "timestamp", ""},
+	{LOGREC_ARG, SSZ(__txn_ckp_args, envid), "envid", "%lu"},
+	{LOGREC_ARG, SSZ(__txn_ckp_args, spare), "spare", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_child_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_child_args, child), "child", "%lx"},
+	{LOGREC_POINTER, SSZ(__txn_child_args, c_lsn), "c_lsn", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_xa_regop_42_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_xa_regop_42_args, opcode), "opcode", "%lu"},
+	{LOGREC_DBT, SSZ(__txn_xa_regop_42_args, xid), "xid", ""},
+	{LOGREC_ARG, SSZ(__txn_xa_regop_42_args, formatID), "formatID", "%ld"},
+	{LOGREC_ARG, SSZ(__txn_xa_regop_42_args, gtrid), "gtrid", "%lu"},
+	{LOGREC_ARG, SSZ(__txn_xa_regop_42_args, bqual), "bqual", "%lu"},
+	{LOGREC_POINTER, SSZ(__txn_xa_regop_42_args, begin_lsn), "begin_lsn", ""},
+	{LOGREC_LOCKS, SSZ(__txn_xa_regop_42_args, locks), "locks", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_prepare_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_prepare_args, opcode), "opcode", "%lu"},
+	{LOGREC_DBT, SSZ(__txn_prepare_args, gid), "gid", ""},
+	{LOGREC_POINTER, SSZ(__txn_prepare_args, begin_lsn), "begin_lsn", ""},
+	{LOGREC_LOCKS, SSZ(__txn_prepare_args, locks), "locks", ""},
+	{LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_recycle_desc[] = {
+	{LOGREC_ARG, SSZ(__txn_recycle_args, min), "min", "%lu"},
+	{LOGREC_ARG, SSZ(__txn_recycle_args, max), "max", "%lu"},
+	{LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __txn_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_regop_recover, DB___txn_regop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_ckp_recover, DB___txn_ckp)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_child_recover, DB___txn_child)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_prepare_recover, DB___txn_prepare)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_recycle_recover, DB___txn_recycle)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/txn/txn_autop.c b/src/txn/txn_autop.c
new file mode 100644
index 00000000..0924a401
--- /dev/null
+++ b/src/txn/txn_autop.c
@@ -0,0 +1,175 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __txn_regop_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_regop_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_regop_42", __txn_regop_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_regop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_regop_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_regop", __txn_regop_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_ckp_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_ckp_42", __txn_ckp_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_ckp_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_ckp_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_ckp", __txn_ckp_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_child_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_child_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_child", __txn_child_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_xa_regop_42_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_xa_regop_42", __txn_xa_regop_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_prepare_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_prepare_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_prepare", __txn_prepare_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_recycle_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__txn_recycle_print(env, dbtp, lsnp, notused2, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *info;
+{
+	COMPQUIET(notused2, DB_TXN_PRINT);
+
+	return (__log_print_record(env, dbtp, lsnp, "__txn_recycle", __txn_recycle_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_regop_print, DB___txn_regop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_ckp_print, DB___txn_ckp)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_child_print, DB___txn_child)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_prepare_print, DB___txn_prepare)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __txn_recycle_print, DB___txn_recycle)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/src/txn/txn_chkpt.c b/src/txn/txn_chkpt.c
new file mode 100644
index 00000000..73715b10
--- /dev/null
+++ b/src/txn/txn_chkpt.c
@@ -0,0 +1,419 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_checkpoint_pp --
+ *	ENV->txn_checkpoint pre/post processing.
+ *
+ * PUBLIC: int __txn_checkpoint_pp
+ * PUBLIC:     __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__txn_checkpoint_pp(dbenv, kbytes, minutes, flags)
+	DB_ENV *dbenv;
+	u_int32_t kbytes, minutes, flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->tx_handle, "txn_checkpoint", DB_INIT_TXN);
+
+	/*
+	 * On a replication client, all transactions are read-only; therefore,
+	 * a checkpoint is a null-op.
+	 *
+	 * We permit txn_checkpoint, instead of just rendering it illegal,
+	 * so that an application can just let a checkpoint thread continue
+	 * to operate as it gets promoted or demoted between being a
+	 * master and a client.
+	 */
+	if (IS_REP_CLIENT(env))
+		return (0);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_checkpoint --
+ *	ENV->txn_checkpoint.
+ *
+ * PUBLIC: int __txn_checkpoint
+ * PUBLIC:	__P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__txn_checkpoint(env, kbytes, minutes, flags)
+	ENV *env;
+	u_int32_t kbytes, minutes, flags;
+{
+	DB_LOG *dblp;
+	DB_LSN ckp_lsn, last_ckp, msg_lsn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	time_t last_ckp_time, now;
+	u_int32_t bytes, id, logflags, mbytes, op;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * A client will only call through here during recovery,
+	 * so just sync the Mpool and go home.  We want to be sure
+	 * that since queue meta pages are not rolled back that they
+	 * are clean in the cache prior to any transaction log
+	 * truncation due to syncup.
+	 */
+	if (IS_REP_CLIENT(env)) {
+		if (MPOOL_ON(env) &&
+		    (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) {
+			__db_err(env, ret, DB_STR("4518",
+		    "txn_checkpoint: failed to flush the buffer cache"));
+			return (ret);
+		}
+		return (0);
+	}
+
+	dblp = env->lg_handle;
+	lp = dblp->reginfo.primary;
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+	infop = env->reginfo;
+	renv = infop->primary;
+	/*
+	 * No mutex is needed as envid is read-only once it is set.
+	 */
+	id = renv->envid;
+
+	MUTEX_LOCK(env, region->mtx_ckp);
+	/*
+	 * The checkpoint LSN is an LSN such that all transactions begun before
+	 * it are complete.  Our first guess (corrected below based on the list
+	 * of active transactions) is the last-written LSN.
+	 */
+	if ((ret = __log_current_lsn_int(env, &ckp_lsn, &mbytes, &bytes)) != 0)
+		goto err;
+
+	/*
+	 * Save for possible use in START_SYNC message.
+	 */
+	msg_lsn = ckp_lsn;
+	if (!LF_ISSET(DB_FORCE)) {
+		/* Don't checkpoint a quiescent database. */
+		if (bytes == 0 && mbytes == 0)
+			goto err;
+
+		/*
+		 * If either kbytes or minutes is non-zero, then only take the
+		 * checkpoint if more than "minutes" minutes have passed or if
+		 * more than "kbytes" of log data have been written since the
+		 * last checkpoint.
+		 */
+		if (kbytes != 0 &&
+		    mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes)
+			goto do_ckp;
+
+		if (minutes != 0) {
+			(void)time(&now);
+
+			TXN_SYSTEM_LOCK(env);
+			last_ckp_time = region->time_ckp;
+			TXN_SYSTEM_UNLOCK(env);
+
+			if (now - last_ckp_time >= (time_t)(minutes * 60))
+				goto do_ckp;
+		}
+
+		/*
+		 * If we checked time and data and didn't go to checkpoint,
+		 * we're done.
+		 */
+		if (minutes != 0 || kbytes != 0)
+			goto err;
+	}
+
+	/*
+	 * We must single thread checkpoints otherwise the chk_lsn may get out
+	 * of order.  We need to capture the start of the earliest currently
+	 * active transaction (chk_lsn) and then flush all buffers.  While
+	 * doing this we we could then be overtaken by another checkpoint that
+	 * sees a later chk_lsn but competes first.  An archive process could
+	 * then remove a log this checkpoint depends on.
+	 */
+do_ckp:
+	if ((ret = __txn_getactive(env, &ckp_lsn)) != 0)
+		goto err;
+
+	/*
+	 * Checkpoints in replication groups can cause performance problems.
+	 *
+	 * As on the master, checkpoint on the replica requires the cache be
+	 * flushed.  The problem occurs when a client has dirty cache pages
+	 * to write when the checkpoint record arrives, and the client's PERM
+	 * response is necessary in order to meet the system's durability
+	 * guarantees.  In this case, the master will have to wait until the
+	 * client completes its cache flush and writes the checkpoint record
+	 * before subsequent transactions can be committed.  The delay may
+	 * cause transactions to timeout waiting on client response, which
+	 * can cause nasty ripple effects in the system's overall throughput.
+	 * [#15338]
+	 *
+	 * First, we send a start-sync record when the checkpoint starts so
+	 * clients can start flushing their cache in preparation for the
+	 * arrival of the checkpoint record.
+	 */
+	if (LOGGING_ON(env) && IS_REP_MASTER(env)) {
+#ifdef HAVE_REPLICATION_THREADS
+		/*
+		 * If repmgr is configured in the shared environment, but no
+		 * send() function configured for this process, assume we have a
+		 * replication-unaware process that wants to automatically
+		 * participate in replication (i.e., sending replication
+		 * messages to clients).
+		 */
+		if (env->rep_handle->send == NULL &&
+		    F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env) &&
+		    (ret = __repmgr_autostart(env)) != 0)
+			goto err;
+#endif
+		/*
+		 * Send the LSN (saved in msg_lsn) where the sync starts
+		 * on the master.  Clients must have this LSN to assure that
+		 * they have applied all txns up to this point.
+		 */
+		if (env->rep_handle->send != NULL)
+			(void)__rep_send_message(env, DB_EID_BROADCAST,
+			    REP_START_SYNC, &msg_lsn, NULL, 0, 0);
+	}
+
+	/* Flush the cache. */
+	if (MPOOL_ON(env) &&
+	    (ret = __memp_sync_int(
+		env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) {
+		__db_err(env, ret, DB_STR("4519",
+		    "txn_checkpoint: failed to flush the buffer cache"));
+		goto err;
+	}
+
+	/*
+	 * The client won't have more dirty pages to flush from its cache than
+	 * the master did, but there may be differences between the hardware,
+	 * I/O configuration and workload on the master and the client that
+	 * can result in the client being unable to finish its cache flush as
+	 * fast as the master.  A way to avoid the problem is to pause after
+	 * the master completes its checkpoint and before the actual checkpoint
+	 * record is logged, giving the replicas additional time to finish.
+	 *
+	 * !!!
+	 * Currently turned off when testing, because it makes the test suite
+	 * take a long time to run.
+	 */
+#ifndef	CONFIG_TEST
+	if (LOGGING_ON(env) &&
+	    IS_REP_MASTER(env) && env->rep_handle->send != NULL &&
+	    !LF_ISSET(DB_CKP_INTERNAL) &&
+	    env->rep_handle->region->chkpt_delay != 0)
+		__os_yield(env, 0, env->rep_handle->region->chkpt_delay);
+#endif
+
+	/*
+	 * Because we can't be a replication client here, and because
+	 * recovery (somewhat unusually) calls txn_checkpoint and expects
+	 * it to write a log message, LOGGING_ON is the correct macro here.
+	 */
+	if (LOGGING_ON(env)) {
+		TXN_SYSTEM_LOCK(env);
+		last_ckp = region->last_ckp;
+		TXN_SYSTEM_UNLOCK(env);
+		/*
+		 * Put out records for the open files before we log
+		 * the checkpoint.  The records are certain to be at
+		 * or after ckp_lsn, but before the checkpoint record
+		 * itself, so they're sure to be included if we start
+		 * recovery from the ckp_lsn contained in this
+		 * checkpoint.
+		 */
+		logflags = DB_LOG_CHKPNT;
+		/*
+		 * If this is a normal checkpoint, log files as checkpoints.
+		 * If we are recovering, only log as DBREG_RCLOSE if
+		 * there are no prepared txns.  Otherwise, it should
+		 * stay as DBREG_CHKPNT.
+		 */
+		op = DBREG_CHKPNT;
+		if (!IS_RECOVERING(env))
+			logflags |= DB_FLUSH;
+		else if (region->stat.st_nrestores == 0)
+			op = DBREG_RCLOSE;
+		if ((ret = __dbreg_log_files(env, op)) != 0 ||
+		    (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags,
+		    &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) {
+			__db_err(env, ret, DB_STR_A("4520",
+			    "txn_checkpoint: log failed at LSN [%ld %ld]",
+			    "%ld %ld"),
+			    (long)ckp_lsn.file, (long)ckp_lsn.offset);
+			goto err;
+		}
+
+		if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0)
+			goto err;
+	}
+
+err:	MUTEX_UNLOCK(env, region->mtx_ckp);
+	if (ret == 0 && lp->db_log_autoremove)
+		__log_autoremove(env);
+	return (ret);
+}
+
+/*
+ * __txn_getactive --
+ *	 Find the oldest active transaction and figure out its "begin" LSN.
+ *	 This is the lowest LSN we can checkpoint, since any record written
+ *	 after it may be involved in a transaction and may therefore need
+ *	 to be undone in the case of an abort.
+ *
+ *	 We check both the file and offset for 0 since the lsn may be in
+ *	 transition.  If it is then we don't care about this txn because it
+ *	 must be starting after we set the initial value of lsnp in the caller.
+ *	 All txns must initialize their begin_lsn before writing to the log.
+ *
+ * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *));
+ */
+int
+__txn_getactive(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *td;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	TXN_SYSTEM_LOCK(env);
+	SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+		if (td->begin_lsn.file != 0 &&
+		    td->begin_lsn.offset != 0 &&
+		    LOG_COMPARE(&td->begin_lsn, lsnp) < 0)
+			*lsnp = td->begin_lsn;
+	TXN_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __txn_getckp --
+ *	Get the LSN of the last transaction checkpoint.
+ *
+ * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *));
+ */
+int
+__txn_getckp(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	DB_LSN lsn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	TXN_SYSTEM_LOCK(env);
+	lsn = region->last_ckp;
+	TXN_SYSTEM_UNLOCK(env);
+
+	if (IS_ZERO_LSN(lsn))
+		return (DB_NOTFOUND);
+
+	*lsnp = lsn;
+	return (0);
+}
+
+/*
+ * __txn_updateckp --
+ *	Update the last_ckp field in the transaction region.  This happens
+ * at the end of a normal checkpoint and also when a replication client
+ * receives a checkpoint record.
+ *
+ * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *));
+ */
+int
+__txn_updateckp(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	/*
+	 * We want to make sure last_ckp only moves forward;  since we drop
+	 * locks above and in log_put, it's possible for two calls to
+	 * __txn_ckp_log to finish in a different order from how they were
+	 * called.
+	 */
+	TXN_SYSTEM_LOCK(env);
+	if (LOG_COMPARE(&region->last_ckp, lsnp) < 0) {
+		region->last_ckp = *lsnp;
+		(void)time(&region->time_ckp);
+	}
+	TXN_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
diff --git a/src/txn/txn_failchk.c b/src/txn/txn_failchk.c
new file mode 100644
index 00000000..b2007ad6
--- /dev/null
+++ b/src/txn/txn_failchk.c
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_failchk --
+ *	Check for transactions started by dead threads of control.
+ *
+ * PUBLIC: int __txn_failchk __P((ENV *));
+ */
+int
+__txn_failchk(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	DB_TXN *ktxn, *txn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *ktd, *td;
+	db_threadid_t tid;
+	int ret;
+	char buf[DB_THREADID_STRLEN];
+	pid_t pid;
+
+	mgr = env->tx_handle;
+	dbenv = env->dbenv;
+	region = mgr->reginfo.primary;
+
+retry:	TXN_SYSTEM_LOCK(env);
+
+	SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail) {
+		/*
+		 * If this is a child transaction, skip it.
+		 * The parent will take care of it.
+		 */
+		if (td->parent != INVALID_ROFF)
+			continue;
+		/*
+		 * If the txn is prepared, then it does not matter
+		 * what the state of the thread is.
+		 */
+		if (td->status == TXN_PREPARED)
+			continue;
+
+		/* If the thread is still alive, it's not a problem. */
+		if (dbenv->is_alive(dbenv, td->pid, td->tid, 0))
+			continue;
+
+		if (F_ISSET(td, TXN_DTL_INMEMORY)) {
+			TXN_SYSTEM_UNLOCK(env);
+			return (__db_failed(env, DB_STR("4501",
+			    "Transaction has in memory logs"),
+			     td->pid, td->tid));
+		}
+
+		/* Abort the transaction. */
+		TXN_SYSTEM_UNLOCK(env);
+		if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+			return (ret);
+		if ((ret = __txn_continue(env, txn, td, NULL, 1)) != 0)
+			return (ret);
+		SH_TAILQ_FOREACH(ktd, &td->kids, klinks, __txn_detail) {
+			if (F_ISSET(ktd, TXN_DTL_INMEMORY))
+				return (__db_failed(env, DB_STR("4502",
+				    "Transaction has in memory logs"),
+				     td->pid, td->tid));
+			if ((ret =
+			    __os_calloc(env, 1, sizeof(DB_TXN), &ktxn)) != 0)
+				return (ret);
+			if ((ret =
+			    __txn_continue(env, ktxn, ktd, NULL, 1)) != 0)
+				return (ret);
+			ktxn->parent = txn;
+			ktxn->mgrp = txn->mgrp;
+			TAILQ_INSERT_HEAD(&txn->kids, ktxn, klinks);
+		}
+		pid = td->pid;
+		tid = td->tid;
+		(void)dbenv->thread_id_string(dbenv, pid, tid, buf);
+		__db_msg(env, DB_STR_A("4503",
+		    "Aborting txn %#lx: %s", "%#lx %s"),
+		    (u_long)txn->txnid, buf);
+		if ((ret = __txn_abort(txn)) != 0)
+			return (__db_failed(env, DB_STR("4504",
+			    "Transaction abort failed"), pid, tid));
+		goto retry;
+	}
+
+	TXN_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
diff --git a/src/txn/txn_method.c b/src/txn/txn_method.c
new file mode 100644
index 00000000..629eac04
--- /dev/null
+++ b/src/txn/txn_method.c
@@ -0,0 +1,124 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_env_create --
+ *	Transaction specific initialization of the DB_ENV structure.
+ *
+ * PUBLIC: int __txn_env_create __P((DB_ENV *));
+ */
+int
+__txn_env_create(dbenv)
+	DB_ENV *dbenv;
+{
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 */
+	dbenv->tx_max = 0;
+
+	return (0);
+}
+
+/*
+ * __txn_env_destroy --
+ *	Transaction specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __txn_env_destroy __P((DB_ENV *));
+ */
+void
+__txn_env_destroy(dbenv)
+	DB_ENV *dbenv;
+{
+	COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * PUBLIC: int __txn_get_tx_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__txn_get_tx_max(dbenv, tx_maxp)
+	DB_ENV *dbenv;
+	u_int32_t *tx_maxp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_NOT_CONFIGURED(env,
+	    env->tx_handle, "DB_ENV->get_tx_max", DB_INIT_TXN);
+
+	if (TXN_ON(env)) {
+		/* Cannot be set after open, no lock required to read. */
+		*tx_maxp = ((DB_TXNREGION *)
+		    env->tx_handle->reginfo.primary)->maxtxns;
+	} else
+		*tx_maxp = dbenv->tx_max;
+	return (0);
+}
+
+/*
+ * __txn_set_tx_max --
+ *	DB_ENV->set_tx_max.
+ *
+ * PUBLIC: int __txn_set_tx_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__txn_set_tx_max(dbenv, tx_max)
+	DB_ENV *dbenv;
+	u_int32_t tx_max;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_max");
+
+	dbenv->tx_max = tx_max;
+	return (0);
+}
+
+/*
+ * PUBLIC: int __txn_get_tx_timestamp __P((DB_ENV *, time_t *));
+ */
+int
+__txn_get_tx_timestamp(dbenv, timestamp)
+	DB_ENV *dbenv;
+	time_t *timestamp;
+{
+	*timestamp = dbenv->tx_timestamp;
+	return (0);
+}
+
+/*
+ * __txn_set_tx_timestamp --
+ *	Set the transaction recovery timestamp.
+ *
+ * PUBLIC: int __txn_set_tx_timestamp __P((DB_ENV *, time_t *));
+ */
+int
+__txn_set_tx_timestamp(dbenv, timestamp)
+	DB_ENV *dbenv;
+	time_t *timestamp;
+{
+	ENV *env;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_timestamp");
+
+	dbenv->tx_timestamp = *timestamp;
+	return (0);
+}
diff --git a/src/txn/txn_rec.c b/src/txn/txn_rec.c
new file mode 100644
index 00000000..b39d56d1
--- /dev/null
+++ b/src/txn/txn_rec.c
@@ -0,0 +1,616 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+/*
+ * PUBLIC: int __txn_regop_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for commits.  Normally, we redo any
+ * committed transaction, however if we are doing recovery to a timestamp, then
+ * we may treat transactions that committed after the timestamp as aborted.
+ */
+int
+__txn_regop_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_regop_args *argp;
+	DB_TXNHEAD *headp;
+	int ret;
+	u_int32_t status;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_regop_print(env, dbtp, lsnp, op, info);
+#endif
+
+	if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	headp = info;
+	/*
+	 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
+	 * We check for the former explicitly and the last two clauses
+	 * apply to the BACKWARD_ROLL case.
+	 */
+
+	if (op == DB_TXN_FORWARD_ROLL) {
+		/*
+		 * If this was a 2-phase-commit transaction, then it
+		 * might already have been removed from the list, and
+		 * that's OK.  Ignore the return code from remove.
+		 */
+		if ((ret = __db_txnlist_remove(env,
+		    info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
+			goto err;
+	} else if ((env->dbenv->tx_timestamp != 0 &&
+	    argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
+	    (!IS_ZERO_LSN(headp->trunc_lsn) &&
+	    LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
+		/*
+		 * We failed either the timestamp check or the trunc_lsn check,
+		 * so we treat this as an abort even if it was a commit record.
+		 */
+		if ((ret = __db_txnlist_update(env, info,
+		    argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
+			goto err;
+		else if (status != TXN_IGNORE && status != TXN_OK)
+			goto err;
+	} else {
+		/* This is a normal commit; mark it appropriately. */
+		if ((ret = __db_txnlist_update(env,
+		    info, argp->txnp->txnid, argp->opcode, lsnp,
+		    &status, 0)) == DB_NOTFOUND) {
+			if ((ret = __db_txnlist_add(env,
+			    info, argp->txnp->txnid,
+			    argp->opcode == TXN_ABORT ?
+			    TXN_IGNORE : argp->opcode, lsnp)) != 0)
+				goto err;
+		} else if (ret != 0 ||
+		    (status != TXN_IGNORE && status != TXN_OK))
+			goto err;
+	}
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+
+	if (0) {
+err:		__db_errx(env, DB_STR_A("4514",
+		    "txnid %lx commit record found, already on commit list",
+		    "%lx"), (u_long)argp->txnp->txnid);
+		ret = EINVAL;
+	}
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_prepare_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for prepares.
+ */
+int
+__txn_prepare_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_prepare_args *argp;
+	DBT *lock_dbt;
+	DB_TXNHEAD *headp;
+	DB_LOCKTAB *lt;
+	u_int32_t status;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_prepare_print(env, dbtp, lsnp, op, info);
+#endif
+
+	if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) {
+		ret = EINVAL;
+		goto err;
+	}
+	headp = info;
+
+	/*
+	 * The return value here is either a DB_NOTFOUND or it is
+	 * the transaction status from the list.  It is not a normal
+	 * error return, so we must make sure that in each of the
+	 * cases below, we overwrite the ret value so we return
+	 * appropriately.
+	 */
+	ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status);
+
+	/*
+	 * If we are rolling forward, then an aborted prepare
+	 * indicates that this may be the last record we'll see for
+	 * this transaction ID, so we should remove it from the list.
+	 */
+
+	if (op == DB_TXN_FORWARD_ROLL) {
+		if ((ret = __db_txnlist_remove(env,
+		    info, argp->txnp->txnid)) != 0)
+			goto txn_err;
+	} else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) {
+		/*
+		 * On the backward pass, we have four possibilities:
+		 * 1. The transaction is already committed, no-op.
+		 * 2. The transaction is already aborted, no-op.
+		 * 3. The prepare failed and was aborted, mark as abort.
+		 * 4. The transaction is neither committed nor aborted.
+		 *	 Treat this like a commit and roll forward so that
+		 *	 the transaction can be resurrected in the region.
+		 * We handle cases 3 and 4 here; cases 1 and 2
+		 * are the final clause below.
+		 */
+		if (argp->opcode == TXN_ABORT) {
+			if ((ret = __db_txnlist_update(env,
+			     info, argp->txnp->txnid,
+			     TXN_ABORT, NULL, &status, 0)) != 0 &&
+			     status != TXN_PREPARE)
+				goto txn_err;
+			ret = 0;
+		}
+		/*
+		 * This is prepared, but not yet committed transaction.  We
+		 * need to add it to the transaction list, so that it gets
+		 * rolled forward. We also have to add it to the region's
+		 * internal state so it can be properly aborted or committed
+		 * after recovery (see txn_recover).
+		 */
+		else if ((ret = __db_txnlist_remove(env,
+		    info, argp->txnp->txnid)) != 0) {
+txn_err:		__db_errx(env,
+			    DB_STR_A("4515",
+			    "transaction not in list %lx", "%lx"),
+			    (u_long)argp->txnp->txnid);
+			ret = DB_NOTFOUND;
+		} else if (IS_ZERO_LSN(headp->trunc_lsn) ||
+		    LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) {
+			if ((ret = __db_txnlist_add(env,
+			   info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) {
+				/* Re-acquire the locks for this transaction. */
+				lock_dbt = &argp->locks;
+				if (LOCKING_ON(env)) {
+					lt = env->lk_handle;
+					if ((ret = __lock_getlocker(lt,
+						argp->txnp->txnid, 1,
+						&argp->txnp->locker)) != 0)
+						goto err;
+					if ((ret = __lock_get_list(env,
+					    argp->txnp->locker, 0,
+					    DB_LOCK_WRITE, lock_dbt)) != 0)
+						goto err;
+				}
+
+				ret = __txn_restore_txn(env, lsnp, argp);
+			}
+		}
+	} else
+		ret = 0;
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+
+err:	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_ckp_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_ckp_args *argp;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	__txn_ckp_print(env, dbtp, lsnp, op, info);
+#endif
+	if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	if (op == DB_TXN_BACKWARD_ROLL)
+		__db_txnlist_ckp(env, info, lsnp);
+
+	*lsnp = argp->last_ckp;
+	__os_free(env, argp);
+	return (DB_TXN_CKP);
+}
+
+/*
+ * __txn_child_recover
+ *	Recover a commit record for a child transaction.
+ *
+ * PUBLIC: int __txn_child_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_child_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_child_args *argp;
+	u_int32_t c_stat, p_stat, tmpstat;
+	int ret, t_ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_child_print(env, dbtp, lsnp, op, info);
+#endif
+	if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * This is a record in a PARENT's log trail indicating that a
+	 * child committed.  If we are aborting, return the childs last
+	 * record's LSN.  If we are in recovery, then if the
+	 * parent is committing, we set ourselves up to commit, else
+	 * we do nothing.
+	 */
+	if (op == DB_TXN_ABORT) {
+		*lsnp = argp->c_lsn;
+		ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn);
+		goto out;
+	} else if (op == DB_TXN_BACKWARD_ROLL) {
+		/* Child might exist -- look for it. */
+		ret = __db_txnlist_find(env, info, argp->child, &c_stat);
+		t_ret =
+		    __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat);
+		if (ret != 0 && ret != DB_NOTFOUND)
+			goto out;
+		if (t_ret != 0 && t_ret != DB_NOTFOUND) {
+			ret = t_ret;
+			goto out;
+		}
+		/*
+		 * If the parent is in state COMMIT or IGNORE, then we apply
+		 * that to the child, else we need to abort the child.
+		 */
+
+		if (ret == DB_NOTFOUND  ||
+		    c_stat == TXN_OK || c_stat == TXN_COMMIT) {
+			if (t_ret == DB_NOTFOUND ||
+			     (p_stat != TXN_COMMIT  && p_stat != TXN_IGNORE))
+				c_stat = TXN_ABORT;
+			else
+				c_stat = p_stat;
+
+			if (ret == DB_NOTFOUND)
+				ret = __db_txnlist_add(env,
+				     info, argp->child, c_stat, NULL);
+			else
+				ret = __db_txnlist_update(env, info,
+				     argp->child, c_stat, NULL, &tmpstat, 0);
+		} else if (c_stat == TXN_EXPECTED) {
+			/*
+			 * The open after this create succeeded.  If the
+			 * parent succeeded, we don't want to redo; if the
+			 * parent aborted, we do want to undo.
+			 */
+			switch (p_stat) {
+			case TXN_COMMIT:
+			case TXN_IGNORE:
+				c_stat = TXN_IGNORE;
+				break;
+			default:
+				c_stat = TXN_ABORT;
+			}
+			ret = __db_txnlist_update(env,
+			    info, argp->child, c_stat, NULL, &tmpstat, 0);
+		} else if (c_stat == TXN_UNEXPECTED) {
+			/*
+			 * The open after this create failed.  If the parent
+			 * is rolling forward, we need to roll forward.  If
+			 * the parent failed, then we do not want to abort
+			 * (because the file may not be the one in which we
+			 * are interested).
+			 */
+			ret = __db_txnlist_update(env, info, argp->child,
+			    p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE,
+			    NULL, &tmpstat, 0);
+		}
+	} else if (op == DB_TXN_OPENFILES) {
+		/*
+		 * If we have a partial subtransaction, then the whole
+		 * transaction should be ignored.
+		 */
+		if ((ret = __db_txnlist_find(env,
+		    info, argp->child, &c_stat)) == DB_NOTFOUND)
+			ret = __db_txnlist_update(env, info,
+			     argp->txnp->txnid, TXN_IGNORE,
+			     NULL, &p_stat, 1);
+	} else if (DB_REDO(op)) {
+		/* Forward Roll */
+		if ((ret =
+		    __db_txnlist_remove(env, info, argp->child)) != 0)
+			__db_errx(env, DB_STR_A("4516",
+			    "Transaction not in list %x", "%x"), argp->child);
+	}
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+
+out:	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * __txn_restore_txn --
+ *	Using only during XA recovery.  If we find any transactions that are
+ * prepared, but not yet committed, then we need to restore the transaction's
+ * state into the shared region, because the TM is going to issue an abort
+ * or commit and we need to respond correctly.
+ *
+ * lsnp is the LSN of the returned LSN
+ * argp is the prepare record (in an appropriate structure)
+ *
+ * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_prepare_args *));
+ */
+int
+__txn_restore_txn(env, lsnp, argp)
+	ENV *env;
+	DB_LSN *lsnp;
+	__txn_prepare_args *argp;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *td;
+	int ret;
+
+	if (argp->gid.size == 0)
+		return (0);
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+	TXN_SYSTEM_LOCK(env);
+
+	/* Allocate a new transaction detail structure. */
+	if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) {
+		TXN_SYSTEM_UNLOCK(env);
+		return (ret);
+	}
+
+	/* Place transaction on active transaction list. */
+	SH_TAILQ_INSERT_HEAD(&region->active_txn, td, links, __txn_detail);
+	region->curtxns++;
+
+	td->txnid = argp->txnp->txnid;
+	__os_id(env->dbenv, &td->pid, &td->tid);
+	td->last_lsn = *lsnp;
+	td->begin_lsn = argp->begin_lsn;
+	td->parent = INVALID_ROFF;
+	td->name = INVALID_ROFF;
+	SH_TAILQ_INIT(&td->kids);
+	MAX_LSN(td->read_lsn);
+	MAX_LSN(td->visible_lsn);
+	td->mvcc_ref = 0;
+	td->mvcc_mtx = MUTEX_INVALID;
+	td->status = TXN_PREPARED;
+	td->flags = TXN_DTL_RESTORED;
+	memcpy(td->gid, argp->gid.data, argp->gid.size);
+	td->nlog_dbs = 0;
+	td->nlog_slots = TXN_NSLOTS;
+	td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots);
+
+	region->stat.st_nrestores++;
+#ifdef HAVE_STATISTICS
+	STAT_INC(env, txn, nactive, region->stat.st_nactive, td->txnid);
+	if (region->stat.st_nactive > region->stat.st_maxnactive)
+		STAT_SET(env, txn, maxnactive, region->stat.st_maxnactive,
+		    region->stat.st_nactive, td->txnid);
+#endif
+	TXN_SYSTEM_UNLOCK(env);
+	return (0);
+}
+
+/*
+ * __txn_recycle_recover --
+ *	Recovery function for recycle.
+ *
+ * PUBLIC: int __txn_recycle_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_recycle_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_recycle_args *argp;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_child_print(env, dbtp, lsnp, op, info);
+#endif
+	if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	COMPQUIET(lsnp, NULL);
+
+	if ((ret = __db_txnlist_gen(env, info,
+	    DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0)
+		return (ret);
+
+	__os_free(env, argp);
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __txn_regop_42_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for commits.  Normally, we redo any
+ * committed transaction, however if we are doing recovery to a timestamp, then
+ * we may treat transactions that committed after the timestamp as aborted.
+ */
+int
+__txn_regop_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_regop_42_args *argp;
+	DB_TXNHEAD *headp;
+	u_int32_t status;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	(void)__txn_regop_42_print(env, dbtp, lsnp, op, info);
+#endif
+
+	if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	headp = info;
+	/*
+	 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
+	 * We check for the former explicitly and the last two clauses
+	 * apply to the BACKWARD_ROLL case.
+	 */
+
+	if (op == DB_TXN_FORWARD_ROLL) {
+		/*
+		 * If this was a 2-phase-commit transaction, then it
+		 * might already have been removed from the list, and
+		 * that's OK.  Ignore the return code from remove.
+		 */
+		if ((ret = __db_txnlist_remove(env,
+		    info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
+			goto err;
+	} else if ((env->dbenv->tx_timestamp != 0 &&
+	    argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
+	    (!IS_ZERO_LSN(headp->trunc_lsn) &&
+	    LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
+		/*
+		 * We failed either the timestamp check or the trunc_lsn check,
+		 * so we treat this as an abort even if it was a commit record.
+		 */
+		if ((ret = __db_txnlist_update(env, info,
+		    argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
+			goto err;
+		else if (status != TXN_IGNORE && status != TXN_OK)
+			goto err;
+	} else {
+		/* This is a normal commit; mark it appropriately. */
+		if ((ret = __db_txnlist_update(env,
+		    info, argp->txnp->txnid, argp->opcode, lsnp,
+		    &status, 0)) == DB_NOTFOUND) {
+			if ((ret = __db_txnlist_add(env,
+			    info, argp->txnp->txnid,
+			    argp->opcode == TXN_ABORT ?
+			    TXN_IGNORE : argp->opcode, lsnp)) != 0)
+				goto err;
+		} else if (ret != 0 ||
+		    (status != TXN_IGNORE && status != TXN_OK))
+			goto err;
+	}
+
+	if (ret == 0)
+		*lsnp = argp->prev_lsn;
+
+	if (0) {
+err:		__db_errx(env, DB_STR_A("4517",
+		    "txnid %lx commit record found, already on commit list",
+		    "%lx"), (u_long)argp->txnp->txnid);
+		ret = EINVAL;
+	}
+	__os_free(env, argp);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_ckp_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__txn_ckp_42_args *argp;
+	int ret;
+
+#ifdef DEBUG_RECOVER
+	__txn_ckp_42_print(env, dbtp, lsnp, op, info);
+#endif
+	if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	if (op == DB_TXN_BACKWARD_ROLL)
+		__db_txnlist_ckp(env, info, lsnp);
+
+	*lsnp = argp->last_ckp;
+	__os_free(env, argp);
+	return (DB_TXN_CKP);
+}
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
new file mode 100644
index 00000000..67f24439
--- /dev/null
+++ b/src/txn/txn_recover.c
@@ -0,0 +1,317 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc_auto/db_auto.h"
+#include "dbinc_auto/crdel_auto.h"
+#include "dbinc_auto/db_ext.h"
+
+/*
+ * __txn_recover_pp --
+ *	ENV->txn_recover pre/post processing.
+ *
+ * PUBLIC: int __txn_recover_pp __P((DB_ENV *,
+ * PUBLIC:     DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_recover_pp(dbenv, preplist, count, retp, flags)
+	DB_ENV *dbenv;
+	DB_PREPLIST *preplist;
+	long count, *retp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(
+	    env, env->tx_handle, "txn_recover", DB_INIT_TXN);
+
+	if (F_ISSET((DB_TXNREGION *)env->tx_handle->reginfo.primary,
+	    TXN_IN_RECOVERY)) {
+		__db_errx(env, DB_STR("4505",
+		    "operation not permitted while in recovery"));
+		return (EINVAL);
+	}
+
+	if (flags != DB_FIRST && flags != DB_NEXT)
+		return (__db_ferr(env, "DB_ENV->txn_recover", 0));
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__txn_recover(env, preplist, count, retp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_recover --
+ *	ENV->txn_recover.
+ *
+ * PUBLIC: int __txn_recover __P((ENV *,
+ * PUBLIC:         DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_recover(env, txns, count, retp, flags)
+	ENV *env;
+	DB_PREPLIST *txns;
+	long  count, *retp;
+	u_int32_t flags;
+{
+	/*
+	 * Public API to retrieve the list of prepared, but not yet committed
+	 * transactions.  See __txn_get_prepared for details.  This function
+	 * and __db_xa_recover both wrap that one.
+	 */
+	return (__txn_get_prepared(env,
+	    NULL, txns, count, retp, flags));
+
+}
+
+/*
+ * __txn_get_prepared --
+ *      Returns a list of prepared (and for XA, heuristically completed)
+ *      transactions (less than or equal to the count parameter).  One of
+ *      xids or txns must be set to point to an array of the appropriate type.
+ *      The count parameter indicates the number of entries in the xids and/or
+ *      txns array. The retp parameter will be set to indicate the number of
+ *      entries returned in the xids/txns array.  Flags indicates the operation,
+ *      one of DB_FIRST or DB_NEXT.
+ *
+ * PUBLIC: int __txn_get_prepared __P((ENV *,
+ * PUBLIC:     XID *, DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_get_prepared(env, xids, txns, count, retp, flags)
+	ENV *env;
+	XID *xids;
+	DB_PREPLIST *txns;
+	long count;             /* This is long for XA compatibility. */
+	long *retp;
+	u_int32_t flags;
+{
+	DB_LSN min;
+	DB_PREPLIST *prepp;
+	DB_THREAD_INFO *ip;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *td;
+	XID *xidp;
+	long i;
+	int restored, ret;
+
+	*retp = 0;
+	MAX_LSN(min);
+	prepp = txns;
+	xidp = xids;
+	restored = ret = 0;
+
+	/*
+	 * If we are starting a scan, then we traverse the active transaction
+	 * list once making sure that all transactions are marked as not having
+	 * been collected.  Then on each pass, we mark the ones we collected
+	 * so that if we cannot collect them all at once, we can finish up
+	 * next time with a continue.
+	 */
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	/*
+	 * During this pass we need to figure out if we are going to need
+	 * to open files.  We need to open files if we've never collected
+	 * before (in which case, none of the COLLECTED bits will be set)
+	 * and the ones that we are collecting are restored (if they aren't
+	 * restored, then we never crashed; just the main server did).
+	 */
+	TXN_SYSTEM_LOCK(env);
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/* Now begin collecting active transactions. */
+	for (td = SH_TAILQ_FIRST(&region->active_txn, __txn_detail);
+	    td != NULL && *retp < count;
+	    td = SH_TAILQ_NEXT(td, links, __txn_detail)) {
+		if (td->status != TXN_PREPARED ||
+		    (flags != DB_FIRST && F_ISSET(td, TXN_DTL_COLLECTED)))
+			continue;
+
+		if (F_ISSET(td, TXN_DTL_RESTORED))
+			restored = 1;
+
+		if (xids != NULL) {
+			xidp->formatID = td->format;
+			/*
+			 * XID structure uses longs; use use u_int32_t's as we
+			 * log them to disk. Cast them to make the conversion
+			 * explicit.
+			 */
+			xidp->gtrid_length = (long)td->gtrid;
+			xidp->bqual_length = (long)td->bqual;
+			memcpy(xidp->data, td->gid, sizeof(td->gid));
+			xidp++;
+		}
+
+		if (txns != NULL) {
+			if ((ret = __os_calloc(env,
+			    1, sizeof(DB_TXN), &prepp->txn)) != 0) {
+				TXN_SYSTEM_UNLOCK(env);
+				goto err;
+			}
+			prepp->txn->td = td;
+			memcpy(prepp->gid, td->gid, sizeof(td->gid));
+			prepp++;
+		}
+
+		if (!IS_ZERO_LSN(td->begin_lsn) &&
+		    LOG_COMPARE(&td->begin_lsn, &min) < 0)
+			min = td->begin_lsn;
+
+		(*retp)++;
+		F_SET(td, TXN_DTL_COLLECTED);
+	}
+	if (flags == DB_FIRST)
+		for (; td != NULL; td = SH_TAILQ_NEXT(td, links, __txn_detail))
+			F_CLR(td, TXN_DTL_COLLECTED);
+	TXN_SYSTEM_UNLOCK(env);
+
+	/*
+	 * Now link all the transactions into the transaction manager's list.
+	 */
+	if (txns != NULL && *retp != 0) {
+		MUTEX_LOCK(env, mgr->mutex);
+		for (i = 0; i < *retp; i++) {
+			if ((ret = __txn_continue(env,
+			    txns[i].txn, txns[i].txn->td, ip, 0)) != 0)
+				goto err;
+			F_SET(txns[i].txn, TXN_MALLOC);
+			if (F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+				F_SET(txns[i].txn, TXN_NOSYNC);
+			else if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC))
+				F_SET(txns[i].txn, TXN_WRITE_NOSYNC);
+			else
+				F_SET(txns[i].txn, TXN_SYNC);
+			TAILQ_INSERT_TAIL(&mgr->txn_chain, txns[i].txn, links);
+		}
+		MUTEX_UNLOCK(env, mgr->mutex);
+
+		/*
+		 * If we are restoring, update our count of outstanding
+		 * transactions.
+		 */
+		if (REP_ON(env)) {
+			REP_SYSTEM_LOCK(env);
+			env->rep_handle->region->op_cnt += (u_long)*retp;
+			REP_SYSTEM_UNLOCK(env);
+		}
+
+	}
+
+	/* If recovery already opened the files for us, don't do it here. */
+	if (restored != 0 && flags == DB_FIRST &&
+	    !F_ISSET(env->lg_handle, DBLOG_OPENFILES))
+		ret = __txn_openfiles(env, ip, &min, 0);
+
+	if (0) {
+err:		TXN_SYSTEM_UNLOCK(env);
+	}
+	return (ret);
+}
+
+/*
+ * __txn_openfiles --
+ *	Call env_openfiles.
+ *
+ * PUBLIC: int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
+ */
+int
+__txn_openfiles(env, ip, min, force)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_LSN *min;
+	int force;
+{
+	DBT data;
+	DB_LOGC *logc;
+	DB_LSN open_lsn;
+	DB_TXNHEAD *txninfo;
+	__txn_ckp_args *ckp_args;
+	int ret, t_ret;
+
+	/*
+	 * Figure out the last checkpoint before the smallest
+	 * start_lsn in the region.
+	 */
+	logc = NULL;
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		goto err;
+
+	memset(&data, 0, sizeof(data));
+	if ((ret = __txn_getckp(env, &open_lsn)) == 0)
+		while (!IS_ZERO_LSN(open_lsn) && (ret =
+		    __logc_get(logc, &open_lsn, &data, DB_SET)) == 0 &&
+		    (force ||
+		    (min != NULL && LOG_COMPARE(min, &open_lsn) < 0))) {
+			/* Format the log record. */
+			if ((ret = __txn_ckp_read(
+			    env, data.data, &ckp_args)) != 0) {
+				__db_errx(env, DB_STR_A("4506",
+				    "Invalid checkpoint record at [%lu][%lu]",
+				    "%lu %lu"), (u_long)open_lsn.file,
+				    (u_long)open_lsn.offset);
+				goto err;
+			}
+			/*
+			 * If force is set, then we're forcing ourselves
+			 * to go back far enough to open files.
+			 * Use ckp_lsn and then break out of the loop.
+			 */
+			open_lsn = force ? ckp_args->ckp_lsn :
+			    ckp_args->last_ckp;
+			__os_free(env, ckp_args);
+			if (force) {
+				if ((ret = __logc_get(logc, &open_lsn,
+				    &data, DB_SET)) != 0)
+					goto err;
+				break;
+			}
+		}
+
+	/*
+	 * There are several ways by which we may have gotten here.
+	 * - We got a DB_NOTFOUND -- we need to read the first
+	 *	log record.
+	 * - We found a checkpoint before min.  We're done.
+	 * - We found a checkpoint after min who's last_ckp is 0.  We
+	 *	need to start at the beginning of the log.
+	 * - We are forcing an openfiles and we have our ckp_lsn.
+	 */
+	if ((ret == DB_NOTFOUND || IS_ZERO_LSN(open_lsn)) && (ret =
+	    __logc_get(logc, &open_lsn, &data, DB_FIRST)) != 0) {
+		__db_errx(env, DB_STR("4507", "No log records"));
+		goto err;
+	}
+
+	if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
+		goto err;
+	ret = __env_openfiles(
+	    env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0);
+	if (txninfo != NULL)
+		__db_txnlist_end(env, txninfo);
+
+err:
+	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
diff --git a/src/txn/txn_region.c b/src/txn/txn_region.c
new file mode 100644
index 00000000..6f43d45f
--- /dev/null
+++ b/src/txn/txn_region.c
@@ -0,0 +1,518 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __txn_init __P((ENV *, DB_TXNMGR *));
+
+/*
+ * __txn_open --
+ *	Open a transaction region.
+ *
+ * PUBLIC: int __txn_open __P((ENV *));
+ */
+int
+__txn_open(env)
+	ENV *env;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	int ret;
+
+	/* Create/initialize the transaction manager structure. */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &mgr)) != 0)
+		return (ret);
+	TAILQ_INIT(&mgr->txn_chain);
+	mgr->env = env;
+
+	/* Join/create the txn region. */
+	if ((ret = __env_region_share(env, &mgr->reginfo)) != 0)
+		goto err;
+
+	/* If we created the region, initialize it. */
+	if (F_ISSET(&mgr->reginfo, REGION_CREATE))
+		if ((ret = __txn_init(env, mgr)) != 0)
+			goto err;
+
+	/* Set the local addresses. */
+	region = mgr->reginfo.primary =
+	    R_ADDR(&mgr->reginfo,
+	    ((REGENV *)env->reginfo->primary)->tx_primary);
+
+	/* If threaded, acquire a mutex to protect the active TXN list. */
+	if ((ret = __mutex_alloc(
+	    env, MTX_TXN_ACTIVE, DB_MUTEX_PROCESS_ONLY, &mgr->mutex)) != 0)
+		goto err;
+
+	mgr->reginfo.mtx_alloc = region->mtx_region;
+	env->tx_handle = mgr;
+	return (0);
+
+err:	env->tx_handle = NULL;
+	if (mgr->reginfo.addr != NULL)
+		(void)__env_region_detach(env, &mgr->reginfo, 0);
+
+	(void)__mutex_free(env, &mgr->mutex);
+	__os_free(env, mgr);
+	return (ret);
+}
+
+/*
+ * __txn_init --
+ *	Initialize a transaction region in shared memory.
+ */
+static int
+__txn_init(env, mgr)
+	ENV *env;
+	DB_TXNMGR *mgr;
+{
+	DB_ENV *dbenv;
+	DB_LSN last_ckp;
+	DB_TXNREGION *region;
+	int ret;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * Find the last checkpoint in the log.
+	 */
+	ZERO_LSN(last_ckp);
+	if (LOGGING_ON(env)) {
+		/*
+		 * The log system has already walked through the last
+		 * file.  Get the LSN of a checkpoint it may have found.
+		 */
+		if ((ret = __log_get_cached_ckp_lsn(env, &last_ckp)) != 0)
+			return (ret);
+
+		/*
+		 * If that didn't work, look backwards from the beginning of
+		 * the last log file until we find the last checkpoint.
+		 */
+		if (IS_ZERO_LSN(last_ckp) &&
+		    (ret = __txn_findlastckp(env, &last_ckp, NULL)) != 0)
+			return (ret);
+	}
+
+	if ((ret = __env_alloc(&mgr->reginfo,
+	    sizeof(DB_TXNREGION), &mgr->reginfo.primary)) != 0) {
+		__db_errx(env, DB_STR("4508",
+		    "Unable to allocate memory for the transaction region"));
+		return (ret);
+	}
+	((REGENV *)env->reginfo->primary)->tx_primary =
+	     R_OFFSET(&mgr->reginfo, mgr->reginfo.primary);
+	region = mgr->reginfo.primary;
+	memset(region, 0, sizeof(*region));
+
+	/* We share the region so we need the same mutex. */
+	region->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+	mgr->reginfo.mtx_alloc = region->mtx_region;
+
+	region->maxtxns = dbenv->tx_max;
+	region->inittxns = dbenv->tx_init;
+	region->last_txnid = TXN_MINIMUM;
+	region->cur_maxid = TXN_MAXIMUM;
+
+	if ((ret = __mutex_alloc(
+	    env, MTX_TXN_CHKPT, 0, &region->mtx_ckp)) != 0)
+		return (ret);
+	region->last_ckp = last_ckp;
+	region->time_ckp = time(NULL);
+
+	memset(&region->stat, 0, sizeof(region->stat));
+#ifdef HAVE_STATISTICS
+	region->stat.st_maxtxns = region->maxtxns;
+	region->stat.st_inittxns = region->inittxns;
+#endif
+
+	SH_TAILQ_INIT(&region->active_txn);
+	SH_TAILQ_INIT(&region->mvcc_txn);
+	return (ret);
+}
+
+/*
+ * __txn_findlastckp --
+ *	Find the last checkpoint in the log, walking backwards from the
+ *	max_lsn given or the beginning of the last log file.  (The
+ *	log system looked through the last log file when it started up.)
+ *
+ * PUBLIC: int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__txn_findlastckp(env, lsnp, max_lsn)
+	ENV *env;
+	DB_LSN *lsnp;
+	DB_LSN *max_lsn;
+{
+	DBT dbt;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	int ret, t_ret;
+	u_int32_t rectype;
+
+	ZERO_LSN(*lsnp);
+
+	if ((ret = __log_cursor(env, &logc)) != 0)
+		return (ret);
+
+	/* Get the last LSN. */
+	memset(&dbt, 0, sizeof(dbt));
+	if (max_lsn != NULL) {
+		lsn = *max_lsn;
+		if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
+			goto err;
+	} else {
+		if ((ret = __logc_get(logc, &lsn, &dbt, DB_LAST)) != 0)
+			goto err;
+		/*
+		 * Twiddle the last LSN so it points to the beginning of the
+		 * last file; we know there's no checkpoint after that, since
+		 * the log system already looked there.
+		 */
+		lsn.offset = 0;
+	}
+
+	/* Read backwards, looking for checkpoints. */
+	while ((ret = __logc_get(logc, &lsn, &dbt, DB_PREV)) == 0) {
+		if (dbt.size < sizeof(u_int32_t))
+			continue;
+		LOGCOPY_32(env, &rectype, dbt.data);
+		if (rectype == DB___txn_ckp) {
+			*lsnp = lsn;
+			break;
+		}
+	}
+
+err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Not finding a checkpoint is not an error;  there may not exist
+	 * one in the log.
+	 */
+	return ((ret == 0 || ret == DB_NOTFOUND) ? 0 : ret);
+}
+
+/*
+ * __txn_env_refresh --
+ *	Clean up after the transaction system on a close or failed open.
+ *
+ * PUBLIC: int __txn_env_refresh __P((ENV *));
+ */
+int
+__txn_env_refresh(env)
+	ENV *env;
+{
+	DB_TXN *txn;
+	DB_TXNMGR *mgr;
+	REGINFO *reginfo;
+	u_int32_t txnid;
+	int aborted, ret, t_ret;
+
+	ret = 0;
+	mgr = env->tx_handle;
+	reginfo = &mgr->reginfo;
+
+	/*
+	 * This function can only be called once per process (i.e., not
+	 * once per thread), so no synchronization is required.
+	 *
+	 * The caller is probably doing something wrong if close is called with
+	 * active transactions.  Try and abort any active transactions that are
+	 * not prepared, but it's quite likely the aborts will fail because
+	 * recovery won't find open files.  If we can't abort any of the
+	 * unprepared transaction, panic, we have to run recovery to get back
+	 * to a known state.
+	 */
+	aborted = 0;
+	if (TAILQ_FIRST(&mgr->txn_chain) != NULL) {
+		while ((txn = TAILQ_FIRST(&mgr->txn_chain)) != NULL) {
+			/* Prepared transactions are OK. */
+			txnid = txn->txnid;
+			if (((TXN_DETAIL *)txn->td)->status == TXN_PREPARED) {
+				if ((ret = __txn_discard_int(txn, 0)) != 0) {
+					__db_err(env, ret, DB_STR_A("4509",
+					    "unable to discard txn %#lx",
+					    "%#lx"), (u_long)txnid);
+					break;
+				}
+				continue;
+			}
+			aborted = 1;
+			if ((t_ret = __txn_abort(txn)) != 0) {
+				__db_err(env, t_ret, DB_STR_A("4510",
+				    "unable to abort transaction %#lx", "%#lx"),
+				    (u_long)txnid);
+				ret = __env_panic(env, t_ret);
+				break;
+			}
+		}
+		if (aborted) {
+			__db_errx(env, DB_STR("4511",
+	"Error: closing the transaction region with active transactions"));
+			if (ret == 0)
+				ret = EINVAL;
+		}
+	}
+
+	/* Discard the per-thread lock. */
+	if ((t_ret = __mutex_free(env, &mgr->mutex)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Detach from the region. */
+	if (F_ISSET(env, ENV_PRIVATE))
+		reginfo->mtx_alloc = MUTEX_INVALID;
+	if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, mgr);
+
+	env->tx_handle = NULL;
+	return (ret);
+}
+
+/*
+ * __txn_region_mutex_count --
+ *	Return the number of mutexes the txn region will need.
+ *
+ * PUBLIC: u_int32_t __txn_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__txn_region_mutex_count(env)
+	ENV *env;
+{
+	COMPQUIET(env, NULL);
+	/*
+	 * We need  a mutex for DB_TXNMGR structure, two mutexes for
+	 * the DB_TXNREGION structure.
+	 */
+	return (1 + 2);
+}
+/*
+ * __txn_region_mutex_max --
+ *	Return the number of additional mutexes the txn region will need.
+ *
+ * PUBLIC: u_int32_t __txn_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__txn_region_mutex_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	if ((count = dbenv->tx_max) == 0)
+		count = DEF_MAX_TXNS;
+	/* We may need a mutex for each MVCC txn. */
+	return (count > dbenv->tx_init ? count - dbenv->tx_init : 0);
+}
+
+/*
+ * __txn_region_size --
+ *	 Return the amount of space needed for the txn region.
+ * PUBLIC:  size_t __txn_region_size __P((ENV *));
+ */
+size_t
+__txn_region_size(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t s;
+
+	dbenv = env->dbenv;
+
+	/*
+	 * Make the region large enough to hold the primary transaction region
+	 * structure, txn_init transaction detail structures, txn_init chunks of
+	 * overhead required by the underlying shared region allocator for each
+	 * chunk of memory, txn_max transaction names, at an average of 20
+	 * bytes each, and 10KB for safety.
+	 */
+	s = sizeof(DB_TXNREGION) + dbenv->tx_init *
+	    (sizeof(TXN_DETAIL) + __env_alloc_overhead() + 20) + 10 * 1024;
+	return (s);
+}
+
+/*
+ * __txn_region_max --
+ *	 Return the additional amount of space needed for the txn region.
+ * PUBLIC:  size_t __txn_region_max __P((ENV *));
+ */
+size_t
+__txn_region_max(env)
+	ENV *env;
+{
+	DB_ENV *dbenv;
+	size_t s;
+	u_int32_t count;
+
+	dbenv = env->dbenv;
+
+	if ((count = dbenv->tx_max) == 0)
+		count = DEF_MAX_TXNS;
+	if (count <= dbenv->tx_init)
+		return (0);
+	s = (count - dbenv->tx_init) *
+	    (sizeof(TXN_DETAIL) + __env_alloc_overhead() + 20);
+	return (s);
+}
+
+/*
+ * __txn_id_set --
+ *	Set the current transaction ID and current maximum unused ID (for
+ *	testing purposes only).
+ *
+ * PUBLIC: int __txn_id_set __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__txn_id_set(env, cur_txnid, max_txnid)
+	ENV *env;
+	u_int32_t cur_txnid, max_txnid;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	int ret;
+
+	ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_id_set", DB_INIT_TXN);
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+	region->last_txnid = cur_txnid;
+	region->cur_maxid = max_txnid;
+
+	ret = 0;
+	if (cur_txnid < TXN_MINIMUM) {
+		__db_errx(env, DB_STR_A("4512",
+		    "Current ID value %lu below minimum", "%lu"),
+		    (u_long)cur_txnid);
+		ret = EINVAL;
+	}
+	if (max_txnid < TXN_MINIMUM) {
+		__db_errx(env, DB_STR_A("4513",
+		    "Maximum ID value %lu below minimum", "%lu"),
+		    (u_long)max_txnid);
+		ret = EINVAL;
+	}
+	return (ret);
+}
+
+/*
+ * __txn_oldest_reader --
+ *	 Find the oldest "read LSN" of any active transaction'
+ *	 MVCC changes older than this can safely be discarded from the cache.
+ *
+ * PUBLIC: int __txn_oldest_reader __P((ENV *, DB_LSN *));
+ */
+int
+__txn_oldest_reader(env, lsnp)
+	ENV *env;
+	DB_LSN *lsnp;
+{
+	DB_LSN old_lsn;
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	TXN_DETAIL *td;
+	int ret;
+
+	if ((mgr = env->tx_handle) == NULL)
+		return (0);
+	region = mgr->reginfo.primary;
+
+	if ((ret = __log_current_lsn_int(env, &old_lsn, NULL, NULL)) != 0)
+		return (ret);
+
+	TXN_SYSTEM_LOCK(env);
+	SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+		if (LOG_COMPARE(&td->read_lsn, &old_lsn) < 0)
+			old_lsn = td->read_lsn;
+
+	*lsnp = old_lsn;
+	TXN_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+/*
+ * __txn_add_buffer --
+ *	Add to the count of buffers created by the given transaction.
+ *
+ * PUBLIC: int __txn_add_buffer __P((ENV *, TXN_DETAIL *));
+ */
+int
+__txn_add_buffer(env, td)
+	ENV *env;
+	TXN_DETAIL *td;
+{
+	DB_ASSERT(env, td != NULL);
+
+	MUTEX_LOCK(env, td->mvcc_mtx);
+	DB_ASSERT(env, td->mvcc_ref < UINT32_MAX);
+	++td->mvcc_ref;
+	MUTEX_UNLOCK(env, td->mvcc_mtx);
+
+	COMPQUIET(env, NULL);
+	return (0);
+}
+
+/*
+ * __txn_remove_buffer --
+ *	Remove a buffer from a transaction -- free the transaction if necessary.
+ *
+ * PUBLIC: int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t));
+ */
+int
+__txn_remove_buffer(env, td, hash_mtx)
+	ENV *env;
+	TXN_DETAIL *td;
+	db_mutex_t hash_mtx;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	int need_free, ret;
+
+	DB_ASSERT(env, td != NULL);
+	ret = 0;
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	MUTEX_LOCK(env, td->mvcc_mtx);
+	DB_ASSERT(env, td->mvcc_ref > 0);
+
+	/*
+	 * We free the transaction detail here only if this is the last
+	 * reference and td is on the list of committed snapshot transactions
+	 * with active pages.
+	 */
+	need_free = (--td->mvcc_ref == 0) && F_ISSET(td, TXN_DTL_SNAPSHOT);
+	MUTEX_UNLOCK(env, td->mvcc_mtx);
+
+	if (need_free) {
+		MUTEX_UNLOCK(env, hash_mtx);
+
+		ret = __mutex_free(env, &td->mvcc_mtx);
+		td->mvcc_mtx = MUTEX_INVALID;
+
+		TXN_SYSTEM_LOCK(env);
+		SH_TAILQ_REMOVE(&region->mvcc_txn, td, links, __txn_detail);
+		STAT_DEC(env,
+		    txn, nsnapshot, region->stat.st_nsnapshot, td->txnid);
+		__env_alloc_free(&mgr->reginfo, td);
+		TXN_SYSTEM_UNLOCK(env);
+
+		MUTEX_READLOCK(env, hash_mtx);
+	}
+
+	return (ret);
+}
diff --git a/src/txn/txn_stat.c b/src/txn/txn_stat.c
new file mode 100644
index 00000000..62fe622d
--- /dev/null
+++ b/src/txn/txn_stat.c
@@ -0,0 +1,461 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int  __txn_compare __P((const void *, const void *));
+static int  __txn_print_all __P((ENV *, u_int32_t));
+static int  __txn_print_stats __P((ENV *, u_int32_t));
+static int  __txn_stat __P((ENV *, DB_TXN_STAT **, u_int32_t));
+static char *__txn_status __P((DB_TXN_ACTIVE *));
+static char *__txn_xa_status __P((DB_TXN_ACTIVE *));
+static void __txn_gid __P((ENV *, DB_MSGBUF *, DB_TXN_ACTIVE *));
+
+/*
+ * __txn_stat_pp --
+ *	DB_ENV->txn_stat pre/post processing.
+ *
+ * PUBLIC: int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+ */
+int
+__txn_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_TXN_STAT **statp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->tx_handle, "DB_ENV->txn_stat", DB_INIT_TXN);
+
+	if ((ret = __db_fchk(env,
+	    "DB_ENV->txn_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__txn_stat(env, statp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_stat --
+ *	ENV->txn_stat.
+ */
+static int
+__txn_stat(env, statp, flags)
+	ENV *env;
+	DB_TXN_STAT **statp;
+	u_int32_t flags;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	DB_TXN_STAT *stats;
+	TXN_DETAIL *td;
+	size_t nbytes;
+	u_int32_t maxtxn, ndx;
+	int ret;
+
+	*statp = NULL;
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	TXN_SYSTEM_LOCK(env);
+	maxtxn = region->curtxns;
+	nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * maxtxn;
+	if ((ret = __os_umalloc(env, nbytes, &stats)) != 0) {
+		TXN_SYSTEM_UNLOCK(env);
+		return (ret);
+	}
+
+	memcpy(stats, &region->stat, sizeof(region->stat));
+	stats->st_last_txnid = region->last_txnid;
+	stats->st_last_ckp = region->last_ckp;
+	stats->st_time_ckp = region->time_ckp;
+	stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
+
+	for (ndx = 0,
+	    td = SH_TAILQ_FIRST(&region->active_txn, __txn_detail);
+	    td != NULL && ndx < maxtxn;
+	    td = SH_TAILQ_NEXT(td, links, __txn_detail), ++ndx) {
+		stats->st_txnarray[ndx].txnid = td->txnid;
+		if (td->parent == INVALID_ROFF)
+			stats->st_txnarray[ndx].parentid = TXN_INVALID;
+		else
+			stats->st_txnarray[ndx].parentid =
+			    ((TXN_DETAIL *)R_ADDR(&mgr->reginfo,
+			    td->parent))->txnid;
+		stats->st_txnarray[ndx].pid = td->pid;
+		stats->st_txnarray[ndx].tid = td->tid;
+		stats->st_txnarray[ndx].lsn = td->begin_lsn;
+		stats->st_txnarray[ndx].read_lsn = td->read_lsn;
+		stats->st_txnarray[ndx].mvcc_ref = td->mvcc_ref;
+		stats->st_txnarray[ndx].status = td->status;
+		stats->st_txnarray[ndx].xa_status = td->xa_br_status;
+		stats->st_txnarray[ndx].priority = td->priority;
+
+		if (td->status == TXN_PREPARED)
+			memcpy(stats->st_txnarray[ndx].gid,
+			    td->gid, sizeof(td->gid));
+		if (td->name != INVALID_ROFF) {
+			(void)strncpy(stats->st_txnarray[ndx].name,
+			    R_ADDR(&mgr->reginfo, td->name),
+			    sizeof(stats->st_txnarray[ndx].name) - 1);
+			stats->st_txnarray[ndx].name[
+			    sizeof(stats->st_txnarray[ndx].name) - 1] = '\0';
+		} else
+			stats->st_txnarray[ndx].name[0] = '\0';
+	}
+
+	__mutex_set_wait_info(env, region->mtx_region,
+	    &stats->st_region_wait, &stats->st_region_nowait);
+	stats->st_regsize = (roff_t)mgr->reginfo.rp->size;
+	if (LF_ISSET(DB_STAT_CLEAR)) {
+		if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+			__mutex_clear(env, region->mtx_region);
+		memset(&region->stat, 0, sizeof(region->stat));
+		region->stat.st_maxtxns = region->maxtxns;
+		region->stat.st_inittxns = region->inittxns;
+		region->stat.st_maxnactive =
+		    region->stat.st_nactive = stats->st_nactive;
+		region->stat.st_maxnsnapshot =
+		    region->stat.st_nsnapshot = stats->st_nsnapshot;
+	}
+
+	TXN_SYSTEM_UNLOCK(env);
+
+	*statp = stats;
+	return (0);
+}
+
+/*
+ * __txn_stat_print_pp --
+ *	DB_ENV->txn_stat_print pre/post processing.
+ *
+ * PUBLIC: int __txn_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__txn_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_REQUIRES_CONFIG(env,
+	    env->tx_handle, "DB_ENV->txn_stat_print", DB_INIT_TXN);
+
+	if ((ret = __db_fchk(env, "DB_ENV->txn_stat_print",
+	    flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env, (__txn_stat_print(env, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __txn_stat_print
+ *	ENV->txn_stat_print method.
+ *
+ * PUBLIC: int  __txn_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__txn_stat_print(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	u_int32_t orig_flags;
+	int ret;
+
+	orig_flags = flags;
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+		ret = __txn_print_stats(env, orig_flags);
+		if (flags == 0 || ret != 0)
+			return (ret);
+	}
+
+	if (LF_ISSET(DB_STAT_ALL) &&
+	    (ret = __txn_print_all(env, orig_flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __txn_print_stats --
+ *	Display default transaction region statistics.
+ */
+static int
+__txn_print_stats(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DB_MSGBUF mb;
+	DB_TXN_ACTIVE *txn;
+	DB_TXN_STAT *sp;
+	u_int32_t i;
+	int ret;
+	char buf[DB_THREADID_STRLEN], time_buf[CTIME_BUFLEN];
+
+	dbenv = env->dbenv;
+
+	if ((ret = __txn_stat(env, &sp, flags)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_STAT_ALL))
+		__db_msg(env, "Default transaction region information:");
+	__db_msg(env, "%lu/%lu\t%s",
+	    (u_long)sp->st_last_ckp.file, (u_long)sp->st_last_ckp.offset,
+	    sp->st_last_ckp.file == 0 ?
+	    "No checkpoint LSN" : "File/offset for last checkpoint LSN");
+	if (sp->st_time_ckp == 0)
+		__db_msg(env, "0\tNo checkpoint timestamp");
+	else
+		__db_msg(env, "%.24s\tCheckpoint timestamp",
+		    __os_ctime(&sp->st_time_ckp, time_buf));
+	__db_msg(env, "%#lx\tLast transaction ID allocated",
+	    (u_long)sp->st_last_txnid);
+	__db_dl(env, "Maximum number of active transactions configured",
+	    (u_long)sp->st_maxtxns);
+	__db_dl(env, "Initial number of transactions configured",
+	    (u_long)sp->st_inittxns);
+	__db_dl(env, "Active transactions", (u_long)sp->st_nactive);
+	__db_dl(env,
+	    "Maximum active transactions", (u_long)sp->st_maxnactive);
+	__db_dl(env,
+	    "Number of transactions begun", (u_long)sp->st_nbegins);
+	__db_dl(env,
+	    "Number of transactions aborted", (u_long)sp->st_naborts);
+	__db_dl(env,
+	    "Number of transactions committed", (u_long)sp->st_ncommits);
+	__db_dl(env, "Snapshot transactions", (u_long)sp->st_nsnapshot);
+	__db_dl(env, "Maximum snapshot transactions",
+	    (u_long)sp->st_maxnsnapshot);
+	__db_dl(env,
+	    "Number of transactions restored", (u_long)sp->st_nrestores);
+
+	__db_dlbytes(env, "Region size",
+	    (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+	__db_dl_pct(env,
+	    "The number of region locks that required waiting",
+	    (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+	    sp->st_region_wait + sp->st_region_nowait), NULL);
+
+	qsort(sp->st_txnarray,
+	    sp->st_nactive, sizeof(sp->st_txnarray[0]), __txn_compare);
+	__db_msg(env, "Active transactions:");
+	DB_MSGBUF_INIT(&mb);
+	for (i = 0; i < sp->st_nactive; ++i) {
+		txn = &sp->st_txnarray[i];
+		__db_msgadd(env, &mb, "\t%lx: %s; xa_status %s;"
+		    " pid/thread %s; begin LSN: file/offset %lu/%lu",
+		    (u_long)txn->txnid, __txn_status(txn), __txn_xa_status(txn),
+		    dbenv->thread_id_string(dbenv, txn->pid, txn->tid, buf),
+		    (u_long)txn->lsn.file, (u_long)txn->lsn.offset);
+		if (txn->parentid != 0)
+			__db_msgadd(env, &mb,
+			    "; parent: %lx", (u_long)txn->parentid);
+		if (!IS_MAX_LSN(txn->read_lsn))
+			__db_msgadd(env, &mb, "; read LSN: %lu/%lu",
+			    (u_long)txn->read_lsn.file,
+			    (u_long)txn->read_lsn.offset);
+		if (txn->mvcc_ref != 0)
+			__db_msgadd(env, &mb,
+			    "; mvcc refcount: %lu", (u_long)txn->mvcc_ref);
+		if (LOCKING_ON(env))
+			__db_msgadd(env, &mb,
+			    "; priority: %lu", (u_long)txn->priority);
+		if (txn->name[0] != '\0')
+			__db_msgadd(env, &mb, "; \"%s\"", txn->name);
+		if (txn->status == TXN_PREPARE)
+			__txn_gid(env, &mb, txn);
+		DB_MSGBUF_FLUSH(env, &mb);
+	}
+
+	__os_ufree(env, sp);
+
+	return (0);
+}
+
+/*
+ * __txn_print_all --
+ *	Display debugging transaction region statistics.
+ */
+static int
+__txn_print_all(env, flags)
+	ENV *env;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ TXN_IN_RECOVERY,	"TXN_IN_RECOVERY" },
+		{ 0,			NULL }
+	};
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	char time_buf[CTIME_BUFLEN];
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	TXN_SYSTEM_LOCK(env);
+
+	__db_print_reginfo(env, &mgr->reginfo, "Transaction", flags);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_TXNMGR handle information:");
+	__mutex_print_debug_single(env, "DB_TXNMGR mutex", mgr->mutex, flags);
+	__db_dl(env,
+	    "Number of transactions discarded", (u_long)mgr->n_discards);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB_TXNREGION handle information:");
+	__mutex_print_debug_single(
+	    env, "DB_TXNREGION region mutex", region->mtx_region, flags);
+	STAT_ULONG("Maximum number of active txns", region->maxtxns);
+	STAT_HEX("Last transaction ID allocated", region->last_txnid);
+	STAT_HEX("Current maximum unused ID", region->cur_maxid);
+
+	__mutex_print_debug_single(
+	    env, "checkpoint mutex", region->mtx_ckp, flags);
+	STAT_LSN("Last checkpoint LSN", &region->last_ckp);
+	__db_msg(env,
+	    "%.24s\tLast checkpoint timestamp",
+	    region->time_ckp == 0 ? "0" :
+	    __os_ctime(&region->time_ckp, time_buf));
+
+	__db_prflags(env, NULL, region->flags, fn, NULL, "\tFlags");
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	TXN_SYSTEM_UNLOCK(env);
+
+	return (0);
+}
+
+static char *
+__txn_status(txn)
+	DB_TXN_ACTIVE *txn;
+{
+	switch (txn->status) {
+	case TXN_ABORTED:
+		return ("aborted");
+	case TXN_COMMITTED:
+		return ("committed");
+	case TXN_NEED_ABORT:
+		return ("need abort");
+	case TXN_PREPARED:
+		return ("prepared");
+	case TXN_RUNNING:
+		return ("running");
+	default:
+		break;
+	}
+	return ("unknown state");
+}
+
+static char *
+__txn_xa_status(txn)
+	DB_TXN_ACTIVE *txn;
+{
+	switch (txn->xa_status) {
+	case TXN_XA_ACTIVE:
+		return ("xa active");
+	case TXN_XA_DEADLOCKED:
+		return ("xa deadlock");
+	case TXN_XA_IDLE:
+		return ("xa idle");
+	case TXN_XA_PREPARED:
+		return ("xa prepared");
+	case TXN_XA_ROLLEDBACK:
+		return ("xa rollback");
+	default:
+		break;
+	}
+	return ("no xa state");
+}
+
+static void
+__txn_gid(env, mbp, txn)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	DB_TXN_ACTIVE *txn;
+{
+	u_int32_t v, *xp;
+	u_int i;
+	int cnt;
+
+	__db_msgadd(env, mbp, "\n\tGID:");
+	for (cnt = 0, xp = (u_int32_t *)txn->gid, i = 0;;) {
+		memcpy(&v, xp++, sizeof(u_int32_t));
+		__db_msgadd(env, mbp, "%#lx ", (u_long)v);
+		if ((i += sizeof(u_int32_t)) >= DB_GID_SIZE)
+			break;
+		if (++cnt == 4) {
+			DB_MSGBUF_FLUSH(env, mbp);
+			__db_msgadd(env, mbp, "\t\t");
+			cnt = 0;
+		}
+	}
+}
+
+static int
+__txn_compare(a1, b1)
+	const void *a1, *b1;
+{
+	const DB_TXN_ACTIVE *a, *b;
+
+	a = a1;
+	b = b1;
+
+	if (a->txnid > b->txnid)
+		return (1);
+	if (a->txnid < b->txnid)
+		return (-1);
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__txn_stat_pp(dbenv, statp, flags)
+	DB_ENV *dbenv;
+	DB_TXN_STAT **statp;
+	u_int32_t flags;
+{
+	COMPQUIET(statp, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__txn_stat_print_pp(dbenv, flags)
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/txn/txn_util.c b/src/txn/txn_util.c
new file mode 100644
index 00000000..0ecd7f6c
--- /dev/null
+++ b/src/txn/txn_util.c
@@ -0,0 +1,696 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+typedef struct __txn_event TXN_EVENT;
+struct __txn_event {
+	TXN_EVENT_T op;
+	TAILQ_ENTRY(__txn_event) links;
+	union {
+		struct {
+			/* Delayed close. */
+			DB *dbp;
+		} c;
+		struct {
+			/* Delayed remove. */
+			char *name;
+			u_int8_t *fileid;
+			int inmem;
+		} r;
+		struct {
+			/* Lock event. */
+			DB_LOCK lock;
+			DB_LOCKER *locker;
+			DB *dbp;
+		} t;
+	} u;
+};
+
+#define	TXN_TOP_PARENT(txn) do {					\
+	while (txn->parent != NULL)					\
+		txn = txn->parent;					\
+} while (0)
+
+static void __clear_fe_watermark __P((DB_TXN *, DB *));
+
+/*
+ * __txn_closeevent --
+ *
+ * Creates a close event that can be added to the [so-called] commit list, so
+ * that we can redo a failed DB handle close once we've aborted the transaction.
+ *
+ * PUBLIC: int __txn_closeevent __P((ENV *, DB_TXN *, DB *));
+ */
+int
+__txn_closeevent(env, txn, dbp)
+	ENV *env;
+	DB_TXN *txn;
+	DB *dbp;
+{
+	int ret;
+	TXN_EVENT *e;
+
+	e = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+		return (ret);
+
+	e->u.c.dbp = dbp;
+	e->op = TXN_CLOSE;
+	TXN_TOP_PARENT(txn);
+	TAILQ_INSERT_TAIL(&txn->events, e, links);
+
+	return (0);
+}
+
+/*
+ * __txn_remevent --
+ *
+ * Creates a remove event that can be added to the commit list.
+ *
+ * PUBLIC: int __txn_remevent __P((ENV *,
+ * PUBLIC:       DB_TXN *, const char *, u_int8_t *, int));
+ */
+int
+__txn_remevent(env, txn, name, fileid, inmem)
+	ENV *env;
+	DB_TXN *txn;
+	const char *name;
+	u_int8_t *fileid;
+	int inmem;
+{
+	int ret;
+	TXN_EVENT *e;
+
+	e = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+		return (ret);
+
+	if ((ret = __os_strdup(env, name, &e->u.r.name)) != 0)
+		goto err;
+
+	if (fileid != NULL) {
+		if ((ret = __os_calloc(env,
+		    1, DB_FILE_ID_LEN, &e->u.r.fileid)) != 0) {
+			__os_free(env, e->u.r.name);
+			goto err;
+		}
+		memcpy(e->u.r.fileid, fileid, DB_FILE_ID_LEN);
+	}
+
+	e->u.r.inmem = inmem;
+	e->op = TXN_REMOVE;
+	TAILQ_INSERT_TAIL(&txn->events, e, links);
+
+	return (0);
+
+err:	__os_free(env, e);
+
+	return (ret);
+}
+
+/*
+ * __txn_remrem --
+ *	Remove a remove event because the remove has been superceeded,
+ * by a create of the same name, for example.
+ *
+ * PUBLIC: void __txn_remrem __P((ENV *, DB_TXN *, const char *));
+ */
+void
+__txn_remrem(env, txn, name)
+	ENV *env;
+	DB_TXN *txn;
+	const char *name;
+{
+	TXN_EVENT *e, *next_e;
+
+	for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
+		next_e = TAILQ_NEXT(e, links);
+		if (e->op != TXN_REMOVE || strcmp(name, e->u.r.name) != 0)
+			continue;
+		TAILQ_REMOVE(&txn->events, e, links);
+		__os_free(env, e->u.r.name);
+		if (e->u.r.fileid != NULL)
+			__os_free(env, e->u.r.fileid);
+		__os_free(env, e);
+	}
+
+	return;
+}
+
+/*
+ * __txn_lockevent --
+ *
+ * Add a lockevent to the commit-queue.  The lock event indicates a locker
+ * trade.
+ *
+ * PUBLIC: int __txn_lockevent __P((ENV *,
+ * PUBLIC:     DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *));
+ */
+int
+__txn_lockevent(env, txn, dbp, lock, locker)
+	ENV *env;
+	DB_TXN *txn;
+	DB *dbp;
+	DB_LOCK *lock;
+	DB_LOCKER *locker;
+{
+	int ret;
+	TXN_EVENT *e;
+
+	if (!LOCKING_ON(env))
+		return (0);
+
+	e = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+		return (ret);
+
+	e->u.t.locker = locker;
+	e->u.t.lock = *lock;
+	e->u.t.dbp = dbp;
+	if (F2_ISSET(dbp, DB2_AM_EXCL))
+		e->op = TXN_XTRADE;
+	else
+		e->op = TXN_TRADE;
+	/* This event goes on the current transaction, not its parent. */
+	TAILQ_INSERT_TAIL(&txn->events, e, links);
+	dbp->cur_txn = txn;
+
+	return (0);
+}
+
+/*
+ * __txn_remlock --
+ *	Remove a lock event because the locker is going away.  We can remove
+ * by lock (using offset) or by locker_id (or by both).
+ *
+ * PUBLIC: void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *));
+ */
+void
+__txn_remlock(env, txn, lock, locker)
+	ENV *env;
+	DB_TXN *txn;
+	DB_LOCK *lock;
+	DB_LOCKER *locker;
+{
+	TXN_EVENT *e, *next_e;
+
+	for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
+		next_e = TAILQ_NEXT(e, links);
+		if ((e->op != TXN_TRADE && e->op != TXN_TRADED && 
+		    e->op != TXN_XTRADE) ||
+		    (e->u.t.lock.off != lock->off && e->u.t.locker != locker))
+			continue;
+		TAILQ_REMOVE(&txn->events, e, links);
+		__os_free(env, e);
+	}
+
+	return;
+}
+
+/*
+ * __txn_doevents --
+ * Process the list of events associated with a transaction.  On commit,
+ * apply the events; on abort, just toss the entries.
+ *
+ * PUBLIC: int __txn_doevents __P((ENV *, DB_TXN *, int, int));
+ */
+
+/*
+ * Trade a locker associated with a thread for one that is associated
+ * only with the handle. Mark the locker so failcheck will know.
+ */
+#define	DO_TRADE do {							\
+	memset(&req, 0, sizeof(req));					\
+	req.lock = e->u.t.lock;						\
+	req.op = DB_LOCK_TRADE;						\
+	t_ret = __lock_vec(env, txn->parent ?				\
+	    txn->parent->locker : e->u.t.locker, 0, &req, 1, NULL);	\
+	if (t_ret == 0)	{						\
+		if (txn->parent != NULL) {				\
+			e->u.t.dbp->cur_txn = txn->parent;		\
+			e->u.t.dbp->cur_locker = txn->parent->locker;	\
+		} else {						\
+			e->op = TXN_TRADED;				\
+			e->u.t.dbp->cur_locker = e->u.t.locker;		\
+			F_SET(e->u.t.dbp->cur_locker,			\
+			    DB_LOCKER_HANDLE_LOCKER);			\
+			if (opcode != TXN_PREPARE)			\
+				e->u.t.dbp->cur_txn = NULL;		\
+		}							\
+	} else if (t_ret == DB_NOTFOUND)				\
+		t_ret = 0;						\
+	if (t_ret != 0 && ret == 0)					\
+		ret = t_ret;						\
+} while (0)
+
+int
+__txn_doevents(env, txn, opcode, preprocess)
+	ENV *env;
+	DB_TXN *txn;
+	int opcode, preprocess;
+{
+	DB_LOCKREQ req;
+	TXN_EVENT *e, *enext;
+	int ret, t_ret;
+
+	ret = 0;
+
+	/*
+	 * This phase only gets called if we have a phase where we
+	 * release read locks.  Since not all paths will call this
+	 * phase, we have to check for it below as well.  So, when
+	 * we do the trade, we update the opcode of the entry so that
+	 * we don't try the trade again.
+	 */
+	if (preprocess) {
+		for (e = TAILQ_FIRST(&txn->events);
+		    e != NULL; e = enext) {
+			enext = TAILQ_NEXT(e, links);
+			/*
+			 * Move all exclusive handle locks and 
+			 * read handle locks to the handle locker.
+			 */
+			if (!(opcode == TXN_COMMIT && e->op == TXN_XTRADE) &&
+			    (e->op != TXN_TRADE || 
+			    IS_WRITELOCK(e->u.t.lock.mode)))
+				continue;
+			DO_TRADE;
+			if (txn->parent != NULL) {
+				TAILQ_REMOVE(&txn->events, e, links);
+				TAILQ_INSERT_HEAD(
+				     &txn->parent->events, e, links);
+			}
+		}
+		return (ret);
+	}
+
+	/*
+	 * Prepare should only cause a preprocess, since the transaction
+	 * isn't over.
+	 */
+	DB_ASSERT(env, opcode != TXN_PREPARE);
+	while ((e = TAILQ_FIRST(&txn->events)) != NULL) {
+		TAILQ_REMOVE(&txn->events, e, links);
+		/*
+		 * Most deferred events should only happen on
+		 * commits, not aborts or prepares.  The two exceptions are
+		 * close and xtrade which gets done on commit and abort, but
+		 * not prepare. If we're not doing operations, then we
+		 * can just go free resources.
+		 */
+		if (opcode == TXN_ABORT && (e->op != TXN_CLOSE &&
+		    e->op != TXN_XTRADE))
+			goto dofree;
+		switch (e->op) {
+		case TXN_CLOSE:
+			if ((t_ret = __db_close(e->u.c.dbp,
+			    NULL, DB_NOSYNC)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case TXN_REMOVE:
+			if (txn->parent != NULL)
+				TAILQ_INSERT_TAIL(
+				    &txn->parent->events, e, links);
+			else if (e->u.r.fileid != NULL) {
+				if ((t_ret = __memp_nameop(env,
+				    e->u.r.fileid, NULL, e->u.r.name,
+				    NULL, e->u.r.inmem)) != 0 && ret == 0)
+					ret = t_ret;
+			} else if ((t_ret =
+			    __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case TXN_TRADE:
+		case TXN_XTRADE:
+			DO_TRADE;
+			if (txn->parent != NULL) {
+				TAILQ_INSERT_HEAD(
+				     &txn->parent->events, e, links);
+				continue;
+			}
+			/* Fall through */
+		case TXN_TRADED:
+			/*
+			 * Downgrade the lock if it is not an exclusive
+			 * database handle lock.  An exclusive database
+			 * should not have any locks other than the
+			 * handle lock.
+			 */
+			if (ret == 0 && !F2_ISSET(e->u.t.dbp, DB2_AM_EXCL)) {
+				if ((t_ret = __lock_downgrade(env,
+				    &e->u.t.lock, DB_LOCK_READ, 0)) != 0 &&
+				    ret == 0)
+					ret = t_ret;
+				/* Update the handle lock mode. */
+				if (ret == 0 && e->u.t.lock.off ==
+				    e->u.t.dbp->handle_lock.off &&
+				    e->u.t.lock.ndx ==
+				    e->u.t.dbp->handle_lock.ndx)
+					e->u.t.dbp->handle_lock.mode =
+					    DB_LOCK_READ;
+			}
+			break;
+		default:
+			/* This had better never happen. */
+			DB_ASSERT(env, 0);
+		}
+dofree:
+		/* Free resources here. */
+		switch (e->op) {
+		case TXN_REMOVE:
+			if (txn->parent != NULL)
+				continue;
+			if (e->u.r.fileid != NULL)
+				__os_free(env, e->u.r.fileid);
+			__os_free(env, e->u.r.name);
+			break;
+		case TXN_TRADE:
+		case TXN_XTRADE:
+			if (opcode == TXN_ABORT)
+				e->u.t.dbp->cur_txn = NULL;
+			break;
+		case TXN_CLOSE:
+		case TXN_TRADED:
+		default:
+			break;
+		}
+		__os_free(env, e);
+	}
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *));
+ */
+int
+__txn_record_fname(env, txn, fname)
+	ENV *env;
+	DB_TXN *txn;
+	FNAME *fname;
+{
+	DB_LOG *dblp;
+	DB_TXNMGR *mgr;
+	TXN_DETAIL *td;
+	roff_t fname_off;
+	roff_t *np, *ldbs;
+	u_int32_t i;
+	int ret;
+
+	if ((td = txn->td) == NULL)
+		return (0);
+	mgr = env->tx_handle;
+	dblp = env->lg_handle;
+	fname_off = R_OFFSET(&dblp->reginfo, fname);
+
+	/* See if we already have a ref to this DB handle. */
+	ldbs = R_ADDR(&mgr->reginfo, td->log_dbs);
+	for (i = 0, np = ldbs; i < td->nlog_dbs; i++, np++)
+		if (*np == fname_off)
+			return (0);
+
+	if (td->nlog_slots <= td->nlog_dbs) {
+		TXN_SYSTEM_LOCK(env);
+		if ((ret = __env_alloc(&mgr->reginfo,
+		    sizeof(roff_t) * (td->nlog_slots << 1), &np)) != 0) {
+			TXN_SYSTEM_UNLOCK(env);
+			return (ret);
+		}
+
+		memcpy(np, ldbs, td->nlog_dbs * sizeof(roff_t));
+		if (td->nlog_slots > TXN_NSLOTS)
+			__env_alloc_free(&mgr->reginfo, ldbs);
+
+		TXN_SYSTEM_UNLOCK(env);
+		td->log_dbs = R_OFFSET(&mgr->reginfo, np);
+		ldbs = np;
+		td->nlog_slots = td->nlog_slots << 1;
+	}
+
+	ldbs[td->nlog_dbs] = fname_off;
+	td->nlog_dbs++;
+	fname->txn_ref++;
+
+	return (0);
+}
+
+/*
+ * __txn_dref_fnam --
+ *	Either pass the fname to our parent txn or decrement the refcount
+ * and close the fileid if it goes to zero.
+ *
+ * PUBLIC: int __txn_dref_fname __P((ENV *, DB_TXN *));
+ */
+int
+__txn_dref_fname(env, txn)
+	ENV *env;
+	DB_TXN *txn;
+{
+	DB_LOG *dblp;
+	DB_TXNMGR *mgr;
+	FNAME *fname;
+	roff_t *np;
+	TXN_DETAIL *ptd, *td;
+	u_int32_t i;
+	int ret;
+
+	td = txn->td;
+
+	if (td->nlog_dbs == 0)
+		return (0);
+
+	mgr = env->tx_handle;
+	dblp = env->lg_handle;
+	ret = 0;
+
+	ptd = txn->parent != NULL ? txn->parent->td : NULL;
+
+	np = R_ADDR(&mgr->reginfo, td->log_dbs);
+	/*
+	 * The order in which FNAMEs are cleaned up matters.  Cleaning up
+	 * in the wrong order can result in database handles leaking.  If
+	 * we are passing the FNAMEs to the parent transaction make sure
+	 * they are passed in order.  If we are cleaning up the FNAMEs,
+	 * make sure that is done in reverse order.
+	 */
+	if (ptd != NULL) {
+		for (i = 0; i < td->nlog_dbs; i++, np++) {
+			fname = R_ADDR(&dblp->reginfo, *np);
+			MUTEX_LOCK(env, fname->mutex);
+			ret = __txn_record_fname(env, txn->parent, fname);
+			fname->txn_ref--;
+			MUTEX_UNLOCK(env, fname->mutex);
+			if (ret != 0)
+				break;
+		}
+	} else {
+		np += td->nlog_dbs - 1;
+		for (i = 0; i < td->nlog_dbs; i++, np--) {
+			fname = R_ADDR(&dblp->reginfo, *np);
+			MUTEX_LOCK(env, fname->mutex);
+			if (fname->txn_ref == 1) {
+				MUTEX_UNLOCK(env, fname->mutex);
+				DB_ASSERT(env, fname->txn_ref != 0);
+				ret = __dbreg_close_id_int(
+				    env, fname, DBREG_CLOSE, 0);
+			} else {
+				fname->txn_ref--;
+				MUTEX_UNLOCK(env, fname->mutex);
+			}
+			if (ret != 0 && ret != EIO)
+				break;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * Common removal routine.  This is called only after verifying that
+ * the DB_MPOOLFILE is in the list.
+ */
+static void
+__clear_fe_watermark(txn, db)
+     DB_TXN *txn;
+     DB *db;
+{
+	MPOOLFILE *mpf;
+
+	mpf = db->mpf->mfp;
+	mpf->fe_watermark = PGNO_INVALID;
+	mpf->fe_txnid = 0U;
+	mpf->fe_nlws = 0U;
+	TAILQ_REMOVE(&txn->femfs, db, felink);
+}
+
+/*
+ * __txn_reset_fe_watermarks
+ * Reset the file extension state of MPOOLFILEs involved in this transaction.
+ *
+ * PUBLIC: void __txn_reset_fe_watermarks __P((DB_TXN *));
+ */
+void
+__txn_reset_fe_watermarks(txn)
+     DB_TXN *txn;
+{
+	DB *db;
+
+	if (txn->parent) {
+		DB_ASSERT(txn->mgrp->env, TAILQ_FIRST(&txn->femfs) == NULL);
+	}
+
+	while ((db = TAILQ_FIRST(&txn->femfs)))
+		__clear_fe_watermark(txn, db);
+}
+
+/*
+ * __txn_remove_fe_watermark
+ * Remove a watermark from the transaction's list
+ *
+ * PUBLIC: void __txn_remove_fe_watermark __P((DB_TXN *,DB *));
+ */
+void
+__txn_remove_fe_watermark(txn, db)
+     DB_TXN *txn;
+     DB *db;
+{
+	DB *db_tmp;
+
+	if (txn == NULL || !F_ISSET(txn, TXN_BULK))
+		return;
+
+	TAILQ_FOREACH(db_tmp, &txn->femfs, felink) {
+		if (db_tmp == db) {
+			__clear_fe_watermark(txn, db);
+			break;
+		}
+	}
+}
+
+/*
+ * __txn_add_fe_watermark
+ *
+ * Add an entry to the transaction's list of
+ * file_extension_watermarks, if warranted.  Also, set the watermark
+ * page number in the MPOOLFILE.  The metadata lock associated with
+ * the mfp must be held when this function is called.
+ *
+ * PUBLIC: void __txn_add_fe_watermark __P((DB_TXN *, DB *, db_pgno_t));
+ */
+void
+__txn_add_fe_watermark(txn, db, pgno)
+     DB_TXN *txn;
+     DB *db;
+     db_pgno_t pgno;
+{
+	MPOOLFILE *mfp;
+
+	if (txn == NULL || !F_ISSET(txn, TXN_BULK))
+		return;
+
+	mfp = db->mpf->mfp;
+	/* If the watermark is already set, there's nothing to do. */
+	if (mfp->fe_watermark != PGNO_INVALID) {
+#ifdef DIAGNOSTIC
+		DB_ASSERT(txn->mgrp->env, mfp->fe_txnid == txn->txnid);
+#endif
+		return;
+	}
+
+	/* We can update MPOOLFILE because the metadata lock is held. */
+	mfp->fe_watermark = pgno;
+	mfp->fe_txnid = txn->txnid;
+
+	TAILQ_INSERT_TAIL(&txn->femfs, db, felink);
+}
+
+/*
+ * __txn_flush_fe_files
+ * For every extended file in which a log record write was skipped,
+ * flush the data pages.  This is called during commit.
+ *
+ * PUBLIC: int __txn_flush_fe_files __P((DB_TXN *));
+ */
+int
+__txn_flush_fe_files(txn)
+     DB_TXN *txn;
+{
+	DB *db;
+	ENV *env;
+	int ret;
+
+	env = txn->mgrp->env;
+
+	DB_ASSERT(env, txn->mgrp != NULL);
+	DB_ASSERT(env, env != NULL);
+
+#ifdef DIAGNOSTIC
+	DB_ASSERT(env, txn->parent == NULL);
+#endif
+
+	TAILQ_FOREACH(db, &txn->femfs, felink) {
+		if (db->mpf->mfp->fe_nlws > 0 &&
+		    (ret = __memp_sync_int(env, db->mpf, 0,
+		    DB_SYNC_FILE, NULL, NULL)))
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __txn_pg_above_fe_watermark --
+ *
+ * Test whether there is a file extension watermark for the given
+ * database, and, if so, whether the given page number is above the
+ * watermark.  If this test returns true, then logging of the page's
+ * update can be suppressed when the file extension/bulk loading
+ * optimization is in force.
+ *
+ * PUBLIC: int __txn_pg_above_fe_watermark
+ * PUBLIC:	__P((DB_TXN*, MPOOLFILE*, db_pgno_t));
+ */
+int
+__txn_pg_above_fe_watermark(txn, mpf, pgno)
+     DB_TXN *txn;
+     MPOOLFILE *mpf;
+     db_pgno_t pgno;
+{
+	ENV *env;
+	int skip;
+
+	if (txn == NULL || (!F_ISSET(txn, TXN_BULK)) ||
+	    mpf->fe_watermark == PGNO_INVALID)
+		return (0);
+
+	env = txn->mgrp->env;
+
+	skip = 0;
+	TXN_SYSTEM_LOCK(env);
+	if (((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_hotbackup > 0)
+		skip = 1;
+	TXN_SYSTEM_UNLOCK(env);
+	if (skip)
+		return (0);
+
+	/*
+	 * If the watermark is a valid page number, then the extending
+	 * transaction should be the current outermost transaction.
+	 */
+	DB_ASSERT(txn->mgrp->env, mpf->fe_txnid == txn->txnid);
+
+	return (mpf->fe_watermark <= pgno);
+}
diff --git a/src/xa/xa.c b/src/xa/xa.c
new file mode 100644
index 00000000..ee75e792
--- /dev/null
+++ b/src/xa/xa.c
@@ -0,0 +1,1068 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/xa_ext.h"
+
+static void corrupted_env __P((ENV *, int));
+
+static int __xa_get_txn __P((ENV *,
+    XID *, TXN_DETAIL *, DB_TXN **, u_long, int));
+static void __xa_put_txn __P((ENV *, DB_TXN *));
+
+static int __xa_txn_get_prepared
+	       __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t));
+static int __xa_thread_enter __P((ENV *, DB_THREAD_INFO **));
+
+static int __db_xa_close __P((char *, int, long));
+static int __db_xa_commit __P((XID *, int, long));
+static int __db_xa_complete __P((int *, int *, int, long));
+static int __db_xa_end __P((XID *, int, long));
+static int __db_xa_forget __P((XID *, int, long));
+static int __db_xa_open __P((char *, int, long));
+static int __db_xa_prepare __P((XID *, int, long));
+static int __db_xa_recover __P((XID *, long, int, long));
+static int __db_xa_rollback __P((XID *, int, long));
+static int __db_xa_start __P((XID *, int, long));
+
+/*
+ * Possible flag values:
+ *	Dynamic registration	0 => no dynamic registration
+ *				TMREGISTER => dynamic registration
+ *	Asynchronous operation	0 => no support for asynchrony
+ *				TMUSEASYNC => async support
+ *	Migration support	0 => migration of transactions across
+ *				     threads is possible
+ *				TMNOMIGRATE => no migration across threads
+ */
+const struct xa_switch_t db_xa_switch = {
+	 "Berkeley DB",		/* name[RMNAMESZ] */
+	 TMNOMIGRATE,		/* flags */
+	 0,			/* version */
+	 __db_xa_open,		/* xa_open_entry */
+	 __db_xa_close,		/* xa_close_entry */
+	 __db_xa_start,		/* xa_start_entry */
+	 __db_xa_end,		/* xa_end_entry */
+	 __db_xa_rollback,	/* xa_rollback_entry */
+	 __db_xa_prepare,	/* xa_prepare_entry */
+	 __db_xa_commit,	/* xa_commit_entry */
+	 __db_xa_recover,	/* xa_recover_entry */
+	 __db_xa_forget,	/* xa_forget_entry */
+	 __db_xa_complete	/* xa_complete_entry */
+};
+
+/*
+ * __xa_get_txn --
+ *	Return a pointer to the current transaction structure for the
+ * designated environment.  We take the XA flags so we can specifically
+ * test for TMJOIN and TMRESUME.  These are testing for compliance with
+ * the XA state machine. The various cases are:
+ *
+ * TMRESUME: DB_TXN should already exist for this thread and should be
+ *	in state SUSPENDED.  Either error or change state.
+ * TMJOIN: DB_TXN should *not* exist, but TXN_DETAIL should -- create
+ *	the DB_TXN and __txn_continue it.
+ * neither: Neither DB_TXN nor TXN_DETAIL should exist (td should be NULL) --
+ *	start transaction.
+ *
+ * In addition, we use this to retrieve the current txn during __db_xa_end.
+ * In this case, the td and the txn should exist and the txn should currently
+ * be associated.
+ *
+ */
+static int
+__xa_get_txn(env, xid, td, txnp, flags, ending)
+	ENV *env;
+	XID *xid;
+	TXN_DETAIL *td;
+	DB_TXN **txnp;
+	u_long flags;
+	int ending;
+{
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	dbenv = env->dbenv;
+	COMPQUIET(ip, NULL);
+	ENV_ENTER_RET(env, ip, ret);
+	if (ret != 0)
+		return (XAER_RMFAIL);
+	else
+		ret = XA_OK;
+	DB_ASSERT(env, ip != NULL);
+	if (ending != 0)
+		DB_ASSERT(env,
+		    ip->dbth_xa_status == TXN_XA_THREAD_ASSOCIATED);
+	else
+		DB_ASSERT(env,
+		    ip->dbth_xa_status != TXN_XA_THREAD_ASSOCIATED);
+
+	/*
+	 * Two cases: the transaction should already exist in this
+	 * environment or it should not.  If it should exist, then
+	 * we should have found its detail and the JOIN or RESUME
+	 * flags should have been set.
+	 */
+	if (td == NULL) {
+		DB_ASSERT(env, ending == 0);
+		if (LF_ISSET(TMJOIN | TMRESUME))
+			ret = XAER_NOTA;
+		/*
+		 * The snapshot flag is ignored if the database is not
+		 * enabled for MVCC.  This allows MVCC to be used
+		 * with XA transactions.
+		 */
+		else if ((ret = __txn_begin(env,
+		    ip, NULL, txnp, DB_TXN_NOWAIT|DB_TXN_SNAPSHOT)) != 0) {
+			dbenv->err(dbenv, ret, DB_STR("4540",
+			    "xa_get_txn: transaction begin failed"));
+			ret = XAER_RMERR;
+		} else {
+			SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+			    *txnp, xa_links, __db_txn);
+			(*txnp)->xa_thr_status = TXN_XA_THREAD_ASSOCIATED;
+			ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+
+			/* Initialize XA fields in the detail structure. */
+			/* XXX Does this need protection of the TXN lock? */
+			td = (TXN_DETAIL *)((*txnp)->td);
+			memcpy(td->gid, xid->data, XIDDATASIZE);
+			td->bqual = (u_int32_t)xid->bqual_length;
+			td->gtrid = (u_int32_t)xid->gtrid_length;
+			td->format = (int32_t)xid->formatID;
+			td->xa_br_status = TXN_XA_ACTIVE;
+		}
+	} else {
+		/* If we get here, the transaction exists. */
+		if (ending == 0 && !LF_ISSET(TMRESUME) && !LF_ISSET(TMJOIN)) {
+			ret = XAER_DUPID;
+			goto out;
+		}
+
+		SH_TAILQ_FOREACH(*txnp, &ip->dbth_xatxn, xa_links, __db_txn)
+			if ((*txnp)->td == td)
+				break;
+
+		/* Check that we are not a child transaction. */
+		if (td->parent != INVALID_ROFF) {
+			dbenv->err(dbenv, EINVAL, DB_STR("4541",
+			    "xa_get_txn: XA transaction with parent"));
+			ret = XAER_RMERR;
+			goto out;
+		}
+
+		if (*txnp != NULL) {
+			if (ending) {
+				DB_ASSERT(env, (*txnp)->xa_thr_status ==
+				    TXN_XA_THREAD_ASSOCIATED);
+				DB_ASSERT(env, (*txnp) ==
+				    SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn));
+			} else if (LF_ISSET(TMRESUME)) {
+				DB_ASSERT(env, (*txnp)->xa_thr_status ==
+				    TXN_XA_THREAD_SUSPENDED);
+				DB_ASSERT(env, ip->dbth_xa_status ==
+				    TXN_XA_THREAD_SUSPENDED);
+				(*txnp)->xa_thr_status =
+				    TXN_XA_THREAD_ASSOCIATED;
+				ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+				if ((*txnp) !=
+				    SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn)) {
+					SH_TAILQ_REMOVE(&ip->dbth_xatxn,
+					    (*txnp), xa_links, __db_txn);
+					SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+					    (*txnp), xa_links, __db_txn);
+				}
+				if (td->xa_br_status == TXN_XA_IDLE)
+					td->xa_br_status = TXN_XA_ACTIVE;
+			} else
+				ret = XAER_PROTO;
+		} else {
+			if (LF_ISSET(TMRESUME)) {
+				dbenv->err(dbenv, EINVAL, DB_STR("4542",
+				    "xa_get_txn: transaction does not exist"));
+				ret = XAER_PROTO;
+			} else if ((ret =
+			    __os_malloc(env, sizeof(DB_TXN), txnp)) == 0) {
+				/* We are joining this branch. */
+				ret = __txn_continue(env, *txnp, td, ip, 1);
+				if (ret != 0) {
+					dbenv->err(dbenv, ret, DB_STR("4543",
+					    "xa_get_txn: txn_continue fails"));
+					ret = XAER_RMFAIL;
+				}
+				ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+				(*txnp)->xa_thr_status =
+				    TXN_XA_THREAD_ASSOCIATED;
+				SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+				    (*txnp), xa_links, __db_txn);
+				if (td->xa_br_status == TXN_XA_IDLE)
+					td->xa_br_status = TXN_XA_ACTIVE;
+			} else {
+				dbenv->err(dbenv, ret, DB_STR("4544",
+				    "xa_get_txn: os_malloc failed"));
+				ret = XAER_RMERR;
+			}
+		}
+	}
+out:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * Release use of this transaction.
+ */
+static void
+__xa_put_txn(env, txnp)
+	ENV *env;
+	DB_TXN *txnp;
+{
+	DB_THREAD_INFO *ip;
+	TXN_DETAIL *td;
+
+	ip = txnp->thread_info;
+	DB_ASSERT(env, ip != NULL);
+	SH_TAILQ_REMOVE(&ip->dbth_xatxn, txnp, xa_links, __db_txn);
+	TAILQ_REMOVE(&txnp->mgrp->txn_chain, txnp, links);
+	td = txnp->td;
+	DB_ASSERT(env, td->xa_ref > 0);
+	td->xa_ref--;
+	__os_free(env, txnp);
+	ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED;
+}
+
+static
+int __xa_thread_enter(env, ipp)
+	ENV *env;
+	DB_THREAD_INFO **ipp;
+{
+	int ret;
+	DB_THREAD_INFO *ip;
+
+	COMPQUIET(ip, NULL);
+	ENV_ENTER_RET(env, ip, ret);
+	if (ret == 0)
+		ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED;
+	*ipp = ip;
+	return (ret);
+}
+
+/*
+ * __xa_txn_get_prepared --
+ *	Wrap the internal call to __txn_get_prepared so that we can call
+ * it from XA. XA routines are not considered to be running "inside" the
+ * library, so when they make calls into the library, we need to use interface
+ * routines that support replication and failchk.  Since __txn_get_prepared
+ * is internal, there is no user API to call, so we use this wrapper routine
+ * instead.
+ */
+static int
+__xa_txn_get_prepared(env, xids, txns, count, retp, flags)
+	ENV *env;
+	XID *xids;
+	DB_PREPLIST *txns;
+	long count;		/* This is long for XA compatibility. */
+	long *retp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = NULL;
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__txn_get_prepared(env, xids, txns, count, retp, flags)), 0, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+#define	XA_FLAGS \
+	(DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | \
+	DB_INIT_TXN | DB_THREAD | DB_REGISTER | DB_RECOVER)
+
+/*
+ * __db_xa_open --
+ *	The open call in the XA protocol.  The rmid field is an id number
+ * that the TM assigned us and will pass us on every xa call.  We need to
+ * map that rmid number into a env structure that we create during
+ * initialization.  The file xa_map.c implements all such xa->db mappings.
+ *	The xa_info field is instance specific information.  We require
+ * that the value of DB_HOME be passed in xa_info.  Since xa_info is the
+ * only thing that we get to pass to db_env_create, any config information
+ * will have to be done via a config file instead of via the db_env_create
+ * call.
+ */
+static int
+__db_xa_open(xa_info, rmid, arg_flags)
+	char *xa_info;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int inmem, ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+	ret = 0;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/* Verify if we already have this environment open. */
+	if (__db_rmid_to_env(rmid, &env) == 0) {
+		env->xa_ref++;
+		/* Indicate that this thread is in an XA environment. */
+		if ((ret = __xa_thread_enter(env, &ip)) == 0) {
+			DB_ASSERT(env, ip != NULL);
+			ENV_LEAVE(env, ip);
+			return (XA_OK);
+		} else
+			return (XAER_RMERR);
+	}
+
+	/* Open a new environment. */
+	if ((ret = db_env_create(&dbenv, 0)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4545",
+		    "xa_open: Failure creating env handle"));
+		return (XAER_RMERR);
+	}
+	if ((ret = dbenv->set_thread_count(dbenv, 25)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4546",
+		    "xa_open: Failure setting thread count"));
+		goto err;
+	}
+	env = dbenv->env;
+	if ((ret = dbenv->open(dbenv, xa_info, XA_FLAGS, 0)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4547",
+		    "xa_open: Failure opening environment"));
+		goto err;
+	}
+
+	/*
+	 * Make sure that the environment is not configured for in-memory
+	 * logging.
+	 */
+	if ((ret = dbenv->log_get_config(dbenv,
+	    DB_LOG_IN_MEMORY, &inmem)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4548",
+		    "xa_open: Failure getting log configuration"));
+		goto err;
+	}
+	if (inmem != 0) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4549",
+	    "xa_open: In-memory logging not allowed in XA environment"));
+		goto err;
+	}
+
+	/* Create the mapping. */
+	__db_map_rmid(rmid, env);
+	env->xa_ref = 1;
+
+	/* Indicate that this thread is in an XA environment. */
+	if ((ret = __xa_thread_enter(env, &ip)) == 0) {
+		ENV_LEAVE(env, ip);
+		return (XA_OK);
+	} else
+		return (XAER_RMERR);
+
+err:	(void)dbenv->close(dbenv, 0);
+	/*
+	 * If the environment is corrupt, then we need to get all threads
+	 * and processes out of it and run recovery.  There is no particularly
+	 * clean way to do that, so we'll use a really big hammer and
+	 * crash the server.
+	 */
+	if (ret == DB_RUNRECOVERY)
+		exit(1);
+
+	return (XAER_RMERR);
+}
+
+/*
+ * __db_xa_close --
+ *	The close call of the XA protocol.  The only trickiness here
+ * is that if there are any active transactions, we must fail.  It is
+ * *not* an error to call close on an environment that has already been
+ * closed (I am interpreting that to mean it's OK to call close on an
+ * environment that has never been opened).
+ */
+static int
+__db_xa_close(xa_info, rmid, arg_flags)
+	char *xa_info;
+	int rmid;
+	long arg_flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+	u_long flags;
+
+	COMPQUIET(xa_info, NULL);
+	COMPQUIET(ip, NULL);
+	ret = 0;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/* If the environment is closed, then we're done. */
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XA_OK);
+
+	/* Check if there are any pending transactions. */
+	ENV_ENTER_RET(env, ip, ret);
+	/*
+	 * If the environment is corrupt, then we need to get all threads
+	 * and processes out of it and run recovery.  There is no particularly
+	 * clean way to do that, so we'll use a really big hammer and
+	 * crash the server.
+	 */
+	if (ret == DB_RUNRECOVERY)
+		exit(1);
+	else if (ret != 0)
+		return (XAER_RMFAIL);
+
+	/*
+	 * If we are calling close without ever having called open, then we
+	 * don't want to do anything, because if we do, our ref counts would
+	 * be all wrong.
+	 */
+	if (ip->dbth_xa_status == TXN_XA_THREAD_NOTA) {
+		ret = XAER_PROTO;
+		goto err;
+	}
+
+	/*
+	 * It is an error for a transaction manager to call xa_close from
+	 * a thread of control that is associated with a transaction branch.
+	 */
+	if (SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn) != NULL) {
+		ret = XAER_PROTO;
+		goto err;
+	}
+
+	if (env->xa_ref > 1) {
+		env->xa_ref--;
+		goto err;
+	} else {
+		/* Destroy the mapping. */
+		ret = __db_unmap_rmid(rmid);
+
+		/* Close the environment. */
+		t_ret = env->dbenv->close(env->dbenv, 0);
+
+		if (ret != 0 || t_ret != 0)
+			ret = XAER_RMERR;
+		/* Don't try to leave an environment we just closed. */
+		goto out;
+	}
+
+err:	ENV_LEAVE(env, ip);
+out:	return (ret == 0 ? XA_OK : ret);
+}
+
+/*
+ * __db_xa_start --
+ *	Begin a transaction for the current resource manager.
+ */
+static int
+__db_xa_start(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txnp;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+	ret = 0;
+
+#define	OK_FLAGS	(TMJOIN | TMRESUME | TMNOWAIT | TMASYNC | TMNOFLAGS)
+	if (LF_ISSET(~OK_FLAGS))
+		return (XAER_INVAL);
+
+	if (LF_ISSET(TMJOIN) && LF_ISSET(TMRESUME))
+		return (XAER_INVAL);
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	/* Die if the environment is corrupted. */
+	PANIC_CHECK_RET(env, ret);
+	if (ret == DB_RUNRECOVERY)
+		exit(1);
+
+	/*
+	 * If td comes back NULL, then we know that we don't have a
+	 * transaction yet.
+	 */
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4550",
+		    "xa_start: failure mapping xid"));
+		return (XAER_RMFAIL);
+	}
+
+	/*
+	 * This can't block, so we can ignore TMNOWAIT.
+	 *
+	 * Other error conditions: RMERR, OUTSIDE, PROTO, RB*
+	 */
+	if (td != NULL) {
+		if (td->xa_br_status == TXN_XA_DEADLOCKED)
+			return (XA_RBDEADLOCK);
+		if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+			return (XA_RBOTHER);
+	}
+	if ((ret = __xa_get_txn(env, xid, td, &txnp, flags, 0)) != 0)
+		return (ret);
+
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_end --
+ *	Disassociate the current transaction from the current process.
+ */
+static int
+__db_xa_end(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Convert for bit manipulation. */
+	if (flags != TMNOFLAGS && !LF_ISSET(TMSUSPEND | TMSUCCESS | TMFAIL))
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4551",
+		    "xa_end: failure mapping xid"));
+		return (XAER_RMFAIL);
+	}
+	if (td == NULL)
+		return (XAER_NOTA);
+
+	if ((ret = __xa_get_txn(env, xid, td, &txn, flags, 1)) != 0)
+		return (ret);
+
+	/* We are ending; make sure there are no open cursors. */
+	if (txn->cursors != 0) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4552",
+		    "xa_end: cannot end with open cursors"));
+		return (XAER_RMERR);
+	}
+
+	if (td != txn->td) {
+		dbenv->err(dbenv, ret, DB_STR("4553",
+		    "xa_end: txn_detail mismatch"));
+		return (XAER_RMERR);
+	}
+
+	if (td->xa_br_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	/*
+	 * This happens if this process timed out,
+	 * and the TMS called __db_xa_rollback
+	 * while this process was holding the txn.
+	 * Need to handle the txn in this process.
+	 */
+	if (td->status == TXN_NEED_ABORT) {
+		if (txn->abort(txn) != 0)
+			return (XAER_RMERR);
+		__xa_put_txn(env, txn);
+		return (XA_RBOTHER);
+	}
+
+	if (td->xa_br_status == TXN_XA_IDLE) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4554",
+		    "xa_end: ending transaction that is idle"));
+		return (XAER_PROTO);
+	}
+
+	/*
+	 * If we are deadlocked or prepared, don't change this, but
+	 * if we are active and the only handle, then make this transaction
+	 * idle.
+	 */
+	if (td->xa_ref == 1 && td->xa_br_status == TXN_XA_ACTIVE)
+		td->xa_br_status = TXN_XA_IDLE;
+	if (LF_ISSET(TMSUSPEND)) {
+		txn->thread_info->dbth_xa_status = TXN_XA_THREAD_SUSPENDED;
+		txn->xa_thr_status = TXN_XA_THREAD_SUSPENDED;
+	} else {
+		__xa_put_txn(env, txn);
+	}
+	return (XA_OK);
+}
+
+/*
+ * If, during a transaction completion operation (commit, abort, prepare)
+ * we detect a corrupt environment, we must close and reopen the
+ * environment and check if the transaction in question exists.  If it
+ * does, then we can complete the operation as requested.  If it does
+ * not, then we have to return aborted, because we just recovered the
+ * environment, aborting this transaction.
+ */
+static void
+corrupted_env(env, rmid)
+	ENV *env;
+	int rmid;
+{
+	DB_ENV *dbenv;
+	const char *path;
+	char *home;
+	int ret;
+	ENV *env2;
+
+	COMPQUIET(home, NULL);
+	ret = 0;
+	dbenv = env->dbenv;
+	path = NULL;
+	if (dbenv->get_home(dbenv, &path) != 0)
+		goto err;
+	if (path != NULL && (__os_strdup(NULL, path, &home) != 0))
+		goto err;
+	/*
+	 * Check that no one else came in and cleaned
+	 * up the environment before we could.  If they
+	 * did then just call __db_xa_open to get the
+	 * new environment.  If they have not then
+	 * unmap the old handle so no one else can get
+	 * it.
+	 */
+	if (__db_rmid_to_env(rmid, &env2) == 0) {
+		PANIC_CHECK_RET(env2, ret);
+		if (ret != 0)
+			(void)__db_unmap_rmid(rmid);
+	}
+
+	/*
+	 * If we cannot get the environment then it is
+	 * corrupted and are currently unable to run recovery.
+	 * In that case all we can do is crash and restart,
+	 * and recovery will clean up the lost transaction.
+	 */
+	if ( __db_xa_open(home, rmid, 0) != XA_OK)
+		goto err;
+
+	__os_free(NULL, home);
+	if (0) {
+err:		exit(1);
+	}
+}
+
+/*
+ * __db_xa_prepare --
+ *	Sync the log to disk so we can guarantee recoverability.
+ */
+static int
+__db_xa_prepare(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txnp;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+	ret = 0;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	/*
+	 * We need to know if we've ever called prepare on this.
+	 * As part of the prepare, we set the xa_status field to
+	 * reflect that fact that prepare has been called, and if
+	 * it's ever called again, it's an error.
+	 */
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	/*
+	 * If the environment is corrupted, reopen it or die if that
+	 * is not possible.
+	 */
+	PANIC_CHECK_RET(env, ret);
+	if (ret == DB_RUNRECOVERY) {
+		corrupted_env(env, rmid);
+		if (__db_rmid_to_env(rmid, &env) != 0)
+			return (XAER_PROTO);
+		dbenv = env->dbenv;
+	}
+
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4555",
+		    "xa_prepare: failure mapping xid"));
+		return (XAER_RMFAIL);
+	}
+	if (td == NULL) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4556",
+		    "xa_prepare: xid not found"));
+		return (XAER_NOTA);
+	}
+
+	if (td->xa_br_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+	if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+		return (XA_RBOTHER);
+
+	if (td->xa_br_status != TXN_XA_ACTIVE &&
+	    td->xa_br_status != TXN_XA_IDLE) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4557",
+		    "xa_prepare: transaction neither active nor idle"));
+		return (XAER_PROTO);
+	}
+
+	/* Now, fill in the global transaction structure. */
+	if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+		return (ret);
+
+	if ((ret = txnp->prepare(txnp, (u_int8_t *)xid->data)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4558",
+		    "xa_prepare: txnp->prepare failed"));
+		td->xa_br_status = TXN_XA_IDLE;
+		return (XAER_RMERR);
+	}
+	td->xa_br_status = TXN_XA_PREPARED;
+
+	__xa_put_txn(env, txnp);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_commit --
+ *	Commit the transaction
+ */
+static int
+__db_xa_commit(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txnp;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+	ret = 0;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+#undef	OK_FLAGS
+#define	OK_FLAGS	(TMNOFLAGS | TMNOWAIT | TMONEPHASE)
+	if (LF_ISSET(~OK_FLAGS))
+		return (XAER_INVAL);
+
+	/*
+	 * We need to know if we've ever called prepare on this.
+	 * We can verify this by examining the xa_status field.
+	 */
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	/*
+	 * If the environment is corrupted, reopen it or die if that
+	 * is not possible.
+	 */
+	PANIC_CHECK_RET(env, ret);
+	if (ret == DB_RUNRECOVERY) {
+		corrupted_env(env, rmid);
+		if (__db_rmid_to_env(rmid, &env) != 0)
+			return (XAER_PROTO);
+		dbenv = env->dbenv;
+	}
+
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4559",
+		    "xa_commit: failure mapping xid"));
+		return (XAER_RMFAIL);
+	}
+	if (td == NULL) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4560",
+		    "xa_commit: xid not found"));
+		return (XAER_NOTA);
+	}
+
+	if (td->xa_br_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+		return (XA_RBOTHER);
+
+	if (LF_ISSET(TMONEPHASE) && td->xa_br_status != TXN_XA_IDLE) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4561",
+		    "xa_commit: commiting transaction active in branch"));
+		return (XAER_PROTO);
+	}
+
+	if (!LF_ISSET(TMONEPHASE) && td->xa_br_status != TXN_XA_PREPARED) {
+		dbenv->err(dbenv, EINVAL, DB_STR("4562",
+		    "xa_commit: attempting to commit unprepared transaction"));
+		return (XAER_PROTO);
+	}
+
+	/* Now, fill in the global transaction structure. */
+	if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Because this transaction is currently associated, commit will not free
+	 * the transaction structure, which is good, because we need to do that
+	 * in xa_put_txn below.
+	 */
+	if ((ret = txnp->commit(txnp, 0)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4563",
+		    "xa_commit: txnp->commit failed"));
+		return (XAER_RMERR);
+	}
+
+	__xa_put_txn(env, txnp);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_recover --
+ *	Returns a list of prepared and heuristically completed transactions.
+ *
+ * The return value is the number of xids placed into the xid array (less
+ * than or equal to the count parameter).  The flags are going to indicate
+ * whether we are starting a scan or continuing one.
+ */
+static int
+__db_xa_recover(xids, count, rmid, flags)
+	XID *xids;
+	long count, flags;
+	int rmid;
+{
+	ENV *env;
+	int ret;
+	u_int32_t newflags;
+	long rval;
+
+	/* If the environment is closed, then we're done. */
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+
+	if (LF_ISSET(TMSTARTRSCAN))
+		newflags = DB_FIRST;
+	else if (LF_ISSET(TMENDRSCAN))
+		newflags = DB_LAST;
+	else
+		newflags = DB_NEXT;
+
+	rval = 0;
+	if ((ret = __xa_txn_get_prepared(env,
+	    xids, NULL, count, &rval, newflags)) != 0) {
+		env->dbenv->err(env->dbenv, ret, DB_STR("4564",
+		    "xa_recover: txn_get_prepared failed"));
+		return (XAER_RMERR);
+	}
+
+	return (rval);
+}
+
+/*
+ * __db_xa_rollback
+ *	Abort an XA transaction.
+ */
+static int
+__db_xa_rollback(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txnp;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+	ret = 0;
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	/*
+	 * If the environment is corrupted, reopen it or die if that
+	 * is not possible.
+	 */
+	PANIC_CHECK_RET(env, ret);
+	if (ret == DB_RUNRECOVERY) {
+		corrupted_env(env, rmid);
+		if (__db_rmid_to_env(rmid, &env) != 0)
+			return (XAER_PROTO);
+		dbenv = env->dbenv;
+	}
+
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4565",
+		    "xa_rollback: failure mapping xid"));
+		return (XAER_RMFAIL);
+	} if (td == NULL) {
+		dbenv->err(dbenv, ret, DB_STR("4566",
+		    "xa_rollback: xid not found"));
+		return (XAER_NOTA);
+	}
+
+	if (td->xa_br_status == TXN_XA_DEADLOCKED)
+		return (XA_RBDEADLOCK);
+
+	if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+		return (XA_RBOTHER);
+
+	if (td->xa_br_status != TXN_XA_ACTIVE &&
+	    td->xa_br_status != TXN_XA_IDLE &&
+	    td->xa_br_status != TXN_XA_PREPARED) {
+		dbenv->err(dbenv, EINVAL, DB_STR_A("4567",
+		    "xa_rollback: transaction in invalid state %d",
+		    "%d"), (int)td->xa_br_status);
+		return (XAER_PROTO);
+	}
+
+	/* Now, fill in the global transaction structure. */
+	if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+		return (ret);
+	/*
+	 * Normally abort frees the txnp, but if this is an associated XA
+	 * transaction, then abort will not free it; we do that below.
+	 */
+	if ((ret = txnp->abort(txnp)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4568",
+		    "xa_rollback: failure aborting transaction"));
+		return (XAER_RMERR);
+	}
+
+	__xa_put_txn(env, txnp);
+
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_forget --
+ *	Forget about an XID for a transaction that was heuristically
+ * completed.  Since we do not heuristically complete anything, I
+ * don't think we have to do anything here, but we should make sure
+ * that we reclaim the slots in the txnid table.
+ */
+static int
+__db_xa_forget(xid, rmid, arg_flags)
+	XID *xid;
+	int rmid;
+	long arg_flags;
+{
+	DB_ENV *dbenv;
+	DB_TXN *txnp;
+	ENV *env;
+	TXN_DETAIL *td;
+	int ret;
+	u_long flags;
+
+	flags = (u_long)arg_flags;	/* Conversion for bit operations. */
+
+	if (LF_ISSET(TMASYNC))
+		return (XAER_ASYNC);
+	if (flags != TMNOFLAGS)
+		return (XAER_INVAL);
+
+	if (__db_rmid_to_env(rmid, &env) != 0)
+		return (XAER_PROTO);
+	dbenv = env->dbenv;
+
+	/*
+	 * If mapping is gone, then we're done.
+	 */
+	if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4569",
+		    "xa_forget: failure mapping xid"));
+		return (XAER_RMFAIL);
+	}
+	if (td == NULL) {
+		dbenv->err(dbenv, ret, DB_STR("4570",
+		    "xa_forget: xid not found"));
+		return (XA_OK);
+	}
+
+	if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+		return (ret);
+
+	if ((ret = txnp->discard(txnp, 0)) != 0) {
+		dbenv->err(dbenv, ret, DB_STR("4571",
+		    "xa_forget: txnp->discard failed"));
+		return (XAER_RMFAIL);
+	}
+
+	__xa_put_txn(env, txnp);
+	return (XA_OK);
+}
+
+/*
+ * __db_xa_complete --
+ *	Used to wait for asynchronous operations to complete.  Since we're
+ *	not doing asynch, this is an invalid operation.
+ */
+static int
+__db_xa_complete(handle, retval, rmid, flags)
+	int *handle, *retval, rmid;
+	long flags;
+{
+	COMPQUIET(handle, NULL);
+	COMPQUIET(retval, NULL);
+	COMPQUIET(rmid, 0);
+	COMPQUIET(flags, 0);
+
+	return (XAER_INVAL);
+}
diff --git a/src/xa/xa_map.c b/src/xa/xa_map.c
new file mode 100644
index 00000000..4dcf4d75
--- /dev/null
+++ b/src/xa/xa_map.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/xa_ext.h"
+
+/*
+ * This file contains all the mapping information that we need to support
+ * the DB/XA interface.
+ */
+
+/*
+ * __db_rmid_to_env
+ *	Return the environment associated with a given XA rmid.
+ *
+ * PUBLIC: int __db_rmid_to_env __P((int, ENV **));
+ */
+int
+__db_rmid_to_env(rmid, envp)
+	int rmid;
+	ENV **envp;
+{
+	ENV *env;
+
+	*envp = NULL;
+	if (TAILQ_EMPTY(&DB_GLOBAL(envq)))
+		TAILQ_INIT(&DB_GLOBAL(envq));
+
+	/*
+	 * When we map an rmid, move that environment to be the first one in
+	 * the list of environments, so we acquire the correct environment
+	 * in DB->open.
+	 */
+	for (env = TAILQ_FIRST(&DB_GLOBAL(envq));
+	    env != NULL; env = TAILQ_NEXT(env, links)) {
+		if (env->xa_rmid == rmid) {
+			*envp = env;
+			if (env != TAILQ_FIRST(&DB_GLOBAL(envq))) {
+				TAILQ_REMOVE(&DB_GLOBAL(envq), env, links);
+				TAILQ_INSERT_HEAD(&DB_GLOBAL(envq), env, links);
+			}
+			return (0);
+		}
+	}
+	return (1);
+}
+
+/*
+ * __db_xid_to_txn
+ *	Return the txn that corresponds to this XID.
+ *
+ * PUBLIC: int __db_xid_to_txn __P((ENV *, XID *, TXN_DETAIL **));
+ */
+int
+__db_xid_to_txn(env, xid, tdp)
+	ENV *env;
+	XID *xid;
+	TXN_DETAIL **tdp;
+{
+	DB_TXNMGR *mgr;
+	DB_TXNREGION *region;
+	u_int8_t *gid;
+
+	mgr = env->tx_handle;
+	region = mgr->reginfo.primary;
+
+	/*
+	 * Search the internal active transaction table to find the
+	 * matching xid.  If this is a performance hit, then we
+	 * can create a hash table, but I doubt it's worth it.
+	 */
+	TXN_SYSTEM_LOCK(env);
+	gid = (u_int8_t *)(xid->data);
+	SH_TAILQ_FOREACH(*tdp, &region->active_txn, links, __txn_detail)
+		if (memcmp(gid, (*tdp)->gid, sizeof((*tdp)->gid)) == 0)
+			break;
+	TXN_SYSTEM_UNLOCK(env);
+
+	/*
+	 * This returns an error, because TXN_SYSTEM_{UN}LOCK may return
+	 * an error.
+	 */
+	return (0);
+}
+
+/*
+ * __db_map_rmid
+ *	Create a mapping between the specified rmid and environment.
+ *
+ * PUBLIC: void __db_map_rmid __P((int, ENV *));
+ */
+void
+__db_map_rmid(rmid, env)
+	int rmid;
+	ENV *env;
+{
+	env->xa_rmid = rmid;
+	TAILQ_INSERT_HEAD(&DB_GLOBAL(envq), env, links);
+}
+
+/*
+ * __db_unmap_rmid
+ *	Destroy the mapping for the given rmid.
+ *
+ * PUBLIC: int __db_unmap_rmid __P((int));
+ */
+int
+__db_unmap_rmid(rmid)
+	int rmid;
+{
+	ENV *e;
+
+	for (e = TAILQ_FIRST(&DB_GLOBAL(envq));
+	    e->xa_rmid != rmid;
+	    e = TAILQ_NEXT(e, links))
+	    ;
+
+	if (e == NULL)
+		return (EINVAL);
+
+	TAILQ_REMOVE(&DB_GLOBAL(envq), e, links);
+	return (0);
+}
+
+/*
+ * __db_unmap_xid
+ *	Destroy the mapping for the specified XID.
+ *
+ * PUBLIC: void __db_unmap_xid __P((ENV *, XID *, size_t));
+ */
+
+void
+__db_unmap_xid(env, xid, off)
+	ENV *env;
+	XID *xid;
+	size_t off;
+{
+	TXN_DETAIL *td;
+
+	COMPQUIET(xid, NULL);
+
+	td = R_ADDR(&env->tx_handle->reginfo, off);
+	memset(td->gid, 0, sizeof(td->gid));
+}