13 files changed, 2917 insertions, 2045 deletions
diff --git a/bdb/mp/Design b/bdb/mp/Design
deleted file mode 100644
index 1b26aae6cba..00000000000
--- a/bdb/mp/Design
+++ /dev/null
@@ -1,52 +0,0 @@
-$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
-
-There are three ways we do locking in the mpool code:
-
-Locking a handle mutex to provide concurrency for DB_THREAD operations.
-Locking the region mutex to provide mutual exclusion while reading and
-    writing structures in the shared region.
-Locking buffer header mutexes during I/O.
-
-The first will not be further described here.  We use the shared mpool
-region lock to provide mutual exclusion while reading/modifying all of
-the data structures, including the buffer headers.  We use a per-buffer
-header lock to wait on buffer I/O.  The order of locking is as follows:
-
-Searching for a buffer:
-    Acquire the region lock.
-    Find the buffer header.
-    Increment the reference count (guarantee the buffer stays).
-    While the BH_LOCKED flag is set (I/O is going on) {
-	Release the region lock.
-	    Explicitly yield the processor if it's not the first pass
-	    through this loop, otherwise, we can simply spin because
-	    we'll be simply switching between the two locks.
-	Request the buffer lock.
-	The I/O will complete...
-	Acquire the buffer lock.
-	Release the buffer lock.
-	Acquire the region lock.
-    }
-    Return the buffer.
-
-Reading/writing a buffer:
-    Acquire the region lock.
-    Find/create the buffer header.
-    If reading, increment the reference count (guarantee the buffer stays).
-    Set the BH_LOCKED flag.
-    Acquire the buffer lock (guaranteed not to block).
-    Release the region lock.
-    Do the I/O and/or initialize the buffer contents.
-    Release the buffer lock.
-	At this point, the buffer lock is available, but the logical
-	operation (flagged by BH_LOCKED) is not yet completed.  For
-	this reason, among others, threads checking the BH_LOCKED flag
-	must loop around their test.
-    Acquire the region lock.
-    Clear the BH_LOCKED flag.
-    Release the region lock.
-    Return/discard the buffer.
-
-Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
-not reacquired when a region lock is reacquired because they couldn't
-have been closed/discarded and because they never move in memory.
diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c
index 731f569f57f..96dd612d7ba 100644
--- a/bdb/mp/mp_alloc.c
+++ b/bdb/mp/mp_alloc.c
@@ -1,22 +1,31 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
+#include <string.h>
 #endif
 
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+	DB_MPOOL_HASH *bucket;
+	u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
 
 /*
  * __memp_alloc --
@@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
 	roff_t *offsetp;
 	void *retp;
 {
-	BH *bhp, *nbhp;
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+	DB_MUTEX *mutexp;
 	MPOOL *c_mp;
 	MPOOLFILE *bh_mfp;
-	size_t total;
-	int nomore, restart, ret, wrote;
+	size_t freed_space;
+	u_int32_t buckets, buffers, high_priority, max_na, priority;
+	int aggressive, ret;
 	void *p;
 
+	dbenv = dbmp->dbenv;
 	c_mp = memreg->primary;
+	dbht = R_ADDR(memreg, c_mp->htab);
+	hp_end = &dbht[c_mp->htab_buckets];
+
+	buckets = buffers = 0;
+	aggressive = 0;
+
+	c_mp->stat.st_alloc++;
+
+	/*
+	 * Get aggressive if we've tried to flush the number of pages as are
+	 * in the system without finding space.
+	 */
+	max_na = 5 * c_mp->htab_buckets;
 
 	/*
 	 * If we're allocating a buffer, and the one we're discarding is the
@@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
 	if (mfp != NULL)
 		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
 
-	nomore = 0;
+	R_LOCK(dbenv, memreg);
+
+	/*
+	 * On every buffer allocation we update the buffer generation number
+	 * and check for wraparound.
+	 */
+	if (++c_mp->lru_count == UINT32_T_MAX)
+		__memp_reset_lru(dbenv, memreg, c_mp);
+
+	/*
+	 * Anything newer than 1/10th of the buffer pool is ignored during
+	 * allocation (unless allocation starts failing).
+	 */
+	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+	/*
+	 * First we try to allocate from free memory.  If that fails, scan the
+	 * buffer pool to find buffers with low priorities.  We consider small
+	 * sets of hash buckets each time to limit the amount of work needing
+	 * to be done.  This approximates LRU, but not very well.  We either
+	 * find a buffer of the same size to use, or we will free 3 times what
+	 * we need in the hopes it will coalesce into a contiguous chunk of the
+	 * right size.  In the latter case we branch back here and try again.
+	 */
 alloc:	if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
-		if (offsetp != NULL)
+		if (mfp != NULL)
+			c_mp->stat.st_pages++;
+		R_UNLOCK(dbenv, memreg);
+
+found:		if (offsetp != NULL)
 			*offsetp = R_OFFSET(memreg, p);
 		*(void **)retp = p;
+
+		/*
+		 * Update the search statistics.
+		 *
+		 * We're not holding the region locked here, these statistics
+		 * can't be trusted.
+		 */
+		if (buckets != 0) {
+			if (buckets > c_mp->stat.st_alloc_max_buckets)
+				c_mp->stat.st_alloc_max_buckets = buckets;
+			c_mp->stat.st_alloc_buckets += buckets;
+		}
+		if (buffers != 0) {
+			if (buffers > c_mp->stat.st_alloc_max_pages)
+				c_mp->stat.st_alloc_max_pages = buffers;
+			c_mp->stat.st_alloc_pages += buffers;
+		}
 		return (0);
 	}
-	if (nomore) {
-		__db_err(dbmp->dbenv,
-	    "Unable to allocate %lu bytes from mpool shared region: %s\n",
-		    (u_long)len, db_strerror(ret));
-		return (ret);
-	}
 
-retry:	/* Find a buffer we can flush; pure LRU. */
-	restart = total = 0;
-	for (bhp =
-	    SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
-		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+	/*
+	 * We re-attempt the allocation every time we've freed 3 times what
+	 * we need.  Reset our free-space counter.
+	 */
+	freed_space = 0;
 
-		/* Ignore pinned or locked (I/O in progress) buffers. */
-		if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+	/*
+	 * Walk the hash buckets and find the next two with potentially useful
+	 * buffers.  Free the buffer with the lowest priority from the buckets'
+	 * chains.
+	 */
+	for (hp_tmp = NULL;;) {
+		/* Check for wrap around. */
+		hp = &dbht[c_mp->last_checked++];
+		if (hp >= hp_end) {
+			c_mp->last_checked = 0;
+
+			/*
+			 * If we've gone through all of the hash buckets, try
+			 * an allocation.  If the cache is small, the old page
+			 * size is small, and the new page size is large, we
+			 * might have freed enough memory (but not 3 times the
+			 * memory).
+			 */
+			goto alloc;
+		}
+
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
 			continue;
 
-		/* Find the associated MPOOLFILE. */
-		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+		/*
+		 * The failure mode is when there are too many buffers we can't
+		 * write or there's not enough memory in the system.  We don't
+		 * have a metric for deciding if allocation has no possible way
+		 * to succeed, so we don't ever fail, we assume memory will be
+		 * available if we wait long enough.
+		 *
+		 * Get aggressive if we've tried to flush 5 times the number of
+		 * hash buckets as are in the system -- it's possible we have
+		 * been repeatedly trying to flush the same buffers, although
+		 * it's unlikely.  Aggressive means:
+		 *
+		 * a: set a flag to attempt to flush high priority buffers as
+		 *    well as other buffers.
+		 * b: sync the mpool to force out queue extent pages.  While we
+		 *    might not have enough space for what we want and flushing
+		 *    is expensive, why not?
+		 * c: sleep for a second -- hopefully someone else will run and
+		 *    free up some memory.  Try to allocate memory too, in case
+		 *    the other thread returns its memory to the region.
+		 * d: look at a buffer in every hash bucket rather than choose
+		 *    the more preferable of two.
+		 *
+		 * !!!
+		 * This test ignores pathological cases like no buffers in the
+		 * system -- that shouldn't be possible.
+		 */
+		if ((++buckets % max_na) == 0) {
+			aggressive = 1;
 
-		/* Write the page if it's dirty. */
-		if (F_ISSET(bhp, BH_DIRTY)) {
-			++bhp->ref;
-			if ((ret = __memp_bhwrite(dbmp,
-			    bh_mfp, bhp, &restart, &wrote)) != 0)
-				return (ret);
-			--bhp->ref;
+			R_UNLOCK(dbenv, memreg);
 
-			/*
-			 * Another process may have acquired this buffer and
-			 * incremented the ref count after we wrote it.
-			 */
-			if (bhp->ref != 0)
-				goto retry;
+			(void)__memp_sync_int(
+			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+			(void)__os_sleep(dbenv, 1, 0);
+
+			R_LOCK(dbenv, memreg);
+			goto alloc;
+		}
+
+		if (!aggressive) {
+			/* Skip high priority buckets. */
+			if (hp->hash_priority > high_priority)
+				continue;
 
 			/*
-			 * If we wrote the page, continue and free the buffer.
-			 * We don't have to rewalk the list to acquire the
-			 * buffer because it was never available for any other
-			 * process to modify it.
-			 *
-			 * If we didn't write the page, but we discarded and
-			 * reacquired the region lock, restart the list walk.
-			 *
-			 * If we neither wrote the buffer nor discarded the
-			 * region lock, continue down the buffer list.
+			 * Find two buckets and select the one with the lowest
+			 * priority.  Performance testing shows that looking
+			 * at two improves the LRUness and looking at more only
+			 * does a little better.
 			 */
-			if (wrote)
-				++c_mp->stat.st_rw_evict;
-			else {
-				if (restart)
-					goto retry;
+			if (hp_tmp == NULL) {
+				hp_tmp = hp;
 				continue;
 			}
+			if (hp->hash_priority > hp_tmp->hash_priority)
+				hp = hp_tmp;
+			hp_tmp = NULL;
+		}
+
+		/* Remember the priority of the buffer we're looking for. */
+		priority = hp->hash_priority;
+
+		/* Unlock the region and lock the hash bucket. */
+		R_UNLOCK(dbenv, memreg);
+		mutexp = &hp->hash_mutex;
+		MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+		__memp_check_order(hp);
+#endif
+		/*
+		 * The lowest priority page is first in the bucket, as they are
+		 * maintained in sorted order.
+		 *
+		 * The buffer may have been freed or its priority changed while
+		 * we switched from the region lock to the hash lock.  If so,
+		 * we have to restart.  We will still take the first buffer on
+		 * the bucket's list, though, if it has a low enough priority.
+		 */
+		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+		    bhp->ref != 0 || bhp->priority > priority)
+			goto next_hb;
+
+		buffers++;
+
+		/* Find the associated MPOOLFILE. */
+		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/* If the page is dirty, pin it and write it. */
+		ret = 0;
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			++bhp->ref;
+			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+			--bhp->ref;
+			if (ret == 0)
+				++c_mp->stat.st_rw_evict;
 		} else
 			++c_mp->stat.st_ro_evict;
 
 		/*
+		 * If a write fails for any reason, we can't proceed.
+		 *
+		 * We released the hash bucket lock while doing I/O, so another
+		 * thread may have acquired this buffer and incremented the ref
+		 * count after we wrote it, in which case we can't have it.
+		 *
+		 * If there's a write error, avoid selecting this buffer again
+		 * by making it the bucket's least-desirable buffer.
+		 */
+		if (ret != 0 || bhp->ref != 0) {
+			if (ret != 0 && aggressive)
+				__memp_bad_buffer(hp);
+			goto next_hb;
+		}
+
+		/*
 		 * Check to see if the buffer is the size we're looking for.
-		 * If it is, simply reuse it.
+		 * If so, we can simply reuse it.  Else, free the buffer and
+		 * its space and keep looking.
 		 */
 		if (mfp != NULL &&
 		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
-			__memp_bhfree(dbmp, bhp, 0);
+			__memp_bhfree(dbmp, hp, bhp, 0);
 
-			if (offsetp != NULL)
-				*offsetp = R_OFFSET(memreg, bhp);
-			*(void **)retp = bhp;
-			return (0);
+			p = bhp;
+			goto found;
 		}
 
-		/* Note how much space we've freed, and free the buffer. */
-		total += __db_shsizeof(bhp);
-		__memp_bhfree(dbmp, bhp, 1);
+		freed_space += __db_shsizeof(bhp);
+		__memp_bhfree(dbmp, hp, bhp, 1);
 
 		/*
-		 * Retry as soon as we've freed up sufficient space.  If we
-		 * have to coalesce of memory to satisfy the request, don't
-		 * try until it's likely (possible?) that we'll succeed.
+		 * Unlock this hash bucket and re-acquire the region lock. If
+		 * we're reaching here as a result of calling memp_bhfree, the
+		 * hash bucket lock has already been discarded.
 		 */
-		if (total >= 3 * len)
+		if (0) {
+next_hb:		MUTEX_UNLOCK(dbenv, mutexp);
+		}
+		R_LOCK(dbenv, memreg);
+
+		/*
+		 * Retry the allocation as soon as we've freed up sufficient
+		 * space.  We're likely to have to coalesce of memory to
+		 * satisfy the request, don't try until it's likely (possible?)
+		 * we'll succeed.
+		 */
+		if (freed_space >= 3 * len)
 			goto alloc;
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __memp_bad_buffer --
+ *	Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp, *t_bhp;
+	u_int32_t priority;
+
+	/* Remove the first buffer from the bucket. */
+	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+	/*
+	 * Find the highest priority buffer in the bucket.  Buffers are
+	 * sorted by priority, so it's the last one in the bucket.
+	 *
+	 * XXX
+	 * Should use SH_TAILQ_LAST, but I think that macro is broken.
+	 */
+	priority = bhp->priority;
+	for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	    t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+		priority = t_bhp->priority;
+
+	/*
+	 * Set our buffer's priority to be just as bad, and append it to
+	 * the bucket.
+	 */
+	bhp->priority = priority;
+	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
 
-		/* Restart the walk if we discarded the region lock. */
-		if (restart)
-			goto retry;
+	/* Reset the hash bucket's priority. */
+	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ *	Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+	DB_ENV *dbenv;
+	REGINFO *memreg;
+	MPOOL *c_mp;
+{
+	BH *bhp;
+	DB_MPOOL_HASH *hp;
+	int bucket;
+
+	/*
+	 * Update the counter so all future allocations will start at the
+	 * bottom.
+	 */
+	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+	/* Release the region lock. */
+	R_UNLOCK(dbenv, memreg);
+
+	/* Adjust the priority of every buffer in the system. */
+	for (hp = R_ADDR(memreg, c_mp->htab),
+	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+			if (bhp->priority != UINT32_T_MAX &&
+			    bhp->priority > MPOOL_BASE_DECREMENT)
+				bhp->priority -= MPOOL_BASE_DECREMENT;
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 	}
-	nomore = 1;
-	goto alloc;
+
+	/* Reacquire the region lock. */
+	R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ *	Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp;
+	u_int32_t priority;
+
+	/*
+	 * Assumes the hash bucket is locked.
+	 */
+	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+		return;
+
+	DB_ASSERT(bhp->priority == hp->hash_priority);
+
+	for (priority = bhp->priority;
+	    (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+	    priority = bhp->priority)
+		DB_ASSERT(priority <= bhp->priority);
 }
+#endif
diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c
index e802b165b2d..85d15218abf 100644
--- a/bdb/mp/mp_bh.c
+++ b/bdb/mp/mp_bh.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -18,40 +18,41 @@ static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp
 #endif
 
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-#include "log.h"
-#include "db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_page.h"
 
+static int __memp_pgwrite
+	   __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
 
 /*
  * __memp_bhwrite --
- *	Write the page associated with a given bucket header.
+ *	Write the page associated with a given buffer header.
  *
- * PUBLIC: int __memp_bhwrite
- * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
  */
 int
-__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
 	MPOOLFILE *mfp;
 	BH *bhp;
-	int *restartp, *wrotep;
+	int open_extents;
 {
+	DB_ENV *dbenv;
 	DB_MPOOLFILE *dbmfp;
 	DB_MPREG *mpreg;
-	int incremented, ret;
+	int local_open, incremented, ret;
 
-	if (restartp != NULL)
-		*restartp = 0;
-	if (wrotep != NULL)
-		*wrotep = 0;
-	incremented = 0;
+	dbenv = dbmp->dbenv;
+	local_open = incremented = 0;
 
 	/*
-	 * If the file has been removed or is a closed temporary file, Jump
-	 * right ahead and pretend that we've found the file we want-- the
+	 * If the file has been removed or is a closed temporary file, jump
+	 * right ahead and pretend that we've found the file we want -- the
 	 * page-write function knows how to handle the fact that we don't have
 	 * (or need!) any real file descriptor information.
 	 */
@@ -66,52 +67,60 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * If we find a descriptor on the file that's not open for writing, we
 	 * try and upgrade it to make it writeable.  If that fails, we're done.
 	 */
-	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
 		if (dbmfp->mfp == mfp) {
 			if (F_ISSET(dbmfp, MP_READONLY) &&
-			    __memp_upgrade(dbmp, dbmfp, mfp)) {
-				MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
-				return (0);
+			    !F_ISSET(dbmfp, MP_UPGRADE) &&
+			    (F_ISSET(dbmfp, MP_UPGRADE_FAIL) ||
+			    __memp_upgrade(dbmp, dbmfp, mfp))) {
+				MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+				return (EPERM);
 			}
 
 			/*
 			 * Increment the reference count -- see the comment in
-			 * memp_fclose().
+			 * __memp_fclose_int().
 			 */
 			++dbmfp->ref;
 			incremented = 1;
 			break;
 		}
-	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
 	if (dbmfp != NULL)
 		goto found;
 
 	/*
 	 * !!!
+	 * It's the caller's choice if we're going to open extent files.
+	 */
+	if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+		return (EPERM);
+
+	/*
+	 * !!!
 	 * Don't try to attach to temporary files.  There are two problems in
 	 * trying to do that.  First, if we have different privileges than the
 	 * process that "owns" the temporary file, we might create the backing
 	 * disk file such that the owning process couldn't read/write its own
-	 * buffers, e.g., memp_trickle() running as root creating a file owned
+	 * buffers, e.g., memp_trickle running as root creating a file owned
 	 * as root, mode 600.  Second, if the temporary file has already been
 	 * created, we don't have any way of finding out what its real name is,
 	 * and, even if we did, it was already unlinked (so that it won't be
 	 * left if the process dies horribly).  This decision causes a problem,
 	 * however: if the temporary file consumes the entire buffer cache,
 	 * and the owner doesn't flush the buffers to disk, we could end up
-	 * with resource starvation, and the memp_trickle() thread couldn't do
+	 * with resource starvation, and the memp_trickle thread couldn't do
 	 * anything about it.  That's a pretty unlikely scenario, though.
 	 *
-	 * Note that we should never get here when the temporary file
-	 * in question has already been closed in another process, in which
-	 * case it should be marked MP_DEADFILE.
+	 * Note we should never get here when the temporary file in question
+	 * has already been closed in another process, in which case it should
+	 * be marked MP_DEADFILE.
 	 */
-	if (F_ISSET(mfp, MP_TEMP)) {
-		DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
-		return (0);
-	}
+	if (F_ISSET(mfp, MP_TEMP))
+		return (EPERM);
 
 	/*
 	 * It's not a page from a file we've opened.  If the file requires
@@ -120,14 +129,14 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * nothing we can do.
 	 */
 	if (mfp->ftype != 0) {
-		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 		for (mpreg = LIST_FIRST(&dbmp->dbregq);
 		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
 			if (mpreg->ftype == mfp->ftype)
 				break;
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 		if (mpreg == NULL)
-			return (0);
+			return (EPERM);
 	}
 
 	/*
@@ -138,17 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * There's no negative cache, so we may repeatedly try and open files
 	 * that we have previously tried (and failed) to open.
 	 */
-	if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
-	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
-		return (0);
+	if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+		return (ret);
+	if ((ret = __memp_fopen_int(dbmfp, mfp,
+	    R_ADDR(dbmp->reginfo, mfp->path_off),
+	    0, 0, mfp->stat.st_pagesize)) != 0) {
+		(void)dbmfp->close(dbmfp, 0);
+		return (ret);
+	}
+	local_open = 1;
 
-found:	ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
+found:	ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp);
 
-	if (incremented) {
-		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	if (incremented)
 		--dbmfp->ref;
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
-	}
+	else if (local_open)
+		F_SET(dbmfp, MP_FLUSH);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 	return (ret);
 }
@@ -157,11 +173,12 @@ found:	ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
  * __memp_pgread --
  *	Read a page from a file.
  *
- * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int));
  */
 int
-__memp_pgread(dbmfp, bhp, can_create)
+__memp_pgread(dbmfp, mutexp, bhp, can_create)
 	DB_MPOOLFILE *dbmfp;
+	DB_MUTEX *mutexp;
 	BH *bhp;
 	int can_create;
 {
@@ -169,171 +186,129 @@ __memp_pgread(dbmfp, bhp, can_create)
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
-	size_t len, pagesize;
-	size_t nr;
-	int created, ret;
+	size_t len, nr, pagesize;
+	int ret;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
 	mfp = dbmfp->mfp;
 	pagesize = mfp->stat.st_pagesize;
 
+	/* We should never be called with a dirty or a locked buffer. */
+	DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
+
+	/* Lock the buffer and swap the hash bucket lock for the buffer lock. */
 	F_SET(bhp, BH_LOCKED | BH_TRASH);
-	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, &bhp->mutex);
+	MUTEX_UNLOCK(dbenv, mutexp);
 
 	/*
 	 * Temporary files may not yet have been created.  We don't create
 	 * them now, we create them when the pages have to be flushed.
 	 */
 	nr = 0;
-	if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
-		/*
-		 * Ignore read errors if we have permission to create the page.
-		 * Assume that the page doesn't exist, and that we'll create it
-		 * when we write it out.
-		 *
-		 * XXX
-		 * Theoretically, we could overwrite a page of data if it were
-		 * possible for a file to be successfully opened for reading
-		 * and then for the read to fail.  Shouldn't ever happen, but
-		 * it might be worth checking to see if the offset is past the
-		 * known end-of-file.
-		 */
-		db_io.fhp = &dbmfp->fh;
+	if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
+		db_io.fhp = dbmfp->fhp;
 		db_io.mutexp = dbmfp->mutexp;
 		db_io.pagesize = db_io.bytes = pagesize;
 		db_io.pgno = bhp->pgno;
 		db_io.buf = bhp->buf;
 
-		ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
-	} else
-		ret = 0;
+		/*
+		 * The page may not exist; if it doesn't, nr may well be 0,
+		 * but we expect the underlying OS calls not to return an
+		 * error code in this case.
+		 */
+		if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
+			goto err;
+	}
 
-	created = 0;
 	if (nr < pagesize) {
-		if (can_create)
-			created = 1;
-		else {
-			/*
-			 * If we had a short read, ret may be 0.  This may not
-			 * be an error -- in particular DB recovery processing
-			 * may request pages that have never been written to
-			 * disk, in which case we won't find the page.  So, the
-			 * caller must know how to handle the error.
-			 */
-			if (ret == 0)
-				ret = EIO;
+		/*
+		 * Don't output error messages for short reads.  In particular,
+		 * DB recovery processing may request pages never written to
+		 * disk or for which only some part have been written to disk,
+		 * in which case we won't find the page.  The caller must know
+		 * how to handle the error.
+		 */
+		if (can_create == 0) {
+			ret = DB_PAGE_NOTFOUND;
 			goto err;
 		}
-	}
 
-	/*
-	 * Clear any bytes we didn't read that need to be cleared.  If we're
-	 * running in diagnostic mode, smash any bytes on the page that are
-	 * unknown quantities for the caller.
-	 */
-	if (nr != pagesize) {
+		/* Clear any bytes that need to be cleared. */
 		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
-		if (nr < len)
-			memset(bhp->buf + nr, 0, len - nr);
-#ifdef DIAGNOSTIC
-		if (nr > len)
-			len = nr;
+		memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
+		/*
+		 * If we're running in diagnostic mode, corrupt any bytes on
+		 * the page that are unknown quantities for the caller.
+		 */
 		if (len < pagesize)
 			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
 #endif
-	}
+		++mfp->stat.st_page_create;
+	} else
+		++mfp->stat.st_page_in;
 
 	/* Call any pgin function. */
 	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
 
-	/* Unlock the buffer and reacquire the region lock. */
+	/* Unlock the buffer and reacquire the hash bucket lock. */
 err:	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, mutexp);
 
 	/*
 	 * If no errors occurred, the data is now valid, clear the BH_TRASH
 	 * flag; regardless, clear the lock bit and let other threads proceed.
 	 */
 	F_CLR(bhp, BH_LOCKED);
-	if (ret == 0) {
+	if (ret == 0)
 		F_CLR(bhp, BH_TRASH);
 
-		/* Update the statistics. */
-		if (created)
-			++mfp->stat.st_page_create;
-		else
-			++mfp->stat.st_page_in;
-	}
-
 	return (ret);
 }
 
 /*
  * __memp_pgwrite --
  *	Write a page to a file.
- *
- * PUBLIC: int __memp_pgwrite
- * PUBLIC:     __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
  */
-int
-__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+static int
+__memp_pgwrite(dbmp, dbmfp, hp, bhp)
 	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
 	BH *bhp;
-	int *restartp, *wrotep;
 {
 	DB_ENV *dbenv;
 	DB_IO db_io;
 	DB_LSN lsn;
-	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
 	size_t nw;
-	int callpgin, dosync, ret, syncfail;
-	const char *fail;
+	int callpgin, ret;
 
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
 	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
-
-	if (restartp != NULL)
-		*restartp = 0;
-	if (wrotep != NULL)
-		*wrotep = 0;
-	callpgin = 0;
+	callpgin = ret = 0;
 
 	/*
-	 * Check the dirty bit -- this buffer may have been written since we
-	 * decided to write it.
+	 * We should never be called with a clean or trash buffer.
+	 * The sync code does call us with already locked buffers.
 	 */
-	if (!F_ISSET(bhp, BH_DIRTY)) {
-		if (wrotep != NULL)
-			*wrotep = 1;
-		return (0);
-	}
-
-	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+	DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
+	DB_ASSERT(!F_ISSET(bhp, BH_TRASH));
 
 	/*
-	 * If there were two writers, we may have just been waiting while the
-	 * other writer completed I/O on this buffer.  Check the dirty bit one
-	 * more time.
+	 * If we have not already traded the hash bucket lock for the buffer
+	 * lock, do so now.
 	 */
-	if (!F_ISSET(bhp, BH_DIRTY)) {
-		MUTEX_UNLOCK(dbenv, &bhp->mutex);
-
-		if (wrotep != NULL)
-			*wrotep = 1;
-		return (0);
+	if (!F_ISSET(bhp, BH_LOCKED)) {
+		F_SET(bhp, BH_LOCKED);
+		MUTEX_LOCK(dbenv, &bhp->mutex);
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 	}
 
-	F_SET(bhp, BH_LOCKED);
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	if (restartp != NULL)
-		*restartp = 1;
-
 	/*
 	 * It's possible that the underlying file doesn't exist, either
 	 * because of an outright removal or because it was a temporary
@@ -347,155 +322,122 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
 		goto file_dead;
 
 	/*
-	 * Ensure the appropriate log records are on disk.  If the page is
-	 * being written as part of a sync operation, the flush has already
-	 * been done, unless it was written by the application *after* the
-	 * sync was scheduled.
+	 * If the page is in a file for which we have LSN information, we have
+	 * to ensure the appropriate log records are on disk.
 	 */
-	if (LOGGING_ON(dbenv) &&
-	    (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+	if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) {
 		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
-		if ((ret = log_flush(dbenv, &lsn)) != 0)
+		if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
 			goto err;
 	}
-	DB_ASSERT(!LOGGING_ON(dbenv) ||
-	   log_compare(&((LOG *)((DB_LOG *)
-	   dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Verify write-ahead logging semantics.
+	 *
+	 * !!!
+	 * One special case.  There is a single field on the meta-data page,
+	 * the last-page-number-in-the-file field, for which we do not log
+	 * changes.  If the page was originally created in a database that
+	 * didn't have logging turned on, we can see a page marked dirty but
+	 * for which no corresponding log record has been written.  However,
+	 * the only way that a page can be created for which there isn't a
+	 * previous log record and valid LSN is when the page was created
+	 * without logging turned on, and so we check for that special-case
+	 * LSN value.
+	 */
+	if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) {
+		/*
+		 * There is a potential race here.  If we are in the midst of
+		 * switching log files, it's possible we could test against the
+		 * old file and the new offset in the log region's LSN.  If we
+		 * fail the first test, acquire the log mutex and check again.
+		 */
+		DB_LOG *dblp;
+		LOG *lp;
+
+		dblp = dbenv->lg_handle;
+		lp = dblp->reginfo.primary;
+		if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
+		    log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+			R_LOCK(dbenv, &dblp->reginfo);
+			DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+			R_UNLOCK(dbenv, &dblp->reginfo);
+		}
+	}
+#endif
 
 	/*
 	 * Call any pgout function.  We set the callpgin flag so that we flag
 	 * that the contents of the buffer will need to be passed through pgin
 	 * before they are reused.
 	 */
-	if (mfp->ftype == 0)
-		ret = 0;
-	else {
+	if (mfp->ftype != 0) {
 		callpgin = 1;
 		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
 			goto err;
 	}
 
 	/* Temporary files may not yet have been created. */
-	if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+	if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
 		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
-		if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
-		    ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
-		    DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
-		    &dbmfp->fh, NULL)) != 0 ||
-		    !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
-			MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 :
+		    __db_appname(dbenv, DB_APP_TMP, NULL,
+		    F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0,
+		    dbmfp->fhp, NULL);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		if (ret != 0) {
 			__db_err(dbenv,
 			    "unable to create temporary backing file");
 			goto err;
 		}
-		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 	}
 
 	/* Write the page. */
-	db_io.fhp = &dbmfp->fh;
+	db_io.fhp = dbmfp->fhp;
 	db_io.mutexp = dbmfp->mutexp;
 	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
 	db_io.pgno = bhp->pgno;
 	db_io.buf = bhp->buf;
 	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
-		ret = __db_panic(dbenv, ret);
-		fail = "write";
-		goto syserr;
-	}
-	if (nw != mfp->stat.st_pagesize) {
-		ret = EIO;
-		fail = "write";
-		goto syserr;
+		__db_err(dbenv, "%s: write failed for page %lu",
+		    __memp_fn(dbmfp), (u_long)bhp->pgno);
+		goto err;
 	}
+	++mfp->stat.st_page_out;
 
+err:
 file_dead:
 	/*
 	 * !!!
 	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
 	 * a valid file reference.
 	 *
-	 * Unlock the buffer and reacquire the region lock.
+	 * Unlock the buffer and reacquire the hash lock.
 	 */
 	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
 
 	/*
-	 * Clean up the flags based on a successful write.
-	 *
 	 * If we rewrote the page, it will need processing by the pgin
 	 * routine before reuse.
 	 */
 	if (callpgin)
 		F_SET(bhp, BH_CALLPGIN);
-	F_CLR(bhp, BH_DIRTY | BH_LOCKED);
 
 	/*
-	 * If we write a buffer for which a checkpoint is waiting, update
-	 * the count of pending buffers (both in the mpool as a whole and
-	 * for this file).  If the count for this file goes to zero, set a
-	 * flag so we flush the writes.
+	 * Update the hash bucket statistics, reset the flags.
+	 * If we were successful, the page is no longer dirty.
 	 */
-	dosync = 0;
-	if (F_ISSET(bhp, BH_SYNC)) {
-		F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-
-		--mp->lsn_cnt;
-		if (mfp != NULL)
-			dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
-	}
-
-	/* Update the page clean/dirty statistics. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-	++c_mp->stat.st_page_clean;
-	--c_mp->stat.st_page_dirty;
-
-	/* Update I/O statistics. */
-	if (mfp != NULL)
-		++mfp->stat.st_page_out;
+	if (ret == 0) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;
 
-	/*
-	 * Do the sync after everything else has been updated, so any incoming
-	 * checkpoint doesn't see inconsistent information.
-	 *
-	 * XXX:
-	 * Don't lock the region around the sync, fsync(2) has no atomicity
-	 * issues.
-	 *
-	 * XXX:
-	 * We ignore errors from the sync -- it makes no sense to return an
-	 * error to the calling process, so set a flag causing the checkpoint
-	 * to be retried later.  There is a possibility, of course, that a
-	 * subsequent checkpoint was started and that we're going to force it
-	 * to fail.  That should be unlikely, and fixing it would be difficult.
-	 */
-	if (dosync) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
-		syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
-		R_LOCK(dbenv, dbmp->reginfo);
-		if (syncfail)
-			F_SET(mp, MP_LSN_RETRY);
+		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
 	}
 
-	if (wrotep != NULL)
-		*wrotep = 1;
-
-	return (0);
-
-syserr:	__db_err(dbenv, "%s: %s failed for page %lu",
-	    __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
-
-err:	/* Unlock the buffer and reacquire the region lock. */
-	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Clean up the flags based on a failure.
-	 *
-	 * The page remains dirty but we remove our lock.  If we rewrote the
-	 * page, it will need processing by the pgin routine before reuse.
-	 */
-	if (callpgin)
-		F_SET(bhp, BH_CALLPGIN);
+	/* Regardless, clear any sync wait-for count and remove our lock. */
+	bhp->ref_sync = 0;
 	F_CLR(bhp, BH_LOCKED);
 
 	return (ret);
@@ -514,15 +456,17 @@ __memp_pg(dbmfp, bhp, is_pgin)
 	int is_pgin;
 {
 	DBT dbt, *dbtp;
+	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	DB_MPREG *mpreg;
 	MPOOLFILE *mfp;
 	int ftype, ret;
 
 	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
 	mfp = dbmfp->mfp;
 
-	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 
 	ftype = mfp->ftype;
 	for (mpreg = LIST_FIRST(&dbmp->dbregq);
@@ -536,28 +480,28 @@ __memp_pg(dbmfp, bhp, is_pgin)
 			dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
 			dbtp = &dbt;
 		}
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 		if (is_pgin) {
 			if (mpreg->pgin != NULL &&
-			    (ret = mpreg->pgin(dbmp->dbenv,
+			    (ret = mpreg->pgin(dbenv,
 			    bhp->pgno, bhp->buf, dbtp)) != 0)
 				goto err;
 		} else
 			if (mpreg->pgout != NULL &&
-			    (ret = mpreg->pgout(dbmp->dbenv,
+			    (ret = mpreg->pgout(dbenv,
 			    bhp->pgno, bhp->buf, dbtp)) != 0)
 				goto err;
 		break;
 	}
 
 	if (mpreg == NULL)
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 	return (0);
 
-err:	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
-	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+err:	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+	__db_err(dbenv, "%s: %s failed for page %lu",
 	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
 	return (ret);
 }
@@ -566,55 +510,78 @@ err:	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
  * __memp_bhfree --
  *	Free a bucket header and its referenced data.
  *
- * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int));
  */
 void
-__memp_bhfree(dbmp, bhp, free_mem)
+__memp_bhfree(dbmp, hp, bhp, free_mem)
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
 	BH *bhp;
 	int free_mem;
 {
-	DB_HASHTAB *dbht;
+	DB_ENV *dbenv;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	int n_bucket, n_cache;
+	u_int32_t n_cache;
 
+	/*
+	 * Assumes the hash bucket is locked and the MPOOL is not.
+	 */
+	dbenv = dbmp->dbenv;
 	mp = dbmp->reginfo[0].primary;
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-	n_cache = NCACHE(mp, bhp->pgno);
-	n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
-	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
 
-	/* Delete the buffer header from the hash bucket queue. */
-	SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+	/*
+	 * Delete the buffer header from the hash bucket queue and reset
+	 * the hash bucket's priority, if necessary.
+	 */
+	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+	if (bhp->priority == hp->hash_priority)
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ?
+		    0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
 
-	/* Delete the buffer header from the LRU queue. */
-	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+	/*
+	 * Discard the hash bucket's mutex, it's no longer needed, and
+	 * we don't want to be holding it when acquiring other locks.
+	 */
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 
-	/* Clear the mutex this buffer recorded */
-	__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
-	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
 	/*
 	 * Find the underlying MPOOLFILE and decrement its reference count.
 	 * If this is its last reference, remove it.
 	 */
 	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+	MUTEX_LOCK(dbenv, &mfp->mutex);
 	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
 		__memp_mf_discard(dbmp, mfp);
+	else
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
+
+	R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+
+	/*
+	 * Clear the mutex this buffer recorded; requires the region lock
+	 * be held.
+	 */
+	__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
 
 	/*
-	 * If we're not reusing it immediately, free the buffer header
+	 * If we're not reusing the buffer immediately, free the buffer header
 	 * and data for real.
 	 */
 	if (free_mem) {
-		--c_mp->stat.st_page_clean;
 		__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+		c_mp = dbmp->reginfo[n_cache].primary;
+		c_mp->stat.st_pages--;
 	}
+	R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
 }
 
 /*
  * __memp_upgrade --
- *	Upgrade a file descriptor from readonly to readwrite.
+ *	Upgrade a file descriptor from read-only to read-write.
  */
 static int
 __memp_upgrade(dbmp, dbmfp, mfp)
@@ -622,41 +589,58 @@ __memp_upgrade(dbmp, dbmfp, mfp)
 	DB_MPOOLFILE *dbmfp;
 	MPOOLFILE *mfp;
 {
-	DB_FH fh;
+	DB_ENV *dbenv;
+	DB_FH *fhp, *tfhp;
 	int ret;
 	char *rpath;
 
-	/*
-	 * !!!
-	 * We expect the handle to already be locked.
-	 */
-
-	/* Check to see if we've already upgraded. */
-	if (F_ISSET(dbmfp, MP_UPGRADE))
-		return (0);
-
-	/* Check to see if we've already failed. */
-	if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
-		return (1);
+	dbenv = dbmp->dbenv;
+	fhp = NULL;
+	rpath = NULL;
 
 	/*
 	 * Calculate the real name for this file and try to open it read/write.
 	 * We know we have a valid pathname for the file because it's the only
 	 * way we could have gotten a file descriptor of any kind.
 	 */
-	if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
-	    NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
-		return (ret);
-	if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
+		goto err;
+
+	if ((ret = __db_appname(dbenv, DB_APP_DATA,
+	    R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+		goto err;
+
+	if (__os_open(dbenv, rpath,
+	    F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) {
 		F_SET(dbmfp, MP_UPGRADE_FAIL);
-		ret = 1;
-	} else {
-		/* Swap the descriptors and set the upgrade flag. */
-		(void)__os_closehandle(&dbmfp->fh);
-		dbmfp->fh = fh;
-		F_SET(dbmfp, MP_UPGRADE);
-		ret = 0;
+		goto err;
 	}
-	__os_freestr(rpath);
+
+	/*
+	 * Swap the descriptors and set the upgrade flag.
+	 *
+	 * XXX
+	 * There is a race here.  If another process schedules a read using the
+	 * existing file descriptor and is swapped out before making the system
+	 * call, this code could theoretically close the file descriptor out
+	 * from under it.  While it's very unlikely, this code should still be
+	 * rewritten.
+	 */
+	tfhp = dbmfp->fhp;
+	dbmfp->fhp = fhp;
+	fhp = tfhp;
+
+	(void)__os_closehandle(dbenv, fhp);
+	F_SET(dbmfp, MP_UPGRADE);
+
+	ret = 0;
+	if (0) {
+err:		ret = 1;
+	}
+	if (fhp != NULL)
+		__os_free(dbenv, fhp);
+	if (rpath != NULL)
+		__os_free(dbenv, rpath);
+
 	return (ret);
 }
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c
index 1bff5e136ab..be0785a2184 100644
--- a/bdb/mp/mp_fget.c
+++ b/bdb/mp/mp_fget.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
 #include <string.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#ifdef HAVE_FILESYSTEM_NOTZERO
+static int __memp_fs_notzero
+    __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
 #endif
 
 /*
- * memp_fget --
+ * __memp_fget --
  *	Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget
+ * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
  */
 int
-memp_fget(dbmfp, pgnoaddr, flags, addrp)
+__memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	DB_MPOOLFILE *dbmfp;
 	db_pgno_t *pgnoaddr;
 	u_int32_t flags;
 	void *addrp;
 {
-	BH *bhp;
+	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+	BH *alloc_bhp, *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	DB_HASHTAB *dbht;
+	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	size_t n_bucket, n_cache, mf_offset;
-	u_int32_t st_hsearch;
-	int b_incr, first, ret;
+	roff_t mf_offset;
+	u_int32_t n_cache, st_hsearch;
+	int b_incr, extending, first, ret;
+
+	*(void **)addrp = NULL;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-	mfp = dbmfp->mfp;
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
-#endif
 
 	PANIC_CHECK(dbenv);
 
+	mp = dbmp->reginfo[0].primary;
+	mfp = dbmfp->mfp;
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	alloc_bhp = bhp = NULL;
+	hp = NULL;
+	b_incr = extending = ret = 0;
+
 	/*
 	 * Validate arguments.
 	 *
@@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * is to keep database files small.  It's sleazy as hell, but we catch
 	 * any attempt to actually write the file in memp_fput().
 	 */
-#define	OKFLAGS	\
-    (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
-    DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
 	if (flags != 0) {
 		if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
 			return (ret);
 
-		switch (flags & ~DB_MPOOL_EXTENT) {
+		switch (flags) {
 		case DB_MPOOL_CREATE:
+			break;
 		case DB_MPOOL_LAST:
+			/* Get the last page number in the file. */
+			if (flags == DB_MPOOL_LAST) {
+				R_LOCK(dbenv, dbmp->reginfo);
+				*pgnoaddr = mfp->last_pgno;
+				R_UNLOCK(dbenv, dbmp->reginfo);
+			}
+			break;
 		case DB_MPOOL_NEW:
-		case DB_MPOOL_NEW_GROUP:
-		case 0:
+			/*
+			 * If always creating a page, skip the first search
+			 * of the hash bucket.
+			 */
+			if (flags == DB_MPOOL_NEW)
+				goto alloc;
 			break;
 		default:
 			return (__db_ferr(dbenv, "memp_fget", 1));
 		}
 	}
 
-#ifdef DIAGNOSTIC
-	/*
-	 * XXX
-	 * We want to switch threads as often as possible.  Yield every time
-	 * we get a new page to ensure contention.
-	 */
-	if (DB_GLOBAL(db_pageyield))
-		__os_yield(dbenv, 1);
-#endif
-
-	/* Initialize remaining local variables. */
-	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
-	bhp = NULL;
-	st_hsearch = 0;
-	b_incr = ret = 0;
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Check for the new, last or last + 1 page requests.
-	 *
-	 * Examine and update the file's last_pgno value.  We don't care if
-	 * the last_pgno value immediately changes due to another thread --
-	 * at this instant in time, the value is correct.  We do increment the
-	 * current last_pgno value if the thread is asking for a new page,
-	 * however, to ensure that two threads creating pages don't get the
-	 * same one.
-	 *
-	 * If we create a page, there is the potential that a page after it
-	 * in the file will be written before it will be written.  Recovery
-	 * depends on pages that are "created" in the file by subsequent pages
-	 * being written be zeroed out, not have random garbage.  Ensure that
-	 * the OS agrees.
-	 *
-	 * !!!
-	 * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
-	 * to allocate contiguous groups of pages in order to do subdatabases.
-	 * We return the first page in the group, but the caller must put an
-	 * LSN on the *last* page and write it, otherwise after a crash we may
-	 * not create all of the pages we need to create.
-	 */
-	if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (LF_ISSET(DB_MPOOL_NEW)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    1, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			++mfp->last_pgno;
-		}
-		if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			mfp->last_pgno += *pgnoaddr;
-		}
-		*pgnoaddr = mfp->last_pgno;
-	}
-
-	/*
-	 * Determine the hash bucket where this page will live, and get local
-	 * pointers to the cache and its hash table.
-	 */
-	n_cache = NCACHE(mp, *pgnoaddr);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
-	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
-		goto alloc;
-
 	/*
 	 * If mmap'ing the file and the page is not past the end of the file,
 	 * just return a pointer.
@@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * goes through the cache.  All pages previously returned will be safe,
 	 * as long as the correct locking protocol was observed.
 	 *
-	 * XXX
 	 * We don't discard the map because we don't know when all of the
 	 * pages will have been discarded from the process' address space.
 	 * It would be possible to do so by reference counting the open
 	 * pages from the mmap, but it's unclear to me that it's worth it.
 	 */
-	if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
-		if (*pgnoaddr > mfp->orig_last_pgno) {
-			/*
-			 * !!!
-			 * See the comment above about non-existent pages and
-			 * the hash access method.
-			 */
-			if (!LF_ISSET(DB_MPOOL_CREATE)) {
-				if (!LF_ISSET(DB_MPOOL_EXTENT))
-					__db_err(dbenv,
-					    "%s: page %lu doesn't exist",
-					    __memp_fn(dbmfp), (u_long)*pgnoaddr);
-				ret = EINVAL;
-				goto err;
-			}
-		} else {
-			*(void **)addrp =
-			    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
-			++mfp->stat.st_map;
-			goto done;
-		}
+	if (dbmfp->addr != NULL &&
+	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+		*(void **)addrp =
+		    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+		++mfp->stat.st_map;
+		return (0);
 	}
 
+hb_search:
+	/*
+	 * Determine the cache and hash bucket where this page lives and get
+	 * local pointers to them.  Reset on each pass through this code, the
+	 * page number can change.
+	 */
+	n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
+
 	/* Search the hash chain for the page. */
-	for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+retry:	st_hsearch = 0;
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
+	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 		++st_hsearch;
 		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
 			continue;
 
-		/* Increment the reference count. */
+		/*
+		 * Increment the reference count.  We may discard the hash
+		 * bucket lock as we evaluate and/or read the buffer, so we
+		 * need to ensure it doesn't move and its contents remain
+		 * unchanged.
+		 */
 		if (bhp->ref == UINT16_T_MAX) {
 			__db_err(dbenv,
 			    "%s: page %lu: reference count overflow",
 			    __memp_fn(dbmfp), (u_long)bhp->pgno);
 			ret = EINVAL;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 			goto err;
 		}
-
-		/*
-		 * Increment the reference count.  We may discard the region
-		 * lock as we evaluate and/or read the buffer, so we need to
-		 * ensure that it doesn't move and that its contents remain
-		 * unchanged.
-		 */
 		++bhp->ref;
 		b_incr = 1;
 
 		/*
-		 * Any buffer we find might be trouble.
-		 *
 		 * BH_LOCKED --
-		 * I/O is in progress.  Because we've incremented the buffer
-		 * reference count, we know the buffer can't move.  Unlock
-		 * the region lock, wait for the I/O to complete, and reacquire
-		 * the region.
+		 * I/O is in progress or sync is waiting on the buffer to write
+		 * it.  Because we've incremented the buffer reference count,
+		 * we know the buffer can't move.  Unlock the bucket lock, wait
+		 * for the buffer to become available, reacquire the bucket.
 		 */
-		for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
-			R_UNLOCK(dbenv, dbmp->reginfo);
+		for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
+		    !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
+			/*
+			 * If someone is trying to sync this buffer and the
+			 * buffer is hot, they may never get in.  Give up
+			 * and try again.
+			 */
+			if (!first && bhp->ref_sync != 0) {
+				--bhp->ref;
+				b_incr = 0;
+				MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+				__os_yield(dbenv, 1);
+				goto retry;
+			}
 
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 			/*
-			 * Explicitly yield the processor if it's not the first
-			 * pass through this loop -- if we don't, we might end
-			 * up running to the end of our CPU quantum as we will
-			 * simply be swapping between the two locks.
+			 * Explicitly yield the processor if not the first pass
+			 * through this loop -- if we don't, we might run to the
+			 * end of our CPU quantum as we will simply be swapping
+			 * between the two locks.
 			 */
 			if (!first)
 				__os_yield(dbenv, 1);
 
-			MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+			MUTEX_LOCK(dbenv, &bhp->mutex);
 			/* Wait for I/O to finish... */
 			MUTEX_UNLOCK(dbenv, &bhp->mutex);
-			R_LOCK(dbenv, dbmp->reginfo);
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+		}
+
+		++mfp->stat.st_cache_hit;
+		break;
+	}
+
+	/*
+	 * Update the hash bucket search statistics -- do now because our next
+	 * search may be for a different bucket.
+	 */
+	++c_mp->stat.st_hash_searches;
+	if (st_hsearch > c_mp->stat.st_hash_longest)
+		c_mp->stat.st_hash_longest = st_hsearch;
+	c_mp->stat.st_hash_examined += st_hsearch;
+
+	/*
+	 * There are 4 possible paths to this location:
+	 *
+	 * FIRST_MISS:
+	 *	Didn't find the page in the hash bucket on our first pass:
+	 *	bhp == NULL, alloc_bhp == NULL
+	 *
+	 * FIRST_FOUND:
+	 *	Found the page in the hash bucket on our first pass:
+	 *	bhp != NULL, alloc_bhp == NULL
+	 *
+	 * SECOND_FOUND:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and found the page in the hash bucket on
+	 *	our second pass:
+	 *	bhp != NULL, alloc_bhp != NULL
+	 *
+	 * SECOND_MISS:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and didn't find the page in the hash bucket
+	 *	on our second pass:
+	 *	bhp == NULL, alloc_bhp != NULL
+	 */
+	state = bhp == NULL ?
+	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+	switch (state) {
+	case FIRST_FOUND:
+		/* We found the buffer in our first check -- we're done. */
+		break;
+	case FIRST_MISS:
+		/*
+		 * We didn't find the buffer in our first check.  Figure out
+		 * if the page exists, and allocate structures so we can add
+		 * the page to the buffer pool.
+		 */
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+alloc:		/*
+		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
+		 * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
+		 * it's an error to try and get a page past the end of file.
+		 */
+		COMPQUIET(n_cache, 0);
+
+		extending = ret = 0;
+		R_LOCK(dbenv, dbmp->reginfo);
+		switch (flags) {
+		case DB_MPOOL_NEW:
+			extending = 1;
+			*pgnoaddr = mfp->last_pgno + 1;
+			break;
+		case DB_MPOOL_CREATE:
+			extending = *pgnoaddr > mfp->last_pgno;
+			break;
+		default:
+			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+			break;
 		}
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		if (ret != 0)
+			goto err;
 
 		/*
-		 * BH_TRASH --
-		 * The contents of the buffer are garbage.  Shouldn't happen,
-		 * and this read is likely to fail, but might as well try.
+		 * !!!
+		 * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+		 * not yet been initialized.
 		 */
-		if (F_ISSET(bhp, BH_TRASH))
-			goto reread;
+		mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+		n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
 
+		/* Allocate a new buffer header and data space. */
+		if ((ret = __memp_alloc(dbmp,
+		    &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
+			goto err;
+#ifdef DIAGNOSTIC
+		if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+			__db_err(dbenv,
+			    "Error: buffer data is NOT size_t aligned");
+			ret = EINVAL;
+			goto err;
+		}
+#endif
 		/*
-		 * BH_CALLPGIN --
-		 * The buffer was converted so it could be written, and the
-		 * contents need to be converted again.
+		 * If we are extending the file, we'll need the region lock
+		 * again.
 		 */
-		if (F_ISSET(bhp, BH_CALLPGIN)) {
-			if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+		if (extending)
+			R_LOCK(dbenv, dbmp->reginfo);
+
+		/*
+		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control.  (That guarantee is interesting
+		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+		 * did not specify the page number, and so, may reasonably not
+		 * have any way to lock the page outside of mpool.) Regardless,
+		 * if we allocate the page, and some other thread of control
+		 * requests the page by number, we will not detect that and the
+		 * thread of control that allocated using DB_MPOOL_NEW may not
+		 * have a chance to initialize the page.  (Note: we *could*
+		 * detect this case if we set a flag in the buffer header which
+		 * guaranteed that no gets of the page would succeed until the
+		 * reference count went to 0, that is, until the creating page
+		 * put the page.)  What we do guarantee is that if two threads
+		 * of control are both doing DB_MPOOL_NEW calls, they won't
+		 * collide, that is, they won't both get the same page.
+		 *
+		 * There's a possibility that another thread allocated the page
+		 * we were planning to allocate while we were off doing buffer
+		 * allocation.  We can do that by making sure the page number
+		 * we were going to use is still available.  If it's not, then
+		 * we check to see if the next available page number hashes to
+		 * the same mpool region as the old one -- if it does, we can
+		 * continue, otherwise, we have to start over.
+		 */
+		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+			*pgnoaddr = mfp->last_pgno + 1;
+			if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+				__db_shalloc_free(
+				    dbmp->reginfo[n_cache].addr, alloc_bhp);
+				/*
+				 * flags == DB_MPOOL_NEW, so extending is set
+				 * and we're holding the region locked.
+				 */
+				R_UNLOCK(dbenv, dbmp->reginfo);
+
+				alloc_bhp = NULL;
+				goto alloc;
+			}
+		}
+
+		/*
+		 * We released the region lock, so another thread might have
+		 * extended the file.  Update the last_pgno and initialize
+		 * the file, as necessary, if we extended the file.
+		 */
+		if (extending) {
+#ifdef HAVE_FILESYSTEM_NOTZERO
+			if (*pgnoaddr > mfp->last_pgno &&
+			    __os_fs_notzero() &&
+			    F_ISSET(dbmfp->fhp, DB_FH_VALID))
+				ret = __memp_fs_notzero(
+				    dbenv, dbmfp, mfp, pgnoaddr);
+			else
+				ret = 0;
+#endif
+			if (ret == 0 && *pgnoaddr > mfp->last_pgno)
+				mfp->last_pgno = *pgnoaddr;
+
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			if (ret != 0)
 				goto err;
-			F_CLR(bhp, BH_CALLPGIN);
 		}
+		goto hb_search;
+	case SECOND_FOUND:
+		/*
+		 * We allocated buffer space for the requested page, but then
+		 * found the page in the buffer cache on our second check.
+		 * That's OK -- we can use the page we found in the pool,
+		 * unless DB_MPOOL_NEW is set.
+		 *
+		 * Free the allocated memory, we no longer need it.  Since we
+		 * can't acquire the region lock while holding the hash bucket
+		 * lock, we have to release the hash bucket and re-acquire it.
+		 * That's OK, because we have the buffer pinned down.
+		 */
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
+		alloc_bhp = NULL;
+		R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
 
-		++mfp->stat.st_cache_hit;
-		*(void **)addrp = bhp->buf;
-		goto done;
-	}
+		/*
+		 * We can't use the page we found in the pool if DB_MPOOL_NEW
+		 * was set.  (For details, see the above comment beginning
+		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
+		 * release our pin on this particular buffer, and try to get
+		 * another one.
+		 */
+		if (flags == DB_MPOOL_NEW) {
+			--bhp->ref;
+			b_incr = 0;
+			goto alloc;
+		}
+		break;
+	case SECOND_MISS:
+		/*
+		 * We allocated buffer space for the requested page, and found
+		 * the page still missing on our second pass through the buffer
+		 * cache.  Instantiate the page.
+		 */
+		bhp = alloc_bhp;
+		alloc_bhp = NULL;
 
-alloc:	/* Allocate new buffer header and data space. */
-	if ((ret = __memp_alloc(dbmp,
-	    &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
-		goto err;
+		/*
+		 * Initialize all the BH and hash bucket fields so we can call
+		 * __memp_bhfree if an error occurs.
+		 *
+		 * Append the buffer to the tail of the bucket list and update
+		 * the hash bucket's priority.
+		 */
+		b_incr = 1;
+
+		memset(bhp, 0, sizeof(BH));
+		bhp->ref = 1;
+		bhp->priority = UINT32_T_MAX;
+		bhp->pgno = *pgnoaddr;
+		bhp->mf_offset = mf_offset;
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+		/* If we extended the file, make sure the page is never lost. */
+		if (extending) {
+			++hp->hash_page_dirty;
+			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+		}
 
-	++c_mp->stat.st_page_clean;
+		/*
+		 * If we created the page, zero it out.  If we didn't create
+		 * the page, read from the backing file.
+		 *
+		 * !!!
+		 * DB_MPOOL_NEW doesn't call the pgin function.
+		 *
+		 * If DB_MPOOL_CREATE is used, then the application's pgin
+		 * function has to be able to handle pages of 0's -- if it
+		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
+		 * and not bother.
+		 *
+		 * If we're running in diagnostic mode, smash any bytes on the
+		 * page that are unknown quantities for the caller.
+		 *
+		 * Otherwise, read the page into memory, optionally creating it
+		 * if DB_MPOOL_CREATE is set.
+		 */
+		if (extending) {
+			if (mfp->clear_len == 0)
+				memset(bhp->buf, 0, mfp->stat.st_pagesize);
+			else {
+				memset(bhp->buf, 0, mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+				    mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+			}
 
-	/*
-	 * Initialize the BH fields so that we can call the __memp_bhfree
-	 * routine if an error occurs.
-	 */
-	memset(bhp, 0, sizeof(BH));
-	bhp->ref = 1;
-	bhp->pgno = *pgnoaddr;
-	bhp->mf_offset = mf_offset;
+			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
+				F_SET(bhp, BH_CALLPGIN);
 
-	/* Increment the count of buffers referenced by this MPOOLFILE. */
-	++mfp->block_cnt;
+			++mfp->stat.st_page_create;
+		} else {
+			F_SET(bhp, BH_TRASH);
+			++mfp->stat.st_cache_miss;
+		}
 
-	/*
-	 * Prepend the bucket header to the head of the appropriate MPOOL
-	 * bucket hash list.  Append the bucket header to the tail of the
-	 * MPOOL LRU chain.
-	 */
-	SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
-	SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+		/* Increment buffer count referenced by MPOOLFILE. */
+		MUTEX_LOCK(dbenv, &mfp->mutex);
+		++mfp->block_cnt;
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
 
-#ifdef DIAGNOSTIC
-	if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
-		__db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
-		ret = EINVAL;
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
+		/*
+		 * Initialize the mutex.  This is the last initialization step,
+		 * because it's the only one that can fail, and everything else
+		 * must be set up or we can't jump to the err label because it
+		 * will call __memp_bhfree.
+		 */
+		if ((ret = __db_mutex_setup(dbenv,
+		    &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
+			goto err;
 	}
-#endif
 
-	if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
-	    R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
-	    0, &dbmp->reginfo[n_cache],
-	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
-	    != 0) {
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
+	DB_ASSERT(bhp->ref != 0);
+
+	/*
+	 * If we're the only reference, update buffer and bucket priorities.
+	 * We may be about to release the hash bucket lock, and everything
+	 * should be correct, first.  (We've already done this if we created
+	 * the buffer, so there is no need to do it again.)
+	 */
+	if (state != SECOND_MISS && bhp->ref == 1) {
+		bhp->priority = UINT32_T_MAX;
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
 	}
 
 	/*
-	 * If we created the page, zero it out and continue.
-	 *
-	 * !!!
-	 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
-	 * If DB_MPOOL_CREATE is used, then the application's pgin function
-	 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
-	 * it can detect all of its page creates, and not bother.
+	 * BH_TRASH --
+	 * The buffer we found may need to be filled from the disk.
 	 *
-	 * If we're running in diagnostic mode, smash any bytes on the
-	 * page that are unknown quantities for the caller.
-	 *
-	 * Otherwise, read the page into memory, optionally creating it if
-	 * DB_MPOOL_CREATE is set.
+	 * It's possible for the read function to fail, which means we fail as
+	 * well.  Note, the __memp_pgread() function discards and reacquires
+	 * the hash lock, so the buffer must be pinned down so that it cannot
+	 * move and its contents are unchanged.  Discard the buffer on failure
+	 * unless another thread is waiting on our I/O to complete.  It's OK to
+	 * leave the buffer around, as the waiting thread will see the BH_TRASH
+	 * flag set, and will also attempt to discard it.  If there's a waiter,
+	 * we need to decrement our reference count.
 	 */
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (mfp->clear_len == 0)
-			memset(bhp->buf, 0, mfp->stat.st_pagesize);
-		else {
-			memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
-			memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
-			    mfp->stat.st_pagesize - mfp->clear_len);
-#endif
-		}
+	if (F_ISSET(bhp, BH_TRASH) &&
+	    (ret = __memp_pgread(dbmfp,
+	    &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+		goto err;
 
-		++mfp->stat.st_page_create;
-	} else {
-		/*
-		 * It's possible for the read function to fail, which means
-		 * that we fail as well.  Note, the __memp_pgread() function
-		 * discards the region lock, so the buffer must be pinned
-		 * down so that it cannot move and its contents are unchanged.
-		 */
-reread:		if ((ret = __memp_pgread(dbmfp,
-		    bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
-			/*
-			 * !!!
-			 * Discard the buffer unless another thread is waiting
-			 * on our I/O to complete.  Regardless, the header has
-			 * the BH_TRASH flag set.
-			 */
-			if (bhp->ref == 1)
-				__memp_bhfree(dbmp, bhp, 1);
+	/*
+	 * BH_CALLPGIN --
+	 * The buffer was processed for being written to disk, and now has
+	 * to be re-converted for use.
+	 */
+	if (F_ISSET(bhp, BH_CALLPGIN)) {
+		if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
 			goto err;
-		}
-
-		++mfp->stat.st_cache_miss;
+		F_CLR(bhp, BH_CALLPGIN);
 	}
 
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+#ifdef DIAGNOSTIC
+	/* Update the file's pinned reference count. */
+	R_LOCK(dbenv, dbmp->reginfo);
+	++dbmfp->pinref;
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
 	/*
-	 * If we're returning a page after our current notion of the last-page,
-	 * update our information.  Note, there's no way to un-instantiate this
-	 * page, it's going to exist whether it's returned to us dirty or not.
+	 * We want to switch threads as often as possible, and at awkward
+	 * times.  Yield every time we get a new page to ensure contention.
 	 */
-	if (bhp->pgno > mfp->last_pgno)
-		mfp->last_pgno = bhp->pgno;
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(dbenv, 1);
+#endif
 
 	*(void **)addrp = bhp->buf;
+	return (0);
 
-done:	/* Update the chain search statistics. */
-	if (st_hsearch) {
-		++c_mp->stat.st_hash_searches;
-		if (st_hsearch > c_mp->stat.st_hash_longest)
-			c_mp->stat.st_hash_longest = st_hsearch;
-		c_mp->stat.st_hash_examined += st_hsearch;
+err:	/*
+	 * Discard our reference.  If we're the only reference, discard the
+	 * the buffer entirely.  If we held a reference to a buffer, we are
+	 * also still holding the hash bucket mutex.
+	 */
+	if (b_incr) {
+		if (bhp->ref == 1)
+			(void)__memp_bhfree(dbmp, hp, bhp, 1);
+		else {
+			--bhp->ref;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		}
 	}
 
-	++dbmfp->pinref;
+	/* If alloc_bhp is set, free the memory. */
+	if (alloc_bhp != NULL)
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	return (ret);
+}
 
-	return (0);
+#ifdef HAVE_FILESYSTEM_NOTZERO
+/*
+ * __memp_fs_notzero --
+ *	Initialize the underlying allocated pages in the file.
+ */
+static int
+__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	db_pgno_t *pgnoaddr;
+{
+	DB_IO db_io;
+	u_int32_t i, npages;
+	size_t nw;
+	int ret;
+	u_int8_t *page;
+	char *fail;
 
-err:	/* Discard our reference. */
-	if (b_incr)
-		--bhp->ref;
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	/*
+	 * Pages allocated by writing pages past end-of-file are not zeroed,
+	 * on some systems.  Recovery could theoretically be fooled by a page
+	 * showing up that contained garbage.  In order to avoid this, we
+	 * have to write the pages out to disk, and flush them.  The reason
+	 * for the flush is because if we don't sync, the allocation of another
+	 * page subsequent to this one might reach the disk first, and if we
+	 * crashed at the right moment, leave us with this page as the one
+	 * allocated by writing a page past it in the file.
+	 *
+	 * Hash is the only access method that allocates groups of pages.  We
+	 * know that it will use the existence of the last page in a group to
+	 * signify that the entire group is OK; so, write all the pages but
+	 * the last one in the group, flush them to disk, and then write the
+	 * last one to disk and flush it.
+	 */
+	if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
+		return (ret);
+
+	db_io.fhp = dbmfp->fhp;
+	db_io.mutexp = dbmfp->mutexp;
+	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+	db_io.buf = page;
+
+	npages = *pgnoaddr - mfp->last_pgno;
+	for (i = 1; i < npages; ++i) {
+		db_io.pgno = mfp->last_pgno + i;
+		if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+			fail = "write";
+			goto err;
+		}
+	}
+	if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+		goto err;
+	}
 
-	*(void **)addrp = NULL;
+	db_io.pgno = mfp->last_pgno + npages;
+	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+		fail = "write";
+		goto err;
+	}
+	if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+err:		__db_err(dbenv, "%s: %s failed for page %lu",
+		    __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
+	}
+
+	__os_free(dbenv, page);
 	return (ret);
 }
+#endif
diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c
index 3611ded18f4..a91bf264652 100644
--- a/bdb/mp/mp_fopen.c
+++ b/bdb/mp/mp_fopen.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fopen.c,v 11.90 2002/08/26 15:22:01 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,211 +16,464 @@ static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell E
 #include <string.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static int __memp_mf_open __P((DB_MPOOL *, const char *,
-    size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **));
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+static int  __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+static int  __memp_fopen __P((DB_MPOOLFILE *,
+		const char *, u_int32_t, int, size_t));
+static void __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+static void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+static void __memp_refcnt __P((DB_MPOOLFILE *, db_pgno_t *));
+static int  __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+static int  __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+static int  __memp_set_ftype __P((DB_MPOOLFILE *, int));
+static int  __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+static int  __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+static int  __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+static void __memp_set_unlink __P((DB_MPOOLFILE *, int));
+
+/* Initialization methods cannot be called after open is called. */
+#define	MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)				\
+	if (F_ISSET(dbmfp, MP_OPEN_CALLED))				\
+		return (__db_mi_open((dbmfp)->dbmp->dbenv, name, 1));
 
 /*
- * MEMP_FREMOVE --
- *	Discard an MPOOLFILE and any buffers it references: update the flags
- *	so we never try to write buffers associated with the file, nor can we
- *	find it when looking for files to join.  In addition, clear the ftype
- *	field, there's no reason to post-process pages, they can be discarded
- *	by any thread.
- */
-#define	MEMP_FREMOVE(mfp) {						\
-	mfp->ftype = 0;							\
-	F_SET(mfp, MP_DEADFILE);					\
-}
-
-/*
- * memp_fopen --
- *	Open a backing file for the memory pool.
+ * __memp_fcreate --
+ *	Create a DB_MPOOLFILE handle.
+ *
+ * PUBLIC: int __memp_fcreate __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
  */
 int
-memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp)
+__memp_fcreate(dbenv, retp, flags)
 	DB_ENV *dbenv;
-	const char *path;
-	u_int32_t flags;
-	int mode;
-	size_t pagesize;
-	DB_MPOOL_FINFO *finfop;
 	DB_MPOOLFILE **retp;
+	u_int32_t flags;
 {
 	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
 	int ret;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fopen(dbenv, path, flags,
-		    mode, pagesize, finfop, retp));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_fcreate", DB_INIT_MPOOL);
 
 	dbmp = dbenv->mp_handle;
 
 	/* Validate arguments. */
-	if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
-	    DB_CREATE |
-	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+	if ((ret = __db_fchk(dbenv, "memp_fcreate", flags, 0)) != 0)
 		return (ret);
 
-	/* Require a non-zero pagesize. */
-	if (pagesize == 0 ||
-	    (finfop != NULL && finfop->clear_len > pagesize)) {
-		__db_err(dbenv, "memp_fopen: illegal page size.");
-		return (EINVAL);
+	/* Allocate and initialize the per-process structure. */
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+		return (ret);
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0)
+		goto err;
+
+	/* Allocate and initialize a mutex if necessary. */
+	if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+	    (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->mutexp,
+	    MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+		goto err;
+
+	dbmfp->ref = 1;
+	dbmfp->lsn_offset = -1;
+	dbmfp->dbmp = dbmp;
+	dbmfp->mfp = INVALID_ROFF;
+
+	dbmfp->close = __memp_fclose;
+	dbmfp->get = __memp_fget;
+	dbmfp->get_fileid = __memp_get_fileid;
+	dbmfp->last_pgno = __memp_last_pgno;
+	dbmfp->open = __memp_fopen;
+	dbmfp->put = __memp_fput;
+	dbmfp->refcnt = __memp_refcnt;
+	dbmfp->set = __memp_fset;
+	dbmfp->set_clear_len = __memp_set_clear_len;
+	dbmfp->set_fileid = __memp_set_fileid;
+	dbmfp->set_ftype = __memp_set_ftype;
+	dbmfp->set_lsn_offset = __memp_set_lsn_offset;
+	dbmfp->set_pgcookie = __memp_set_pgcookie;
+	dbmfp->set_priority = __memp_set_priority;
+	dbmfp->set_unlink = __memp_set_unlink;
+	dbmfp->sync = __memp_fsync;
+
+	*retp = dbmfp;
+	return (0);
+
+err:	if (dbmfp != NULL) {
+		if (dbmfp->fhp != NULL)
+			(void)__os_free(dbenv, dbmfp->fhp);
+		(void)__os_free(dbenv, dbmfp);
 	}
+	return (ret);
+}
 
-	return (__memp_fopen(dbmp,
-	    NULL, path, flags, mode, pagesize, 1, finfop, retp));
+/*
+ * __memp_set_clear_len --
+ *	Set the clear length.
+ */
+static int
+__memp_set_clear_len(dbmfp, clear_len)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t clear_len;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_clear_len");
+
+	dbmfp->clear_len = clear_len;
+	return (0);
 }
 
 /*
- * __memp_set_unlink -- set unlink on last close flag.
- *
- * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *));
+ * __memp_set_fileid --
+ *	Set the file ID.
  */
-void
-__memp_set_unlink(dbmpf)
-	DB_MPOOLFILE *dbmpf;
+static int
+__memp_set_fileid(dbmfp, fileid)
+	DB_MPOOLFILE *dbmfp;
+	u_int8_t *fileid;
 {
-	DB_MPOOL *dbmp;
-	dbmp = dbmpf->dbmp;
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_fileid");
 
-	R_LOCK(dbmp->dbenv, dbmp->reginfo);
-	F_SET(dbmpf->mfp, MP_UNLINK);
-	R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+	/*
+	 * XXX
+	 * This is dangerous -- we're saving the caller's pointer instead
+	 * of allocating memory and copying the contents.
+	 */
+	dbmfp->fileid = fileid;
+	return (0);
 }
 
 /*
- * __memp_clear_unlink -- clear unlink on last close flag.
- *
- * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *));
+ * __memp_set_ftype --
+ *	Set the file type (as registered).
  */
-void
-__memp_clear_unlink(dbmpf)
-	DB_MPOOLFILE *dbmpf;
+static int
+__memp_set_ftype(dbmfp, ftype)
+	DB_MPOOLFILE *dbmfp;
+	int ftype;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_ftype");
+
+	dbmfp->ftype = ftype;
+	return (0);
+}
+
+/*
+ * __memp_set_lsn_offset --
+ *	Set the page's LSN offset.
+ */
+static int
+__memp_set_lsn_offset(dbmfp, lsn_offset)
+	DB_MPOOLFILE *dbmfp;
+	int32_t lsn_offset;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_lsn_offset");
+
+	dbmfp->lsn_offset = lsn_offset;
+	return (0);
+}
+
+/*
+ * __memp_set_pgcookie --
+ *	Set the pgin/pgout cookie.
+ */
+static int
+__memp_set_pgcookie(dbmfp, pgcookie)
+	DB_MPOOLFILE *dbmfp;
+	DBT *pgcookie;
+{
+	MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_pgcookie");
+
+	dbmfp->pgcookie = pgcookie;
+	return (0);
+}
+
+/*
+ * __memp_set_priority --
+ *	Set the cache priority for pages from this file.
+ */
+static int
+__memp_set_priority(dbmfp, priority)
+	DB_MPOOLFILE *dbmfp;
+	DB_CACHE_PRIORITY priority;
+{
+	switch (priority) {
+	case DB_PRIORITY_VERY_LOW:
+		dbmfp->mfp->priority = MPOOL_PRI_VERY_LOW;
+		break;
+	case DB_PRIORITY_LOW:
+		dbmfp->mfp->priority = MPOOL_PRI_LOW;
+		break;
+	case DB_PRIORITY_DEFAULT:
+		dbmfp->mfp->priority = MPOOL_PRI_DEFAULT;
+		break;
+	case DB_PRIORITY_HIGH:
+		dbmfp->mfp->priority = MPOOL_PRI_HIGH;
+		break;
+	case DB_PRIORITY_VERY_HIGH:
+		dbmfp->mfp->priority = MPOOL_PRI_VERY_HIGH;
+		break;
+	default:
+		__db_err(dbmfp->dbmp->dbenv,
+		    "Unknown priority value: %d", priority);
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_fopen --
+ *	Open a backing file for the memory pool.
+ */
+static int
+__memp_fopen(dbmfp, path, flags, mode, pagesize)
+	DB_MPOOLFILE *dbmfp;
+	const char *path;
+	u_int32_t flags;
+	int mode;
+	size_t pagesize;
 {
+	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	dbmp = dbmpf->dbmp;
+	int ret;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+
+	PANIC_CHECK(dbenv);
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
+	    DB_CREATE | DB_DIRECT | DB_EXTENT |
+	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+		return (ret);
 
 	/*
-	 * This bit is protected in the queue code because the metapage
-	 * is locked so we can avoid geting the region lock.
-	 * If this gets used from other than the queue code, we cannot.
+	 * Require a non-zero, power-of-two pagesize, smaller than the
+	 * clear length.
 	 */
-	if (!F_ISSET(dbmpf->mfp, MP_UNLINK))
-		return;
-	R_LOCK(dbmp->dbenv, dbmp->reginfo);
-	F_CLR(dbmpf->mfp, MP_UNLINK);
-	R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+	if (pagesize == 0 || !POWER_OF_TWO(pagesize)) {
+		__db_err(dbenv,
+		    "memp_fopen: page sizes must be a power-of-2");
+		return (EINVAL);
+	}
+	if (dbmfp->clear_len > pagesize) {
+		__db_err(dbenv,
+		    "memp_fopen: clear length larger than page size");
+		return (EINVAL);
+	}
+
+	/* Read-only checks, and local flag. */
+	if (LF_ISSET(DB_RDONLY) && path == NULL) {
+		__db_err(dbenv,
+		    "memp_fopen: temporary files can't be readonly");
+		return (EINVAL);
+	}
+
+	return (__memp_fopen_int(dbmfp, NULL, path, flags, mode, pagesize));
 }
 
 /*
- * __memp_fopen --
+ * __memp_fopen_int --
  *	Open a backing file for the memory pool; internal version.
  *
- * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
- * PUBLIC:    u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
+ * PUBLIC: int __memp_fopen_int __P((DB_MPOOLFILE *,
+ * PUBLIC:     MPOOLFILE *, const char *, u_int32_t, int, size_t));
  */
 int
-__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
-	DB_MPOOL *dbmp;
+__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize)
+	DB_MPOOLFILE *dbmfp;
 	MPOOLFILE *mfp;
 	const char *path;
 	u_int32_t flags;
-	int mode, needlock;
+	int mode;
 	size_t pagesize;
-	DB_MPOOL_FINFO *finfop;
-	DB_MPOOLFILE **retp;
 {
 	DB_ENV *dbenv;
-	DB_MPOOLFILE *dbmfp;
-	DB_MPOOL_FINFO finfo;
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
 	db_pgno_t last_pgno;
 	size_t maxmap;
 	u_int32_t mbytes, bytes, oflags;
-	int ret;
+	int mfp_alloc, ret;
 	u_int8_t idbuf[DB_FILE_ID_LEN];
 	char *rpath;
+	void *p;
 
+	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	ret = 0;
+	mp = dbmp->reginfo[0].primary;
+	mfp_alloc = ret = 0;
 	rpath = NULL;
 
 	/*
-	 * If mfp is provided, we take the DB_MPOOL_FINFO information from
-	 * the mfp.  We don't bother initializing everything, because some
-	 * of them are expensive to acquire.  If no mfp is provided and the
-	 * finfop argument is NULL, we default the values.
+	 * Set the page size so os_open can decide whether to turn buffering
+	 * off if the DB_DIRECT_DB flag is set.
 	 */
-	if (finfop == NULL) {
-		memset(&finfo, 0, sizeof(finfo));
-		if (mfp != NULL) {
-			finfo.ftype = mfp->ftype;
-			finfo.pgcookie = NULL;
-			finfo.fileid = NULL;
-			finfo.lsn_offset = mfp->lsn_off;
-			finfo.clear_len = mfp->clear_len;
-		} else {
-			finfo.ftype = 0;
-			finfo.pgcookie = NULL;
-			finfo.fileid = NULL;
-			finfo.lsn_offset = -1;
-			finfo.clear_len = 0;
-		}
-		finfop = &finfo;
-	}
+	dbmfp->fhp->pagesize = (u_int32_t)pagesize;
 
-	/* Allocate and initialize the per-process structure. */
-	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
-		return (ret);
-	dbmfp->dbmp = dbmp;
-	dbmfp->ref = 1;
-	if (LF_ISSET(DB_RDONLY))
+	/*
+	 * If it's a temporary file, delay the open until we actually need
+	 * to write the file, and we know we can't join any existing files.
+	 */
+	if (path == NULL)
+		goto alloc;
+
+	/*
+	 * Get the real name for this file and open it.  If it's a Queue extent
+	 * file, it may not exist, and that's OK.
+	 */
+	oflags = 0;
+	if (LF_ISSET(DB_CREATE))
+		oflags |= DB_OSO_CREATE;
+	if (LF_ISSET(DB_DIRECT))
+		oflags |= DB_OSO_DIRECT;
+	if (LF_ISSET(DB_RDONLY)) {
 		F_SET(dbmfp, MP_READONLY);
+		oflags |= DB_OSO_RDONLY;
+	}
+	if ((ret =
+	    __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0)
+		goto err;
+	if ((ret = __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) {
+		if (!LF_ISSET(DB_EXTENT))
+			__db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+		goto err;
+	}
 
-	if (path == NULL) {
-		if (LF_ISSET(DB_RDONLY)) {
-			__db_err(dbenv,
-			    "memp_fopen: temporary files can't be readonly");
-			ret = EINVAL;
+	/*
+	 * Get the file id if we weren't given one.  Generated file id's
+	 * don't use timestamps, otherwise there'd be no chance of any
+	 * other process joining the party.
+	 */
+	if (dbmfp->fileid == NULL) {
+		if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
 			goto err;
+		dbmfp->fileid = idbuf;
+	}
+
+	/*
+	 * If our caller knows what mfp we're using, increment the ref count,
+	 * no need to search.
+	 *
+	 * We don't need to acquire a lock other than the mfp itself, because
+	 * we know there's another reference and it's not going away.
+	 */
+	if (mfp != NULL) {
+		MUTEX_LOCK(dbenv, &mfp->mutex);
+		++mfp->mpf_cnt;
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
+		goto check_map;
+	}
+
+	/*
+	 * If not creating a temporary file, walk the list of MPOOLFILE's,
+	 * looking for a matching file.  Files backed by temporary files
+	 * or previously removed files can't match.
+	 *
+	 * DB_TRUNCATE support.
+	 *
+	 * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
+	 * pair) plus a timestamp.  If files are removed and created in less
+	 * than a second, the fileID can be repeated.  The problem with
+	 * repetition happens when the file that previously had the fileID
+	 * value still has pages in the pool, since we don't want to use them
+	 * to satisfy requests for the new file.
+	 *
+	 * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
+	 * opens with that flag set guarantees matching fileIDs when the
+	 * machine can open a file and then re-open with truncate within a
+	 * second.  For this reason, we pass that flag down, and, if we find
+	 * a matching entry, we ensure that it's never found again, and we
+	 * create a new entry for the current request.
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		/* Skip dead files and temporary files. */
+		if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+			continue;
+
+		/* Skip non-matching files. */
+		if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+		    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+			continue;
+
+		/*
+		 * If the file is being truncated, remove it from the system
+		 * and create a new entry.
+		 *
+		 * !!!
+		 * We should be able to set mfp to NULL and break out of the
+		 * loop, but I like the idea of checking all the entries.
+		 */
+		if (LF_ISSET(DB_TRUNCATE)) {
+			MUTEX_LOCK(dbenv, &mfp->mutex);
+			MPOOLFILE_IGNORE(mfp);
+			MUTEX_UNLOCK(dbenv, &mfp->mutex);
+			continue;
 		}
-		last_pgno = 0;
-	} else {
-		/* Get the real name for this file and open it. */
-		if ((ret = __db_appname(dbenv,
-		    DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
-			goto err;
-		oflags = 0;
-		if (LF_ISSET(DB_CREATE))
-			oflags |= DB_OSO_CREATE;
-		if (LF_ISSET(DB_RDONLY))
-			oflags |= DB_OSO_RDONLY;
-		if ((ret =
-		   __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) {
-			if (!LF_ISSET(DB_EXTENT))
-				__db_err(dbenv,
-				    "%s: %s", rpath, db_strerror(ret));
+
+		/*
+		 * Some things about a file cannot be changed: the clear length,
+		 * page size, or lSN location.
+		 *
+		 * The file type can change if the application's pre- and post-
+		 * processing needs change.  For example, an application that
+		 * created a hash subdatabase in a database that was previously
+		 * all btree.
+		 *
+		 * XXX
+		 * We do not check to see if the pgcookie information changed,
+		 * or update it if it is, this might be a bug.
+		 */
+		if (dbmfp->clear_len != mfp->clear_len ||
+		    pagesize != mfp->stat.st_pagesize ||
+		    dbmfp->lsn_offset != mfp->lsn_off) {
+			__db_err(dbenv,
+		    "%s: clear length, page size or LSN location changed",
+			    path);
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			ret = EINVAL;
 			goto err;
 		}
 
+		if (dbmfp->ftype != 0)
+			mfp->ftype = dbmfp->ftype;
+
+		MUTEX_LOCK(dbenv, &mfp->mutex);
+		++mfp->mpf_cnt;
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
+		break;
+	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	if (mfp != NULL)
+		goto check_map;
+
+alloc:	/* Allocate and initialize a new MPOOLFILE. */
+	if ((ret = __memp_alloc(
+	    dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+		goto err;
+	mfp_alloc = 1;
+	memset(mfp, 0, sizeof(MPOOLFILE));
+	mfp->mpf_cnt = 1;
+	mfp->ftype = dbmfp->ftype;
+	mfp->stat.st_pagesize = pagesize;
+	mfp->lsn_off = dbmfp->lsn_offset;
+	mfp->clear_len = dbmfp->clear_len;
+
+	if (LF_ISSET(DB_DIRECT))
+		F_SET(mfp, MP_DIRECT);
+	if (LF_ISSET(DB_EXTENT))
+		F_SET(mfp, MP_EXTENT);
+
+	if (path == NULL)
+		F_SET(mfp, MP_TEMP);
+	else {
 		/*
 		 * Don't permit files that aren't a multiple of the pagesize,
 		 * and find the number of the last page in the file, all the
@@ -234,93 +487,84 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 		 * environments where an off_t is 32-bits, but still run where
 		 * offsets are 64-bits, and they pay us a lot of money.
 		 */
-		if ((ret = __os_ioinfo(dbenv, rpath,
-		    &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) {
+		if ((ret = __os_ioinfo(
+		    dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
 			__db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
 			goto err;
 		}
 
 		/*
-		 * If we're doing a verify, we might have to cope with
-		 * a truncated file;  if the file size is not a multiple
-		 * of the page size, round down to a page--we'll
-		 * take care of the partial page outside the memp system.
+		 * During verify or recovery, we might have to cope with a
+		 * truncated file; if the file size is not a multiple of the
+		 * page size, round down to a page, we'll take care of the
+		 * partial page outside the mpool system.
 		 */
-
-		/* Page sizes have to be a power-of-two, ignore mbytes. */
 		if (bytes % pagesize != 0) {
 			if (LF_ISSET(DB_ODDFILESIZE))
-				/*
-				 * If we're doing a verify, we might
-				 * have to cope with a truncated file;
-				 * round down, we'll worry about the partial
-				 * page outside the memp system.
-				 */
-				bytes -= (bytes % pagesize);
+				bytes -= (u_int32_t)(bytes % pagesize);
 			else {
 				__db_err(dbenv,
-		"%s: file size not a multiple of the pagesize",
-				    rpath);
+		    "%s: file size not a multiple of the pagesize", rpath);
 				ret = EINVAL;
 				goto err;
 			}
 		}
 
-		last_pgno = mbytes * (MEGABYTE / pagesize);
-		last_pgno += bytes / pagesize;
-
-		/* Correction: page numbers are zero-based, not 1-based. */
+		/*
+		 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
+		 * page get, we have to increment the last page in the file.
+		 * Figure it out and save it away.
+		 *
+		 * Note correction: page numbers are zero-based, not 1-based.
+		 */
+		last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
+		last_pgno += (db_pgno_t)(bytes / pagesize);
 		if (last_pgno != 0)
 			--last_pgno;
+		mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
 
-		/*
-		 * Get the file id if we weren't given one.  Generated file id's
-		 * don't use timestamps, otherwise there'd be no chance of any
-		 * other process joining the party.
-		 */
-		if (finfop->fileid == NULL) {
-			if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
-				goto err;
-			finfop->fileid = idbuf;
-		}
-	}
+		/* Copy the file path into shared memory. */
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+			goto err;
+		memcpy(p, path, strlen(path) + 1);
 
-	/*
-	 * If we weren't provided an underlying shared object to join with,
-	 * find/allocate the shared file objects.  Also allocate space for
-	 * for the per-process thread lock.
-	 */
-	if (needlock)
-		R_LOCK(dbenv, dbmp->reginfo);
-	if (mfp == NULL)
-		ret = __memp_mf_open(
-		    dbmp, path, pagesize, last_pgno, finfop, flags, &mfp);
-	else {
-		++mfp->mpf_cnt;
-		ret = 0;
+		/* Copy the file identification string into shared memory. */
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+			goto err;
+		memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
 	}
-	if (needlock)
-		R_UNLOCK(dbenv, dbmp->reginfo);
-	if (ret != 0)
-		goto err;
 
-	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-		if ((ret = __db_mutex_alloc(
-		    dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0)
-			goto err;
-		if ((ret = __db_mutex_init(
-		    dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0)
+	/* Copy the page cookie into shared memory. */
+	if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
+		mfp->pgcookie_len = 0;
+		mfp->pgcookie_off = 0;
+	} else {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
 			goto err;
-
-		/* XXX: KEITH: CLOSE THE FILE ON FAILURE? */
+		memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size);
+		mfp->pgcookie_len = dbmfp->pgcookie->size;
 	}
 
-	dbmfp->mfp = mfp;
+	/*
+	 * Prepend the MPOOLFILE to the list of MPOOLFILE's.
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex,
+	    MUTEX_NO_RLOCK);
+	if (ret == 0)
+		SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (ret != 0)
+		goto err;
 
+check_map:
 	/*
 	 * If a file:
-	 *	+ is read-only
 	 *	+ isn't temporary
+	 *	+ is read-only
 	 *	+ doesn't require any pgin/pgout support
 	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
 	 *	  the environment in which it was opened)
@@ -332,7 +576,6 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 	 * NFS mounted partition, and we can fail in buffer I/O just as easily
 	 * as here.
 	 *
-	 * XXX
 	 * We'd like to test to see if the file is too big to mmap.  Since we
 	 * don't know what size or type off_t's or size_t's are, or the largest
 	 * unsigned integral type is, or what random insanity the local C
@@ -341,11 +584,11 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 	 */
 #define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 Mb. */
 	if (F_ISSET(mfp, MP_CAN_MMAP)) {
-		if (!F_ISSET(dbmfp, MP_READONLY))
-			F_CLR(mfp, MP_CAN_MMAP);
 		if (path == NULL)
 			F_CLR(mfp, MP_CAN_MMAP);
-		if (finfop->ftype != 0)
+		if (!F_ISSET(dbmfp, MP_READONLY))
+			F_CLR(mfp, MP_CAN_MMAP);
+		if (dbmfp->ftype != 0)
 			F_CLR(mfp, MP_CAN_MMAP);
 		if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
 			F_CLR(mfp, MP_CAN_MMAP);
@@ -354,260 +597,239 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
 		if (mbytes > maxmap / MEGABYTE ||
 		    (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
 			F_CLR(mfp, MP_CAN_MMAP);
-	}
-	dbmfp->addr = NULL;
-	if (F_ISSET(mfp, MP_CAN_MMAP)) {
-		dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
-		if (__os_mapfile(dbenv, rpath,
-		    &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) {
-			dbmfp->addr = NULL;
-			F_CLR(mfp, MP_CAN_MMAP);
+
+		dbmfp->addr = NULL;
+		if (F_ISSET(mfp, MP_CAN_MMAP)) {
+			dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+			if (__os_mapfile(dbenv, rpath,
+			    dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
+				dbmfp->addr = NULL;
+				F_CLR(mfp, MP_CAN_MMAP);
+			}
 		}
 	}
-	if (rpath != NULL)
-		__os_freestr(rpath);
 
+	dbmfp->mfp = mfp;
+
+	F_SET(dbmfp, MP_OPEN_CALLED);
+
+	/* Add the file to the process' list of DB_MPOOLFILEs. */
 	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
 	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
-	*retp = dbmfp;
-	return (0);
+	if (0) {
+err:		if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+			(void)__os_closehandle(dbenv, dbmfp->fhp);
+
+		if (mfp_alloc) {
+			R_LOCK(dbenv, dbmp->reginfo);
+			if (mfp->path_off != 0)
+				__db_shalloc_free(dbmp->reginfo[0].addr,
+				    R_ADDR(dbmp->reginfo, mfp->path_off));
+			if (mfp->fileid_off != 0)
+				__db_shalloc_free(dbmp->reginfo[0].addr,
+				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+			__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+			R_UNLOCK(dbenv, dbmp->reginfo);
+		}
 
-err:	/*
-	 * Note that we do not have to free the thread mutex, because we
-	 * never get to here after we have successfully allocated it.
-	 */
-	if (rpath != NULL)
-		__os_freestr(rpath);
-	if (F_ISSET(&dbmfp->fh, DB_FH_VALID))
-		(void)__os_closehandle(&dbmfp->fh);
-	if (dbmfp != NULL) {
-		if (dbmfp->mutexp != NULL)
-			__db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
-		__os_free(dbmfp, sizeof(DB_MPOOLFILE));
 	}
+	if (rpath != NULL)
+		__os_free(dbenv, rpath);
 	return (ret);
 }
 
 /*
- * __memp_mf_open --
- *	Open an MPOOLFILE.
+ * __memp_get_fileid --
+ *	Return the file ID.
+ *
+ * XXX
+ * Undocumented interface: DB private.
  */
-static int
-__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp)
-	DB_MPOOL *dbmp;
-	const char *path;
-	size_t pagesize;
-	db_pgno_t last_pgno;
-	DB_MPOOL_FINFO *finfop;
-	u_int32_t flags;
-	MPOOLFILE **retp;
+static void
+__memp_get_fileid(dbmfp, fidp)
+	DB_MPOOLFILE *dbmfp;
+	u_int8_t *fidp;
 {
-	MPOOL *mp;
-	MPOOLFILE *mfp;
-	int ret;
-	void *p;
-
-#define	ISTEMPORARY	(path == NULL)
-
 	/*
-	 * If not creating a temporary file, walk the list of MPOOLFILE's,
-	 * looking for a matching file.  Files backed by temporary files
-	 * or previously removed files can't match.
+	 * No lock needed -- we're using the handle, it had better not
+	 * be going away.
 	 *
-	 * DB_TRUNCATE support.
-	 *
-	 * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
-	 * pair) plus a timestamp.  If files are removed and created in less
-	 * than a second, the fileID can be repeated.  The problem with
-	 * repetition happens when the file that previously had the fileID
-	 * value still has pages in the pool, since we don't want to use them
-	 * to satisfy requests for the new file.
-	 *
-	 * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
-	 * opens with that flag set guarantees matching fileIDs when the
-	 * machine can open a file and then re-open with truncate within a
-	 * second.  For this reason, we pass that flag down, and, if we find
-	 * a matching entry, we ensure that it's never found again, and we
-	 * create a new entry for the current request.
+	 * !!!
+	 * Get the fileID out of the region, not out of the DB_MPOOLFILE
+	 * structure because the DB_MPOOLFILE reference is possibly short
+	 * lived, and isn't to be trusted.
 	 */
-	if (!ISTEMPORARY) {
-		mp = dbmp->reginfo[0].primary;
-		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
-		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
-			if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
-				continue;
-			if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo,
-			    mfp->fileid_off), DB_FILE_ID_LEN) == 0) {
-				if (LF_ISSET(DB_TRUNCATE)) {
-					MEMP_FREMOVE(mfp);
-					continue;
-				}
-				if (finfop->clear_len != mfp->clear_len ||
-				    pagesize != mfp->stat.st_pagesize) {
-					__db_err(dbmp->dbenv,
-				    "%s: page size or clear length changed",
-					    path);
-					return (EINVAL);
-				}
-
-				/*
-				 * It's possible that our needs for pre- and
-				 * post-processing are changing.  For example,
-				 * an application created a hash subdatabase
-				 * in a database that was previously all btree.
-				 */
-				if (finfop->ftype != 0)
-					mfp->ftype = finfop->ftype;
-
-				++mfp->mpf_cnt;
-
-				*retp = mfp;
-				return (0);
-			}
-		}
-	}
+	memcpy(fidp, R_ADDR(
+	    dbmfp->dbmp->reginfo, dbmfp->mfp->fileid_off), DB_FILE_ID_LEN);
+}
 
-	/* Allocate a new MPOOLFILE. */
-	if ((ret = __memp_alloc(
-	    dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
-		goto mem_err;
-	*retp = mfp;
+/*
+ * __memp_last_pgno --
+ *	Return the page number of the last page in the file.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_last_pgno(dbmfp, pgnoaddr)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
 
-	/* Initialize the structure. */
-	memset(mfp, 0, sizeof(MPOOLFILE));
-	mfp->mpf_cnt = 1;
-	mfp->ftype = finfop->ftype;
-	mfp->lsn_off = finfop->lsn_offset;
-	mfp->clear_len = finfop->clear_len;
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
 
-	/*
-	 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
-	 * we have to know the last page in the file.  Figure it out and save
-	 * it away.
-	 */
-	mfp->stat.st_pagesize = pagesize;
-	mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
+	R_LOCK(dbenv, dbmp->reginfo);
+	*pgnoaddr = dbmfp->mfp->last_pgno;
+	R_UNLOCK(dbenv, dbmp->reginfo);
+}
 
-	if (ISTEMPORARY)
-		F_SET(mfp, MP_TEMP);
-	else {
-		/* Copy the file path into shared memory. */
-		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
-		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
-			goto err;
-		memcpy(p, path, strlen(path) + 1);
+/*
+ * __memp_refcnt --
+ *	Return the current reference count.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_refcnt(dbmfp, cntp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *cntp;
+{
+	DB_ENV *dbenv;
 
-		/* Copy the file identification string into shared memory. */
-		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
-		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
-			goto err;
-		memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
+	dbenv = dbmfp->dbmp->dbenv;
 
-		F_SET(mfp, MP_CAN_MMAP);
-	}
+	MUTEX_LOCK(dbenv, &dbmfp->mfp->mutex);
+	*cntp = dbmfp->mfp->mpf_cnt;
+	MUTEX_UNLOCK(dbenv, &dbmfp->mfp->mutex);
+}
 
-	/* Copy the page cookie into shared memory. */
-	if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
-		mfp->pgcookie_len = 0;
-		mfp->pgcookie_off = 0;
-	} else {
-		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
-		    NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
-			goto err;
-		memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
-		mfp->pgcookie_len = finfop->pgcookie->size;
-	}
+/*
+ * __memp_set_unlink --
+ *	Set unlink on last close flag.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_set_unlink(dbmpf, set)
+	DB_MPOOLFILE *dbmpf;
+	int set;
+{
+	DB_ENV *dbenv;
 
-	/* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
-	mp = dbmp->reginfo[0].primary;
-	SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+	dbenv = dbmpf->dbmp->dbenv;
 
-	if (0) {
-err:		if (mfp->path_off != 0)
-			__db_shalloc_free(dbmp->reginfo[0].addr,
-			    R_ADDR(dbmp->reginfo, mfp->path_off));
-		if (mfp->fileid_off != 0)
-			__db_shalloc_free(dbmp->reginfo[0].addr,
-			    R_ADDR(dbmp->reginfo, mfp->fileid_off));
-		if (mfp != NULL)
-			__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
-mem_err:	__db_err(dbmp->dbenv,
-		    "Unable to allocate memory for mpool file");
-	}
-	return (ret);
+	MUTEX_LOCK(dbenv, &dbmpf->mfp->mutex);
+	if (set)
+		F_SET(dbmpf->mfp, MP_UNLINK);
+	else
+		F_CLR(dbmpf->mfp, MP_UNLINK);
+	MUTEX_UNLOCK(dbenv, &dbmpf->mfp->mutex);
 }
 
 /*
  * memp_fclose --
  *	Close a backing file for the memory pool.
  */
+static int
+__memp_fclose(dbmfp, flags)
+	DB_MPOOLFILE *dbmfp;
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	int ret, t_ret;
+
+	dbenv = dbmfp->dbmp->dbenv;
+
+	PANIC_CHECK(dbenv);
+
+	/*
+	 * XXX
+	 * DB_MPOOL_DISCARD: Undocumented flag: DB private.
+	 */
+	ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD);
+
+	if ((t_ret = __memp_fclose_int(dbmfp, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __memp_fclose_int --
+ *	Internal version of __memp_fclose.
+ *
+ * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t));
+ */
 int
-memp_fclose(dbmfp)
+__memp_fclose_int(dbmfp, flags)
 	DB_MPOOLFILE *dbmfp;
+	u_int32_t flags;
 {
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
 	char *rpath;
-	int ret, t_ret;
+	int deleted, ret, t_ret;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
 	ret = 0;
 
-	PANIC_CHECK(dbenv);
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fclose(dbmfp));
-#endif
-
 	/*
-	 * Remove the DB_MPOOLFILE from the queue.  This has to happen before
-	 * we perform any action that can fail, otherwise __memp_close may
-	 * loop infinitely when calling us to discard all of the DB_MPOOLFILEs.
+	 * We have to reference count DB_MPOOLFILE structures as other threads
+	 * in the process may be using them.  Here's the problem:
+	 *
+	 * Thread A opens a database.
+	 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+	 *    in order to free up memory in the mpool cache.
+	 * Thread A closes the database while thread B is using the
+	 *    DB_MPOOLFILE structure.
+	 *
+	 * By opening all databases before creating any threads, and closing
+	 * the databases after all the threads have exited, applications get
+	 * better performance and avoid the problem path entirely.
+	 *
+	 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer is a
+	 * short-term lock, even in worst case, since we better be the only
+	 * thread of control using the DB_MPOOLFILE structure to read pages
+	 * *into* the cache.  Wait until we're the only reference holder and
+	 * remove the DB_MPOOLFILE structure from the list, so nobody else can
+	 * find it.  We do this, rather than have the last reference holder
+	 * (whoever that might be) discard the DB_MPOOLFILE structure, because
+	 * we'd rather write error messages to the application in the close
+	 * routine, not in the checkpoint/sync routine.
+	 *
+	 * !!!
+	 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
+	 * file list, check the DB_OPEN_CALLED flag to be sure.
 	 */
-	for (;;) {
+	for (deleted = 0;;) {
 		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
-
-		/*
-		 * We have to reference count DB_MPOOLFILE structures as other
-		 * threads may be using them.  The problem only happens if the
-		 * application makes a bad design choice.  Here's the path:
-		 *
-		 * Thread A opens a database.
-		 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
-		 *    in order to free up memory in the mpool cache.
-		 * Thread A closes the database while thread B is using the
-		 *    DB_MPOOLFILE structure.
-		 *
-		 * By opening all databases before creating the threads, and
-		 * closing them after the threads have exited, applications
-		 * get better performance and avoid the problem path entirely.
-		 *
-		 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
-		 * is a short-term lock, even in worst case, since we better be
-		 * the only thread of control using the DB_MPOOLFILE structure
-		 * to read pages *into* the cache.  Wait until we're the only
-		 * reference holder and remove the DB_MPOOLFILE structure from
-		 * the list, so nobody else can even find it.
-		 */
 		if (dbmfp->ref == 1) {
-			TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
-			break;
+			if (F_ISSET(dbmfp, MP_OPEN_CALLED))
+				TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+			deleted = 1;
 		}
 		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
-		(void)__os_sleep(dbenv, 1, 0);
+		if (deleted)
+			break;
+		__os_sleep(dbenv, 1, 0);
 	}
-	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 	/* Complain if pinned blocks never returned. */
-	if (dbmfp->pinref != 0)
+	if (dbmfp->pinref != 0) {
 		__db_err(dbenv, "%s: close: %lu blocks left pinned",
 		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
+		ret = __db_panic(dbenv, DB_RUNRECOVERY);
+	}
 
 	/* Discard any mmap information. */
 	if (dbmfp->addr != NULL &&
@@ -615,11 +837,11 @@ memp_fclose(dbmfp)
 		__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret));
 
 	/* Close the file; temporary files may not yet have been created. */
-	if (F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
-	    (t_ret = __os_closehandle(&dbmfp->fh)) != 0) {
+	if (F_ISSET(dbmfp->fhp, DB_FH_VALID) &&
+	    (t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) {
 		__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret));
-		if (ret != 0)
-			t_ret = ret;
+		if (ret == 0)
+			ret = t_ret;
 	}
 
 	/* Discard the thread mutex. */
@@ -628,38 +850,51 @@ memp_fclose(dbmfp)
 
 	/*
 	 * Discard our reference on the the underlying MPOOLFILE, and close
-	 * it if it's no longer useful to anyone.
-	 *
-	 * If we're not discarding it, and it's a temp file, this means
-	 * all the outstanding references belong to unflushed buffers.
-	 * (A temp file can only be referenced by one DB_MPOOLFILE).
-	 * We don't care about preserving any of those buffers, so mark
-	 * the MPOOLFILE as dead so that when we try to flush them,
-	 * even the dirty ones just get discarded.
+	 * it if it's no longer useful to anyone.  It possible the open of
+	 * the file never happened or wasn't successful, in which case, mpf
+	 * will be NULL;
 	 */
-	R_LOCK(dbenv, dbmp->reginfo);
-	mfp = dbmfp->mfp;
-	if (--mfp->mpf_cnt == 0) {
+	if ((mfp = dbmfp->mfp) == NULL)
+		goto done;
+
+	/*
+	 * If it's a temp file, all outstanding references belong to unflushed
+	 * buffers.  (A temp file can only be referenced by one DB_MPOOLFILE).
+	 * We don't care about preserving any of those buffers, so mark the
+	 * MPOOLFILE as dead so that even the dirty ones just get discarded
+	 * when we try to flush them.
+	 */
+	deleted = 0;
+	MUTEX_LOCK(dbenv, &mfp->mutex);
+	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
+		if (LF_ISSET(DB_MPOOL_DISCARD) ||
+		    F_ISSET(mfp, MP_TEMP | MP_UNLINK))
+			MPOOLFILE_IGNORE(mfp);
 		if (F_ISSET(mfp, MP_UNLINK)) {
-			MEMP_FREMOVE(mfp);
 			if ((t_ret = __db_appname(dbmp->dbenv,
-			    DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo,
+			    DB_APP_DATA, R_ADDR(dbmp->reginfo,
 			    mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
 				ret = t_ret;
-			if (t_ret == 0 && (t_ret =
-			    __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0))
+			if (t_ret == 0) {
+				if ((t_ret = __os_unlink(
+				    dbmp->dbenv, rpath) != 0) && ret == 0)
+					ret = t_ret;
+				__os_free(dbenv, rpath);
+			}
+		}
+		if (mfp->block_cnt == 0) {
+			if ((t_ret =
+			    __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
 				ret = t_ret;
-			__os_free(rpath, 0);
+			deleted = 1;
 		}
-		if (mfp->block_cnt == 0)
-			__memp_mf_discard(dbmp, mfp);
 	}
-	else if (F_ISSET(mfp, MP_TEMP))
-		MEMP_FREMOVE(mfp);
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (deleted == 0)
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
 
 	/* Discard the DB_MPOOLFILE structure. */
-	__os_free(dbmfp, sizeof(DB_MPOOLFILE));
+done:	__os_free(dbenv, dbmfp->fhp);
+	__os_free(dbenv, dbmfp);
 
 	return (ret);
 }
@@ -668,20 +903,69 @@ memp_fclose(dbmfp)
  * __memp_mf_discard --
  *	Discard an MPOOLFILE.
  *
- * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
+ * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
  */
-void
+int
 __memp_mf_discard(dbmp, mfp)
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
 {
+	DB_ENV *dbenv;
+	DB_FH fh;
+	DB_MPOOL_STAT *sp;
 	MPOOL *mp;
+	char *rpath;
+	int ret;
 
+	dbenv = dbmp->dbenv;
 	mp = dbmp->reginfo[0].primary;
+	ret = 0;
+
+	/*
+	 * Expects caller to be holding the MPOOLFILE mutex.
+	 *
+	 * When discarding a file, we have to flush writes from it to disk.
+	 * The scenario is that dirty buffers from this file need to be
+	 * flushed to satisfy a future checkpoint, but when the checkpoint
+	 * calls mpool sync, the sync code won't know anything about them.
+	 */
+	if (!F_ISSET(mfp, MP_DEADFILE) &&
+	    (ret = __db_appname(dbenv, DB_APP_DATA,
+	    R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
+		if ((ret = __os_open(dbenv, rpath, 0, 0, &fh)) == 0) {
+			ret = __os_fsync(dbenv, &fh);
+			(void)__os_closehandle(dbenv, &fh);
+		}
+		__os_free(dbenv, rpath);
+	}
+
+	/*
+	 * We have to release the MPOOLFILE lock before acquiring the region
+	 * lock so that we don't deadlock.  Make sure nobody ever looks at
+	 * this structure again.
+	 */
+	MPOOLFILE_IGNORE(mfp);
+
+	/* Discard the mutex we're holding. */
+	MUTEX_UNLOCK(dbenv, &mfp->mutex);
 
 	/* Delete from the list of MPOOLFILEs. */
+	R_LOCK(dbenv, dbmp->reginfo);
 	SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
 
+	/* Copy the statistics into the region. */
+	sp = &mp->stat;
+	sp->st_cache_hit += mfp->stat.st_cache_hit;
+	sp->st_cache_miss += mfp->stat.st_cache_miss;
+	sp->st_map += mfp->stat.st_map;
+	sp->st_page_create += mfp->stat.st_page_create;
+	sp->st_page_in += mfp->stat.st_page_in;
+	sp->st_page_out += mfp->stat.st_page_out;
+
+	/* Clear the mutex this MPOOLFILE recorded. */
+	__db_shlocks_clear(&mfp->mutex, dbmp->reginfo,
+	    (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off));
+
 	/* Free the space. */
 	if (mfp->path_off != 0)
 		__db_shalloc_free(dbmp->reginfo[0].addr,
@@ -693,35 +977,10 @@ __memp_mf_discard(dbmp, mfp)
 		__db_shalloc_free(dbmp->reginfo[0].addr,
 		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
 	__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
-}
-
-/*
- * __memp_fremove --
- *	Remove an underlying file from the system.
- *
- * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *));
- */
-int
-__memp_fremove(dbmfp)
-	DB_MPOOLFILE *dbmfp;
-{
-	DB_ENV *dbenv;
-	DB_MPOOL *dbmp;
-	MPOOLFILE *mfp;
-
-	dbmp = dbmfp->dbmp;
-	dbenv = dbmp->dbenv;
-	mfp = dbmfp->mfp;
-
-	PANIC_CHECK(dbenv);
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	MEMP_FREMOVE(mfp);
 
 	R_UNLOCK(dbenv, dbmp->reginfo);
 
-	return (0);
+	return (ret);
 }
 
 /*
diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c
index be03b721f36..271e44a4ef8 100644
--- a/bdb/mp/mp_fput.c
+++ b/bdb/mp/mp_fput.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -15,43 +15,32 @@ static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Ex
 
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 /*
- * memp_fput --
+ * __memp_fput --
  *	Mpool file put function.
+ *
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
  */
 int
-memp_fput(dbmfp, pgaddr, flags)
+__memp_fput(dbmfp, pgaddr, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
 	u_int32_t flags;
 {
-	BH *bhp;
+	BH *argbhp, *bhp, *prev;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
-	int ret, wrote;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	u_int32_t n_cache;
+	int adjust, ret;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
-#endif
 
 	PANIC_CHECK(dbenv);
 
@@ -72,17 +61,6 @@ memp_fput(dbmfp, pgaddr, flags)
 		}
 	}
 
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Decrement the pinned reference count. */
-	if (dbmfp->pinref == 0) {
-		__db_err(dbenv,
-		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
-		R_UNLOCK(dbenv, dbmp->reginfo);
-		return (EINVAL);
-	} else
-		--dbmfp->pinref;
-
 	/*
 	 * If we're mapping the file, there's nothing to do.  Because we can
 	 * stop mapping the file at any time, we have to check on each buffer
@@ -90,97 +68,135 @@ memp_fput(dbmfp, pgaddr, flags)
 	 * region.
 	 */
 	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
-	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
 		return (0);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Decrement the per-file pinned buffer count (mapped pages aren't
+	 * counted).
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	if (dbmfp->pinref == 0) {
+		ret = EINVAL;
+		__db_err(dbenv,
+		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
+	} else {
+		ret = 0;
+		--dbmfp->pinref;
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (ret != 0)
+		return (ret);
+#endif
 
-	/* Convert the page address to a buffer header. */
+	/* Convert a page address to a buffer header and hash bucket. */
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
 
-	/* Convert the buffer header to a cache. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-
-/* UNLOCK THE REGION, LOCK THE CACHE. */
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
 
 	/* Set/clear the page bits. */
-	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
-		++c_mp->stat.st_page_clean;
-		--c_mp->stat.st_page_dirty;
+	if (LF_ISSET(DB_MPOOL_CLEAN) &&
+	    F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;
 		F_CLR(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
-		--c_mp->stat.st_page_clean;
-		++c_mp->stat.st_page_dirty;
+		++hp->hash_page_dirty;
 		F_SET(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DISCARD))
 		F_SET(bhp, BH_DISCARD);
 
 	/*
-	 * If the page is dirty and being scheduled to be written as part of
-	 * a checkpoint, we no longer know that the log is up-to-date.
-	 */
-	if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
-		F_SET(bhp, BH_SYNC_LOGFLSH);
-
-	/*
 	 * Check for a reference count going to zero.  This can happen if the
 	 * application returns a page twice.
 	 */
 	if (bhp->ref == 0) {
 		__db_err(dbenv, "%s: page %lu: unpinned page returned",
 		    __memp_fn(dbmfp), (u_long)bhp->pgno);
-		R_UNLOCK(dbenv, dbmp->reginfo);
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 		return (EINVAL);
 	}
 
 	/*
-	 * If more than one reference to the page, we're done.  Ignore the
-	 * discard flags (for now) and leave it at its position in the LRU
-	 * chain.  The rest gets done at last reference close.
+	 * If more than one reference to the page or a reference other than a
+	 * thread waiting to flush the buffer to disk, we're done.  Ignore the
+	 * discard flags (for now) and leave the buffer's priority alone.
 	 */
-	if (--bhp->ref > 0) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
+	if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) {
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 		return (0);
 	}
 
+	/* Update priority values. */
+	if (F_ISSET(bhp, BH_DISCARD) ||
+	    dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW)
+		bhp->priority = 0;
+	else {
+		/*
+		 * We don't lock the LRU counter or the stat.st_pages field, if
+		 * we get garbage (which won't happen on a 32-bit machine), it
+		 * only means a buffer has the wrong priority.
+		 */
+		bhp->priority = c_mp->lru_count;
+
+		adjust = 0;
+		if (dbmfp->mfp->priority != 0)
+			adjust =
+			    (int)c_mp->stat.st_pages / dbmfp->mfp->priority;
+		if (F_ISSET(bhp, BH_DIRTY))
+			adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
+
+		if (adjust > 0) {
+			if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust)
+				bhp->priority += adjust;
+		} else if (adjust < 0)
+			if (bhp->priority > (u_int32_t)-adjust)
+				bhp->priority += adjust;
+	}
+
 	/*
-	 * Move the buffer to the head/tail of the LRU chain.  We do this
-	 * before writing the buffer for checkpoint purposes, as the write
-	 * can discard the region lock and allow another process to acquire
-	 * buffer.  We could keep that from happening, but there seems no
-	 * reason to do so.
+	 * Buffers on hash buckets are sorted by priority -- move the buffer
+	 * to the correct position in the list.
 	 */
-	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
-	if (F_ISSET(bhp, BH_DISCARD))
-		SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
+	argbhp = bhp;
+	SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh);
+
+	prev = NULL;
+	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	    bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+		if (bhp->priority > argbhp->priority)
+			break;
+	if (prev == NULL)
+		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh);
 	else
-		SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+		SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh);
+
+	/* Reset the hash bucket's priority. */
+	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+#ifdef DIAGNOSTIC
+	__memp_check_order(hp);
+#endif
 
 	/*
-	 * If this buffer is scheduled for writing because of a checkpoint, we
-	 * need to write it (if it's dirty), or update the checkpoint counters
-	 * (if it's not dirty).  If we try to write it and can't, that's not
-	 * necessarily an error as it's not completely unreasonable that the
-	 * application have permission to write the underlying file, but set a
-	 * flag so that the next time the memp_sync function is called we try
-	 * writing it there, as the checkpoint thread of control better be able
-	 * to write all of the files.
+	 * The sync code has a separate counter for buffers on which it waits.
+	 * It reads that value without holding a lock so we update it as the
+	 * last thing we do.  Once that value goes to 0, we won't see another
+	 * reference to that buffer being returned to the cache until the sync
+	 * code has finished, so we're safe as long as we don't let the value
+	 * go to 0 before we finish with the buffer.
 	 */
-	if (F_ISSET(bhp, BH_SYNC)) {
-		if (F_ISSET(bhp, BH_DIRTY)) {
-			if (__memp_bhwrite(dbmp,
-			    dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
-				F_SET(mp, MP_LSN_RETRY);
-		} else {
-			F_CLR(bhp, BH_SYNC);
-
-			--mp->lsn_cnt;
-			--dbmfp->mfp->lsn_cnt;
-		}
-	}
+	if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0)
+		--argbhp->ref_sync;
+
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
 	return (0);
 }
diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c
index 08313c9b6f5..65cd6286ac9 100644
--- a/bdb/mp/mp_fset.c
+++ b/bdb/mp/mp_fset.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -15,25 +15,18 @@ static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Ex
 
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 /*
- * memp_fset --
+ * __memp_fset --
  *	Mpool page set-flag routine.
+ *
+ * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
  */
 int
-memp_fset(dbmfp, pgaddr, flags)
+__memp_fset(dbmfp, pgaddr, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
 	u_int32_t flags;
@@ -41,17 +34,13 @@ memp_fset(dbmfp, pgaddr, flags)
 	BH *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	u_int32_t n_cache;
 	int ret;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
-#endif
 
 	PANIC_CHECK(dbenv);
 
@@ -60,7 +49,7 @@ memp_fset(dbmfp, pgaddr, flags)
 		return (__db_ferr(dbenv, "memp_fset", 1));
 
 	if ((ret = __db_fchk(dbenv, "memp_fset", flags,
-	    DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+	    DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
 		return (ret);
 	if ((ret = __db_fcchk(dbenv, "memp_fset",
 	    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
@@ -72,27 +61,29 @@ memp_fset(dbmfp, pgaddr, flags)
 		return (EACCES);
 	}
 
-	/* Convert the page address to a buffer header. */
+	/* Convert the page address to a buffer header and hash bucket. */
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
-
-	/* Convert the buffer header to a cache. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
-		++c_mp->stat.st_page_clean;
-		--c_mp->stat.st_page_dirty;
+	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
+
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
+
+	/* Set/clear the page bits. */
+	if (LF_ISSET(DB_MPOOL_CLEAN) &&
+	    F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;
 		F_CLR(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
-		--c_mp->stat.st_page_clean;
-		++c_mp->stat.st_page_dirty;
+		++hp->hash_page_dirty;
 		F_SET(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DISCARD))
 		F_SET(bhp, BH_DISCARD);
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 	return (0);
 }
diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c
index 85a6239b032..38f0a645f16 100644
--- a/bdb/mp/mp_method.c
+++ b/bdb/mp/mp_method.c
@@ -1,30 +1,30 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
+static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
-#endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
+#ifdef HAVE_RPC
+#include <rpc/rpc.h>
+#endif
 #endif
 
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 #ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
 #endif
 
 static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
@@ -41,29 +41,46 @@ __memp_dbenv_create(dbenv)
 	DB_ENV *dbenv;
 {
 	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 *
 	 * We default to 32 8K pages.  We don't default to a flat 256K, because
 	 * some systems require significantly more memory to hold 32 pages than
 	 * others.  For example, HP-UX with POSIX pthreads needs 88 bytes for
 	 * a POSIX pthread mutex and almost 200 bytes per buffer header, while
-	 * Solaris needs 24 and 52 bytes for the same structures.
+	 * Solaris needs 24 and 52 bytes for the same structures.  The minimum
+	 * number of hash buckets is 37.  These contain a mutex also.
 	 */
-	dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
+	dbenv->mp_bytes =
+	    32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
 	dbenv->mp_ncache = 1;
 
-	dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
-	dbenv->set_cachesize = __memp_set_cachesize;
-
-#ifdef	HAVE_RPC
-	/*
-	 * If we have a client, overwrite what we just setup to
-	 * point to client functions.
-	 */
+#ifdef HAVE_RPC
 	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
 		dbenv->set_cachesize = __dbcl_env_cachesize;
 		dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
-	}
+		dbenv->memp_dump_region = NULL;
+		dbenv->memp_fcreate = __dbcl_memp_fcreate;
+		dbenv->memp_nameop = NULL;
+		dbenv->memp_register = __dbcl_memp_register;
+		dbenv->memp_stat = __dbcl_memp_stat;
+		dbenv->memp_sync = __dbcl_memp_sync;
+		dbenv->memp_trickle = __dbcl_memp_trickle;
+	} else
 #endif
-
+	{
+		dbenv->set_cachesize = __memp_set_cachesize;
+		dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+		dbenv->memp_dump_region = __memp_dump_region;
+		dbenv->memp_fcreate = __memp_fcreate;
+		dbenv->memp_nameop = __memp_nameop;
+		dbenv->memp_register = __memp_register;
+		dbenv->memp_stat = __memp_stat;
+		dbenv->memp_sync = __memp_sync;
+		dbenv->memp_trickle = __memp_trickle;
+	}
 }
 
 /*
@@ -78,26 +95,50 @@ __memp_set_cachesize(dbenv, gbytes, bytes, ncache)
 {
 	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");
 
-	dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
-	dbenv->mp_bytes = bytes % GIGABYTE;
-	dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
+	/* Normalize the values. */
+	if (ncache == 0)
+		ncache = 1;
 
 	/*
-	 * If the application requested less than 500Mb, increase the
-	 * cachesize by 25% to account for our overhead.  (I'm guessing
-	 * that caches over 500Mb are specifically sized, i.e., it's
-	 * a large server and the application actually knows how much
-	 * memory is available.)
+	 * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
+	 * applications that specify 4GB cache sizes -- we know what they meant.
+	 */
+	if (gbytes / ncache == 4 && bytes == 0) {
+		--gbytes;
+		bytes = GIGABYTE - 1;
+	} else {
+		gbytes += bytes / GIGABYTE;
+		bytes %= GIGABYTE;
+	}
+
+	/* Avoid too-large cache sizes, they result in a region size of zero. */
+	if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) {
+		__db_err(dbenv, "individual cache size too large");
+		return (EINVAL);
+	}
+
+	/*
+	 * If the application requested less than 500Mb, increase the cachesize
+	 * by 25% and factor in the size of the hash buckets to account for our
+	 * overhead.  (I'm guessing caches over 500Mb are specifically sized,
+	 * that is, it's a large server and the application actually knows how
+	 * much memory is available.  We only document the 25% overhead number,
+	 * not the hash buckets, but I don't see a reason to confuse the issue,
+	 * it shouldn't matter to an application.)
 	 *
 	 * There is a minimum cache size, regardless.
 	 */
-	if (dbenv->mp_gbytes == 0) {
-		if (dbenv->mp_bytes < 500 * MEGABYTE)
-			dbenv->mp_bytes += dbenv->mp_bytes / 4;
-		if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
-			dbenv->mp_bytes = DB_CACHESIZE_MIN;
+	if (gbytes == 0) {
+		if (bytes < 500 * MEGABYTE)
+			bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
+		if (bytes / ncache < DB_CACHESIZE_MIN)
+			bytes = ncache * DB_CACHESIZE_MIN;
 	}
 
+	dbenv->mp_gbytes = gbytes;
+	dbenv->mp_bytes = bytes;
+	dbenv->mp_ncache = ncache;
+
 	return (0);
 }
 
diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c
index 4b85466ce63..06eca2f8646 100644
--- a/bdb/mp/mp_region.c
+++ b/bdb/mp/mp_region.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -17,11 +17,11 @@ static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell
 #endif
 
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 static size_t __mpool_region_maint __P((REGINFO *));
 #endif
 
@@ -119,6 +119,8 @@ __memp_open(dbenv)
 
 			regids[i] = dbmp->reginfo[i].id;
 		}
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
 	} else {
 		/*
 		 * Determine how many regions there are going to be, allocate
@@ -135,6 +137,19 @@ __memp_open(dbenv)
 			dbmp->reginfo[i].id = INVALID_REGION_ID;
 		dbmp->reginfo[0] = reginfo;
 
+		/*
+		 * We have to unlock the primary mpool region before we attempt
+		 * to join the additional mpool regions.  If we don't, we can
+		 * deadlock.  The scenario is that we hold the primary mpool
+		 * region lock.  We then try to attach to an additional mpool
+		 * region, which requires the acquisition/release of the main
+		 * region lock (to search the list of regions).  If another
+		 * thread of control already holds the main region lock and is
+		 * waiting on our primary mpool region lock, we'll deadlock.
+		 * See [#4696] for more information.
+		 */
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
 		/* Join remaining regions. */
 		regids = R_ADDR(dbmp->reginfo, mp->regids);
 		for (i = 1; i < dbmp->nreg; ++i) {
@@ -155,17 +170,10 @@ __memp_open(dbenv)
 		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
 
 	/* If the region is threaded, allocate a mutex to lock the handles. */
-	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-		if ((ret = __db_mutex_alloc(
-		    dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
-			goto err;
-		}
-		if ((ret =
-		    __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
-			goto err;
-	}
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+	    (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp,
+	    MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+		goto err;
 
 	dbenv->mp_handle = dbmp;
 	return (0);
@@ -180,12 +188,11 @@ err:	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
 			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
 				(void)__db_r_detach(
 				    dbenv, &dbmp->reginfo[i], 0);
-		__os_free(dbmp->reginfo,
-		    dbmp->nreg * sizeof(*dbmp->reginfo));
+		__os_free(dbenv, dbmp->reginfo);
 	}
 	if (dbmp->mutexp != NULL)
 		__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
-	__os_free(dbmp, sizeof(*dbmp));
+	__os_free(dbenv, dbmp);
 	return (ret);
 }
 
@@ -199,13 +206,13 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	DB_MPOOL *dbmp;
 	int reginfo_off, htab_buckets;
 {
-	DB_HASHTAB *htab;
+	DB_MPOOL_HASH *htab;
 	MPOOL *mp;
 	REGINFO *reginfo;
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 	size_t maint_size;
 #endif
-	int ret;
+	int i, ret;
 	void *p;
 
 	mp = NULL;
@@ -218,7 +225,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	mp = reginfo->primary;
 	memset(mp, 0, sizeof(*mp));
 
-#ifdef	MUTEX_SYSTEM_RESOURCES
+#ifdef	HAVE_MUTEX_SYSTEM_RESOURCES
 	maint_size = __mpool_region_maint(reginfo);
 	/* Allocate room for the maintenance info and initialize it. */
 	if ((ret = __db_shalloc(reginfo->addr,
@@ -231,14 +238,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	if (reginfo_off == 0) {
 		SH_TAILQ_INIT(&mp->mpfq);
 
-		if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
-		    R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
-		    DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
-		    (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
-			goto err;
-
 		ZERO_LSN(mp->lsn);
-		mp->lsn_cnt = 0;
 
 		mp->nreg = dbmp->nreg;
 		if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
@@ -247,32 +247,41 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 		mp->regids = R_OFFSET(dbmp->reginfo, p);
 	}
 
-	SH_TAILQ_INIT(&mp->bhq);
-
 	/* Allocate hash table space and initialize it. */
 	if ((ret = __db_shalloc(reginfo->addr,
-	    htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
+	    htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
 		goto mem_err;
-	__db_hashinit(htab, htab_buckets);
 	mp->htab = R_OFFSET(reginfo, htab);
-	mp->htab_buckets = htab_buckets;
+	for (i = 0; i < htab_buckets; i++) {
+		if ((ret = __db_mutex_setup(dbenv,
+		    reginfo, &htab[i].hash_mutex,
+		    MUTEX_NO_RLOCK)) != 0)
+			return (ret);
+		SH_TAILQ_INIT(&htab[i].hash_bucket);
+		htab[i].hash_page_dirty = htab[i].hash_priority = 0;
+	}
+	mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;
 
+	/*
+	 * Only the environment creator knows the total cache size, fill in
+	 * those statistics now.
+	 */
+	mp->stat.st_gbytes = dbenv->mp_gbytes;
+	mp->stat.st_bytes = dbenv->mp_bytes;
 	return (0);
 
 mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
-err:	if (reginfo->primary != NULL)
-		__db_shalloc_free(reginfo->addr, reginfo->primary);
 	return (ret);
 }
 
 /*
- * __memp_close --
- *	Internal version of memp_close: only called from DB_ENV->close.
+ * __memp_dbenv_refresh --
+ *	Clean up after the mpool system on a close or failed open.
  *
- * PUBLIC: int __memp_close __P((DB_ENV *));
+ * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
  */
 int
-__memp_close(dbenv)
+__memp_dbenv_refresh(dbenv)
 	DB_ENV *dbenv;
 {
 	DB_MPOOL *dbmp;
@@ -287,12 +296,12 @@ __memp_close(dbenv)
 	/* Discard DB_MPREGs. */
 	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
 		LIST_REMOVE(mpreg, q);
-		__os_free(mpreg, sizeof(DB_MPREG));
+		__os_free(dbenv, mpreg);
 	}
 
 	/* Discard DB_MPOOLFILEs. */
 	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
-		if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+		if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
 			ret = t_ret;
 
 	/* Discard the thread mutex. */
@@ -305,14 +314,14 @@ __memp_close(dbenv)
 		    dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
 			ret = t_ret;
 
-	__os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
-	__os_free(dbmp, sizeof(*dbmp));
+	__os_free(dbenv, dbmp->reginfo);
+	__os_free(dbenv, dbmp);
 
 	dbenv->mp_handle = NULL;
 	return (ret);
 }
 
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 /*
  * __mpool_region_maint --
  *	Return the amount of space needed for region maintenance info.
@@ -328,9 +337,11 @@ __mpool_region_maint(infop)
 	/*
 	 * For mutex maintenance we need one mutex per possible page.
 	 * Compute the maximum number of pages this cache can have.
-	 * Also add in an mpool mutex.
+	 * Also add in an mpool mutex and mutexes for all dbenv and db
+	 * handles.
 	 */
 	numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
+	numlocks += DB_MAX_HANDLES;
 	s = sizeof(roff_t) * numlocks;
 	return (s);
 }
@@ -347,11 +358,109 @@ __mpool_region_destroy(dbenv, infop)
 	DB_ENV *dbenv;
 	REGINFO *infop;
 {
-	MPOOL *mp;
+	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
+	    ((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off));
 
 	COMPQUIET(dbenv, NULL);
-	mp = R_ADDR(infop, infop->rp->primary);
+	COMPQUIET(infop, NULL);
+}
+
+/*
+ * __memp_nameop
+ *	Remove or rename a file in the pool.
+ *
+ * PUBLIC: int  __memp_nameop __P((DB_ENV *,
+ * PUBLIC:     u_int8_t *, const char *, const char *, const char *));
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+int
+__memp_nameop(dbenv, fileid, newname, fullold, fullnew)
+	DB_ENV *dbenv;
+	u_int8_t *fileid;
+	const char *newname, *fullold, *fullnew;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	roff_t newname_off;
+	int locked, ret;
+	void *p;
+
+	locked = 0;
+	dbmp = NULL;
 
-	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
-	return;
+	if (!MPOOL_ON(dbenv))
+		goto fsop;
+
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/*
+	 * Remove or rename a file that the mpool might know about.  We assume
+	 * that the fop layer has the file locked for exclusive access, so we
+	 * don't worry about locking except for the mpool mutexes.  Checkpoint
+	 * can happen at any time, independent of file locking, so we have to
+	 * do the actual unlink or rename system call to avoid any race.
+	 *
+	 * If this is a rename, allocate first, because we can't recursively
+	 * grab the region lock.
+	 */
+	if (newname == NULL)
+		p = NULL;
+	else {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, strlen(newname) + 1, &newname_off, &p)) != 0)
+			return (ret);
+		memcpy(p, newname, strlen(newname) + 1);
+	}
+
+	locked = 1;
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Find the file -- if mpool doesn't know about this file, that's not
+	 * an error-- we may not have it open.
+	 */
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		/* Ignore non-active files. */
+		if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+			continue;
+
+		/* Ignore non-matching files. */
+		if (memcmp(fileid, R_ADDR(
+		    dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+			continue;
+
+		/* If newname is NULL, we're removing the file. */
+		if (newname == NULL) {
+			MUTEX_LOCK(dbenv, &mfp->mutex);
+			MPOOLFILE_IGNORE(mfp);
+			MUTEX_UNLOCK(dbenv, &mfp->mutex);
+		} else {
+			/*
+			 * Else, it's a rename.  We've allocated memory
+			 * for the new name.  Swap it with the old one.
+			 */
+			p = R_ADDR(dbmp->reginfo, mfp->path_off);
+			mfp->path_off = newname_off;
+		}
+		break;
+	}
+
+	/* Delete the memory we no longer need. */
+	if (p != NULL)
+		__db_shalloc_free(dbmp->reginfo[0].addr, p);
+
+fsop:	if (newname == NULL)
+		(void)__os_unlink(dbenv, fullold);
+	else
+		(void)__os_rename(dbenv, fullold, fullnew, 1);
+
+	if (locked)
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
+	return (0);
 }
diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c
index 27859f69d7b..46eefad986f 100644
--- a/bdb/mp/mp_register.c
+++ b/bdb/mp/mp_register.c
@@ -1,38 +1,33 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
+static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 /*
  * memp_register --
  *	Register a file type's pgin, pgout routines.
+ *
+ * PUBLIC: int __memp_register __P((DB_ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
  */
 int
-memp_register(dbenv, ftype, pgin, pgout)
+__memp_register(dbenv, ftype, pgin, pgout)
 	DB_ENV *dbenv;
 	int ftype;
 	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
@@ -42,13 +37,9 @@ memp_register(dbenv, ftype, pgin, pgout)
 	DB_MPREG *mpreg;
 	int ret;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);
 
 	dbmp = dbenv->mp_handle;
 
@@ -70,7 +61,7 @@ memp_register(dbenv, ftype, pgin, pgout)
 		return (0);
 
 	/* New entry. */
-	if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
+	if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0)
 		return (ret);
 
 	mpreg->ftype = ftype;
diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c
index 7982513448d..12e72b91d70 100644
--- a/bdb/mp/mp_stat.c
+++ b/bdb/mp/mp_stat.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
+static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -18,123 +18,150 @@ static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic E
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "db_am.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
 
-static void __memp_dumpcache
-		__P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
+static void __memp_dumpcache __P((DB_ENV *,
+		DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
 static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
+static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int));
 
 /*
- * memp_stat --
+ * __memp_stat --
  *	Display MPOOL statistics.
+ *
+ * PUBLIC: int __memp_stat
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
  */
 int
-memp_stat(dbenv, gspp, fspp, db_malloc)
+__memp_stat(dbenv, gspp, fspp, flags)
 	DB_ENV *dbenv;
 	DB_MPOOL_STAT **gspp;
 	DB_MPOOL_FSTAT ***fspp;
-	void *(*db_malloc) __P((size_t));
+	u_int32_t flags;
 {
 	DB_MPOOL *dbmp;
 	DB_MPOOL_FSTAT **tfsp, *tstruct;
 	DB_MPOOL_STAT *sp;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	char *tname;
-	size_t len, nlen;
-	u_int32_t i;
+	size_t len, nlen, pagesize;
+	u_int32_t pages, i;
 	int ret;
-	char *name;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
-#endif
+	char *name, *tname;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL);
+
+	if ((ret = __db_fchk(dbenv,
+	    "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
 
 	dbmp = dbenv->mp_handle;
-	sp = NULL;
+	mp = dbmp->reginfo[0].primary;
 
 	/* Global statistics. */
-	mp = dbmp->reginfo[0].primary;
 	if (gspp != NULL) {
 		*gspp = NULL;
 
-		if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
+		if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0)
 			return (ret);
+		memset(*gspp, 0, sizeof(**gspp));
 		sp = *gspp;
 
 		/*
 		 * Initialization and information that is not maintained on
 		 * a per-cache basis.
 		 */
-		sp->st_hash_longest = 0;
-		sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
-		sp->st_region_nowait =
-		    dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
-		sp->st_gbytes = dbenv->mp_gbytes;
-		sp->st_bytes = dbenv->mp_bytes;
+		c_mp = dbmp->reginfo[0].primary;
+		sp->st_gbytes = c_mp->stat.st_gbytes;
+		sp->st_bytes = c_mp->stat.st_bytes;
 		sp->st_ncache = dbmp->nreg;
 		sp->st_regsize = dbmp->reginfo[0].rp->size;
 
-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/* Walk the cache list and accumulate the global information. */
 		for (i = 0; i < mp->nreg; ++i) {
 			c_mp = dbmp->reginfo[i].primary;
+
+			sp->st_map += c_mp->stat.st_map;
 			sp->st_cache_hit += c_mp->stat.st_cache_hit;
 			sp->st_cache_miss += c_mp->stat.st_cache_miss;
-			sp->st_map += c_mp->stat.st_map;
 			sp->st_page_create += c_mp->stat.st_page_create;
 			sp->st_page_in += c_mp->stat.st_page_in;
 			sp->st_page_out += c_mp->stat.st_page_out;
 			sp->st_ro_evict += c_mp->stat.st_ro_evict;
 			sp->st_rw_evict += c_mp->stat.st_rw_evict;
+			sp->st_page_trickle += c_mp->stat.st_page_trickle;
+			sp->st_pages += c_mp->stat.st_pages;
+			/*
+			 * st_page_dirty	calculated by __memp_stat_hash
+			 * st_page_clean	calculated here
+			 */
+			__memp_stat_hash(
+			    &dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
+			sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
 			sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
 			sp->st_hash_searches += c_mp->stat.st_hash_searches;
-			if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
-				sp->st_hash_longest =
-				    c_mp->stat.st_hash_longest;
+			sp->st_hash_longest += c_mp->stat.st_hash_longest;
 			sp->st_hash_examined += c_mp->stat.st_hash_examined;
-			sp->st_page_clean += c_mp->stat.st_page_clean;
-			sp->st_page_dirty += c_mp->stat.st_page_dirty;
-			sp->st_page_trickle += c_mp->stat.st_page_trickle;
-			sp->st_region_wait += c_mp->stat.st_region_wait;
-			sp->st_region_nowait += c_mp->stat.st_region_nowait;
+			/*
+			 * st_hash_nowait	calculated by __memp_stat_wait
+			 * st_hash_wait
+			 */
+			__memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags);
+			sp->st_region_nowait +=
+			    dbmp->reginfo[i].rp->mutex.mutex_set_nowait;
+			sp->st_region_wait +=
+			    dbmp->reginfo[i].rp->mutex.mutex_set_wait;
+			sp->st_alloc += c_mp->stat.st_alloc;
+			sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
+			if (sp->st_alloc_max_buckets <
+			    c_mp->stat.st_alloc_max_buckets)
+				sp->st_alloc_max_buckets =
+				    c_mp->stat.st_alloc_max_buckets;
+			sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
+			if (sp->st_alloc_max_pages <
+			    c_mp->stat.st_alloc_max_pages)
+				sp->st_alloc_max_pages =
+				    c_mp->stat.st_alloc_max_pages;
+
+			if (LF_ISSET(DB_STAT_CLEAR)) {
+				dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0;
+				dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0;
+				pages = c_mp->stat.st_pages;
+				memset(&c_mp->stat, 0, sizeof(c_mp->stat));
+				c_mp->stat.st_hash_buckets = c_mp->htab_buckets;
+				c_mp->stat.st_pages = pages;
+			}
 		}
 
 		/*
-		 * We have duplicate statistics fields in the cache and
-		 * per-file structures.  The counters are only incremented
-		 * in the per-file structures, though.  The intent is that
-		 * if we ever flush files from the pool we can save their
-		 * last known totals in the cache structure.
+		 * We have duplicate statistics fields in per-file structures
+		 * and the cache.  The counters are only incremented in the
+		 * per-file structures, except if a file is flushed from the
+		 * mpool, at which time we copy its information into the cache
+		 * statistics.  We added the cache information above, now we
+		 * add the per-file information.
 		 */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			sp->st_map += mfp->stat.st_map;
 			sp->st_cache_hit += mfp->stat.st_cache_hit;
 			sp->st_cache_miss += mfp->stat.st_cache_miss;
-			sp->st_map += mfp->stat.st_map;
 			sp->st_page_create += mfp->stat.st_page_create;
 			sp->st_page_in += mfp->stat.st_page_in;
 			sp->st_page_out += mfp->stat.st_page_out;
+			if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) {
+				pagesize = mfp->stat.st_pagesize;
+				memset(&mfp->stat, 0, sizeof(mfp->stat));
+				mfp->stat.st_pagesize = pagesize;
+			}
 		}
-
 		R_UNLOCK(dbenv, dbmp->reginfo);
 	}
 
@@ -142,9 +169,8 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 	if (fspp != NULL) {
 		*fspp = NULL;
 
-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/* Count the MPOOLFILE structures. */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (i = 0, len = 0,
 		    mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 		    mfp != NULL;
@@ -153,18 +179,15 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 			    sizeof(DB_MPOOL_FSTAT) +
 			    strlen(__memp_fns(dbmp, mfp)) + 1;
 		len += sizeof(DB_MPOOL_FSTAT *);	/* Trailing NULL */
-
 		R_UNLOCK(dbenv, dbmp->reginfo);
 
-		if (len == 0)
+		if (i == 0)
 			return (0);
 
 		/* Allocate space */
-		if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
+		if ((ret = __os_umalloc(dbenv, len, fspp)) != 0)
 			return (ret);
 
-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/*
 		 * Build each individual entry.  We assume that an array of
 		 * pointers are aligned correctly to be followed by an array
@@ -179,20 +202,30 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 		tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
 		tname = (char *)(tstruct + i);
 
+		/*
+		 * Files may have been opened since we counted, don't walk
+		 * off the end of the allocated space.
+		 */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
-		    mfp != NULL;
+		    mfp != NULL && i-- > 0;
 		    ++tfsp, ++tstruct, tname += nlen,
 		    mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
 			name = __memp_fns(dbmp, mfp);
 			nlen = strlen(name) + 1;
 			*tfsp = tstruct;
 			*tstruct = mfp->stat;
+			if (LF_ISSET(DB_STAT_CLEAR)) {
+				pagesize = mfp->stat.st_pagesize;
+				memset(&mfp->stat, 0, sizeof(mfp->stat));
+				mfp->stat.st_pagesize = pagesize;
+			}
 			tstruct->file_name = tname;
 			memcpy(tname, name, nlen);
 		}
-		*tfsp = NULL;
-
 		R_UNLOCK(dbenv, dbmp->reginfo);
+
+		*tfsp = NULL;
 	}
 	return (0);
 }
@@ -200,7 +233,6 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 #define	FMAP_ENTRIES	200			/* Files we map. */
 
 #define	MPOOL_DUMP_HASH	0x01			/* Debug hash chains. */
-#define	MPOOL_DUMP_LRU	0x02			/* Debug LRU chains. */
 #define	MPOOL_DUMP_MEM	0x04			/* Debug region memory. */
 #define	MPOOL_DUMP_ALL	0x07			/* Debug all. */
 
@@ -208,14 +240,23 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
  * __memp_dump_region --
  *	Display MPOOL structures.
  *
- * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
+ * PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *));
  */
-void
+int
 __memp_dump_region(dbenv, area, fp)
 	DB_ENV *dbenv;
 	char *area;
 	FILE *fp;
 {
+	static const FN fn[] = {
+		{ MP_CAN_MMAP,	"mmapped" },
+		{ MP_DEADFILE,	"dead" },
+		{ MP_DIRECT,	"no buffer" },
+		{ MP_EXTENT,	"extent" },
+		{ MP_TEMP,	"temporary" },
+		{ MP_UNLINK,	"unlink" },
+		{ 0,		NULL }
+	};
 	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
 	MPOOL *mp;
@@ -225,6 +266,10 @@ __memp_dump_region(dbenv, area, fp)
 	int cnt;
 	u_int8_t *p;
 
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL);
+
 	dbmp = dbenv->mp_handle;
 
 	/* Make it easy to call from the debugger. */
@@ -239,40 +284,42 @@ __memp_dump_region(dbenv, area, fp)
 		case 'h':
 			LF_SET(MPOOL_DUMP_HASH);
 			break;
-		case 'l':
-			LF_SET(MPOOL_DUMP_LRU);
-			break;
 		case 'm':
 			LF_SET(MPOOL_DUMP_MEM);
 			break;
 		}
 
-	R_LOCK(dbenv, dbmp->reginfo);
-
 	mp = dbmp->reginfo[0].primary;
 
 	/* Display MPOOL structures. */
 	(void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
-	    DB_LINE, (u_long)dbmp->reginfo[0].addr);
+	    DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr));
 
 	/* Display the MPOOLFILE structures. */
-	cnt = 0;
-	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	R_LOCK(dbenv, dbmp->reginfo);
+	for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
-		(void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
-		    cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
-		    F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
+		(void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1,
+		    __memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize);
+		(void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;",
+		    (long)mfp->ftype, (u_long)mfp->mpf_cnt,
+		    (u_long)mfp->block_cnt, (u_long)mfp->last_pgno);
+		__db_prflags(mfp->flags, fn, fp);
+
+		(void)fprintf(fp, "\n\t UID: ");
 		p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
-		for (i = 0; i < DB_FILE_ID_LEN; ++i) {
-			(void)fprintf(fp, "%x", *p++);
+		for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) {
+			(void)fprintf(fp, "%x", (u_int)*p);
 			if (i < DB_FILE_ID_LEN - 1)
 				(void)fprintf(fp, " ");
 		}
-		(void)fprintf(fp, "]\n");
+		(void)fprintf(fp, "\n");
 		if (cnt < FMAP_ENTRIES)
 			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
 
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
 		(void)fprintf(fp, "File #%d: %s: per-process, %s\n",
@@ -281,6 +328,7 @@ __memp_dump_region(dbenv, area, fp)
 		    if (cnt < FMAP_ENTRIES)
 			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
 	}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 	if (cnt < FMAP_ENTRIES)
 		fmap[cnt] = INVALID_ROFF;
 	else
@@ -289,13 +337,14 @@ __memp_dump_region(dbenv, area, fp)
 	/* Dump the memory pools. */
 	for (i = 0; i < mp->nreg; ++i) {
 		(void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
-		__memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
+		__memp_dumpcache(
+		    dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags);
 	}
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
 	/* Flush in case we're debugging. */
 	(void)fflush(fp);
+
+	return (0);
 }
 
 /*
@@ -303,7 +352,8 @@ __memp_dump_region(dbenv, area, fp)
  *	Display statistics for a cache.
  */
 static void
-__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
+__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags)
+	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	REGINFO *reginfo;
 	size_t *fmap;
@@ -311,7 +361,7 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
 	u_int32_t flags;
 {
 	BH *bhp;
-	DB_HASHTAB *dbht;
+	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp;
 	int bucket;
 
@@ -320,27 +370,24 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
 	/* Display the hash table list of BH's. */
 	if (LF_ISSET(MPOOL_DUMP_HASH)) {
 		(void)fprintf(fp,
-	    "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
+		    "%s\nBH hash table (%lu hash slots)\nbucket (priority):\n",
 		    DB_LINE, (u_long)c_mp->htab_buckets);
-		for (dbht = R_ADDR(reginfo, c_mp->htab),
-		    bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
-			if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
-				(void)fprintf(fp, "%lu:\n", (u_long)bucket);
-			for (bhp = SH_TAILQ_FIRST(dbht, __bh);
-			    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+		(void)fprintf(fp,
+		    "\tpageno, file, ref, address [LSN] priority\n");
+
+		for (hp = R_ADDR(reginfo, c_mp->htab),
+		    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+			if ((bhp =
+			    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+				(void)fprintf(fp, "%lu (%u):\n",
+				    (u_long)bucket, hp->hash_priority);
+			for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
 				__memp_pbh(dbmp, bhp, fmap, fp);
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 		}
 	}
 
-	/* Display the LRU list of BH's. */
-	if (LF_ISSET(MPOOL_DUMP_LRU)) {
-		(void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
-		(void)fprintf(fp, "pageno, file, ref, address\n");
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
-			__memp_pbh(dbmp, bhp, fmap, fp);
-	}
-
 	/* Dump the memory pool. */
 	if (LF_ISSET(MPOOL_DUMP_MEM))
 		__db_shalloc_dump(reginfo->addr, fp);
@@ -360,10 +407,9 @@ __memp_pbh(dbmp, bhp, fmap, fp)
 	static const FN fn[] = {
 		{ BH_CALLPGIN,		"callpgin" },
 		{ BH_DIRTY,		"dirty" },
+		{ BH_DIRTY_CREATE,	"created" },
 		{ BH_DISCARD,		"discard" },
 		{ BH_LOCKED,		"locked" },
-		{ BH_SYNC,		"sync" },
-		{ BH_SYNC_LOGFLSH,	"sync:logflush" },
 		{ BH_TRASH,		"trash" },
 		{ 0,			NULL }
 	};
@@ -374,15 +420,72 @@ __memp_pbh(dbmp, bhp, fmap, fp)
 			break;
 
 	if (fmap[i] == INVALID_ROFF)
-		(void)fprintf(fp, "  %4lu, %lu, %2lu, %lu",
+		(void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu",
 		    (u_long)bhp->pgno, (u_long)bhp->mf_offset,
-		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+		    (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+		    (u_long)bhp->priority);
 	else
-		(void)fprintf(fp, "  %4lu,   #%d,  %2lu, %lu",
+		(void)fprintf(fp, "\t%5lu,   #%d,  %2lu, %8lu [%lu,%lu] %lu",
 		    (u_long)bhp->pgno, i + 1,
-		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+		    (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+		    (u_long)bhp->priority);
 
 	__db_prflags(bhp->flags, fn, fp);
 
 	(void)fprintf(fp, "\n");
 }
+
+/*
+ * __memp_stat_hash --
+ *	Total hash bucket stats (other than mutex wait) into the region.
+ *
+ * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+ */
+void
+__memp_stat_hash(reginfo, mp, dirtyp)
+	REGINFO *reginfo;
+	MPOOL *mp;
+	u_int32_t *dirtyp;
+{
+	DB_MPOOL_HASH *hp;
+	u_int32_t dirty;
+	int i;
+
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
+		dirty += hp->hash_page_dirty;
+	*dirtyp = dirty;
+}
+
+/*
+ * __memp_stat_wait --
+ *	Total hash bucket wait stats into the region.
+ */
+static void
+__memp_stat_wait(reginfo, mp, mstat, flags)
+	REGINFO *reginfo;
+	MPOOL *mp;
+	DB_MPOOL_STAT *mstat;
+	int flags;
+{
+	DB_MPOOL_HASH *hp;
+	DB_MUTEX *mutexp;
+	int i;
+
+	mstat->st_hash_max_wait = 0;
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0; i < mp->htab_buckets; i++, hp++) {
+		mutexp = &hp->hash_mutex;
+		mstat->st_hash_nowait += mutexp->mutex_set_nowait;
+		mstat->st_hash_wait += mutexp->mutex_set_wait;
+		if (mutexp->mutex_set_wait > mstat->st_hash_max_wait)
+			mstat->st_hash_max_wait = mutexp->mutex_set_wait;
+
+		if (LF_ISSET(DB_STAT_CLEAR)) {
+			mutexp->mutex_set_wait = 0;
+			mutexp->mutex_set_nowait = 0;
+		}
+	}
+}
diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c
index 1b0751db709..03b42208b39 100644
--- a/bdb/mp/mp_sync.c
+++ b/bdb/mp/mp_sync.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "$Id: mp_sync.c,v 11.64 2002/08/25 16:00:27 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,339 +16,92 @@ static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic E
 #include <stdlib.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef struct {
+	DB_MPOOL_HASH *track_hp;	/* Hash bucket. */
+
+	roff_t	  track_off;		/* Page file offset. */
+	db_pgno_t track_pgno;		/* Page number. */
+} BH_TRACK;
 
 static int __bhcmp __P((const void *, const void *));
-static int __memp_fsync __P((DB_MPOOLFILE *));
-static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
 
 /*
- * memp_sync --
+ * __memp_sync --
  *	Mpool sync function.
+ *
+ * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
  */
 int
-memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, lsnp)
 	DB_ENV *dbenv;
 	DB_LSN *lsnp;
 {
-	BH *bhp, **bharray;
 	DB_MPOOL *dbmp;
-	DB_LSN tlsn;
-	MPOOL *c_mp, *mp;
-	MPOOLFILE *mfp;
-	u_int32_t ar_cnt, i, ndirty;
-	int ret, retry_done, retry_need, wrote;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_sync(dbenv, lsnp));
-#endif
+	MPOOL *mp;
+	int ret;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
-
-	dbmp = dbenv->mp_handle;
-	mp = dbmp->reginfo[0].primary;
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
 
 	/*
-	 * If no LSN is provided, flush the entire cache.
-	 *
-	 * !!!
-	 * Our current behavior is to flush the entire cache, so there's
-	 * nothing special we have to do here other than deal with NULL
-	 * pointers.
+	 * If no LSN is provided, flush the entire cache (reasonable usage
+	 * even if there's no log subsystem configured).
 	 */
-	if (lsnp == NULL) {
-		ZERO_LSN(tlsn);
-		lsnp = &tlsn;
-		F_SET(mp, MP_LSN_RETRY);
-	} else if (!LOGGING_ON(dbenv)) {
-		__db_err(dbenv, "memp_sync: requires logging");
-		return (EINVAL);
-	}
+	if (lsnp != NULL)
+		ENV_REQUIRES_CONFIG(dbenv,
+		    dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
 
-	/*
-	 * Sync calls are single-threaded so that we don't have multiple
-	 * threads, with different checkpoint LSNs, walking the caches
-	 * and updating the checkpoint LSNs and how many buffers remain
-	 * to be written for the checkpoint.  This shouldn't be a problem,
-	 * any application that has multiple checkpoint threads isn't what
-	 * I'd call trustworthy.
-	 */
-	MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
 
-	/*
-	 * If the application is asking about a previous call to memp_sync(),
-	 * and we haven't found any buffers that the application holding the
-	 * pin couldn't write, return yes or no based on the current count.
-	 * Note, if the application is asking about a LSN *smaller* than one
-	 * we've already handled or are currently handling, then we return a
-	 * result based on the count for the larger LSN.
-	 */
-	R_LOCK(dbenv, dbmp->reginfo);
-	if (!IS_ZERO_LSN(*lsnp) &&
-	    !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
-		if (mp->lsn_cnt == 0) {
+	/* If we've flushed to the requested LSN, return that information. */
+	if (lsnp != NULL) {
+		R_LOCK(dbenv, dbmp->reginfo);
+		if (log_compare(lsnp, &mp->lsn) <= 0) {
 			*lsnp = mp->lsn;
-			ret = 0;
-		} else
-			ret = DB_INCOMPLETE;
 
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			return (0);
+		}
 		R_UNLOCK(dbenv, dbmp->reginfo);
-		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-		return (ret);
 	}
 
-	/*
-	 * Allocate room for a list of buffers, and decide how many buffers
-	 * we can pin down.
-	 *
-	 * !!!
-	 * Note: __memp_sballoc has released the region lock if we're not
-	 * continuing forward.
-	 */
-	if ((ret =
-	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
-		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+	if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
 		return (ret);
-	}
 
-	retry_done = 0;
-retry:	retry_need = 0;
-	/*
-	 * Start a new checkpoint.
-	 *
-	 * Save the LSN.  We know that it's a new LSN, a retry, or larger than
-	 * the one for which we were already doing a checkpoint.  (BTW, I don't
-	 * expect to see multiple LSN's from the same or multiple processes,
-	 * but You Just Never Know.  Responding as if they all called with the
-	 * largest of the LSNs specified makes everything work.)
-	 *
-	 * We don't currently use the LSN we save.  We could potentially save
-	 * the last-written LSN in each buffer header and use it to determine
-	 * what buffers need to be written.  The problem with this is that it's
-	 * sizeof(LSN) more bytes of buffer header.  We currently write all the
-	 * dirty buffers instead, but with a sufficiently large cache that's
-	 * going to be a problem.
-	 */
-	mp->lsn = *lsnp;
-
-	/*
-	 * Clear the global count of buffers waiting to be written, walk the
-	 * list of files clearing the count of buffers waiting to be written.
-	 *
-	 * Clear the retry flag.
-	 */
-	mp->lsn_cnt = 0;
-	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
-	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
-		mfp->lsn_cnt = 0;
-	F_CLR(mp, MP_LSN_RETRY);
-
-	/*
-	 * Walk each cache's list of buffers and mark all dirty buffers to be
-	 * written and all pinned buffers to be potentially written (we can't
-	 * know if they'll need to be written until the holder returns them to
-	 * the cache).  We do this in one pass while holding the region locked
-	 * so that processes can't make new buffers dirty, causing us to never
-	 * finish.  Since the application may have restarted the sync using a
-	 * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
-	 * appear leftover from previous calls.
-	 *
-	 * Keep a count of the total number of buffers we need to write in
-	 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
-	 */
-	for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-			if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
-				F_SET(bhp, BH_SYNC);
-
-				++mp->lsn_cnt;
-
-				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
-				++mfp->lsn_cnt;
-
-				/*
-				 * If the buffer isn't being used, we can write
-				 * it immediately, so increment its reference
-				 * count to lock it down, and save a reference
-				 * to it.
-				 *
-				 * If we've run out space to store buffer refs,
-				 * we're screwed.  We don't want to realloc the
-				 * array while holding a region lock, so we set
-				 * a flag and deal with it later.
-				 */
-				if (bhp->ref == 0) {
-					++bhp->ref;
-					bharray[ar_cnt] = bhp;
-
-					if (++ar_cnt >= ndirty) {
-						retry_need = 1;
-						break;
-					}
-				}
-			} else
-				if (F_ISSET(bhp, BH_SYNC))
-					F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-		}
-		if (ar_cnt >= ndirty)
-			break;
-	}
-
-	/* If there no buffers we can write immediately, we're done. */
-	if (ar_cnt == 0) {
-		ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
-		goto done;
-	}
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Sort the buffers we're going to write immediately.
-	 *
-	 * We try and write the buffers in file/page order: it should reduce
-	 * seeks by the underlying filesystem and possibly reduce the actual
-	 * number of writes.
-	 */
-	if (ar_cnt > 1)
-		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
-
-	/*
-	 * Flush the log.  We have to ensure the log records reflecting the
-	 * changes on the database pages we're writing have already made it
-	 * to disk.  We usually do that as we write each page, but if we
-	 * are going to write a large number of pages, repeatedly acquiring
-	 * the log region lock is going to be expensive.  Flush the entire
-	 * log now, so that sync doesn't require any more log flushes.
-	 */
-	if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
-		goto done;
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Walk the array, writing buffers. */
-	for (i = 0; i < ar_cnt; ++i) {
-		/*
-		 * It's possible for a thread to have gotten the buffer since
-		 * we listed it for writing.  If the reference count is still
-		 * 1, we're the only ones using the buffer, go ahead and write.
-		 * If it's >1, then skip the buffer and assume that it will be
-		 * written when it's returned to the cache.
-		 */
-		if (bharray[i]->ref > 1) {
-			--bharray[i]->ref;
-			continue;
-		}
-
-		/* Write the buffer. */
-		mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
-		ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
-
-		/* Release the buffer. */
-		--bharray[i]->ref;
-
-		if (ret == 0 && wrote)
-			continue;
-
-		/*
-		 * Any process syncing the shared memory buffer pool had best
-		 * be able to write to any underlying file. Be understanding,
-		 * but firm, on this point.
-		 */
-		if (ret == 0) {
-			__db_err(dbenv, "%s: unable to flush page: %lu",
-			    __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
-			ret = EPERM;
-		}
-
-		/*
-		 * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
-		 * future checkpoint return can depend on this failure.  Clear
-		 * the buffer's BH_SYNC flag, because it's used to determine
-		 * if lsn_cnt values are incremented/decremented.  Don't bother
-		 * to reset/clear:
-		 *
-		 *	MPOOL->lsn_cnt
-		 *	MPOOLFILE->lsn_cnt
-		 *
-		 * they don't make any difference.
-		 */
-		ZERO_LSN(mp->lsn);
-		F_SET(mp, MP_LSN_RETRY);
-
-		/* Release any buffers we're still pinning down. */
-		while (++i < ar_cnt) {
-			bhp = bharray[i];
-			--bhp->ref;
-			F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-		}
-
-		goto done;
-	}
-
-	ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
-
-	/*
-	 * If there were too many buffers and we're not returning an error, we
-	 * re-try the checkpoint once -- since we allocated 80% of the total
-	 * buffer count, once should be enough. If it still doesn't work, some
-	 * other thread of control is dirtying buffers as fast as we're writing
-	 * them, and we might as well give up for now.  In the latter case, set
-	 * the global retry flag, we'll have to start from scratch on the next
-	 * checkpoint.
-	 */
-	if (retry_need) {
-		if (retry_done) {
-			ret = DB_INCOMPLETE;
-			F_SET(mp, MP_LSN_RETRY);
-		} else {
-			retry_done = 1;
-			goto retry;
-		}
+	if (lsnp != NULL) {
+		R_LOCK(dbenv, dbmp->reginfo);
+		if (log_compare(lsnp, &mp->lsn) > 0)
+			mp->lsn = *lsnp;
+		R_UNLOCK(dbenv, dbmp->reginfo);
 	}
 
-done:	R_UNLOCK(dbenv, dbmp->reginfo);
-	MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-
-	__os_free(bharray, ndirty * sizeof(BH *));
-
-	return (ret);
+	return (0);
 }
 
 /*
- * memp_fsync --
+ * __memp_fsync --
  *	Mpool file sync function.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
  */
 int
-memp_fsync(dbmfp)
+__memp_fsync(dbmfp)
 	DB_MPOOLFILE *dbmfp;
 {
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	int is_tmp;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fsync(dbmfp));
-#endif
-
 	PANIC_CHECK(dbenv);
 
 	/*
@@ -359,13 +112,10 @@ memp_fsync(dbmfp)
 	if (F_ISSET(dbmfp, MP_READONLY))
 		return (0);
 
-	R_LOCK(dbenv, dbmp->reginfo);
-	is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	if (is_tmp)
+	if (F_ISSET(dbmfp->mfp, MP_TEMP))
 		return (0);
 
-	return (__memp_fsync(dbmfp));
+	return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
 }
 
 /*
@@ -379,6 +129,7 @@ __mp_xxx_fh(dbmfp, fhp)
 	DB_MPOOLFILE *dbmfp;
 	DB_FH **fhp;
 {
+	DB_ENV *dbenv;
 	/*
 	 * This is a truly spectacular layering violation, intended ONLY to
 	 * support compatibility for the DB 1.85 DB->fd call.
@@ -393,239 +144,457 @@ __mp_xxx_fh(dbmfp, fhp)
 	 * because we want to write to the backing file regardless so that
 	 * we get a file descriptor to return.
 	 */
-	*fhp = &dbmfp->fh;
-	return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+	*fhp = dbmfp->fhp;
+	if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+		return (0);
+	dbenv = dbmfp->dbmp->dbenv;
+
+	return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
 }
 
 /*
- * __memp_fsync --
- *	Mpool file internal sync function.
+ * __memp_sync_int --
+ *	Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *));
  */
-static int
-__memp_fsync(dbmfp)
+int
+__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep)
+	DB_ENV *dbenv;
 	DB_MPOOLFILE *dbmfp;
+	int ar_max, *wrotep;
+	db_sync_op op;
 {
-	BH *bhp, **bharray;
-	DB_ENV *dbenv;
+	BH *bhp;
+	BH_TRACK *bharray;
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	DB_MUTEX *mutexp;
 	MPOOL *c_mp, *mp;
-	size_t mf_offset;
-	u_int32_t ar_cnt, i, ndirty;
-	int incomplete, ret, retry_done, retry_need, wrote;
+	MPOOLFILE *mfp;
+	u_int32_t n_cache;
+	int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote;
 
-	dbmp = dbmfp->dbmp;
-	dbenv = dbmp->dbenv;
+	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
-
-	R_LOCK(dbenv, dbmp->reginfo);
+	pass = wrote = 0;
 
 	/*
-	 * Allocate room for a list of buffers, and decide how many buffers
-	 * we can pin down.
-	 *
-	 * !!!
-	 * Note: __memp_sballoc has released our region lock if we're not
-	 * continuing forward.
+	 * If the caller does not specify how many pages assume one
+	 * per bucket.
 	 */
+	if (ar_max == 0)
+		ar_max = mp->nreg * mp->htab_buckets;
+
 	if ((ret =
-	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+	    __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
 		return (ret);
 
-	retry_done = 0;
-retry:	retry_need = 0;
 	/*
 	 * Walk each cache's list of buffers and mark all dirty buffers to be
-	 * written and all pinned buffers to be potentially written (we can't
-	 * know if they'll need to be written until the holder returns them to
-	 * the cache).  We do this in one pass while holding the region locked
-	 * so that processes can't make new buffers dirty, causing us to never
-	 * finish.
+	 * written and all pinned buffers to be potentially written, depending
+	 * on our flags.
 	 */
-	mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
-	for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-			if (!F_ISSET(bhp, BH_DIRTY) ||
-			    bhp->mf_offset != mf_offset)
-				continue;
-			if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
-				incomplete = 1;
-				continue;
-			}
+	for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+		c_mp = dbmp->reginfo[n_cache].primary;
 
+		hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+		for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
 			/*
-			 * If the buffer isn't being used, we can write
-			 * it immediately, so increment its reference
-			 * count to lock it down, and save a reference
-			 * to it.
-			 *
-			 * If we've run out space to store buffer refs,
-			 * we're screwed.  We don't want to realloc the
-			 * array while holding a region lock, so we set
-			 * a flag and deal with it later.
+			 * We can check for empty buckets before locking as we
+			 * only care if the pointer is zero or non-zero.  We
+			 * can ignore empty buckets because we only need write
+			 * buffers that were dirty before we started.
 			 */
-			++bhp->ref;
-			bharray[ar_cnt] = bhp;
-			if (++ar_cnt >= ndirty) {
-				retry_need = 1;
-				break;
+			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+				continue;
+
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+			for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+			    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+				/* Always ignore unreferenced, clean pages. */
+				if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
+					continue;
+
+				/*
+				 * Checkpoints have to wait on all pinned pages,
+				 * as pages may be marked dirty when returned to
+				 * the cache.
+				 *
+				 * File syncs only wait on pages both pinned and
+				 * dirty.  (We don't care if pages are marked
+				 * dirty when returned to the cache, that means
+				 * there's another writing thread and flushing
+				 * the cache for this handle is meaningless.)
+				 */
+				if (op == DB_SYNC_FILE &&
+				    !F_ISSET(bhp, BH_DIRTY))
+					continue;
+
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+				/*
+				 * Ignore temporary files -- this means you
+				 * can't even flush temporary files by handle.
+				 * (Checkpoint doesn't require temporary files
+				 * be flushed and the underlying buffer write
+				 * write routine may not be able to write it
+				 * anyway.)
+				 */
+				if (F_ISSET(mfp, MP_TEMP))
+					continue;
+
+				/*
+				 * If we're flushing a specific file, see if
+				 * this page is from that file.
+				 */
+				if (dbmfp != NULL && mfp != dbmfp->mfp)
+					continue;
+
+				/*
+				 * Ignore files that aren't involved in DB's
+				 * transactional operations during checkpoints.
+				 */
+				if (dbmfp == NULL && mfp->lsn_off == -1)
+					continue;
+
+				/* Track the buffer, we want it. */
+				bharray[ar_cnt].track_hp = hp;
+				bharray[ar_cnt].track_pgno = bhp->pgno;
+				bharray[ar_cnt].track_off = bhp->mf_offset;
+				ar_cnt++;
+
+				if (ar_cnt >= ar_max) {
+					if ((ret = __os_realloc(dbenv,
+					    (ar_max * 2) * sizeof(BH_TRACK),
+					    &bharray)) != 0)
+						break;
+					ar_max *= 2;
+				}
 			}
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+			if (ret != 0)
+				goto err;
 		}
-		if (ar_cnt >= ndirty)
-			break;
 	}
 
-	/* If there no buffers we can write immediately, we're done. */
-	if (ar_cnt == 0) {
-		ret = 0;
+	/* If there no buffers to write, we're done. */
+	if (ar_cnt == 0)
 		goto done;
-	}
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	/* Sort the buffers we're going to write. */
+	/*
+	 * Write the buffers in file/page order, trying to reduce seeks by the
+	 * filesystem and, when pages are smaller than filesystem block sizes,
+	 * reduce the actual number of writes.
+	 */
 	if (ar_cnt > 1)
-		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+		qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
 
-	R_LOCK(dbenv, dbmp->reginfo);
+	/*
+	 * If we're trickling buffers, only write enough to reach the correct
+	 * percentage for this region.  We may not write enough if the dirty
+	 * buffers have an unbalanced distribution among the regions, but that
+	 * seems unlikely.
+	 */
+	 if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg)
+		ar_cnt = ar_max / (int)mp->nreg;
+
+	/*
+	 * Flush the log.  We have to ensure the log records reflecting the
+	 * changes on the database pages we're writing have already made it
+	 * to disk.  We still have to check the log each time we write a page
+	 * (because pages we are about to write may be modified after we have
+	 * flushed the log), but in general this will at least avoid any I/O
+	 * on the log's part.
+	 */
+	if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0)
+		goto err;
+
+	/*
+	 * Walk the array, writing buffers.  When we write a buffer, we NULL
+	 * out its hash bucket pointer so we don't process a slot more than
+	 * once.
+	 */
+	for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) {
+		if (i >= ar_cnt) {
+			i = 0;
+			++pass;
+			__os_sleep(dbenv, 1, 0);
+		}
+		if ((hp = bharray[i].track_hp) == NULL)
+			continue;
+
+		/* Lock the hash bucket and find the buffer. */
+		mutexp = &hp->hash_mutex;
+		MUTEX_LOCK(dbenv, mutexp);
+		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+			if (bhp->pgno == bharray[i].track_pgno &&
+			    bhp->mf_offset == bharray[i].track_off)
+				break;
 
-	/* Walk the array, writing buffers. */
-	for (i = 0; i < ar_cnt;) {
 		/*
-		 * It's possible for a thread to have gotten the buffer since
-		 * we listed it for writing.  If the reference count is still
-		 * 1, we're the only ones using the buffer, go ahead and write.
-		 * If it's >1, then skip the buffer and assume that it will be
-		 * written when it's returned to the cache.
+		 * If we can't find the buffer we're done, somebody else had
+		 * to have written it.
+		 *
+		 * If the buffer isn't pinned or dirty, we're done, there's
+		 * no work needed.
 		 */
-		if (bharray[i]->ref > 1) {
-			incomplete = 1;
-			--bharray[i++]->ref;
+		if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
+			MUTEX_UNLOCK(dbenv, mutexp);
+			--remaining;
+			bharray[i].track_hp = NULL;
 			continue;
 		}
 
-		/* Write the buffer. */
-		ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+		/*
+		 * If the buffer is locked by another thread, ignore it, we'll
+		 * come back to it.
+		 *
+		 * If the buffer is pinned and it's only the first or second
+		 * time we have looked at it, ignore it, we'll come back to
+		 * it.
+		 *
+		 * In either case, skip the buffer if we're not required to
+		 * write it.
+		 */
+		if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
+			MUTEX_UNLOCK(dbenv, mutexp);
+			if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+				--remaining;
+				bharray[i].track_hp = NULL;
+			}
+			continue;
+		}
+
+		/*
+		 * The buffer is either pinned or dirty.
+		 *
+		 * Set the sync wait-for count, used to count down outstanding
+		 * references to this buffer as they are returned to the cache.
+		 */
+		bhp->ref_sync = bhp->ref;
 
-		/* Release the buffer. */
-		--bharray[i++]->ref;
+		/* Pin the buffer into memory and lock it. */
+		++bhp->ref;
+		F_SET(bhp, BH_LOCKED);
+		MUTEX_LOCK(dbenv, &bhp->mutex);
 
-		if (ret == 0) {
-			if (!wrote)
-				incomplete = 1;
-			continue;
+		/*
+		 * Unlock the hash bucket and wait for the wait-for count to
+		 * go to 0.   No new thread can acquire the buffer because we
+		 * have it locked.
+		 *
+		 * If a thread attempts to re-pin a page, the wait-for count
+		 * will never go to 0 (the thread spins on our buffer lock,
+		 * while we spin on the thread's ref count).  Give up if we
+		 * don't get the buffer in 3 seconds, we can try again later.
+		 *
+		 * If, when the wait-for count goes to 0, the buffer is found
+		 * to be dirty, write it.
+		 */
+		MUTEX_UNLOCK(dbenv, mutexp);
+		for (wait_cnt = 1;
+		    bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+			__os_sleep(dbenv, 1, 0);
+		MUTEX_LOCK(dbenv, mutexp);
+		hb_lock = 1;
+
+		/*
+		 * If the ref_sync count has gone to 0, we're going to be done
+		 * with this buffer no matter what happens.
+		 */
+		if (bhp->ref_sync == 0) {
+			--remaining;
+			bharray[i].track_hp = NULL;
 		}
 
 		/*
-		 * On error:
+		 * If the ref_sync count has gone to 0 and the buffer is still
+		 * dirty, we write it.  We only try to write the buffer once.
+		 * Any process checkpointing or trickle-flushing the pool
+		 * must be able to write any underlying file -- if the write
+		 * fails, error out.  It would be very strange if file sync
+		 * failed to write, but we don't care if it happens.
+		 */
+		if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
+			hb_lock = 0;
+			MUTEX_UNLOCK(dbenv, mutexp);
+
+			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+			if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
+				++wrote;
+			else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE)
+				__db_err(dbenv, "%s: unable to flush page: %lu",
+				    __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+			else
+				ret = 0;
+		}
+
+		/*
+		 * If ref_sync count never went to 0, the buffer was written
+		 * by another thread, or the write failed, we still have the
+		 * buffer locked.
+		 *
+		 * We may or may not currently hold the hash bucket mutex.  If
+		 * the __memp_bhwrite -> __memp_pgwrite call was successful,
+		 * then __memp_pgwrite will have swapped the buffer lock for
+		 * the hash lock.  All other call paths will leave us without
+		 * the hash bucket lock.
 		 *
-		 * Release any buffers we're still pinning down.
+		 * The order of mutexes above was to acquire the buffer lock
+		 * while holding the hash bucket lock.  Don't deadlock here,
+		 * release the buffer lock and then acquire the hash bucket
+		 * lock.
 		 */
-		while (i < ar_cnt)
-			--bharray[i++]->ref;
-		break;
-	}
+		if (F_ISSET(bhp, BH_LOCKED)) {
+			F_CLR(bhp, BH_LOCKED);
+			MUTEX_UNLOCK(dbenv, &bhp->mutex);
 
-	/*
-	 * If there were too many buffers and we're not returning an error, we
-	 * re-try the flush once -- since we allocated 80% of the total
-	 * buffer count, once should be enough. If it still doesn't work, some
-	 * other thread of control is dirtying buffers as fast as we're writing
-	 * them, and we might as well give up.
-	 */
-	if (retry_need) {
-		if (retry_done)
-			incomplete = 1;
-		else {
-			retry_done = 1;
-			goto retry;
+			if (!hb_lock)
+				MUTEX_LOCK(dbenv, mutexp);
 		}
-	}
 
-done:	R_UNLOCK(dbenv, dbmp->reginfo);
+		/*
+		 * Reset the ref_sync count regardless of our success, we're
+		 * done with this buffer for now.
+		 */
+		bhp->ref_sync = 0;
+
+		/* Discard our reference and unlock the bucket. */
+		--bhp->ref;
+		MUTEX_UNLOCK(dbenv, mutexp);
 
-	__os_free(bharray, ndirty * sizeof(BH *));
+		if (ret != 0)
+			break;
+	}
+
+done:	/* If we've opened files to flush pages, close them. */
+	if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0)
+		ret = t_ret;
 
 	/*
-	 * Sync the underlying file as the last thing we do, so that the OS
-	 * has a maximal opportunity to flush buffers before we request it.
-	 *
-	 * !!!:
-	 * Don't lock the region around the sync, fsync(2) has no atomicity
-	 * issues.
+	 * If doing a checkpoint or flushing a file for the application, we
+	 * have to force the pages to disk.  We don't do this as we go along
+	 * because we want to give the OS as much time as possible to lazily
+	 * flush, and because we have to flush files that might not even have
+	 * had dirty buffers in the cache, so we have to walk the files list.
 	 */
-	if (ret == 0)
-		ret = incomplete ?
-		    DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+	if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+		if (dbmfp == NULL)
+			ret = __memp_sync_files(dbenv, dbmp);
+		else
+			ret = __os_fsync(dbenv, dbmfp->fhp);
+	}
+
+err:	__os_free(dbenv, bharray);
+	if (wrotep != NULL)
+		*wrotep = wrote;
 
 	return (ret);
 }
 
 /*
- * __memp_sballoc --
- *	Allocate room for a list of buffers.
+ * __memp_sync_files --
+ *	Sync all the files in the environment, open or not.
  */
-static int
-__memp_sballoc(dbenv, bharrayp, ndirtyp)
+static
+int __memp_sync_files(dbenv, dbmp)
 	DB_ENV *dbenv;
-	BH ***bharrayp;
-	u_int32_t *ndirtyp;
-{
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
-	u_int32_t i, nclean, ndirty, maxpin;
-	int ret;
+{
+	DB_MPOOLFILE *dbmfp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	int ret, t_ret;
 
-	dbmp = dbenv->mp_handle;
+	ret = 0;
 	mp = dbmp->reginfo[0].primary;
 
-	/*
-	 * We don't want to hold the region lock while we write the buffers,
-	 * so only lock it while we create a list.
-	 *
-	 * Walk through the list of caches, figuring out how many buffers
-	 * we're going to need.
-	 *
-	 * Make a point of not holding the region lock across the library
-	 * allocation call.
-	 */
-	for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		ndirty += c_mp->stat.st_page_dirty;
-		nclean += c_mp->stat.st_page_clean;
+	R_LOCK(dbenv, dbmp->reginfo);
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		if (mfp->stat.st_page_out == 0 ||
+		    F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+			continue;
+
+		/* Look for an already open handle. */
+		ret = 0;
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+		for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+		    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+			if (dbmfp->mfp == mfp) {
+				ret = __os_fsync(dbenv, dbmfp->fhp);
+				break;
+			}
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		if (ret != 0)
+			goto err;
+
+		/* If we don't find one, open one. */
+		if (dbmfp == NULL) {
+			if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+				goto err;
+			ret = __memp_fopen_int(
+			    dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+			    0, 0, mfp->stat.st_pagesize);
+			if (ret == 0)
+				ret = __os_fsync(dbenv, dbmfp->fhp);
+			if ((t_ret =
+			    __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				goto err;
+		}
 	}
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	if (ndirty == 0) {
-		*ndirtyp = 0;
-		return (0);
+
+	if (0) {
+err:		__db_err(dbenv, "%s: cannot sync: %s",
+		    R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret));
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
 
-	/*
-	 * We don't want to pin down the entire buffer cache, otherwise we'll
-	 * starve threads needing new pages.  Don't pin down more than 80% of
-	 * the cache, making sure that we don't screw up just because only a
-	 * few pages have been created.
-	 */
-	maxpin = ((ndirty + nclean) * 8) / 10;
-	if (maxpin < 10)
-		maxpin = 10;
+	return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ *	Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(dbenv, dbmp)
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+{
+	DB_MPOOLFILE *dbmfp;
+	int ret;
 
 	/*
-	 * Get a good-sized block of memory to hold buffer pointers, we don't
-	 * want to run out, but correct if we want to allocate more than we
-	 * would be allowed to store, regardless.
+	 * The routine exists because we must close files opened by sync to
+	 * flush buffers.  There are two cases: first, extent files have to
+	 * be closed so they may be removed when empty.  Second, regular
+	 * files have to be closed so we don't run out of descriptors (for
+	 * example, and application partitioning its data into databases
+	 * based on timestamps, so there's a continually increasing set of
+	 * files).
+	 *
+	 * We mark files opened in the __memp_bhwrite() function with the
+	 * MP_FLUSH flag.  Here we walk through our file descriptor list,
+	 * and, if a file was opened by __memp_bhwrite(), we close it.
 	 */
-	ndirty += ndirty / 2 + 10;
-	if (ndirty > maxpin)
-		ndirty = maxpin;
-	if ((ret =
-	    __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
-		return (ret);
-
-	*ndirtyp = ndirty;
-
-	R_LOCK(dbenv, dbmp->reginfo);
+retry:	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+		if (F_ISSET(dbmfp, MP_FLUSH)) {
+			F_CLR(dbmfp, MP_FLUSH);
+			MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+			if ((ret = __memp_fclose_int(dbmfp, 0)) != 0)
+				return (ret);
+			goto retry;
+		}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 	return (0);
 }
@@ -634,15 +603,15 @@ static int
 __bhcmp(p1, p2)
 	const void *p1, *p2;
 {
-	BH *bhp1, *bhp2;
+	BH_TRACK *bhp1, *bhp2;
 
-	bhp1 = *(BH * const *)p1;
-	bhp2 = *(BH * const *)p2;
+	bhp1 = (BH_TRACK *)p1;
+	bhp2 = (BH_TRACK *)p2;
 
 	/* Sort by file (shared memory pool offset). */
-	if (bhp1->mf_offset < bhp2->mf_offset)
+	if (bhp1->track_off < bhp2->track_off)
 		return (-1);
-	if (bhp1->mf_offset > bhp2->mf_offset)
+	if (bhp1->track_off > bhp2->track_off)
 		return (1);
 
 	/*
@@ -650,9 +619,9 @@ __bhcmp(p1, p2)
 	 * Defend against badly written quicksort code calling the comparison
 	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
 	 */
-	if (bhp1->pgno < bhp2->pgno)
+	if (bhp1->track_pgno < bhp2->track_pgno)
 		return (-1);
-	if (bhp1->pgno > bhp2->pgno)
+	if (bhp1->track_pgno > bhp2->track_pgno)
 		return (1);
 	return (0);
 }
diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c
index f937805cf40..71077ab60cc 100644
--- a/bdb/mp/mp_trickle.c
+++ b/bdb/mp/mp_trickle.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,42 +16,29 @@ static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell
 #include <stdlib.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static int __memp_trick __P((DB_ENV *, int, int, int *));
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
 /*
- * memp_trickle --
+ * __memp_trickle --
  *	Keep a specified percentage of the buffers clean.
+ *
+ * PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *));
  */
 int
-memp_trickle(dbenv, pct, nwrotep)
+__memp_trickle(dbenv, pct, nwrotep)
 	DB_ENV *dbenv;
 	int pct, *nwrotep;
 {
 	DB_MPOOL *dbmp;
-	MPOOL *mp;
-	u_int32_t i;
-	int ret;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
-#endif
+	MPOOL *c_mp, *mp;
+	u_int32_t clean, dirty, i, total, dtmp;
+	int ret, wrote;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL);
 
 	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
@@ -62,88 +49,35 @@ memp_trickle(dbenv, pct, nwrotep)
 	if (pct < 1 || pct > 100)
 		return (EINVAL);
 
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Loop through the caches... */
-	for (ret = 0, i = 0; i < mp->nreg; ++i)
-		if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
-			break;
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	return (ret);
-}
-
-/*
- * __memp_trick --
- *	Trickle a single cache.
- */
-static int
-__memp_trick(dbenv, ncache, pct, nwrotep)
-	DB_ENV *dbenv;
-	int ncache, pct, *nwrotep;
-{
-	BH *bhp;
-	DB_MPOOL *dbmp;
-	MPOOL *c_mp;
-	MPOOLFILE *mfp;
-	db_pgno_t pgno;
-	u_long total;
-	int ret, wrote;
-
-	dbmp = dbenv->mp_handle;
-	c_mp = dbmp->reginfo[ncache].primary;
-
 	/*
-	 * If there are sufficient clean buffers, or no buffers or no dirty
+	 * If there are sufficient clean buffers, no buffers or no dirty
 	 * buffers, we're done.
 	 *
 	 * XXX
-	 * Using st_page_clean and st_page_dirty is our only choice at the
-	 * moment, but it's not as correct as we might like in the presence
-	 * of pools with more than one buffer size, as a free 512-byte buffer
-	 * isn't the same as a free 8K buffer.
+	 * Using hash_page_dirty is our only choice at the moment, but it's not
+	 * as correct as we might like in the presence of pools having more
+	 * than one page size, as a free 512B buffer isn't the same as a free
+	 * 8KB buffer.
+	 *
+	 * Loop through the caches counting total/dirty buffers.
 	 */
-loop:	total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
-	if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
-	    (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
-		return (0);
-
-	/* Loop until we write a buffer. */
-	for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-		if (bhp->ref != 0 ||
-		    !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
-			continue;
-
-		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
-
-		/*
-		 * We can't write to temporary files -- see the comment in
-		 * mp_bh.c:__memp_bhwrite().
-		 */
-		if (F_ISSET(mfp, MP_TEMP))
-			continue;
+	for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		total += c_mp->stat.st_pages;
+		__memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
+		dirty += dtmp;
+	}
 
-		pgno = bhp->pgno;
-		if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
-			return (ret);
+	clean = total - dirty;
+	if (clean == total || (clean * 100) / total >= (u_long)pct)
+		return (0);
 
-		/*
-		 * Any process syncing the shared memory buffer pool had better
-		 * be able to write to any underlying file.  Be understanding,
-		 * but firm, on this point.
-		 */
-		if (!wrote) {
-			__db_err(dbenv, "%s: unable to flush page: %lu",
-			    __memp_fns(dbmp, mfp), (u_long)pgno);
-			return (EPERM);
-		}
+	if (nwrotep == NULL)
+		nwrotep = &wrote;
+	ret = __memp_sync_int(dbenv, NULL,
+	    ((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep);
 
-		++c_mp->stat.st_page_trickle;
-		if (nwrotep != NULL)
-			++*nwrotep;
-		goto loop;
-	}
+	mp->stat.st_page_trickle += *nwrotep;
 
-	return (0);
+	return (ret);
 }