1 files changed, 500 insertions, 263 deletions
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c
index 1bff5e136ab..be0785a2184 100644
--- a/bdb/mp/mp_fget.c
+++ b/bdb/mp/mp_fget.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
 #include <string.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#ifdef HAVE_FILESYSTEM_NOTZERO
+static int __memp_fs_notzero
+    __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
 #endif
 
 /*
- * memp_fget --
+ * __memp_fget --
  *	Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget
+ * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
  */
 int
-memp_fget(dbmfp, pgnoaddr, flags, addrp)
+__memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	DB_MPOOLFILE *dbmfp;
 	db_pgno_t *pgnoaddr;
 	u_int32_t flags;
 	void *addrp;
 {
-	BH *bhp;
+	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+	BH *alloc_bhp, *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	DB_HASHTAB *dbht;
+	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	size_t n_bucket, n_cache, mf_offset;
-	u_int32_t st_hsearch;
-	int b_incr, first, ret;
+	roff_t mf_offset;
+	u_int32_t n_cache, st_hsearch;
+	int b_incr, extending, first, ret;
+
+	*(void **)addrp = NULL;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-	mfp = dbmfp->mfp;
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
-#endif
 
 	PANIC_CHECK(dbenv);
 
+	mp = dbmp->reginfo[0].primary;
+	mfp = dbmfp->mfp;
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	alloc_bhp = bhp = NULL;
+	hp = NULL;
+	b_incr = extending = ret = 0;
+
 	/*
 	 * Validate arguments.
 	 *
@@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * is to keep database files small.  It's sleazy as hell, but we catch
 	 * any attempt to actually write the file in memp_fput().
 	 */
-#define	OKFLAGS	\
-    (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
-    DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
 	if (flags != 0) {
 		if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
 			return (ret);
 
-		switch (flags & ~DB_MPOOL_EXTENT) {
+		switch (flags) {
 		case DB_MPOOL_CREATE:
+			break;
 		case DB_MPOOL_LAST:
+			/* Get the last page number in the file. */
+			if (flags == DB_MPOOL_LAST) {
+				R_LOCK(dbenv, dbmp->reginfo);
+				*pgnoaddr = mfp->last_pgno;
+				R_UNLOCK(dbenv, dbmp->reginfo);
+			}
+			break;
 		case DB_MPOOL_NEW:
-		case DB_MPOOL_NEW_GROUP:
-		case 0:
+			/*
+			 * If always creating a page, skip the first search
+			 * of the hash bucket.
+			 */
+			if (flags == DB_MPOOL_NEW)
+				goto alloc;
 			break;
 		default:
 			return (__db_ferr(dbenv, "memp_fget", 1));
 		}
 	}
 
-#ifdef DIAGNOSTIC
-	/*
-	 * XXX
-	 * We want to switch threads as often as possible.  Yield every time
-	 * we get a new page to ensure contention.
-	 */
-	if (DB_GLOBAL(db_pageyield))
-		__os_yield(dbenv, 1);
-#endif
-
-	/* Initialize remaining local variables. */
-	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
-	bhp = NULL;
-	st_hsearch = 0;
-	b_incr = ret = 0;
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Check for the new, last or last + 1 page requests.
-	 *
-	 * Examine and update the file's last_pgno value.  We don't care if
-	 * the last_pgno value immediately changes due to another thread --
-	 * at this instant in time, the value is correct.  We do increment the
-	 * current last_pgno value if the thread is asking for a new page,
-	 * however, to ensure that two threads creating pages don't get the
-	 * same one.
-	 *
-	 * If we create a page, there is the potential that a page after it
-	 * in the file will be written before it will be written.  Recovery
-	 * depends on pages that are "created" in the file by subsequent pages
-	 * being written be zeroed out, not have random garbage.  Ensure that
-	 * the OS agrees.
-	 *
-	 * !!!
-	 * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
-	 * to allocate contiguous groups of pages in order to do subdatabases.
-	 * We return the first page in the group, but the caller must put an
-	 * LSN on the *last* page and write it, otherwise after a crash we may
-	 * not create all of the pages we need to create.
-	 */
-	if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (LF_ISSET(DB_MPOOL_NEW)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    1, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			++mfp->last_pgno;
-		}
-		if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			mfp->last_pgno += *pgnoaddr;
-		}
-		*pgnoaddr = mfp->last_pgno;
-	}
-
-	/*
-	 * Determine the hash bucket where this page will live, and get local
-	 * pointers to the cache and its hash table.
-	 */
-	n_cache = NCACHE(mp, *pgnoaddr);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
-	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
-		goto alloc;
-
 	/*
 	 * If mmap'ing the file and the page is not past the end of the file,
 	 * just return a pointer.
@@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * goes through the cache.  All pages previously returned will be safe,
 	 * as long as the correct locking protocol was observed.
 	 *
-	 * XXX
 	 * We don't discard the map because we don't know when all of the
 	 * pages will have been discarded from the process' address space.
 	 * It would be possible to do so by reference counting the open
 	 * pages from the mmap, but it's unclear to me that it's worth it.
 	 */
-	if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
-		if (*pgnoaddr > mfp->orig_last_pgno) {
-			/*
-			 * !!!
-			 * See the comment above about non-existent pages and
-			 * the hash access method.
-			 */
-			if (!LF_ISSET(DB_MPOOL_CREATE)) {
-				if (!LF_ISSET(DB_MPOOL_EXTENT))
-					__db_err(dbenv,
-					    "%s: page %lu doesn't exist",
-					    __memp_fn(dbmfp), (u_long)*pgnoaddr);
-				ret = EINVAL;
-				goto err;
-			}
-		} else {
-			*(void **)addrp =
-			    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
-			++mfp->stat.st_map;
-			goto done;
-		}
+	if (dbmfp->addr != NULL &&
+	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+		*(void **)addrp =
+		    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+		++mfp->stat.st_map;
+		return (0);
 	}
 
+hb_search:
+	/*
+	 * Determine the cache and hash bucket where this page lives and get
+	 * local pointers to them.  Reset on each pass through this code, the
+	 * page number can change.
+	 */
+	n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
+
 	/* Search the hash chain for the page. */
-	for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+retry:	st_hsearch = 0;
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
+	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 		++st_hsearch;
 		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
 			continue;
 
-		/* Increment the reference count. */
+		/*
+		 * Increment the reference count.  We may discard the hash
+		 * bucket lock as we evaluate and/or read the buffer, so we
+		 * need to ensure it doesn't move and its contents remain
+		 * unchanged.
+		 */
 		if (bhp->ref == UINT16_T_MAX) {
 			__db_err(dbenv,
 			    "%s: page %lu: reference count overflow",
 			    __memp_fn(dbmfp), (u_long)bhp->pgno);
 			ret = EINVAL;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 			goto err;
 		}
-
-		/*
-		 * Increment the reference count.  We may discard the region
-		 * lock as we evaluate and/or read the buffer, so we need to
-		 * ensure that it doesn't move and that its contents remain
-		 * unchanged.
-		 */
 		++bhp->ref;
 		b_incr = 1;
 
 		/*
-		 * Any buffer we find might be trouble.
-		 *
 		 * BH_LOCKED --
-		 * I/O is in progress.  Because we've incremented the buffer
-		 * reference count, we know the buffer can't move.  Unlock
-		 * the region lock, wait for the I/O to complete, and reacquire
-		 * the region.
+		 * I/O is in progress or sync is waiting on the buffer to write
+		 * it.  Because we've incremented the buffer reference count,
+		 * we know the buffer can't move.  Unlock the bucket lock, wait
+		 * for the buffer to become available, reacquire the bucket.
 		 */
-		for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
-			R_UNLOCK(dbenv, dbmp->reginfo);
+		for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
+		    !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
+			/*
+			 * If someone is trying to sync this buffer and the
+			 * buffer is hot, they may never get in.  Give up
+			 * and try again.
+			 */
+			if (!first && bhp->ref_sync != 0) {
+				--bhp->ref;
+				b_incr = 0;
+				MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+				__os_yield(dbenv, 1);
+				goto retry;
+			}
 
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 			/*
-			 * Explicitly yield the processor if it's not the first
-			 * pass through this loop -- if we don't, we might end
-			 * up running to the end of our CPU quantum as we will
-			 * simply be swapping between the two locks.
+			 * Explicitly yield the processor if not the first pass
+			 * through this loop -- if we don't, we might run to the
+			 * end of our CPU quantum as we will simply be swapping
+			 * between the two locks.
 			 */
 			if (!first)
 				__os_yield(dbenv, 1);
 
-			MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+			MUTEX_LOCK(dbenv, &bhp->mutex);
 			/* Wait for I/O to finish... */
 			MUTEX_UNLOCK(dbenv, &bhp->mutex);
-			R_LOCK(dbenv, dbmp->reginfo);
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+		}
+
+		++mfp->stat.st_cache_hit;
+		break;
+	}
+
+	/*
+	 * Update the hash bucket search statistics -- do now because our next
+	 * search may be for a different bucket.
+	 */
+	++c_mp->stat.st_hash_searches;
+	if (st_hsearch > c_mp->stat.st_hash_longest)
+		c_mp->stat.st_hash_longest = st_hsearch;
+	c_mp->stat.st_hash_examined += st_hsearch;
+
+	/*
+	 * There are 4 possible paths to this location:
+	 *
+	 * FIRST_MISS:
+	 *	Didn't find the page in the hash bucket on our first pass:
+	 *	bhp == NULL, alloc_bhp == NULL
+	 *
+	 * FIRST_FOUND:
+	 *	Found the page in the hash bucket on our first pass:
+	 *	bhp != NULL, alloc_bhp == NULL
+	 *
+	 * SECOND_FOUND:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and found the page in the hash bucket on
+	 *	our second pass:
+	 *	bhp != NULL, alloc_bhp != NULL
+	 *
+	 * SECOND_MISS:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and didn't find the page in the hash bucket
+	 *	on our second pass:
+	 *	bhp == NULL, alloc_bhp != NULL
+	 */
+	state = bhp == NULL ?
+	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+	switch (state) {
+	case FIRST_FOUND:
+		/* We found the buffer in our first check -- we're done. */
+		break;
+	case FIRST_MISS:
+		/*
+		 * We didn't find the buffer in our first check.  Figure out
+		 * if the page exists, and allocate structures so we can add
+		 * the page to the buffer pool.
+		 */
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+alloc:		/*
+		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
+		 * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
+		 * it's an error to try and get a page past the end of file.
+		 */
+		COMPQUIET(n_cache, 0);
+
+		extending = ret = 0;
+		R_LOCK(dbenv, dbmp->reginfo);
+		switch (flags) {
+		case DB_MPOOL_NEW:
+			extending = 1;
+			*pgnoaddr = mfp->last_pgno + 1;
+			break;
+		case DB_MPOOL_CREATE:
+			extending = *pgnoaddr > mfp->last_pgno;
+			break;
+		default:
+			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+			break;
 		}
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		if (ret != 0)
+			goto err;
 
 		/*
-		 * BH_TRASH --
-		 * The contents of the buffer are garbage.  Shouldn't happen,
-		 * and this read is likely to fail, but might as well try.
+		 * !!!
+		 * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+		 * not yet been initialized.
 		 */
-		if (F_ISSET(bhp, BH_TRASH))
-			goto reread;
+		mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+		n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
 
+		/* Allocate a new buffer header and data space. */
+		if ((ret = __memp_alloc(dbmp,
+		    &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
+			goto err;
+#ifdef DIAGNOSTIC
+		if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+			__db_err(dbenv,
+			    "Error: buffer data is NOT size_t aligned");
+			ret = EINVAL;
+			goto err;
+		}
+#endif
 		/*
-		 * BH_CALLPGIN --
-		 * The buffer was converted so it could be written, and the
-		 * contents need to be converted again.
+		 * If we are extending the file, we'll need the region lock
+		 * again.
 		 */
-		if (F_ISSET(bhp, BH_CALLPGIN)) {
-			if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+		if (extending)
+			R_LOCK(dbenv, dbmp->reginfo);
+
+		/*
+		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control.  (That guarantee is interesting
+		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+		 * did not specify the page number, and so, may reasonably not
+		 * have any way to lock the page outside of mpool.) Regardless,
+		 * if we allocate the page, and some other thread of control
+		 * requests the page by number, we will not detect that and the
+		 * thread of control that allocated using DB_MPOOL_NEW may not
+		 * have a chance to initialize the page.  (Note: we *could*
+		 * detect this case if we set a flag in the buffer header which
+		 * guaranteed that no gets of the page would succeed until the
+		 * reference count went to 0, that is, until the creating page
+		 * put the page.)  What we do guarantee is that if two threads
+		 * of control are both doing DB_MPOOL_NEW calls, they won't
+		 * collide, that is, they won't both get the same page.
+		 *
+		 * There's a possibility that another thread allocated the page
+		 * we were planning to allocate while we were off doing buffer
+		 * allocation.  We can do that by making sure the page number
+		 * we were going to use is still available.  If it's not, then
+		 * we check to see if the next available page number hashes to
+		 * the same mpool region as the old one -- if it does, we can
+		 * continue, otherwise, we have to start over.
+		 */
+		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+			*pgnoaddr = mfp->last_pgno + 1;
+			if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+				__db_shalloc_free(
+				    dbmp->reginfo[n_cache].addr, alloc_bhp);
+				/*
+				 * flags == DB_MPOOL_NEW, so extending is set
+				 * and we're holding the region locked.
+				 */
+				R_UNLOCK(dbenv, dbmp->reginfo);
+
+				alloc_bhp = NULL;
+				goto alloc;
+			}
+		}
+
+		/*
+		 * We released the region lock, so another thread might have
+		 * extended the file.  Update the last_pgno and initialize
+		 * the file, as necessary, if we extended the file.
+		 */
+		if (extending) {
+#ifdef HAVE_FILESYSTEM_NOTZERO
+			if (*pgnoaddr > mfp->last_pgno &&
+			    __os_fs_notzero() &&
+			    F_ISSET(dbmfp->fhp, DB_FH_VALID))
+				ret = __memp_fs_notzero(
+				    dbenv, dbmfp, mfp, pgnoaddr);
+			else
+				ret = 0;
+#endif
+			if (ret == 0 && *pgnoaddr > mfp->last_pgno)
+				mfp->last_pgno = *pgnoaddr;
+
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			if (ret != 0)
 				goto err;
-			F_CLR(bhp, BH_CALLPGIN);
 		}
+		goto hb_search;
+	case SECOND_FOUND:
+		/*
+		 * We allocated buffer space for the requested page, but then
+		 * found the page in the buffer cache on our second check.
+		 * That's OK -- we can use the page we found in the pool,
+		 * unless DB_MPOOL_NEW is set.
+		 *
+		 * Free the allocated memory, we no longer need it.  Since we
+		 * can't acquire the region lock while holding the hash bucket
+		 * lock, we have to release the hash bucket and re-acquire it.
+		 * That's OK, because we have the buffer pinned down.
+		 */
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
+		alloc_bhp = NULL;
+		R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
 
-		++mfp->stat.st_cache_hit;
-		*(void **)addrp = bhp->buf;
-		goto done;
-	}
+		/*
+		 * We can't use the page we found in the pool if DB_MPOOL_NEW
+		 * was set.  (For details, see the above comment beginning
+		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
+		 * release our pin on this particular buffer, and try to get
+		 * another one.
+		 */
+		if (flags == DB_MPOOL_NEW) {
+			--bhp->ref;
+			b_incr = 0;
+			goto alloc;
+		}
+		break;
+	case SECOND_MISS:
+		/*
+		 * We allocated buffer space for the requested page, and found
+		 * the page still missing on our second pass through the buffer
+		 * cache.  Instantiate the page.
+		 */
+		bhp = alloc_bhp;
+		alloc_bhp = NULL;
 
-alloc:	/* Allocate new buffer header and data space. */
-	if ((ret = __memp_alloc(dbmp,
-	    &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
-		goto err;
+		/*
+		 * Initialize all the BH and hash bucket fields so we can call
+		 * __memp_bhfree if an error occurs.
+		 *
+		 * Append the buffer to the tail of the bucket list and update
+		 * the hash bucket's priority.
+		 */
+		b_incr = 1;
+
+		memset(bhp, 0, sizeof(BH));
+		bhp->ref = 1;
+		bhp->priority = UINT32_T_MAX;
+		bhp->pgno = *pgnoaddr;
+		bhp->mf_offset = mf_offset;
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+		/* If we extended the file, make sure the page is never lost. */
+		if (extending) {
+			++hp->hash_page_dirty;
+			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+		}
 
-	++c_mp->stat.st_page_clean;
+		/*
+		 * If we created the page, zero it out.  If we didn't create
+		 * the page, read from the backing file.
+		 *
+		 * !!!
+		 * DB_MPOOL_NEW doesn't call the pgin function.
+		 *
+		 * If DB_MPOOL_CREATE is used, then the application's pgin
+		 * function has to be able to handle pages of 0's -- if it
+		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
+		 * and not bother.
+		 *
+		 * If we're running in diagnostic mode, smash any bytes on the
+		 * page that are unknown quantities for the caller.
+		 *
+		 * Otherwise, read the page into memory, optionally creating it
+		 * if DB_MPOOL_CREATE is set.
+		 */
+		if (extending) {
+			if (mfp->clear_len == 0)
+				memset(bhp->buf, 0, mfp->stat.st_pagesize);
+			else {
+				memset(bhp->buf, 0, mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+				    mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+			}
 
-	/*
-	 * Initialize the BH fields so that we can call the __memp_bhfree
-	 * routine if an error occurs.
-	 */
-	memset(bhp, 0, sizeof(BH));
-	bhp->ref = 1;
-	bhp->pgno = *pgnoaddr;
-	bhp->mf_offset = mf_offset;
+			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
+				F_SET(bhp, BH_CALLPGIN);
 
-	/* Increment the count of buffers referenced by this MPOOLFILE. */
-	++mfp->block_cnt;
+			++mfp->stat.st_page_create;
+		} else {
+			F_SET(bhp, BH_TRASH);
+			++mfp->stat.st_cache_miss;
+		}
 
-	/*
-	 * Prepend the bucket header to the head of the appropriate MPOOL
-	 * bucket hash list.  Append the bucket header to the tail of the
-	 * MPOOL LRU chain.
-	 */
-	SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
-	SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+		/* Increment buffer count referenced by MPOOLFILE. */
+		MUTEX_LOCK(dbenv, &mfp->mutex);
+		++mfp->block_cnt;
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
 
-#ifdef DIAGNOSTIC
-	if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
-		__db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
-		ret = EINVAL;
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
+		/*
+		 * Initialize the mutex.  This is the last initialization step,
+		 * because it's the only one that can fail, and everything else
+		 * must be set up or we can't jump to the err label because it
+		 * will call __memp_bhfree.
+		 */
+		if ((ret = __db_mutex_setup(dbenv,
+		    &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
+			goto err;
 	}
-#endif
 
-	if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
-	    R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
-	    0, &dbmp->reginfo[n_cache],
-	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
-	    != 0) {
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
+	DB_ASSERT(bhp->ref != 0);
+
+	/*
+	 * If we're the only reference, update buffer and bucket priorities.
+	 * We may be about to release the hash bucket lock, and everything
+	 * should be correct, first.  (We've already done this if we created
+	 * the buffer, so there is no need to do it again.)
+	 */
+	if (state != SECOND_MISS && bhp->ref == 1) {
+		bhp->priority = UINT32_T_MAX;
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
 	}
 
 	/*
-	 * If we created the page, zero it out and continue.
-	 *
-	 * !!!
-	 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
-	 * If DB_MPOOL_CREATE is used, then the application's pgin function
-	 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
-	 * it can detect all of its page creates, and not bother.
+	 * BH_TRASH --
+	 * The buffer we found may need to be filled from the disk.
 	 *
-	 * If we're running in diagnostic mode, smash any bytes on the
-	 * page that are unknown quantities for the caller.
-	 *
-	 * Otherwise, read the page into memory, optionally creating it if
-	 * DB_MPOOL_CREATE is set.
+	 * It's possible for the read function to fail, which means we fail as
+	 * well.  Note, the __memp_pgread() function discards and reacquires
+	 * the hash lock, so the buffer must be pinned down so that it cannot
+	 * move and its contents are unchanged.  Discard the buffer on failure
+	 * unless another thread is waiting on our I/O to complete.  It's OK to
+	 * leave the buffer around, as the waiting thread will see the BH_TRASH
+	 * flag set, and will also attempt to discard it.  If there's a waiter,
+	 * we need to decrement our reference count.
 	 */
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (mfp->clear_len == 0)
-			memset(bhp->buf, 0, mfp->stat.st_pagesize);
-		else {
-			memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
-			memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
-			    mfp->stat.st_pagesize - mfp->clear_len);
-#endif
-		}
+	if (F_ISSET(bhp, BH_TRASH) &&
+	    (ret = __memp_pgread(dbmfp,
+	    &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+		goto err;
 
-		++mfp->stat.st_page_create;
-	} else {
-		/*
-		 * It's possible for the read function to fail, which means
-		 * that we fail as well.  Note, the __memp_pgread() function
-		 * discards the region lock, so the buffer must be pinned
-		 * down so that it cannot move and its contents are unchanged.
-		 */
-reread:		if ((ret = __memp_pgread(dbmfp,
-		    bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
-			/*
-			 * !!!
-			 * Discard the buffer unless another thread is waiting
-			 * on our I/O to complete.  Regardless, the header has
-			 * the BH_TRASH flag set.
-			 */
-			if (bhp->ref == 1)
-				__memp_bhfree(dbmp, bhp, 1);
+	/*
+	 * BH_CALLPGIN --
+	 * The buffer was processed for being written to disk, and now has
+	 * to be re-converted for use.
+	 */
+	if (F_ISSET(bhp, BH_CALLPGIN)) {
+		if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
 			goto err;
-		}
-
-		++mfp->stat.st_cache_miss;
+		F_CLR(bhp, BH_CALLPGIN);
 	}
 
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+#ifdef DIAGNOSTIC
+	/* Update the file's pinned reference count. */
+	R_LOCK(dbenv, dbmp->reginfo);
+	++dbmfp->pinref;
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
 	/*
-	 * If we're returning a page after our current notion of the last-page,
-	 * update our information.  Note, there's no way to un-instantiate this
-	 * page, it's going to exist whether it's returned to us dirty or not.
+	 * We want to switch threads as often as possible, and at awkward
+	 * times.  Yield every time we get a new page to ensure contention.
 	 */
-	if (bhp->pgno > mfp->last_pgno)
-		mfp->last_pgno = bhp->pgno;
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(dbenv, 1);
+#endif
 
 	*(void **)addrp = bhp->buf;
+	return (0);
 
-done:	/* Update the chain search statistics. */
-	if (st_hsearch) {
-		++c_mp->stat.st_hash_searches;
-		if (st_hsearch > c_mp->stat.st_hash_longest)
-			c_mp->stat.st_hash_longest = st_hsearch;
-		c_mp->stat.st_hash_examined += st_hsearch;
+err:	/*
+	 * Discard our reference.  If we're the only reference, discard the
+	 * the buffer entirely.  If we held a reference to a buffer, we are
+	 * also still holding the hash bucket mutex.
+	 */
+	if (b_incr) {
+		if (bhp->ref == 1)
+			(void)__memp_bhfree(dbmp, hp, bhp, 1);
+		else {
+			--bhp->ref;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		}
 	}
 
-	++dbmfp->pinref;
+	/* If alloc_bhp is set, free the memory. */
+	if (alloc_bhp != NULL)
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	return (ret);
+}
 
-	return (0);
+#ifdef HAVE_FILESYSTEM_NOTZERO
+/*
+ * __memp_fs_notzero --
+ *	Initialize the underlying allocated pages in the file.
+ */
+static int
+__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	db_pgno_t *pgnoaddr;
+{
+	DB_IO db_io;
+	u_int32_t i, npages;
+	size_t nw;
+	int ret;
+	u_int8_t *page;
+	char *fail;
 
-err:	/* Discard our reference. */
-	if (b_incr)
-		--bhp->ref;
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	/*
+	 * Pages allocated by writing pages past end-of-file are not zeroed,
+	 * on some systems.  Recovery could theoretically be fooled by a page
+	 * showing up that contained garbage.  In order to avoid this, we
+	 * have to write the pages out to disk, and flush them.  The reason
+	 * for the flush is because if we don't sync, the allocation of another
+	 * page subsequent to this one might reach the disk first, and if we
+	 * crashed at the right moment, leave us with this page as the one
+	 * allocated by writing a page past it in the file.
+	 *
+	 * Hash is the only access method that allocates groups of pages.  We
+	 * know that it will use the existence of the last page in a group to
+	 * signify that the entire group is OK; so, write all the pages but
+	 * the last one in the group, flush them to disk, and then write the
+	 * last one to disk and flush it.
+	 */
+	if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
+		return (ret);
+
+	db_io.fhp = dbmfp->fhp;
+	db_io.mutexp = dbmfp->mutexp;
+	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+	db_io.buf = page;
+
+	npages = *pgnoaddr - mfp->last_pgno;
+	for (i = 1; i < npages; ++i) {
+		db_io.pgno = mfp->last_pgno + i;
+		if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+			fail = "write";
+			goto err;
+		}
+	}
+	if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+		goto err;
+	}
 
-	*(void **)addrp = NULL;
+	db_io.pgno = mfp->last_pgno + npages;
+	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+		fail = "write";
+		goto err;
+	}
+	if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+err:		__db_err(dbenv, "%s: %s failed for page %lu",
+		    __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
+	}
+
+	__os_free(dbenv, page);
 	return (ret);
 }
+#endif