/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 2015 Oracle and/or its affiliates.  All rights reserved.
 *
 * $Id$
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"

/*
 * This configuration parameter limits the number of hash buckets which
 * __memp_alloc() searches through while excluding buffers with a 'high'
 * priority.
 */
#if !defined(MPOOL_ALLOC_SEARCH_LIMIT)
#define	MPOOL_ALLOC_SEARCH_LIMIT	500
#endif

/*
 * __memp_bh_unreachable --
 *
 *	Determine whether this buffer can not ever be seen again: is the next
 *	newer version visible to the same transaction which sees this one?
 *	If both versions are visibile to the same transaction, there is no
 *	reason to keep the older one: it can be purged.
 *
 *	If this buffer has a more recent version, and there is a transaction
 *	with a read_lsn between this buffer's and that more recent version's,
 *	the buffer is visible to at least that transaction, so return FALSE.
 *	Otherwise return TRUE.
 *
 *	txns:	   3/10		       2/10	   2/5 2/1          1/10
 *	vers: 3/15	 2/15  2/14    2/10   2/8	     1/150
 *	      vis	 vis  unreach   vis  unreach	     vis
 *	who  new txns	 3/10	       2/10		    2/5, 2/1
 *	sees
 *
 *	Note: in the abvove example, the page was allocated after txn 1/10
 *	started. 1/10 would not see any version of the page.
 *
 * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
 */
int
__memp_bh_unreachable(env, bhp, snapshots, n_snapshots)
	ENV *env;
	BH *bhp;
	DB_LSN *snapshots;
	int n_snapshots;
{
	BH *newer_bhp;
	DB_LSN b_vlsn, n_vlsn;
	int i, ret;
#ifdef DIAGNOSTIC
	DB_MPOOL *dbmp;
	DB_MSGBUF mb;
	MPOOLFILE *bh_mfp;
#endif

	/*
	 * The buffer can't be purged if it is being used, or is the most recent
	 * version, or the next newer version isn't a copy yet.
	 */
	if (BH_REFCOUNT(bhp) != 0 ||
	    (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL ||
	    newer_bhp->td_off == INVALID_ROFF)
		return (FALSE);

	/*
	 * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent,
	 * newer buffer (n_vlsn). If the newer version hasn't committed yet the
	 * bhp could be needed.
	 */
	n_vlsn = *VISIBLE_LSN(env, newer_bhp);
	if (IS_MAX_LSN(n_vlsn))
		return (FALSE);
	if (bhp->td_off == INVALID_ROFF)
		INIT_LSN(b_vlsn);
	else
		b_vlsn = *VISIBLE_LSN(env, bhp);

	ret = TRUE;
	/*
	 * Look for a transaction which is between n_lsn and b_lsn - determining
	 * that bhp is reachable. Stop looking once the transactions get so
	 * small (old) that they precede the buffer's version; no earlier txn
	 * could be between n_vlsn and b_vlsn.
	 */
	for (i = 0;
	     i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0;
	     i++) {
		if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) {
			/*
			 * This txn can see (started after) bhp, but not
			 * newer_bhp (which committed after this txn started).
			 */
			ret = FALSE;
			break;
		}
	}

#ifdef DIAGNOSTIC
	if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) {
		dbmp = env->mp_handle;
		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
		DB_MSGBUF_INIT(&mb);
		__db_msgadd(env, &mb,
    "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n",
		    __memp_fns(dbmp, bh_mfp), bhp->pgno,
		    ret ? "purgeable" : "needed",
		    (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags,
		    (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i);
		for (i = 0; i != n_snapshots; i++)
			__db_msgadd(env, &mb, " %lu/%lu",
			    (u_long)snapshots[i].file,
			    (u_long)snapshots[i].offset);
		DB_MSGBUF_FLUSH(env, &mb);
	}
#endif
	return (ret);
}

/*
 * __memp_alloc --
 *	Allocate some space from a cache region. If the region is full then
 *	reuse one or more cache buffers.
 *
 * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
 * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
 */
int
__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
	DB_MPOOL *dbmp;
	REGINFO *infop;
	MPOOLFILE *mfp;
	size_t len;
	roff_t *offsetp;
	void *retp;
{
	BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
	BH_FROZEN_PAGE *frozen_bhp;
	DB_LSN *snapshots, vlsn;
	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
	ENV *env;
	MPOOL *c_mp;
	MPOOLFILE *bh_mfp;
	size_t freed_space;
	u_int32_t buckets, bucket_priority, buffers, cache_reduction;
	u_int32_t dirty_eviction, high_priority, priority, versions;
	u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
	int aggressive, alloc_freeze, b_lock, giveup;
	int h_locked, need_free, n_snapshots, obsolete, ret, write_error;
	u_int8_t *endp;
	void *p;

	env = dbmp->env;
	c_mp = infop->primary;
	dbht = R_ADDR(infop, c_mp->htab);
	hp_end = &dbht[c_mp->htab_buckets];
	hp_saved = NULL;
	snapshots = NULL;
	priority_saved = write_error = 0;
	buckets = buffers = put_counter = total_buckets = versions = 0;
	aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0;

	/*
	 * If we're allocating a buffer, and the one we're discarding is the
	 * same size, we don't want to waste the time to re-integrate it into
	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
	 * NULL, we'll compare the underlying page sizes of the two buffers
	 * before free-ing and re-allocating buffers.
	 */
	if (mfp != NULL) {
		len = SSZA(BH, buf) + mfp->pagesize;
		/* Add space for alignment padding for MVCC diagnostics. */
		MVCC_BHSIZE(mfp, len);
	}

	STAT_INC(env, mpool, nallocs, c_mp->stat.st_alloc, len);

	MPOOL_REGION_LOCK(env, infop);

	/*
	 * First we try to allocate from free memory.  If that fails, scan the
	 * buffer pool to find buffers with low priorities.  We consider small
	 * sets of hash buckets each time to limit the amount of work needing
	 * to be done.  This approximates LRU, but not very well.  We either
	 * find a buffer of the same size to use, or we will free 3 times what
	 * we need in the hopes it will coalesce into a contiguous chunk of the
	 * right size.  In the latter case we branch back here and try again.
	 */
alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
		if (mfp != NULL) {
			/*
			 * For MVCC diagnostics, align the pointer so that the
			 * buffer starts on a page boundary.
			 */
			MVCC_BHALIGN(p);
			bhp = (BH *)p;

			if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
			    DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
				MVCC_BHUNALIGN(bhp);
				__env_alloc_free(infop, bhp);
				goto search;
			}
			c_mp->pages++;
		}
		MPOOL_REGION_UNLOCK(env, infop);
found:		if (offsetp != NULL)
			*offsetp = R_OFFSET(infop, p);
		*(void **)retp = p;

		/*
		 * Update the search statistics.
		 *
		 * We're not holding the region locked here, these statistics
		 * can't be trusted.
		 */
#ifdef HAVE_STATISTICS
		total_buckets += buckets;
		if (total_buckets != 0) {
			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
				STAT_SET(env, mpool, alloc_max_buckets,
				    c_mp->stat.st_alloc_max_buckets,
				    total_buckets, infop->id);
			STAT_ADJUST(env, mpool, alloc_buckets,
			    c_mp->stat.st_alloc_buckets,
			    total_buckets, infop->id);
		}
		if (buffers != 0) {
			if (buffers > c_mp->stat.st_alloc_max_pages)
				STAT_SET(env, mpool, alloc_max_pages,
				    c_mp->stat.st_alloc_max_pages,
				    buffers, infop->id);
			STAT_ADJUST(env, mpool, alloc_pages,
			    c_mp->stat.st_alloc_pages, buffers, infop->id);
		}
#endif
		goto done;
	} else if (giveup || c_mp->pages == 0) {
		MPOOL_REGION_UNLOCK(env, infop);

		__db_errx(env, DB_STR("3017",
		    "unable to allocate space from the buffer cache"));
		if (ret == ENOMEM && write_error != 0)
			ret = EIO;
		goto done;
	}

search:
	/*
	 * Anything newer than 1/10th of the buffer pool is ignored during the
	 * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
	 */
	cache_reduction = c_mp->pages / 10;
	high_priority = aggressive ? MPOOL_LRU_MAX :
	    c_mp->lru_priority - cache_reduction;
	lru_generation = c_mp->lru_generation;

	ret = 0;

	/*
	 * We re-attempt the allocation every time we've freed 3 times what
	 * we need.  Reset our free-space counter.
	 */
	freed_space = 0;
	total_buckets += buckets;
	buckets = 0;

	/*
	 * Walk the hash buckets and find the next two with potentially useful
	 * buffers.  Free the buffer with the lowest priority from the buckets'
	 * chains.
	 */
	for (;;) {
		/* All pages have been freed, make one last try */
		if (c_mp->pages == 0)
			goto alloc;

		/* Check for wrap around. */
		hp = &dbht[c_mp->last_checked++];
		if (hp >= hp_end) {
			c_mp->last_checked = 0;
			hp = &dbht[c_mp->last_checked++];
		}

		/*
		 * The failure mode is when there are too many buffers we can't
		 * write or there's not enough memory in the system to support
		 * the number of pinned buffers.
		 *
		 * Get aggressive if we've reviewed the entire cache without
		 * freeing the needed space.  (The code resets "aggressive"
		 * when we free any space.)  Aggressive means:
		 *
		 * a: set a flag to attempt to flush high priority buffers as
		 *    well as other buffers.
		 * b: look at a buffer in every hash bucket rather than choose
		 *    the more preferable of two.
		 * c: start to think about giving up.
		 *
		 * If we get here three or more times, sync the mpool to force
		 * out queue extent pages.  While we might not have enough
		 * space for what we want and flushing is expensive, why not?
		 * Then sleep for a second, hopefully someone else will run and
		 * free up some memory.
		 *
		 * Always try to allocate memory too, in case some other thread
		 * returns its memory to the region.
		 *
		 * We don't have any way to know an allocation has no way to
		 * succeed.  Fail if no pages are returned to the cache after
		 * we've been trying for a relatively long time.
		 *
		 * !!!
		 * This test ignores pathological cases like no buffers in the
		 * system -- we check for that early on, so it isn't possible.
		 */
		if (buckets++ == c_mp->htab_buckets) {
			if (freed_space > 0)
				goto alloc;
			MPOOL_REGION_UNLOCK(env, infop);

			/* Refresh the list of mvcc reader transactions. */
			if (snapshots != NULL)
				__os_free(env, snapshots);
			if ((ret = __txn_get_readers(
			    env, &snapshots, &n_snapshots)) != 0)
				goto err;

			aggressive++;
			/*
			 * Once aggressive, we consider all buffers. By setting
			 * this to MPOOL_LRU_MAX, we'll still select a victim
			 * even if all buffers have the highest normal priority.
			 */
			high_priority = MPOOL_LRU_MAX;
			PERFMON4(env, mpool, alloc_wrap,
			    len, infop->id, aggressive, c_mp->put_counter);
			switch (aggressive) {
			case 1:
				break;
			case 2:
				put_counter = c_mp->put_counter;
				break;
			case 3:
			case 4:
			case 5:
			case 6:
				(void)__memp_sync_int(
				    env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);

				__os_yield(env, 1, 0);
				break;
			default:
				aggressive = 1;
				if (put_counter == c_mp->put_counter)
					giveup = 1;
				break;
			}

			MPOOL_REGION_LOCK(env, infop);
			goto alloc;
		}

		/*
		 * Skip empty buckets.
		 *
		 * We can check for empty buckets before locking the hash
		 * bucket as we only care if the pointer is zero or non-zero.
		 */
		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
			continue;

		/* Unlock the region and lock the hash bucket. */
		MPOOL_REGION_UNLOCK(env, infop);
		MUTEX_READLOCK(env, hp->mtx_hash);
		h_locked = 1;
		b_lock = 0;

		/*
		 * Set aggressive to consider all buffers if we have already
		 * searched in too many buckets.
		 */
		if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
			aggressive = 1;
			/* Once aggressive, we consider all buffers. */
			high_priority = MPOOL_LRU_MAX;
			if (snapshots == NULL && (ret = __txn_get_readers(
			    env, &snapshots, &n_snapshots)) != 0)
				goto err;
		}

		/*
		 * Find a buffer we can use.
		 * Skip over refcount > 0 buffers; we can't get rid of them.
		 *
		 * Without MVCC we use the lowest-LRU singleton buffer we find
		 * that's better than the result of another hash bucket we've
		 * reviewed.  We do not use a buffer which has a priority
		 * greater than high_priority unless we are being aggressive.
		 *
		 * MVCC requires looking at additional factors: we don't want to
		 * free a still-relevent buffer out of the middle of an MVCC
		 * chain, since that requires freezing - lots of I/O.  So,
		 * walk the buffers, looking for an obsolete buffer at the
		 * end of the MVCC chain. Once a buffer becomes obsolete, its
		 * LRU priority is irrelevant because that version can never
		 * be accessed again.
		 *
		 * If we don't find any obsolete MVCC buffers, we will get
		 * aggressive, and in that case consider the lowest priority
		 * buffer within a chain.
		 */
retry_search:	bhp = NULL;
		bucket_priority = high_priority;
		obsolete = 0;
		if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1],
		    &hp->old_reader) > 0)
			hp->old_reader = snapshots[n_snapshots - 1];
		SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
			/*
			 * First, do the standard LRU check for singletons.
			 * We can use the buffer if it is unreferenced, has a
			 * priority that isn't too high (unless we are
			 * aggressive), and is better than the best candidate
			 * we have found so far in this bucket.
			 */
#ifdef MPOOL_ALLOC_SEARCH_DYN
			if (aggressive == 0 &&
			     ++high_priority >= c_mp->lru_priority)
				aggressive = 1;
#endif

			if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
				if (BH_REFCOUNT(current_bhp) != 0)
					continue;
				buffers++;
				if (bucket_priority > current_bhp->priority) {
					bucket_priority = current_bhp->priority;
					if (bhp != NULL)
						atomic_dec(env, &bhp->ref);
					bhp = current_bhp;
					atomic_inc(env, &bhp->ref);
				}
				continue;
			}

			/*
			 * For MVCC buffers, walk through the chain.  If we are
			 * aggressive, choose the best candidate from within
			 * the chain for freezing.
			 */
			for (mvcc_bhp = oldest_bhp = current_bhp;
			    mvcc_bhp != NULL;
			    oldest_bhp = mvcc_bhp,
			    mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
				DB_ASSERT(env, mvcc_bhp !=
				    SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
#ifdef MPOOL_ALLOC_SEARCH_DYN
				if (aggressive == 0 &&
				     ++high_priority >= c_mp->lru_priority) {
					aggressive = 1;
					if (snapshots == NULL && (ret =
					    __txn_readers(env,
					    &snapshots, &n_snapshots)) != 0)
						goto err;
				}
#endif
				if (n_snapshots > 0 &&
				    __memp_bh_unreachable(env,
				    mvcc_bhp, snapshots, n_snapshots)) {
					oldest_bhp = mvcc_bhp;
					goto is_obsolete;
				}
				if (bhp != NULL &&
				    mvcc_bhp->priority >= bhp->priority)
					continue;
				if (BH_REFCOUNT(mvcc_bhp) != 0)
					continue;
				/*
				 * Since taking still-relevant versions requires
				 * freezing, skip over them at low aggression
				 * levels unless we see that a high proportion
				 * of buffers (over 1/4) are MVCC copies.
				 */
				if (aggressive < 2 &&
				    ++versions < (buffers >> 2))
					continue;
				buffers++;
				if (F_ISSET(mvcc_bhp, BH_FROZEN))
					continue;
				/*
				 * Select mvcc_bhp as current best candidate,
				 * releasing the current candidate, if any.
				 */
				if (bhp != NULL)
					atomic_dec(env, &bhp->ref);
				bhp = mvcc_bhp;
				atomic_inc(env, &bhp->ref);
			}

			/*
			 * oldest_bhp is the last buffer on the MVCC chain, and
			 * an obsolete buffer at the end of the MVCC chain gets
			 * used without further search.
			 */
			if (BH_REFCOUNT(oldest_bhp) != 0)
				continue;

			if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
				if (aggressive < 2)
					buffers++;
is_obsolete:
				obsolete = 1;
				if (bhp != NULL)
					atomic_dec(env, &bhp->ref);
				bhp = oldest_bhp;
				atomic_inc(env, &bhp->ref);
				goto this_buffer;
			}
		}

		/*
		 * bhp is either NULL or the best candidate buffer.
		 * We'll use the chosen buffer only if we have compared its
		 * priority against one chosen from another hash bucket.
		 */
		if (bhp == NULL)
			goto next_hb;

		priority = bhp->priority;

		/*
		 * Compare two hash buckets and select the one with the lower
		 * priority, except mvcc at high aggression levels. Performance
		 * testing shows looking at two improves the LRU-ness and
		 * looking at more only does a little better.
		 */
		if (hp_saved == NULL) {
			/*
			 * At high aggressive levels when mvcc is active, stop
			 * looking for candidate once one has been found.
			 * Freezing takes more time than writing out to a db.
			 */
			if (aggressive > 1 && n_snapshots > 1)
				goto this_buffer;
			hp_saved = hp;
			priority_saved = priority;
			goto next_hb;
		}

		/*
		 * If the buffer we just found is a better choice than our
		 * previous choice, use it.
		 *
		 * If the previous choice was better, pretend we're moving
		 * from this hash bucket to the previous one and re-do the
		 * search.
		 *
		 * We don't worry about simply swapping between two buckets
		 * because that could only happen if a buffer was removed
		 * from the chain, or its priority updated.   If a buffer
		 * is removed from the chain, some other thread has managed
		 * to discard a buffer, so we're moving forward.  Updating
		 * a buffer's priority will make it a high-priority buffer,
		 * so we'll ignore it when we search again, and so we will
		 * eventually zero in on a buffer to use, or we'll decide
		 * there are no buffers we can use.
		 *
		 * If there's only a single hash bucket with buffers, we'll
		 * search the bucket once, choose a buffer, walk the entire
		 * list of buckets and search it again.   In the case of a
		 * system that's busy, it's possible to imagine a case where
		 * we'd loop for a long while.  For that reason, and because
		 * the test is easy, we special case and test for it.
		 */
		if (priority > priority_saved && hp != hp_saved) {
			MUTEX_UNLOCK(env, hp->mtx_hash);
			hp_tmp = hp_saved;
			hp_saved = hp;
			hp = hp_tmp;
			priority_saved = priority;
			MUTEX_READLOCK(env, hp->mtx_hash);
			h_locked = 1;
			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
			atomic_dec(env, &bhp->ref);
			goto retry_search;
		}

		/*
		 * If another thread has called __memp_reset_lru() while we were
		 * looking for this buffer, it is possible that we've picked a
		 * poor choice for a victim. If so toss it and start over.
		 */
		if (lru_generation != c_mp->lru_generation) {
			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
			atomic_dec(env, &bhp->ref);
			MUTEX_UNLOCK(env, hp->mtx_hash);
			MPOOL_REGION_LOCK(env, infop);
			hp_saved = NULL;
			goto search;
		}

this_buffer:	/*
		 * Discard any previously remembered hash bucket, we've got
		 * a winner.
		 */
		hp_saved = NULL;

		/* Drop the hash mutex and lock the buffer exclusively. */
		MUTEX_UNLOCK(env, hp->mtx_hash);
		h_locked = 0;

		/* Don't bother trying to latch a busy buffer. */
		if (BH_REFCOUNT(bhp) > 1)
			goto next_hb;

		/* We cannot block as the caller is probably holding locks. */
		if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
			if (ret != DB_LOCK_NOTGRANTED) {
				goto err;
			}
			ret = 0;
			goto next_hb;
		}
		F_SET(bhp, BH_EXCLUSIVE);
		if (obsolete)
			F_SET(bhp, BH_UNREACHABLE);
		b_lock = 1;

		/* Someone may have grabbed it while we got the lock. */
		if (BH_REFCOUNT(bhp) != 1)
			goto next_hb;

		/* Find the associated MPOOLFILE. */
		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);

		/* If the page is dirty, write it. */
		ret = 0;
		dirty_eviction = 0;
		if (F_ISSET(bhp, BH_DIRTY)) {
			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
			DB_ASSERT(env, atomic_read(&bhp->ref) > 0);

			/*
			 * If a write fails for any reason, we can't proceed.
			 *
			 * If there's a write error and we're having problems
			 * finding something to allocate, avoid selecting this
			 * buffer again by maximizing its priority.
			 */
			if (ret != 0) {
				if (ret != EPERM && ret != EAGAIN) {
					write_error++;
					__db_errx(env, DB_STR_A("3018",
		"%s: unwritable page %d remaining in the cache after error %d",
					    "%s %d %d"),
					    __memp_fns(dbmp, bh_mfp),
					    bhp->pgno, ret);
				}
				bhp->priority = MPOOL_LRU_REDZONE;

				goto next_hb;
			}

			dirty_eviction = 1;
		}

		/*
		 * Freeze this buffer, if necessary.  That is, if the buffer is
		 * part of an MVCC chain and could be required by a reader.
		 */
		if (SH_CHAIN_HASPREV(bhp, vc) ||
		    (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) {
			if (!aggressive ||
			    F_ISSET(bhp, BH_DIRTY | BH_FROZEN))
				goto next_hb;
			ret = __memp_bh_freeze(
			    dbmp, infop, hp, bhp, &alloc_freeze);
			if (ret == EIO)
				write_error++;
			if (ret == EBUSY || ret == EIO ||
			    ret == ENOMEM || ret == ENOSPC) {
				ret = 0;
				goto next_hb;
			} else if (ret != 0) {
				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
				atomic_dec(env, &bhp->ref);
				DB_ASSERT(env, b_lock);
				F_CLR(bhp, BH_EXCLUSIVE);
				MUTEX_UNLOCK(env, bhp->mtx_buf);
				DB_ASSERT(env, !h_locked);
				goto err;
			}
		}

		MUTEX_LOCK(env, hp->mtx_hash);
		h_locked = 1;

		/*
		 * We released the hash bucket lock while doing I/O, so another
		 * thread may have acquired this buffer and incremented the ref
		 * count or dirtied the buffer or installed a new version after
		 * we wrote it, in which case we can't have it.
		 */
		if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
		    (SH_CHAIN_HASNEXT(bhp, vc) &&
		    SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
		    !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
			if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
				__db_msg(env,
		    "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
				    (u_long)R_OFFSET(infop, bhp), bhp->flags,
				    BH_REFCOUNT(bhp),
			(u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
			(u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
			goto next_hb;
		}

		/*
		 * If the buffer is frozen, thaw it and look for another one
		 * we can use. (Calling __memp_bh_freeze above will not mark
		 * this bhp BH_FROZEN; it creates another frozen one.)
		 */
		if (F_ISSET(bhp, BH_FROZEN)) {
			DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||
			    obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
			if (!F_ISSET(bhp, BH_THAWED)) {
				/*
				 * This call releases the hash bucket mutex.
				 * We're going to retry the search, so we need
				 * to re-lock it.
				 */
				if ((ret = __memp_bh_thaw(dbmp,
				    infop, hp, bhp, NULL)) != 0)
					goto done;
				MUTEX_READLOCK(env, hp->mtx_hash);
			} else {
				need_free = atomic_dec(env, &bhp->ref) == 0;
				F_CLR(bhp, BH_EXCLUSIVE);
				MUTEX_UNLOCK(env, bhp->mtx_buf);
				if (need_free) {
					MPOOL_REGION_LOCK(env, infop);
					SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
					    bhp, hq);
					MPOOL_REGION_UNLOCK(env, infop);
				}
			}
			bhp = NULL;
			b_lock = alloc_freeze = 0;
			goto retry_search;
		}

		/* We are certainly freeing this buf; now update statistic. */
		if (dirty_eviction)
			STAT_INC(env, mpool,
			    dirty_eviction, c_mp->stat.st_rw_evict, infop->id);
		else
			STAT_INC(env, mpool,
			    clean_eviction, c_mp->stat.st_ro_evict, infop->id);
		/*
		 * If we need some empty buffer headers for freezing, turn the
		 * buffer we've found into frozen headers and put them on the
		 * free list.  Only reset alloc_freeze if we've actually
		 * allocated some frozen buffer headers.
		 */
		if (alloc_freeze) {
			if ((ret = __memp_bhfree(dbmp,
			     infop, bh_mfp, hp, bhp, 0)) != 0)
				goto err;
			DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
			if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
				goto err;
			b_lock = 0;
			h_locked = 0;

			MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
			    PROT_READ | PROT_WRITE | PROT_EXEC);

			MPOOL_REGION_LOCK(env, infop);
			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
			    (BH_FROZEN_ALLOC *)bhp, links);
			frozen_bhp = (BH_FROZEN_PAGE *)
			    ((BH_FROZEN_ALLOC *)bhp + 1);
			endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
			while ((u_int8_t *)(frozen_bhp + 1) < endp) {
				frozen_bhp->header.mtx_buf = MUTEX_INVALID;
				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
				    (BH *)frozen_bhp, hq);
				frozen_bhp++;
			}
			MPOOL_REGION_UNLOCK(env, infop);

			alloc_freeze = 0;
			MUTEX_READLOCK(env, hp->mtx_hash);
			h_locked = 1;
			goto retry_search;
		}

		/*
		 * If the buffer is the size we're looking for, we can simply
		 * reuse it. Otherwise, free it and keep looking.
		 */
		if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
			if ((ret = __memp_bhfree(dbmp,
			     infop, bh_mfp, hp, bhp, 0)) != 0)
				goto err;
			p = bhp;
			goto found;
		}

		freed_space += sizeof(*bhp) + bh_mfp->pagesize;
		if ((ret = __memp_bhfree(dbmp,
		    infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
			goto err;

		/* Reset "aggressive" and "write_error" if we free any space. */
		if (aggressive > 1)
			aggressive = 1;
		write_error = 0;

		/*
		 * Unlock this buffer and re-acquire the region lock. If
		 * we're reaching here as a result of calling memp_bhfree, the
		 * buffer lock has already been discarded.
		 */
		if (0) {
next_hb:		if (bhp != NULL) {
				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
				atomic_dec(env, &bhp->ref);
				if (b_lock) {
					F_CLR(bhp, BH_EXCLUSIVE);
					MUTEX_UNLOCK(env, bhp->mtx_buf);
					b_lock = 0;
				}
			}
			if (h_locked)
				MUTEX_UNLOCK(env, hp->mtx_hash);
			h_locked = 0;
		}
		obsolete = 0;
		MPOOL_REGION_LOCK(env, infop);

		/*
		 * Retry the allocation as soon as we've freed up sufficient
		 * space.  We're likely to have to coalesce of memory to
		 * satisfy the request, don't try until it's likely (possible?)
		 * we'll succeed.
		 */
		if (freed_space >= 3 * len)
			goto alloc;
	}
err:
	if (h_locked) {
		MUTEX_UNLOCK(env, hp->mtx_hash);
		h_locked = 0;
	}
done:
	if (snapshots != NULL)
		__os_free(env, snapshots);
	return (ret);
}

/*
 * __memp_free --
 *	Free some space from a cache region.
 *
 * PUBLIC: void __memp_free __P((REGINFO *, void *));
 */
void
__memp_free(infop, buf)
	REGINFO *infop;
	void *buf;
{
	__env_alloc_free(infop, buf);
}