1 files changed, 442 insertions, 0 deletions
diff --git a/storage/bdb/mp/mp_alloc.c b/storage/bdb/mp/mp_alloc.c
new file mode 100644
index 00000000000..96dd612d7ba
--- /dev/null
+++ b/storage/bdb/mp/mp_alloc.c
@@ -0,0 +1,442 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2002
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+	DB_MPOOL_HASH *bucket;
+	u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
+
+/*
+ * __memp_alloc --
+ *	Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
+	DB_MPOOL *dbmp;
+	REGINFO *memreg;
+	MPOOLFILE *mfp;
+	size_t len;
+	roff_t *offsetp;
+	void *retp;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+	DB_MUTEX *mutexp;
+	MPOOL *c_mp;
+	MPOOLFILE *bh_mfp;
+	size_t freed_space;
+	u_int32_t buckets, buffers, high_priority, max_na, priority;
+	int aggressive, ret;
+	void *p;
+
+	dbenv = dbmp->dbenv;
+	c_mp = memreg->primary;
+	dbht = R_ADDR(memreg, c_mp->htab);
+	hp_end = &dbht[c_mp->htab_buckets];
+
+	buckets = buffers = 0;
+	aggressive = 0;
+
+	c_mp->stat.st_alloc++;
+
+	/*
+	 * Get aggressive if we've tried to flush the number of pages as are
+	 * in the system without finding space.
+	 */
+	max_na = 5 * c_mp->htab_buckets;
+
+	/*
+	 * If we're allocating a buffer, and the one we're discarding is the
+	 * same size, we don't want to waste the time to re-integrate it into
+	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
+	 * NULL, we'll compare the underlying page sizes of the two buffers
+	 * before free-ing and re-allocating buffers.
+	 */
+	if (mfp != NULL)
+		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
+
+	R_LOCK(dbenv, memreg);
+
+	/*
+	 * On every buffer allocation we update the buffer generation number
+	 * and check for wraparound.
+	 */
+	if (++c_mp->lru_count == UINT32_T_MAX)
+		__memp_reset_lru(dbenv, memreg, c_mp);
+
+	/*
+	 * Anything newer than 1/10th of the buffer pool is ignored during
+	 * allocation (unless allocation starts failing).
+	 */
+	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+	/*
+	 * First we try to allocate from free memory.  If that fails, scan the
+	 * buffer pool to find buffers with low priorities.  We consider small
+	 * sets of hash buckets each time to limit the amount of work needing
+	 * to be done.  This approximates LRU, but not very well.  We either
+	 * find a buffer of the same size to use, or we will free 3 times what
+	 * we need in the hopes it will coalesce into a contiguous chunk of the
+	 * right size.  In the latter case we branch back here and try again.
+	 */
+alloc:	if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
+		if (mfp != NULL)
+			c_mp->stat.st_pages++;
+		R_UNLOCK(dbenv, memreg);
+
+found:		if (offsetp != NULL)
+			*offsetp = R_OFFSET(memreg, p);
+		*(void **)retp = p;
+
+		/*
+		 * Update the search statistics.
+		 *
+		 * We're not holding the region locked here, these statistics
+		 * can't be trusted.
+		 */
+		if (buckets != 0) {
+			if (buckets > c_mp->stat.st_alloc_max_buckets)
+				c_mp->stat.st_alloc_max_buckets = buckets;
+			c_mp->stat.st_alloc_buckets += buckets;
+		}
+		if (buffers != 0) {
+			if (buffers > c_mp->stat.st_alloc_max_pages)
+				c_mp->stat.st_alloc_max_pages = buffers;
+			c_mp->stat.st_alloc_pages += buffers;
+		}
+		return (0);
+	}
+
+	/*
+	 * We re-attempt the allocation every time we've freed 3 times what
+	 * we need.  Reset our free-space counter.
+	 */
+	freed_space = 0;
+
+	/*
+	 * Walk the hash buckets and find the next two with potentially useful
+	 * buffers.  Free the buffer with the lowest priority from the buckets'
+	 * chains.
+	 */
+	for (hp_tmp = NULL;;) {
+		/* Check for wrap around. */
+		hp = &dbht[c_mp->last_checked++];
+		if (hp >= hp_end) {
+			c_mp->last_checked = 0;
+
+			/*
+			 * If we've gone through all of the hash buckets, try
+			 * an allocation.  If the cache is small, the old page
+			 * size is small, and the new page size is large, we
+			 * might have freed enough memory (but not 3 times the
+			 * memory).
+			 */
+			goto alloc;
+		}
+
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		/*
+		 * The failure mode is when there are too many buffers we can't
+		 * write or there's not enough memory in the system.  We don't
+		 * have a metric for deciding if allocation has no possible way
+		 * to succeed, so we don't ever fail, we assume memory will be
+		 * available if we wait long enough.
+		 *
+		 * Get aggressive if we've tried to flush 5 times the number of
+		 * hash buckets as are in the system -- it's possible we have
+		 * been repeatedly trying to flush the same buffers, although
+		 * it's unlikely.  Aggressive means:
+		 *
+		 * a: set a flag to attempt to flush high priority buffers as
+		 *    well as other buffers.
+		 * b: sync the mpool to force out queue extent pages.  While we
+		 *    might not have enough space for what we want and flushing
+		 *    is expensive, why not?
+		 * c: sleep for a second -- hopefully someone else will run and
+		 *    free up some memory.  Try to allocate memory too, in case
+		 *    the other thread returns its memory to the region.
+		 * d: look at a buffer in every hash bucket rather than choose
+		 *    the more preferable of two.
+		 *
+		 * !!!
+		 * This test ignores pathological cases like no buffers in the
+		 * system -- that shouldn't be possible.
+		 */
+		if ((++buckets % max_na) == 0) {
+			aggressive = 1;
+
+			R_UNLOCK(dbenv, memreg);
+
+			(void)__memp_sync_int(
+			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+			(void)__os_sleep(dbenv, 1, 0);
+
+			R_LOCK(dbenv, memreg);
+			goto alloc;
+		}
+
+		if (!aggressive) {
+			/* Skip high priority buckets. */
+			if (hp->hash_priority > high_priority)
+				continue;
+
+			/*
+			 * Find two buckets and select the one with the lowest
+			 * priority.  Performance testing shows that looking
+			 * at two improves the LRUness and looking at more only
+			 * does a little better.
+			 */
+			if (hp_tmp == NULL) {
+				hp_tmp = hp;
+				continue;
+			}
+			if (hp->hash_priority > hp_tmp->hash_priority)
+				hp = hp_tmp;
+			hp_tmp = NULL;
+		}
+
+		/* Remember the priority of the buffer we're looking for. */
+		priority = hp->hash_priority;
+
+		/* Unlock the region and lock the hash bucket. */
+		R_UNLOCK(dbenv, memreg);
+		mutexp = &hp->hash_mutex;
+		MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+		__memp_check_order(hp);
+#endif
+		/*
+		 * The lowest priority page is first in the bucket, as they are
+		 * maintained in sorted order.
+		 *
+		 * The buffer may have been freed or its priority changed while
+		 * we switched from the region lock to the hash lock.  If so,
+		 * we have to restart.  We will still take the first buffer on
+		 * the bucket's list, though, if it has a low enough priority.
+		 */
+		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+		    bhp->ref != 0 || bhp->priority > priority)
+			goto next_hb;
+
+		buffers++;
+
+		/* Find the associated MPOOLFILE. */
+		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/* If the page is dirty, pin it and write it. */
+		ret = 0;
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			++bhp->ref;
+			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+			--bhp->ref;
+			if (ret == 0)
+				++c_mp->stat.st_rw_evict;
+		} else
+			++c_mp->stat.st_ro_evict;
+
+		/*
+		 * If a write fails for any reason, we can't proceed.
+		 *
+		 * We released the hash bucket lock while doing I/O, so another
+		 * thread may have acquired this buffer and incremented the ref
+		 * count after we wrote it, in which case we can't have it.
+		 *
+		 * If there's a write error, avoid selecting this buffer again
+		 * by making it the bucket's least-desirable buffer.
+		 */
+		if (ret != 0 || bhp->ref != 0) {
+			if (ret != 0 && aggressive)
+				__memp_bad_buffer(hp);
+			goto next_hb;
+		}
+
+		/*
+		 * Check to see if the buffer is the size we're looking for.
+		 * If so, we can simply reuse it.  Else, free the buffer and
+		 * its space and keep looking.
+		 */
+		if (mfp != NULL &&
+		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
+			__memp_bhfree(dbmp, hp, bhp, 0);
+
+			p = bhp;
+			goto found;
+		}
+
+		freed_space += __db_shsizeof(bhp);
+		__memp_bhfree(dbmp, hp, bhp, 1);
+
+		/*
+		 * Unlock this hash bucket and re-acquire the region lock. If
+		 * we're reaching here as a result of calling memp_bhfree, the
+		 * hash bucket lock has already been discarded.
+		 */
+		if (0) {
+next_hb:		MUTEX_UNLOCK(dbenv, mutexp);
+		}
+		R_LOCK(dbenv, memreg);
+
+		/*
+		 * Retry the allocation as soon as we've freed up sufficient
+		 * space.  We're likely to have to coalesce of memory to
+		 * satisfy the request, don't try until it's likely (possible?)
+		 * we'll succeed.
+		 */
+		if (freed_space >= 3 * len)
+			goto alloc;
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __memp_bad_buffer --
+ *	Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp, *t_bhp;
+	u_int32_t priority;
+
+	/* Remove the first buffer from the bucket. */
+	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+	/*
+	 * Find the highest priority buffer in the bucket.  Buffers are
+	 * sorted by priority, so it's the last one in the bucket.
+	 *
+	 * XXX
+	 * Should use SH_TAILQ_LAST, but I think that macro is broken.
+	 */
+	priority = bhp->priority;
+	for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	    t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+		priority = t_bhp->priority;
+
+	/*
+	 * Set our buffer's priority to be just as bad, and append it to
+	 * the bucket.
+	 */
+	bhp->priority = priority;
+	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+
+	/* Reset the hash bucket's priority. */
+	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ *	Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+	DB_ENV *dbenv;
+	REGINFO *memreg;
+	MPOOL *c_mp;
+{
+	BH *bhp;
+	DB_MPOOL_HASH *hp;
+	int bucket;
+
+	/*
+	 * Update the counter so all future allocations will start at the
+	 * bottom.
+	 */
+	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+	/* Release the region lock. */
+	R_UNLOCK(dbenv, memreg);
+
+	/* Adjust the priority of every buffer in the system. */
+	for (hp = R_ADDR(memreg, c_mp->htab),
+	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+			if (bhp->priority != UINT32_T_MAX &&
+			    bhp->priority > MPOOL_BASE_DECREMENT)
+				bhp->priority -= MPOOL_BASE_DECREMENT;
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+	}
+
+	/* Reacquire the region lock. */
+	R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ *	Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp;
+	u_int32_t priority;
+
+	/*
+	 * Assumes the hash bucket is locked.
+	 */
+	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+		return;
+
+	DB_ASSERT(bhp->priority == hp->hash_priority);
+
+	for (priority = bhp->priority;
+	    (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+	    priority = bhp->priority)
+		DB_ASSERT(priority <= bhp->priority);
+}
+#endif